Created
May 19, 2016 17:49
-
-
Save y3nr1ng/75fd0f04abd40f169f5ef01baa0e7b77 to your computer and use it in GitHub Desktop.
Task 2 - Voting
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
import os, sys, argparse, logging | |
baseline = '/tmp2/b03902036/train-punc.pro' | |
result = '/tmp2/b03902036/result.cmb' | |
ground_truth = dict() | |
with open(baseline, 'r') as in_file : | |
for line in in_file : | |
# ignore lines with empty text after the filtering process | |
sid, content = line.strip().split('\t', maxsplit=1) | |
sid = int(sid) | |
try : | |
emot, txt= content.strip().split('\t', maxsplit=1) | |
except ValueError : | |
print('{:d} has value error'.format(sid)) | |
ground_truth[sid] = int(emot) | |
eval_weight=[1, 0.5, 0.333] | |
total_score = 0 | |
total_trial = 0 | |
with open(result, 'r') as in_file : | |
next(in_file) | |
for line in in_file : | |
sid, emot = line.strip().split(',', maxsplit=1) | |
emot_cand = emot.strip().split(' ') | |
for j, weight in enumerate(eval_weight) : | |
if int(emot_cand[j]) == ground_truth[int(sid)] : | |
print('{:d} -> {:s}'.format(ground_truth[int(sid)], str(emot_cand))) | |
total_score += weight | |
break | |
total_trial += 1 | |
print('{:f} / {:f} = {:f}'.format(total_score, total_trial, (total_score/total_trial))) | |
print('accuracy = {:f}'.format(total_score/total_trial)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
import os, sys, argparse, logging | |
# search for files | |
import glob | |
TOTAL_EMOTICON_TYPES = 40 | |
N_TOP = 3 | |
logging.basicConfig(format='%(asctime)s [%(levelname)s] %(message)s') | |
def load_file(file_path, sent_dict, weight=[1, 1, 1], logger=None) : | |
with open(file_path, 'r') as in_file : | |
# skip the header | |
next(in_file) | |
for line in in_file : | |
# ignore lines with empty text after the filtering process | |
sid, emot = line.strip().split(',', maxsplit=1) | |
emot = [int(x) for x in emot.split()] | |
sid = int(sid) | |
if sid not in sent_dict : | |
sent_dict[sid] = [0 for i in range(1, TOTAL_EMOTICON_TYPES+1)] | |
for i, e in enumerate(emot) : | |
if i < len(weight) : | |
sent_dict[sid][e-1] += weight[i] | |
else : | |
break | |
if logger : | |
logger.debug('{:d} {:s}'.format(sid, str(sent_dict[sid]))) | |
return sent_dict | |
def get_args() : | |
parser = argparse.ArgumentParser(description='Train the classifier using trained doc2vec model.') | |
parser.add_argument('--verbose', '-v', dest='verbose', | |
action='count', default=0, | |
help='control the display level of output logs') | |
parser.add_argument('--outdir', '-o', dest='out_dir', | |
default='/tmp2/b03902036', | |
help='destination directory for the model file') | |
parser.add_argument('ans_dir', nargs='+', | |
help='The directory that stores the answer files') | |
return parser.parse_args() | |
if __name__ == '__main__' : | |
# parse the command line arguments | |
args = get_args() | |
# get the logger object | |
logger = logging.getLogger() | |
# set the log level | |
if args.verbose >= 2 : | |
logger.setLevel(logging.DEBUG) | |
elif args.verbose >= 1 : | |
logger.setLevel(logging.INFO) | |
else : | |
logger.setLevel(logging.WARNING) | |
if len(args.ans_dir) > 1 : | |
logger.warning('currently only 1 directory is supported') | |
args.ans_dir = args.ans_dir[0] | |
logger.info('scanning in {:s}'.format(args.ans_dir)) | |
file_list = glob.glob(os.path.join(args.ans_dir, '**/*.ans'), recursive=True) | |
sent_dict = dict() | |
# traverse all the .ans file | |
for file_path in file_list: | |
logger.info('processing "{:s}"...'.format(file_path)) | |
sent_dict = load_file(file_path, sent_dict, logger=logger) | |
# find the top N emoticons | |
logger.info('voting in progress') | |
for key, value in sent_dict.items() : | |
value_sorted = sorted(range(1, len(value)+1), key=lambda i: value[i-1])[-N_TOP:] | |
sent_dict[key] = value_sorted[::-1] | |
logger.debug('{:d} {:s}'.format(key, str(sent_dict[key]))) | |
new_filepath = os.path.join(args.out_dir, 'result.cmb') | |
with open(new_filepath, 'w') as out_file : | |
out_file.write('Id,Emoticon\n') | |
for key, value in sent_dict.items() : | |
prediction = ' '.join([str(x) for x in value]) | |
out_file.write('{:d},{:s}\n'.format(key, prediction)) | |
logger.info('saved to {:s}'.format(new_filepath)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment