Created
May 16, 2016 13:11
-
-
Save danijar/c255527a1685daa6b568a98f322365f0 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import argparse | |
import collections | |
import re | |
TOKEN_REGEX = re.compile(r'[A-Za-z]+') | |
BLACKLIST = set([ | |
'pdf', 'and', 'the', 'proceedings', 'conference', 'ieee', 'for', | |
'about', 'details', 'data', 'with', 'arxiv', 'preprint', 'advances']) | |
def tokenize(line): | |
tokens = TOKEN_REGEX.findall(line) | |
tokens = [x for x in tokens if len(x) > 2] | |
tokens = [x.lower() for x in tokens] | |
tokens = [x for x in tokens if x not in BLACKLIST] | |
return tokens | |
def get_counts(document): | |
counter = collections.Counter() | |
overall = 0 | |
for line in document: | |
tokens = tokenize(line) | |
counter.update(tokens) | |
overall += len(tokens) | |
counts = {k: v / overall for k, v in counter.items()} | |
return counts | |
def get_scores(document, reference): | |
with open(reference) as lines: | |
reference = get_counts(lines) | |
with open(document) as lines: | |
document = get_counts(lines) | |
scores = {} | |
for term, frequency in document.items(): | |
base = reference.get(term, 0) | |
scores[term] = frequency / (base + 1) | |
return scores | |
if __name__ == '__main__': | |
parser = argparse.ArgumentParser() | |
parser.add_argument('reference', help='path to reference corpus') | |
parser.add_argument('document', help='path to input document') | |
parser.add_argument('-n', type=int, help='number of terms to print') | |
args = parser.parse_args() | |
scores = get_scores(args.document, args.reference) | |
scores = sorted(scores.items(), key=lambda x: x[1], reverse=True) | |
best = ['{1:1.5f} {0}'.format(*x) for x in scores[:args.n]] | |
print('\n'.join(reversed(best))) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment