Skip to content

Instantly share code, notes, and snippets.

@danijar
Created May 16, 2016 13:11
Show Gist options
  • Save danijar/c255527a1685daa6b568a98f322365f0 to your computer and use it in GitHub Desktop.
Save danijar/c255527a1685daa6b568a98f322365f0 to your computer and use it in GitHub Desktop.
import argparse
import collections
import re
TOKEN_REGEX = re.compile(r'[A-Za-z]+')
BLACKLIST = set([
'pdf', 'and', 'the', 'proceedings', 'conference', 'ieee', 'for',
'about', 'details', 'data', 'with', 'arxiv', 'preprint', 'advances'])
def tokenize(line):
tokens = TOKEN_REGEX.findall(line)
tokens = [x for x in tokens if len(x) > 2]
tokens = [x.lower() for x in tokens]
tokens = [x for x in tokens if x not in BLACKLIST]
return tokens
def get_counts(document):
counter = collections.Counter()
overall = 0
for line in document:
tokens = tokenize(line)
counter.update(tokens)
overall += len(tokens)
counts = {k: v / overall for k, v in counter.items()}
return counts
def get_scores(document, reference):
with open(reference) as lines:
reference = get_counts(lines)
with open(document) as lines:
document = get_counts(lines)
scores = {}
for term, frequency in document.items():
base = reference.get(term, 0)
scores[term] = frequency / (base + 1)
return scores
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('reference', help='path to reference corpus')
parser.add_argument('document', help='path to input document')
parser.add_argument('-n', type=int, help='number of terms to print')
args = parser.parse_args()
scores = get_scores(args.document, args.reference)
scores = sorted(scores.items(), key=lambda x: x[1], reverse=True)
best = ['{1:1.5f} {0}'.format(*x) for x in scores[:args.n]]
print('\n'.join(reversed(best)))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment