danijar · May 16, 2016 13:11
diff --git a/script_tfidf.py b/script_tfidf.py
 import argparse
 import collections
 import re

 TOKEN_REGEX = re.compile(r'[A-Za-z]+')
 BLACKLIST = set([
    'pdf', 'and', 'the', 'proceedings', 'conference', 'ieee', 'for',
    'about', 'details', 'data', 'with', 'arxiv', 'preprint', 'advances'])

 def tokenize(line):
    tokens = TOKEN_REGEX.findall(line)
    tokens = [x for x in tokens if len(x) > 2]
    tokens = [x.lower() for x in tokens]
    tokens = [x for x in tokens if x not in BLACKLIST]
    return tokens

 def get_counts(document):
    counter = collections.Counter()
    overall = 0
    for line in document:
        tokens = tokenize(line)
        counter.update(tokens)
        overall += len(tokens)
    counts = {k: v / overall for k, v in counter.items()}
    return counts

 def get_scores(document, reference):
    with open(reference) as lines:
        reference = get_counts(lines)
    with open(document) as lines:
        document = get_counts(lines)
    scores = {}
    for term, frequency in document.items():
        base = reference.get(term, 0)
        scores[term] = frequency / (base + 1)
    return scores

 if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('reference', help='path to reference corpus')
    parser.add_argument('document', help='path to input document')
    parser.add_argument('-n', type=int, help='number of terms to print')
    args = parser.parse_args()
    scores = get_scores(args.document, args.reference)
    scores = sorted(scores.items(), key=lambda x: x[1], reverse=True)
    best = ['{1:1.5f} {0}'.format(*x) for x in scores[:args.n]]
    print('\n'.join(reversed(best)))
	import argparse
	import collections
	import re

	TOKEN_REGEX = re.compile(r'[A-Za-z]+')
	BLACKLIST = set([
	'pdf', 'and', 'the', 'proceedings', 'conference', 'ieee', 'for',
	'about', 'details', 'data', 'with', 'arxiv', 'preprint', 'advances'])

	def tokenize(line):
	tokens = TOKEN_REGEX.findall(line)
	tokens = [x for x in tokens if len(x) > 2]
	tokens = [x.lower() for x in tokens]
	tokens = [x for x in tokens if x not in BLACKLIST]
	return tokens

	def get_counts(document):
	counter = collections.Counter()
	overall = 0
	for line in document:
	tokens = tokenize(line)
	counter.update(tokens)
	overall += len(tokens)
	counts = {k: v / overall for k, v in counter.items()}
	return counts

	def get_scores(document, reference):
	with open(reference) as lines:
	reference = get_counts(lines)
	with open(document) as lines:
	document = get_counts(lines)
	scores = {}
	for term, frequency in document.items():
	base = reference.get(term, 0)
	scores[term] = frequency / (base + 1)
	return scores

	if __name__ == '__main__':
	parser = argparse.ArgumentParser()
	parser.add_argument('reference', help='path to reference corpus')
	parser.add_argument('document', help='path to input document')
	parser.add_argument('-n', type=int, help='number of terms to print')
	args = parser.parse_args()
	scores = get_scores(args.document, args.reference)
	scores = sorted(scores.items(), key=lambda x: x[1], reverse=True)
	best = ['{1:1.5f} {0}'.format(*x) for x in scores[:args.n]]
	print('\n'.join(reversed(best)))