Created
June 16, 2020 14:24
-
-
Save hughdbrown/c7ec42c074b679b76e75fd22fd8ae77d to your computer and use it in GitHub Desktop.
TF-IDF sample code
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from math import log | |
from collections import Counter | |
class TFIDF(object): | |
def __init__(self, corpus): | |
self.corpus = corpus | |
self.ndocs = len(corpus) | |
self.documents = [Counter(doc.split()) for doc in corpus] | |
self.words = sum(sum(doc.values()) for doc in self.documents) | |
self.vocab = reduce(set.union, [doc.keys() for doc in self.documents], set()) | |
self.term_freq = reduce(Counter.__add__, self.documents, Counter()) | |
def tf(self, word): | |
return self.term_freq.get(word, 0) / float(self.words) | |
def n_containing(self, word): | |
return sum(word in doc for doc in self.documents) | |
def idf(self, word): | |
return log(self.words / (1.0 + self.n_containing(word))) | |
def tfidf(self, word): | |
return self.tf(word) * self.idf(word) | |
document1 = """Python is a 2000 made-for-TV horror movie directed by Richard | |
Clabaugh. The film features several cult favorite actors, including William | |
Zabka of The Karate Kid fame, Wil Wheaton, Casper Van Dien, Jenny McCarthy, | |
Keith Coogan, Robert Englund (best known for his role as Freddy Krueger in the | |
A Nightmare on Elm Street series of films), Dana Barron, David Bowe, and Sean | |
Whalen. The film concerns a genetically engineered snake, a python, that | |
escapes and unleashes itself on a small town. It includes the classic final | |
girl scenario evident in films like Friday the 13th. It was filmed in Los Angeles, | |
California and Malibu, California. Python was followed by two sequels: Python | |
II (2002) and Boa vs. Python (2004), both also made-for-TV films.""" | |
document2 = """Python, from the Greek word (πύθων/πύθωνας), is a genus of | |
nonvenomous pythons[2] found in Africa and Asia. Currently, 7 species are | |
recognised.[2] A member of this genus, P. reticulatus, is among the longest | |
snakes known.""" | |
document3 = """The Colt Python is a .357 Magnum caliber revolver formerly | |
manufactured by Colt's Manufacturing Company of Hartford, Connecticut. | |
It is sometimes referred to as a "Combat Magnum".[1] It was first introduced | |
in 1955, the same year as Smith & Wesson's M29 .44 Magnum. The now discontinued | |
Colt Python targeted the premium revolver market segment. Some firearm | |
collectors and writers such as Jeff Cooper, Ian V. Hogg, Chuck Hawks, Leroy | |
Thompson, Renee Smeets and Martin Dougherty have described the Python as the | |
finest production revolver ever made.""" | |
def unpunct(doc): | |
from string import punctuation | |
for p in punctuation: | |
doc = doc.replace(p, "") | |
return doc | |
def prepare(doc): | |
return unpunct(doc.lower()) | |
def main(): | |
doclist = [document1, document2, document3] | |
for i, doc in enumerate(doclist, start=1): | |
print('-' * 30) | |
tfidf = TFIDF([prepare(doc)]) | |
print("Top words in document {}".format(i)) | |
scores = {word: tfidf.tfidf(word) for word in tfidf.vocab} | |
sorted_words = sorted(scores.items(), key=lambda x: x[1], reverse=True) | |
for word, score in sorted_words[:3]: | |
print("Word: {} TF-IDF: {}".format(word, round(score, 5))) | |
print('-' * 30) | |
tfidf = TFIDF([prepare(doc) for doc in doclist]) | |
print("Top words in corpus") | |
scores = {word: tfidf.tfidf(word) for word in tfidf.vocab} | |
sorted_words = sorted(scores.items(), key=lambda x: x[1], reverse=True) | |
for word, score in sorted_words[:3]: | |
print("Word: {} TF-IDF: {}".format(word, round(score, 5))) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment