Skip to content

Instantly share code, notes, and snippets.

@bllchmbrs
Last active December 29, 2021 14:10
Show Gist options
  • Save bllchmbrs/48c5c0124ba4e162b2e3 to your computer and use it in GitHub Desktop.
Save bllchmbrs/48c5c0124ba4e162b2e3 to your computer and use it in GitHub Desktop.
TF IDF Explained in Python Along with Scikit-Learn Implementation
from __future__ import division
import string
import math
tokenize = lambda doc: doc.lower().split(" ")
document_0 = "China has a strong economy that is growing at a rapid pace. However politically it differs greatly from the US Economy."
document_1 = "At last, China seems serious about confronting an endemic problem: domestic violence and corruption."
document_2 = "Japan's prime minister, Shinzo Abe, is working towards healing the economic turmoil in his own country for his view on the future of his people."
document_3 = "Vladimir Putin is working hard to fix the economy in Russia as the Ruble has tumbled."
document_4 = "What's the future of Abenomics? We asked Shinzo Abe for his views"
document_5 = "Obama has eased sanctions on Cuba while accelerating those against the Russian Economy, even as the Ruble's value falls almost daily."
document_6 = "Vladimir Putin is riding a horse while hunting deer. Vladimir Putin always seems so serious about things - even riding horses. Is he crazy?"
all_documents = [document_0, document_1, document_2, document_3, document_4, document_5, document_6]
def jaccard_similarity(query, document):
intersection = set(query).intersection(set(document))
union = set(query).union(set(document))
return len(intersection)/len(union)
def term_frequency(term, tokenized_document):
return tokenized_document.count(term)
def sublinear_term_frequency(term, tokenized_document):
count = tokenized_document.count(term)
if count == 0:
return 0
return 1 + math.log(count)
def augmented_term_frequency(term, tokenized_document):
max_count = max([term_frequency(t, tokenized_document) for t in tokenized_document])
return (0.5 + ((0.5 * term_frequency(term, tokenized_document))/max_count))
def inverse_document_frequencies(tokenized_documents):
idf_values = {}
all_tokens_set = set([item for sublist in tokenized_documents for item in sublist])
for tkn in all_tokens_set:
contains_token = map(lambda doc: tkn in doc, tokenized_documents)
idf_values[tkn] = 1 + math.log(len(tokenized_documents)/(sum(contains_token)))
return idf_values
def tfidf(documents):
tokenized_documents = [tokenize(d) for d in documents]
idf = inverse_document_frequencies(tokenized_documents)
tfidf_documents = []
for document in tokenized_documents:
doc_tfidf = []
for term in idf.keys():
tf = sublinear_term_frequency(term, document)
doc_tfidf.append(tf * idf[term])
tfidf_documents.append(doc_tfidf)
return tfidf_documents
#in Scikit-Learn
from sklearn.feature_extraction.text import TfidfVectorizer
sklearn_tfidf = TfidfVectorizer(norm='l2',min_df=0, use_idf=True, smooth_idf=False, sublinear_tf=True, tokenizer=tokenize)
sklearn_representation = sklearn_tfidf.fit_transform(all_documents)
########### END BLOG POST 1 #############
def cosine_similarity(vector1, vector2):
dot_product = sum(p*q for p,q in zip(vector1, vector2))
magnitude = math.sqrt(sum([val**2 for val in vector1])) * math.sqrt(sum([val**2 for val in vector2]))
if not magnitude:
return 0
return dot_product/magnitude
tfidf_representation = tfidf(all_documents)
our_tfidf_comparisons = []
for count_0, doc_0 in enumerate(tfidf_representation):
for count_1, doc_1 in enumerate(tfidf_representation):
our_tfidf_comparisons.append((cosine_similarity(doc_0, doc_1), count_0, count_1))
skl_tfidf_comparisons = []
for count_0, doc_0 in enumerate(sklearn_representation.toarray()):
for count_1, doc_1 in enumerate(sklearn_representation.toarray()):
skl_tfidf_comparisons.append((cosine_similarity(doc_0, doc_1), count_0, count_1))
for x in zip(sorted(our_tfidf_comparisons, reverse = True), sorted(skl_tfidf_comparisons, reverse = True)):
print x
@BurakaKrishna
Copy link

Hi,i was trying the code in Python 3 and found that in the function sublinear term frequency, it is not able to handle when the term checking count is zero for that document when making tfidf_representation,is this expected?

@irfanandratama
Copy link

Hi, i was trying to adapt your code with user input not by document input but i am having trouble at idf process. could you make something like that? anyway, thanks for sharing the code

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment