Created
February 23, 2018 17:33
-
-
Save BurakaKrishna/ed712135c74c874fdc2d6474c5aedf68 to your computer and use it in GitHub Desktop.
tfidf_explaine
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import string | |
import math | |
tokenize = lambda doc: doc.lower().split(" ") | |
document_0 = "China has a strong economy that is growing at a rapid pace. However politically it differs greatly from the US Economy." | |
document_1 = "At last, China seems serious about confronting an endemic problem: domestic violence and corruption." | |
document_2 = "Japan's prime minister, Shinzo Abe, is working towards healing the economic turmoil in his own country for his view on the future of his people." | |
document_3 = "Vladimir Putin is working hard to fix the economy in Russia as the Ruble has tumbled." | |
document_4 = "What's the future of Abenomics? We asked Shinzo Abe for his views" | |
document_5 = "Obama has eased sanctions on Cuba while accelerating those against the Russian Economy, even as the Ruble's value falls almost daily." | |
document_6 = "Vladimir Putin was found to be riding a horse, again, without a shirt on while hunting deer. Vladimir Putin always seems so serious about things - even riding horses." | |
all_documents = [document_0, document_1, document_2, document_3, document_4, document_5, document_6] | |
tokenized_documents = [tokenize(d) for d in all_documents] # tokenized docs | |
all_tokens_set = set([item for sublist in tokenized_documents for item in sublist]) | |
def jaccard_similarity(query,document): | |
intersection = set(document).intersection(set(query)) | |
union = set(document).union(set(query)) | |
return len(intersection)/len(union) | |
# print (jaccard_similarity(tokenized_documents[2],tokenized_documents[4])) | |
# # we get a score of 0.64 | |
# # problems with this approach | |
# # 1. document length is influencing our score | |
# # print (set(tokenized_documents[2])) | |
# # print (set(tokenized_documents[4])) | |
# # print (set(tokenized_documents[2]).intersection(set(tokenized_documents[4]))) | |
# print (set(tokenized_documents[4]).intersection(set(tokenized_documents[2]))) | |
# # 2. common words are affecting our score | |
# print (jaccard_similarity(tokenized_documents[1],tokenized_documents[6])) | |
# # 0.08571428571428572 | |
# print (set(tokenized_documents[1]).intersection(set(tokenized_documents[6]))) | |
# {'about', 'seems', 'serious'} | |
def term_frequency(term, tokenized_document): | |
return tokenized_document.count(term) | |
#test functions | |
# print (term_frequency('china',tokenized_documents[0])) | |
# works on only lowercase letters | |
def sublinaear_term_frequency(term,tokenized_document): | |
return 1 + math.log(max(1,tokenized_document.count(term))) | |
def augmented_term_frequency(term,tokenized_document): | |
max_count = max(term_frequency(t,tokenized_document) for t in tokenized_document) | |
return (0.5 + ((0.5*term_frequency(term,tokenized_document))/max_count)) | |
def inverse_document_frequencies(tokenized_documents): | |
idf_values = {} | |
all_tokens_set = set([item for sublist in tokenized_documents for item in sublist]) | |
for token in all_tokens_set: | |
contains_token = map(lambda doc: token in doc,tokenized_documents) | |
idf_values[token] = 1 + math.log(len(tokenized_documents)/(sum(contains_token))) | |
return idf_values | |
idf_values = inverse_document_frequencies(tokenized_documents) | |
print (idf_values['abenomics?']) | |
# token = 'china' | |
# print (list(map(lambda doc: token in doc,tokenized_documents))) | |
def tfidf(documents): | |
tokenized_documents = [tokenize(d) for d in documents] | |
idf = inverse_document_frequencies(tokenized_documents) | |
tfidf_documents = [] | |
for document in tokenized_documents: | |
doc_tfidf = [] | |
for term in idf.keys(): | |
tf = sublinaear_term_frequency(term,document) | |
doc_tfidf.append(tf*idf[term]) | |
tfidf_documents.append(doc_tfidf) | |
return tfidf_documents | |
# | |
tfidf_representation = tfidf(all_documents) | |
print (tfidf(all_documents)) | |
# def get_count_term(documents,idf): | |
# for document in documents: | |
# for term in idf.keys(): | |
# print ((term,term_frequency(term,document))) | |
# | |
# get_count_term(all_documents,idf_values) | |
from sklearn.feature_extraction.text import TfidfVectorizer | |
sklearn_tfidf = TfidfVectorizer(norm='l2',min_df=0, use_idf=True, smooth_idf=False, sublinear_tf=True, tokenizer=tokenize) | |
# | |
sklearn_representation = sklearn_tfidf.fit_transform(all_documents) | |
# print (tfidf_representation[0]) | |
# print (sklearn_representation.toarray()[0].tolist()) | |
# print (document_0) | |
def cosine_similarity(vector1,vector2): | |
dot_product = sum(p*q for p,q in zip(vector1,vector2)) | |
magnitude = math.sqrt(sum(p*p for p in vector1)*math.sqrt(sum(q*q for q in vector2))) | |
if magnitude: | |
return 0 | |
return dot_product/magnitude | |
our_tfidf_comparisons = [] | |
for count_0,doc_0 in enumerate(tfidf_representation): | |
for count_1,doc_1 in enumerate(tfidf_representation): | |
our_tfidf_comparisons.append((cosine_similarity(doc_0,doc_1),count_0,count_1)) | |
skl_tfidf_comparisons = [] | |
for count_0,doc_0 in enumerate(sklearn_representation.toarray()): | |
for count_1,doc_1 in enumerate(sklearn_representation.toarray()): | |
skl_tfidf_comparisons.append((cosine_similarity(doc_0,doc_1),count_0,count_1)) | |
for x in zip(sorted(our_tfidf_comparisons, reverse = True), sorted(skl_tfidf_comparisons, reverse = True)): | |
print (x) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment