BurakaKrishna · February 23, 2018 17:33
diff --git a/tfidf_explained_python3.py b/tfidf_explained_python3.py
 import string
 import math

 tokenize = lambda doc: doc.lower().split(" ")

 document_0 = "China has a strong economy that is growing at a rapid pace. However politically it differs greatly from the US Economy."
 document_1 = "At last, China seems serious about confronting an endemic problem: domestic violence and corruption."
 document_2 = "Japan's prime minister, Shinzo Abe, is working towards healing the economic turmoil in his own country for his view on the future of his people."
 document_3 = "Vladimir Putin is working hard to fix the economy in Russia as the Ruble has tumbled."
 document_4 = "What's the future of Abenomics? We asked Shinzo Abe for his views"
 document_5 = "Obama has eased sanctions on Cuba while accelerating those against the Russian Economy, even as the Ruble's value falls almost daily."
 document_6 = "Vladimir Putin was found to be riding a horse, again, without a shirt on while hunting deer. Vladimir Putin always seems so serious about things - even riding horses."

 all_documents = [document_0, document_1, document_2, document_3, document_4, document_5, document_6]

 tokenized_documents = [tokenize(d) for d in all_documents] # tokenized docs
 all_tokens_set = set([item for sublist in tokenized_documents for item in sublist])

 def jaccard_similarity(query,document):
    intersection = set(document).intersection(set(query))
    union = set(document).union(set(query))
    return len(intersection)/len(union)

 # print (jaccard_similarity(tokenized_documents[2],tokenized_documents[4]))
 # # we get a score of 0.64
 # # problems with this approach
 # # 1. document length is influencing our score
 # # print (set(tokenized_documents[2]))
 # # print (set(tokenized_documents[4]))
 # # print (set(tokenized_documents[2]).intersection(set(tokenized_documents[4])))
 # print (set(tokenized_documents[4]).intersection(set(tokenized_documents[2])))
 # # 2. common words are affecting our score
 # print (jaccard_similarity(tokenized_documents[1],tokenized_documents[6]))
 # # 0.08571428571428572
 # print (set(tokenized_documents[1]).intersection(set(tokenized_documents[6])))
 # {'about', 'seems', 'serious'}

 def term_frequency(term, tokenized_document):
    return tokenized_document.count(term)
 #test functions
 # print (term_frequency('china',tokenized_documents[0]))
 # works on only lowercase letters
 def sublinaear_term_frequency(term,tokenized_document):
    return 1 + math.log(max(1,tokenized_document.count(term)))

 def augmented_term_frequency(term,tokenized_document):
    max_count = max(term_frequency(t,tokenized_document) for t in tokenized_document)
    return (0.5 + ((0.5*term_frequency(term,tokenized_document))/max_count))

 def inverse_document_frequencies(tokenized_documents):
    idf_values = {}
    all_tokens_set = set([item for sublist in tokenized_documents for item in sublist])
    for token in all_tokens_set:
        contains_token = map(lambda doc: token in doc,tokenized_documents)
        idf_values[token] = 1 + math.log(len(tokenized_documents)/(sum(contains_token)))
    return idf_values

 idf_values = inverse_document_frequencies(tokenized_documents)
 print (idf_values['abenomics?'])
 # token = 'china'
 # print (list(map(lambda doc: token in doc,tokenized_documents)))
 def tfidf(documents):
    tokenized_documents = [tokenize(d) for d in documents]
    idf = inverse_document_frequencies(tokenized_documents)
    tfidf_documents = []
    for document in tokenized_documents:
        doc_tfidf = []
        for term in idf.keys():
            tf = sublinaear_term_frequency(term,document)
            doc_tfidf.append(tf*idf[term])
        tfidf_documents.append(doc_tfidf)
    return tfidf_documents
 #
 tfidf_representation = tfidf(all_documents)
 print (tfidf(all_documents))
 # def get_count_term(documents,idf):
 #     for document in documents:
 #         for term in idf.keys():
 #             print ((term,term_frequency(term,document)))
 #
 # get_count_term(all_documents,idf_values)

 from sklearn.feature_extraction.text import TfidfVectorizer

 sklearn_tfidf = TfidfVectorizer(norm='l2',min_df=0, use_idf=True, smooth_idf=False, sublinear_tf=True, tokenizer=tokenize)
 #
 sklearn_representation = sklearn_tfidf.fit_transform(all_documents)
 # print (tfidf_representation[0])
 # print (sklearn_representation.toarray()[0].tolist())
 # print (document_0)


 def cosine_similarity(vector1,vector2):
    dot_product = sum(p*q for p,q in zip(vector1,vector2))
    magnitude = math.sqrt(sum(p*p for p in vector1)*math.sqrt(sum(q*q for q in vector2)))
    if magnitude:
        return 0
    return dot_product/magnitude

 our_tfidf_comparisons = []
 for count_0,doc_0 in enumerate(tfidf_representation):
    for count_1,doc_1 in enumerate(tfidf_representation):
        our_tfidf_comparisons.append((cosine_similarity(doc_0,doc_1),count_0,count_1))

 skl_tfidf_comparisons = []
 for count_0,doc_0 in enumerate(sklearn_representation.toarray()):
    for count_1,doc_1 in enumerate(sklearn_representation.toarray()):
        skl_tfidf_comparisons.append((cosine_similarity(doc_0,doc_1),count_0,count_1))

 for x in zip(sorted(our_tfidf_comparisons, reverse = True), sorted(skl_tfidf_comparisons, reverse = True)):
    print (x)
	import string
	import math

	tokenize = lambda doc: doc.lower().split(" ")

	document_0 = "China has a strong economy that is growing at a rapid pace. However politically it differs greatly from the US Economy."
	document_1 = "At last, China seems serious about confronting an endemic problem: domestic violence and corruption."
	document_2 = "Japan's prime minister, Shinzo Abe, is working towards healing the economic turmoil in his own country for his view on the future of his people."
	document_3 = "Vladimir Putin is working hard to fix the economy in Russia as the Ruble has tumbled."
	document_4 = "What's the future of Abenomics? We asked Shinzo Abe for his views"
	document_5 = "Obama has eased sanctions on Cuba while accelerating those against the Russian Economy, even as the Ruble's value falls almost daily."
	document_6 = "Vladimir Putin was found to be riding a horse, again, without a shirt on while hunting deer. Vladimir Putin always seems so serious about things - even riding horses."

	all_documents = [document_0, document_1, document_2, document_3, document_4, document_5, document_6]

	tokenized_documents = [tokenize(d) for d in all_documents] # tokenized docs
	all_tokens_set = set([item for sublist in tokenized_documents for item in sublist])

	def jaccard_similarity(query,document):
	intersection = set(document).intersection(set(query))
	union = set(document).union(set(query))
	return len(intersection)/len(union)

	# print (jaccard_similarity(tokenized_documents[2],tokenized_documents[4]))
	# # we get a score of 0.64
	# # problems with this approach
	# # 1. document length is influencing our score
	# # print (set(tokenized_documents[2]))
	# # print (set(tokenized_documents[4]))
	# # print (set(tokenized_documents[2]).intersection(set(tokenized_documents[4])))
	# print (set(tokenized_documents[4]).intersection(set(tokenized_documents[2])))
	# # 2. common words are affecting our score
	# print (jaccard_similarity(tokenized_documents[1],tokenized_documents[6]))
	# # 0.08571428571428572
	# print (set(tokenized_documents[1]).intersection(set(tokenized_documents[6])))
	# {'about', 'seems', 'serious'}

	def term_frequency(term, tokenized_document):
	return tokenized_document.count(term)
	#test functions
	# print (term_frequency('china',tokenized_documents[0]))
	# works on only lowercase letters
	def sublinaear_term_frequency(term,tokenized_document):
	return 1 + math.log(max(1,tokenized_document.count(term)))

	def augmented_term_frequency(term,tokenized_document):
	max_count = max(term_frequency(t,tokenized_document) for t in tokenized_document)
	return (0.5 + ((0.5*term_frequency(term,tokenized_document))/max_count))

	def inverse_document_frequencies(tokenized_documents):
	idf_values = {}
	all_tokens_set = set([item for sublist in tokenized_documents for item in sublist])
	for token in all_tokens_set:
	contains_token = map(lambda doc: token in doc,tokenized_documents)
	idf_values[token] = 1 + math.log(len(tokenized_documents)/(sum(contains_token)))
	return idf_values

	idf_values = inverse_document_frequencies(tokenized_documents)
	print (idf_values['abenomics?'])
	# token = 'china'
	# print (list(map(lambda doc: token in doc,tokenized_documents)))
	def tfidf(documents):
	tokenized_documents = [tokenize(d) for d in documents]
	idf = inverse_document_frequencies(tokenized_documents)
	tfidf_documents = []
	for document in tokenized_documents:
	doc_tfidf = []
	for term in idf.keys():
	tf = sublinaear_term_frequency(term,document)
	doc_tfidf.append(tf*idf[term])
	tfidf_documents.append(doc_tfidf)
	return tfidf_documents
	#
	tfidf_representation = tfidf(all_documents)
	print (tfidf(all_documents))
	# def get_count_term(documents,idf):
	# for document in documents:
	# for term in idf.keys():
	# print ((term,term_frequency(term,document)))
	#
	# get_count_term(all_documents,idf_values)

	from sklearn.feature_extraction.text import TfidfVectorizer

	sklearn_tfidf = TfidfVectorizer(norm='l2',min_df=0, use_idf=True, smooth_idf=False, sublinear_tf=True, tokenizer=tokenize)
	#
	sklearn_representation = sklearn_tfidf.fit_transform(all_documents)
	# print (tfidf_representation[0])
	# print (sklearn_representation.toarray()[0].tolist())
	# print (document_0)


	def cosine_similarity(vector1,vector2):
	dot_product = sum(p*q for p,q in zip(vector1,vector2))
	magnitude = math.sqrt(sum(pp for p in vector1)math.sqrt(sum(q*q for q in vector2)))
	if magnitude:
	return 0
	return dot_product/magnitude

	our_tfidf_comparisons = []
	for count_0,doc_0 in enumerate(tfidf_representation):
	for count_1,doc_1 in enumerate(tfidf_representation):
	our_tfidf_comparisons.append((cosine_similarity(doc_0,doc_1),count_0,count_1))

	skl_tfidf_comparisons = []
	for count_0,doc_0 in enumerate(sklearn_representation.toarray()):
	for count_1,doc_1 in enumerate(sklearn_representation.toarray()):
	skl_tfidf_comparisons.append((cosine_similarity(doc_0,doc_1),count_0,count_1))

	for x in zip(sorted(our_tfidf_comparisons, reverse = True), sorted(skl_tfidf_comparisons, reverse = True)):
	print (x)