-
-
Save bllchmbrs/48c5c0124ba4e162b2e3 to your computer and use it in GitHub Desktop.
from __future__ import division | |
import string | |
import math | |
tokenize = lambda doc: doc.lower().split(" ") | |
document_0 = "China has a strong economy that is growing at a rapid pace. However politically it differs greatly from the US Economy." | |
document_1 = "At last, China seems serious about confronting an endemic problem: domestic violence and corruption." | |
document_2 = "Japan's prime minister, Shinzo Abe, is working towards healing the economic turmoil in his own country for his view on the future of his people." | |
document_3 = "Vladimir Putin is working hard to fix the economy in Russia as the Ruble has tumbled." | |
document_4 = "What's the future of Abenomics? We asked Shinzo Abe for his views" | |
document_5 = "Obama has eased sanctions on Cuba while accelerating those against the Russian Economy, even as the Ruble's value falls almost daily." | |
document_6 = "Vladimir Putin is riding a horse while hunting deer. Vladimir Putin always seems so serious about things - even riding horses. Is he crazy?" | |
all_documents = [document_0, document_1, document_2, document_3, document_4, document_5, document_6] | |
def jaccard_similarity(query, document): | |
intersection = set(query).intersection(set(document)) | |
union = set(query).union(set(document)) | |
return len(intersection)/len(union) | |
def term_frequency(term, tokenized_document): | |
return tokenized_document.count(term) | |
def sublinear_term_frequency(term, tokenized_document): | |
count = tokenized_document.count(term) | |
if count == 0: | |
return 0 | |
return 1 + math.log(count) | |
def augmented_term_frequency(term, tokenized_document): | |
max_count = max([term_frequency(t, tokenized_document) for t in tokenized_document]) | |
return (0.5 + ((0.5 * term_frequency(term, tokenized_document))/max_count)) | |
def inverse_document_frequencies(tokenized_documents): | |
idf_values = {} | |
all_tokens_set = set([item for sublist in tokenized_documents for item in sublist]) | |
for tkn in all_tokens_set: | |
contains_token = map(lambda doc: tkn in doc, tokenized_documents) | |
idf_values[tkn] = 1 + math.log(len(tokenized_documents)/(sum(contains_token))) | |
return idf_values | |
def tfidf(documents): | |
tokenized_documents = [tokenize(d) for d in documents] | |
idf = inverse_document_frequencies(tokenized_documents) | |
tfidf_documents = [] | |
for document in tokenized_documents: | |
doc_tfidf = [] | |
for term in idf.keys(): | |
tf = sublinear_term_frequency(term, document) | |
doc_tfidf.append(tf * idf[term]) | |
tfidf_documents.append(doc_tfidf) | |
return tfidf_documents | |
#in Scikit-Learn | |
from sklearn.feature_extraction.text import TfidfVectorizer | |
sklearn_tfidf = TfidfVectorizer(norm='l2',min_df=0, use_idf=True, smooth_idf=False, sublinear_tf=True, tokenizer=tokenize) | |
sklearn_representation = sklearn_tfidf.fit_transform(all_documents) | |
########### END BLOG POST 1 ############# | |
def cosine_similarity(vector1, vector2): | |
dot_product = sum(p*q for p,q in zip(vector1, vector2)) | |
magnitude = math.sqrt(sum([val**2 for val in vector1])) * math.sqrt(sum([val**2 for val in vector2])) | |
if not magnitude: | |
return 0 | |
return dot_product/magnitude | |
tfidf_representation = tfidf(all_documents) | |
our_tfidf_comparisons = [] | |
for count_0, doc_0 in enumerate(tfidf_representation): | |
for count_1, doc_1 in enumerate(tfidf_representation): | |
our_tfidf_comparisons.append((cosine_similarity(doc_0, doc_1), count_0, count_1)) | |
skl_tfidf_comparisons = [] | |
for count_0, doc_0 in enumerate(sklearn_representation.toarray()): | |
for count_1, doc_1 in enumerate(sklearn_representation.toarray()): | |
skl_tfidf_comparisons.append((cosine_similarity(doc_0, doc_1), count_0, count_1)) | |
for x in zip(sorted(our_tfidf_comparisons, reverse = True), sorted(skl_tfidf_comparisons, reverse = True)): | |
print x | |
This is link post: http://billchambers.me/tutorials/2014/12/21/tf-idf-explained-in-python.html
Thanks ,There is any blog or code for json data
sample Json data :
{"intents": [
{"tag": "greetings",
"patterns": ["Hi","hai","Is anyone there?", "Hello", "Are you there","what is happening","how is it going","great to see you","Nice you"],
"responses": ["Hi there, how can I help?"],
"context_set":""
},
{"tag": "open",
"patterns": ["what time does the bank open","when does the bank open","at what time does the bank open"],
"responses": ["usually opens at 9 am and close at 6 pm", "9am-6pm"],
"context_set":"timings"
},
{"tag": "lunch",
"patterns": ["what are the lunch timings of bank?","when does the lunch break starts in bank"],
"responses": ["after noon 1 pm-2 pm bank will be closed for lunch time for bank empolyees"],
"context_set":"lunchtime"
}
]
}
Hi, i would like to know if it's a good idea to use tf-idf scores to text classification.
Like, to classify a review as positive or negative.
Hi,i was trying the code in Python 3 and found that in the function sublinear term frequency, it is not able to handle when the term checking count is zero for that document when making tfidf_representation,is this expected?
Hi, i was trying to adapt your code with user input not by document input but i am having trouble at idf process. could you make something like that? anyway, thanks for sharing the code
Do you have a blog post explaining some of your functions? Just curious, thanks for sharing the code!