-
-
Save anhpt379/1605037 to your computer and use it in GitHub Desktop.
tf-idf example
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#-*- coding: utf-8 -*- | |
import re | |
import nltk | |
from nltk.tokenize import RegexpTokenizer | |
from nltk import bigrams, trigrams | |
import math | |
stopwords = nltk.corpus.stopwords.words('portuguese') | |
tokenizer = RegexpTokenizer("[\w’]+", flags=re.UNICODE) | |
def freq(word, doc): | |
return doc.count(word) | |
def word_count(doc): | |
return len(doc) | |
def tf(word, doc): | |
return (freq(word, doc) / float(word_count(doc))) | |
def num_docs_containing(word, list_of_docs): | |
count = 0 | |
for document in list_of_docs: | |
if freq(word, document) > 0: | |
count += 1 | |
return 1 + count | |
def idf(word, list_of_docs): | |
return math.log(len(list_of_docs) / | |
float(num_docs_containing(word, list_of_docs))) | |
def tf_idf(word, doc, list_of_docs): | |
return (tf(word, doc) * idf(word, list_of_docs)) | |
#Compute the frequency for each term. | |
vocabulary = [] | |
docs = {} | |
all_tips = [] | |
for tip in (['documment 1', 'documment 2']): | |
tokens = tokenizer.tokenize(tip.text) | |
bi_tokens = bigrams(tokens) | |
tri_tokens = trigrams(tokens) | |
tokens = [token.lower() for token in tokens if len(token) > 2] | |
tokens = [token for token in tokens if token not in stopwords] | |
bi_tokens = [' '.join(token).lower() for token in bi_tokens] | |
bi_tokens = [token for token in bi_tokens if token not in stopwords] | |
tri_tokens = [' '.join(token).lower() for token in tri_tokens] | |
tri_tokens = [token for token in tri_tokens if token not in stopwords] | |
final_tokens = [] | |
final_tokens.extend(tokens) | |
final_tokens.extend(bi_tokens) | |
final_tokens.extend(tri_tokens) | |
docs[tip] = {'freq': {}, 'tf': {}, 'idf': {}, | |
'tf-idf': {}, 'tokens': []} | |
for token in final_tokens: | |
#The frequency computed for each tip | |
docs[tip]['freq'][token] = freq(token, final_tokens) | |
#The term-frequency (Normalized Frequency) | |
docs[tip]['tf'][token] = tf(token, final_tokens) | |
docs[tip]['tokens'] = final_tokens | |
vocabulary.append(final_tokens) | |
for doc in docs: | |
for token in docs[doc]['tf']: | |
#The Inverse-Document-Frequency | |
docs[doc]['idf'][token] = idf(token, vocabulary) | |
#The tf-idf | |
docs[doc]['tf-idf'][token] = tf_idf(token, docs[doc]['tokens'], vocabulary) | |
#Now let's find out the most relevant words by tf-idf. | |
words = {} | |
for doc in docs: | |
for token in docs[doc]['tf-idf']: | |
if token not in words: | |
words[token] = docs[doc]['tf-idf'][token] | |
else: | |
if docs[doc]['tf-idf'][token] > words[token]: | |
words[token] = docs[doc]['tf-idf'][token] | |
print doc | |
for token in docs[doc]['tf-idf']: | |
print token, docs[doc]['tf-idf'][token] | |
for item in sorted(words.items(), key=lambda x: x[1], reverse=True): | |
print "%f <= %s" % (item[1], item[0]) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment