Created
May 5, 2014 13:09
-
-
Save thinrhino/fb286abad5c466fab5ac to your computer and use it in GitHub Desktop.
Calculate tf-idf
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# ref: http://www.tfidf.com/ | |
# Example: | |
# Consider a document containing 100 words wherein the word cat appears 3 times. | |
# The term frequency (i.e., tf) for cat is then (3 / 100) = 0.03. Now, assume we | |
# have 10 million documents and the word cat appears in one thousand of these. | |
# Then, the inverse document frequency (i.e., idf) is calculated as log(10,000,000 / 1,000) = 4. | |
# Thus, the Tf-idf weight is the product of these quantities: 0.03 * 4 = 0.12. | |
# | |
# Hence: | |
# 1. Calculate term frequency | |
# - Find total number of words in each document (w_count_per_doc) | |
# - { word_id : (count/w_count_per_doc) } | |
# | |
# 2. Inverse Document Fequency | |
# - Find total number of documents = D (given constant) | |
# - Find count of doc_id for every word_id | |
# - IDF = log(D/count) (this is per word_id) | |
# | |
# 3. TF-IDF | |
# - tf-idf = tf * idf | |
from collections import defaultdict | |
data_file = open('<data_file>', 'r') | |
data = defaultdict(list) | |
w_count_per_doc = defaultdict(int) | |
w_count = defaultdict(int) | |
while True: | |
l = data_file.readline() | |
if l == '': | |
break | |
doc_id, word_id, word_count = l.split(' ') | |
data[doc_id].append({word_id : int(word_count)}) | |
w_count_per_doc[doc_id] += int(word_count) | |
w_count[word_id] += int(word_count) | |
# Calculate TF | |
tf_dict = defaultdict(list) | |
for key in data.keys(): | |
for i in data[key]: | |
#pass | |
tf = float(i.values()[0])/float(w_count_per_doc[key]) | |
tf_dict[key].append({i.keys()[0]: tf}) | |
# Calculate IDF | |
import math | |
# Document Count | |
D = 300000 | |
idf_dict = defaultdict(float) | |
for key in w_count.keys(): | |
idf_dict[key] = math.log(float(D) / float(w_count[key])) | |
# Calculate tf-idf | |
tfidf_dict = defaultdict(list) | |
for key in data.keys(): | |
for i in data[key]: | |
tfidf = float(i.values()[0]) * float(idf_dict[i.keys()[0]]) | |
tfidf_dict[key].append({i.keys()[0] : tfidf}) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment