Skip to content

Instantly share code, notes, and snippets.

@thinrhino
Created May 5, 2014 13:09
Show Gist options
  • Save thinrhino/fb286abad5c466fab5ac to your computer and use it in GitHub Desktop.
Save thinrhino/fb286abad5c466fab5ac to your computer and use it in GitHub Desktop.
Calculate tf-idf
# ref: http://www.tfidf.com/
# Example:
# Consider a document containing 100 words wherein the word cat appears 3 times.
# The term frequency (i.e., tf) for cat is then (3 / 100) = 0.03. Now, assume we
# have 10 million documents and the word cat appears in one thousand of these.
# Then, the inverse document frequency (i.e., idf) is calculated as log(10,000,000 / 1,000) = 4.
# Thus, the Tf-idf weight is the product of these quantities: 0.03 * 4 = 0.12.
#
# Hence:
# 1. Calculate term frequency
# - Find total number of words in each document (w_count_per_doc)
# - { word_id : (count/w_count_per_doc) }
#
# 2. Inverse Document Fequency
# - Find total number of documents = D (given constant)
# - Find count of doc_id for every word_id
# - IDF = log(D/count) (this is per word_id)
#
# 3. TF-IDF
# - tf-idf = tf * idf
from collections import defaultdict
data_file = open('<data_file>', 'r')
data = defaultdict(list)
w_count_per_doc = defaultdict(int)
w_count = defaultdict(int)
while True:
l = data_file.readline()
if l == '':
break
doc_id, word_id, word_count = l.split(' ')
data[doc_id].append({word_id : int(word_count)})
w_count_per_doc[doc_id] += int(word_count)
w_count[word_id] += int(word_count)
# Calculate TF
tf_dict = defaultdict(list)
for key in data.keys():
for i in data[key]:
#pass
tf = float(i.values()[0])/float(w_count_per_doc[key])
tf_dict[key].append({i.keys()[0]: tf})
# Calculate IDF
import math
# Document Count
D = 300000
idf_dict = defaultdict(float)
for key in w_count.keys():
idf_dict[key] = math.log(float(D) / float(w_count[key]))
# Calculate tf-idf
tfidf_dict = defaultdict(list)
for key in data.keys():
for i in data[key]:
tfidf = float(i.values()[0]) * float(idf_dict[i.keys()[0]])
tfidf_dict[key].append({i.keys()[0] : tfidf})
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment