thinrhino · May 5, 2014 13:09
diff --git a/tf-idf.py b/tf-idf.py
 # ref: http://www.tfidf.com/
 # Example:
 # Consider a document containing 100 words wherein the word cat appears 3 times.
 # The term frequency (i.e., tf) for cat is then (3 / 100) = 0.03. Now, assume we
 # have 10 million documents and the word cat appears in one thousand of these.
 # Then, the inverse document frequency (i.e., idf) is calculated as log(10,000,000 / 1,000) = 4. 
 # Thus, the Tf-idf weight is the product of these quantities: 0.03 * 4 = 0.12.
 # 
 # Hence:
 # 1. Calculate term frequency
 # - Find total number of words in each document (w_count_per_doc)
 # - { word_id : (count/w_count_per_doc) }
 # 
 # 2. Inverse Document Fequency
 # - Find total number of documents = D (given constant)
 # - Find count of doc_id for every word_id
 # - IDF = log(D/count) (this is per word_id)
 # 
 # 3. TF-IDF
 # - tf-idf = tf * idf

 from collections import defaultdict
 data_file = open('<data_file>', 'r')

 data = defaultdict(list)
 w_count_per_doc = defaultdict(int)
 w_count = defaultdict(int)

 while True:
    l = data_file.readline()
    if l == '':
        break
    doc_id, word_id, word_count = l.split(' ')
    data[doc_id].append({word_id : int(word_count)})
    w_count_per_doc[doc_id] += int(word_count)
    w_count[word_id] += int(word_count)

 # Calculate TF
 tf_dict = defaultdict(list)

 for key in data.keys():
    for i in data[key]:
        #pass
        tf = float(i.values()[0])/float(w_count_per_doc[key])
        tf_dict[key].append({i.keys()[0]: tf})

 # Calculate IDF
 import math

 # Document Count
 D = 300000

 idf_dict = defaultdict(float)
 for key in w_count.keys():
    idf_dict[key] = math.log(float(D) / float(w_count[key]))

 # Calculate tf-idf
 tfidf_dict = defaultdict(list)
 for key in data.keys():
    for i in data[key]:
        tfidf = float(i.values()[0]) * float(idf_dict[i.keys()[0]])
        tfidf_dict[key].append({i.keys()[0] : tfidf})
	# ref: http://www.tfidf.com/
	# Example:
	# Consider a document containing 100 words wherein the word cat appears 3 times.
	# The term frequency (i.e., tf) for cat is then (3 / 100) = 0.03. Now, assume we
	# have 10 million documents and the word cat appears in one thousand of these.
	# Then, the inverse document frequency (i.e., idf) is calculated as log(10,000,000 / 1,000) = 4.
	# Thus, the Tf-idf weight is the product of these quantities: 0.03 * 4 = 0.12.
	#
	# Hence:
	# 1. Calculate term frequency
	# - Find total number of words in each document (w_count_per_doc)
	# - { word_id : (count/w_count_per_doc) }
	#
	# 2. Inverse Document Fequency
	# - Find total number of documents = D (given constant)
	# - Find count of doc_id for every word_id
	# - IDF = log(D/count) (this is per word_id)
	#
	# 3. TF-IDF
	# - tf-idf = tf * idf

	from collections import defaultdict
	data_file = open('<data_file>', 'r')

	data = defaultdict(list)
	w_count_per_doc = defaultdict(int)
	w_count = defaultdict(int)

	while True:
	l = data_file.readline()
	if l == '':
	break
	doc_id, word_id, word_count = l.split(' ')
	data[doc_id].append({word_id : int(word_count)})
	w_count_per_doc[doc_id] += int(word_count)
	w_count[word_id] += int(word_count)

	# Calculate TF
	tf_dict = defaultdict(list)

	for key in data.keys():
	for i in data[key]:
	#pass
	tf = float(i.values()[0])/float(w_count_per_doc[key])
	tf_dict[key].append({i.keys()[0]: tf})

	# Calculate IDF
	import math

	# Document Count
	D = 300000

	idf_dict = defaultdict(float)
	for key in w_count.keys():
	idf_dict[key] = math.log(float(D) / float(w_count[key]))

	# Calculate tf-idf
	tfidf_dict = defaultdict(list)
	for key in data.keys():
	for i in data[key]:
	tfidf = float(i.values()[0]) * float(idf_dict[i.keys()[0]])
	tfidf_dict[key].append({i.keys()[0] : tfidf})