gupul2k · October 11, 2015 21:38 · gupul2k · Oct 20, 2012
diff --git a/bigrams_vectorgen.py b/bigrams_vectorgen.py
 #Author: Sobhan Hota
 #Date: Oct 20 2012
 #Script can run to generate vector for bigram collected in Source File 
 #captures the count from the supplied input file (if present), then divide by
 #input file document length.


 import itertools
 from collections import Counter

 total_words = 0

 import sys

 #Capture Total Words and build bi grams for the input file
 for arg in sys.argv:
 	f_ip = open(arg)
 data_tmp = f_ip.readlines()
 for lines in data_tmp:
 	all_words = lines.split()

 nextword = iter(all_words)
 next(nextword)

 bigrams_ip = Counter(zip(all_words, nextword))

 total_words += len(all_words)
 f_ip.close()

 print total_words

 #Prepare bi-grams collection from Source file
 f = open('c:\\Python27\\nlp\\ip\\UPC_Source.txt')

 data = f.readlines()
 for line in data:
 	words = line.split()	

 	nextword = iter(words)
 	next(nextword)

 	bigrams_c = Counter(zip(words, nextword))			

 print  bigrams_c
 print  bigrams_ip

 # Print first 1000 top bi-grams vector from Source and look for count in input file
 for item, index in bigrams_c.most_common(100):
 	#print item, index,

 	if item in bigrams_ip > 0:
 		print '%f,' % ((bigrams_ip[item]/float(total_words))*1000),
 	else:
 		print '%f,' % ((1/float(total_words))*1000),		
 		
 		
 print ', '+arg
	#Author: Sobhan Hota
	#Date: Oct 20 2012
	#Script can run to generate vector for bigram collected in Source File
	#captures the count from the supplied input file (if present), then divide by
	#input file document length.


	import itertools
	from collections import Counter

	total_words = 0

	import sys

	#Capture Total Words and build bi grams for the input file
	for arg in sys.argv:
	f_ip = open(arg)
	data_tmp = f_ip.readlines()
	for lines in data_tmp:
	all_words = lines.split()

	nextword = iter(all_words)
	next(nextword)

	bigrams_ip = Counter(zip(all_words, nextword))

	total_words += len(all_words)
	f_ip.close()

	print total_words

	#Prepare bi-grams collection from Source file
	f = open('c:\\Python27\\nlp\\ip\\UPC_Source.txt')

	data = f.readlines()
	for line in data:
	words = line.split()

	nextword = iter(words)
	next(nextword)

	bigrams_c = Counter(zip(words, nextword))

	print bigrams_c
	print bigrams_ip

	# Print first 1000 top bi-grams vector from Source and look for count in input file
	for item, index in bigrams_c.most_common(100):
	#print item, index,

	if item in bigrams_ip > 0:
	print '%f,' % ((bigrams_ip[item]/float(total_words))*1000),
	else:
	print '%f,' % ((1/float(total_words))*1000),


	print ', '+arg