Skip to content

Instantly share code, notes, and snippets.

@gupul2k
Created December 10, 2012 23:02
Show Gist options
  • Save gupul2k/4254172 to your computer and use it in GitHub Desktop.
Save gupul2k/4254172 to your computer and use it in GitHub Desktop.
Feature Vector Generation for supplied BoWs
#!/usr/bin/python
#Script to generate feature vector for a supplied BoWs file.
#Date: Nov 2 2012
#Author: Hota Sobhan
from string import punctuation
from operator import itemgetter
words = {}
total_words = 0
import sys
for arg in sys.argv:
words_gen = (word.strip(punctuation).lower() for line in open(arg)
for word in line.split())
for word in words_gen:
words[word] = words.get(word, 0) + 1
top_words = sorted(words.iteritems(), key=itemgetter(1), reverse=True)
#Capture Total Words
f = open(arg)
data = f.readlines()
for lines in data:
all_words = lines.split()
total_words += len(all_words)
f.close()
#Read the lines from Top 500 Words list
f_500 = open('C:\Python27\Top_Bows_PROD.txt')
data_500 = f_500.readlines()
#Loop in most frequent 500 List
for lines_500 in data_500:
for word, frequency in top_words:
#print lines_500, word, frequency
if lines_500 == word+'\n':
print '%f,' % ((frequency/float(total_words))*1000),
break
else:
print '%f,' % ((1/float(total_words))*1000),
break
print arg[0:2]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment