Last active
October 11, 2015 21:38
-
-
Save gupul2k/3923124 to your computer and use it in GitHub Desktop.
NLP: Bigram Vector Generation by Python
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#Author: Sobhan Hota | |
#Date: Oct 20 2012 | |
#Script can run to generate vector for bigram collected in Source File | |
#captures the count from the supplied input file (if present), then divide by | |
#input file document length. | |
import itertools | |
from collections import Counter | |
total_words = 0 | |
import sys | |
#Capture Total Words and build bi grams for the input file | |
for arg in sys.argv: | |
f_ip = open(arg) | |
data_tmp = f_ip.readlines() | |
for lines in data_tmp: | |
all_words = lines.split() | |
nextword = iter(all_words) | |
next(nextword) | |
bigrams_ip = Counter(zip(all_words, nextword)) | |
total_words += len(all_words) | |
f_ip.close() | |
print total_words | |
#Prepare bi-grams collection from Source file | |
f = open('c:\\Python27\\nlp\\ip\\UPC_Source.txt') | |
data = f.readlines() | |
for line in data: | |
words = line.split() | |
nextword = iter(words) | |
next(nextword) | |
bigrams_c = Counter(zip(words, nextword)) | |
print bigrams_c | |
print bigrams_ip | |
# Print first 1000 top bi-grams vector from Source and look for count in input file | |
for item, index in bigrams_c.most_common(100): | |
#print item, index, | |
if item in bigrams_ip > 0: | |
print '%f,' % ((bigrams_ip[item]/float(total_words))*1000), | |
else: | |
print '%f,' % ((1/float(total_words))*1000), | |
print ', '+arg |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
You can run this via a batch program:
for /f %%a IN ('dir /b *.txt') do c:\bigrams_vectorgen.py %%a