Skip to content

Instantly share code, notes, and snippets.

@gupul2k
Last active October 11, 2015 21:38
Show Gist options
  • Save gupul2k/3923124 to your computer and use it in GitHub Desktop.
Save gupul2k/3923124 to your computer and use it in GitHub Desktop.
NLP: Bigram Vector Generation by Python
#Author: Sobhan Hota
#Date: Oct 20 2012
#Script can run to generate vector for bigram collected in Source File
#captures the count from the supplied input file (if present), then divide by
#input file document length.
import itertools
from collections import Counter
total_words = 0
import sys
#Capture Total Words and build bi grams for the input file
for arg in sys.argv:
f_ip = open(arg)
data_tmp = f_ip.readlines()
for lines in data_tmp:
all_words = lines.split()
nextword = iter(all_words)
next(nextword)
bigrams_ip = Counter(zip(all_words, nextword))
total_words += len(all_words)
f_ip.close()
print total_words
#Prepare bi-grams collection from Source file
f = open('c:\\Python27\\nlp\\ip\\UPC_Source.txt')
data = f.readlines()
for line in data:
words = line.split()
nextword = iter(words)
next(nextword)
bigrams_c = Counter(zip(words, nextword))
print bigrams_c
print bigrams_ip
# Print first 1000 top bi-grams vector from Source and look for count in input file
for item, index in bigrams_c.most_common(100):
#print item, index,
if item in bigrams_ip > 0:
print '%f,' % ((bigrams_ip[item]/float(total_words))*1000),
else:
print '%f,' % ((1/float(total_words))*1000),
print ', '+arg
@gupul2k
Copy link
Author

gupul2k commented Oct 20, 2012

You can run this via a batch program:
for /f %%a IN ('dir /b *.txt') do c:\bigrams_vectorgen.py %%a

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment