Created
August 10, 2014 20:05
-
-
Save Pinak-Chakraborty/0c7e8e6f505d45f012e5 to your computer and use it in GitHub Desktop.
Tokenizer and computation of unigrams and bigrams (without regex)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sys, os, os.path, glob, codecs | |
# Set the codecs | |
sys.stdout = codecs.getwriter('UTF-8')(sys.stdout.detach()) | |
# set the delimiters | |
delimiterSet = ";.,!?\"()':[]\n/+-—=≤≥{}><*’”“|" | |
digits = "0123456789" | |
chars = "abcdefghijklmnopqrstuvwxyz" | |
chars = "".join( (chars, chars.upper()) ) | |
spaces = " \t\n" | |
numberdelimiters = ",." | |
# Declare Unigram and Bigram dictionary | |
Unigrams = {} | |
Bigrams = {} | |
# Declare variable for Bigrams (used for concatenation) | |
# Max number of unigrams and bigrams with highest frequency written | |
prev_word = "START" | |
writemax = 100 | |
# Main tokenizer starts here | |
def main(fname): | |
print("starting tokenizer") | |
global delimiterSet | |
global writemax | |
if not os.path.isfile(fname): | |
print("Error: Not a file", fname, "\n") | |
usage() | |
return | |
try: | |
#inStream = open(fname, mode="r") | |
inStream = open(fname, mode="r", encoding="UTF-8") | |
token = "" | |
ch = inStream.read(1) | |
lookahead = inStream.read(1) | |
while True: | |
if not ch: | |
if token: | |
print(token) | |
process(token) | |
break | |
if ch in delimiterSet: | |
if token: | |
if token[-1] in digits and lookahead in digits and ch in numberdelimiters: | |
token = "".join( (token, ch) ) | |
elif token[-1] in chars and lookahead in chars and ch in numberdelimiters: | |
token = "".join( (token, ch) ) | |
else: | |
print(token) | |
process(token) | |
token = "" | |
if ch not in spaces: | |
print(ch) | |
process(ch) | |
elif ch in spaces: | |
if token: | |
print(token) | |
process(token) | |
token = "" | |
else: | |
token = "".join( (token, ch) ) | |
ch = lookahead | |
lookahead = inStream.read(1) | |
inStream.close() | |
except IOError: | |
print("Cannot read from file:", fname, file=sys.stdout) | |
#At this point, unigrams and bigrams are created - so print from these | |
#-------------------------------------------------------------------- | |
# Write unigrams to output file - first 100 highest freq are written | |
output_file = open("unigram-out.txt","w", encoding="UTF-8") | |
writecount = 0 | |
for uni in sorted(Unigrams, key=Unigrams.get, reverse=True): | |
countU = Unigrams[uni] | |
output_file.write(str(countU) + '\t' + str(uni) + '\n') | |
writecount +=1 | |
if writecount >= writemax: | |
break | |
#print("output = ", str(countU), '\t', uni, '\n') | |
output_file.close() | |
# Write bigrams to output file: | |
output_file = open("bigram-out.txt","w", encoding="UTF-8") | |
writecount = 0 | |
for bi in sorted(Bigrams, key=Bigrams.get, reverse=True): | |
countB = Bigrams[bi] | |
output_file.write(str(countB)+ '\t' + str(bi) + '\n') | |
writecount +=1 | |
if writecount >= writemax: | |
break | |
#print("output = ", str(countB), '\t', bi, '\n') | |
output_file.close() | |
#----------------------------------------------------------- | |
def process(word): | |
# This function populates the unigrams and bigrams | |
# This should be called for every token from the main function | |
global prev_word | |
# loop over unigrams: | |
if word in Unigrams: | |
Unigrams[word] += 1 | |
else: | |
Unigrams[word] = 1 | |
#----------------------------------------------------------- | |
# concatenate words to get bigram: | |
bigram = prev_word + ' ' + word | |
if bigram in Bigrams: | |
Bigrams[bigram] += 1 | |
else: | |
Bigrams[bigram] = 1 | |
# change value of prev_word | |
prev_word = word | |
#----------------------------------------------------------- | |
def usage(): | |
print(""" | |
tokenizer.py | |
Usage: | |
python tokenizer.py mytext.txt | |
""") | |
if __name__ == '__main__': | |
if len(sys.argv) > 1: | |
for i in sys.argv[1:]: | |
for j in glob.glob(i): | |
main(os.path.expanduser(os.path.expandvars(j))) | |
else: | |
usage() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment