Pinak-Chakraborty · August 10, 2014 20:07
diff --git a/Tokenizer-2 b/Tokenizer-2
 import sys, os, os.path, glob, codecs

 # Set the codecs
 sys.stdout = codecs.getwriter('UTF-8')(sys.stdout.detach())

 # no of highest frequemcy unigrams and bigrams that will be written out
 writemax = 100

 def wordTokenizier(line):
    #delimiters = "[A-Z]{2,}(?![a-z])|[A-Z][a-z]+(?=[A-Z])|[\'\w\-]+"
    delimiters = "[A-Z]{2,}(?![a-z])|[A-Z][a-z]+(?=[A-Z])|[\'\w\-]+|[.,!;:()^*'-/]"
    tokenList = re.findall(delimiters, line)
    return tokenList

 if __name__ == '__main__':
    # Declare file to be worked with
    textfile = "C:\Python34\Scripts\mytext.txt"

    # Declare Unigram and Bigram dictionary
    Unigrams = {}
    Bigrams = {}
    prev_word = "START"

 # Open & read file in a loop
    for line in open(textfile):
        line = line.rstrip()
        print ("input = ", line)
 #-- Tokenize lines --------------------------------------
        tokens = wordTokenizier(line)
 #-- Loop over tokens ---------------------------------------
        for word in tokens:
 #-- Process unigras first ----------------------------------
            if word in Unigrams:
                Unigrams[word] += 1
            else:
                Unigrams[word] = 1
 #-- Now process bigrams ------------------------------------
            # concatenate words to get bigram:
            bigram = prev_word + ' ' + word
            if bigram in Bigrams:
                Bigrams[bigram] += 1
            else:
                Bigrams[bigram] = 1
            # change value of prev_word
            prev_word = word
    
 #-- All file processed here --------------------------------                

 # Write unigrams to output file - first "writemax" highest freq are written
    output_file = open("unigram-out.txt","w", encoding="UTF-8")
    writecount = 0
    for uni in sorted(Unigrams, key=Unigrams.get, reverse=True):
        countU = Unigrams[uni]
        output_file.write(str(countU) + '\t' + str(uni) + '\n')
        writecount +=1
        if writecount >= writemax:
            break
        #print("output = ", str(countU), '\t', uni, '\n')
    output_file.close()

 # Write bigrams to output file:
    output_file = open("bigram-out.txt","w", encoding="UTF-8")
    writecount = 0
    for bi in sorted(Bigrams, key=Bigrams.get, reverse=True):
        countB = Bigrams[bi]
        output_file.write(str(countB)+ '\t' + str(bi) + '\n')
        writecount +=1
        if writecount >= writemax:
            break
        #print("output = ", str(countB), '\t', bi, '\n')
    output_file.close()
 #-----------------------------------------------------------
	import sys, os, os.path, glob, codecs

	# Set the codecs
	sys.stdout = codecs.getwriter('UTF-8')(sys.stdout.detach())

	# no of highest frequemcy unigrams and bigrams that will be written out
	writemax = 100

	def wordTokenizier(line):
	#delimiters = "[A-Z]{2,}(?![a-z])\|[A-Z][a-z]+(?=[A-Z])\|[\'\w\-]+"
	delimiters = "[A-Z]{2,}(?![a-z])\|[A-Z][a-z]+(?=[A-Z])\|[\'\w\-]+\|[.,!;:()^*'-/]"
	tokenList = re.findall(delimiters, line)
	return tokenList

	if __name__ == '__main__':
	# Declare file to be worked with
	textfile = "C:\Python34\Scripts\mytext.txt"

	# Declare Unigram and Bigram dictionary
	Unigrams = {}
	Bigrams = {}
	prev_word = "START"

	# Open & read file in a loop
	for line in open(textfile):
	line = line.rstrip()
	print ("input = ", line)
	#-- Tokenize lines --------------------------------------
	tokens = wordTokenizier(line)
	#-- Loop over tokens ---------------------------------------
	for word in tokens:
	#-- Process unigras first ----------------------------------
	if word in Unigrams:
	Unigrams[word] += 1
	else:
	Unigrams[word] = 1
	#-- Now process bigrams ------------------------------------
	# concatenate words to get bigram:
	bigram = prev_word + ' ' + word
	if bigram in Bigrams:
	Bigrams[bigram] += 1
	else:
	Bigrams[bigram] = 1
	# change value of prev_word
	prev_word = word

	#-- All file processed here --------------------------------

	# Write unigrams to output file - first "writemax" highest freq are written
	output_file = open("unigram-out.txt","w", encoding="UTF-8")
	writecount = 0
	for uni in sorted(Unigrams, key=Unigrams.get, reverse=True):
	countU = Unigrams[uni]
	output_file.write(str(countU) + '\t' + str(uni) + '\n')
	writecount +=1
	if writecount >= writemax:
	break
	#print("output = ", str(countU), '\t', uni, '\n')
	output_file.close()

	# Write bigrams to output file:
	output_file = open("bigram-out.txt","w", encoding="UTF-8")
	writecount = 0
	for bi in sorted(Bigrams, key=Bigrams.get, reverse=True):
	countB = Bigrams[bi]
	output_file.write(str(countB)+ '\t' + str(bi) + '\n')
	writecount +=1
	if writecount >= writemax:
	break
	#print("output = ", str(countB), '\t', bi, '\n')
	output_file.close()
	#-----------------------------------------------------------