Pinak-Chakraborty · August 10, 2014 20:05
diff --git a/gistfile1.py b/gistfile1.py
 import sys, os, os.path, glob, codecs

 # Set the codecs
 sys.stdout = codecs.getwriter('UTF-8')(sys.stdout.detach())

 # set the delimiters
 delimiterSet = ";.,!?\"()':[]\n/+-—=≤≥{}><*’”“|"
 digits = "0123456789"
 chars = "abcdefghijklmnopqrstuvwxyz"
 chars = "".join( (chars, chars.upper()) )
 spaces = " \t\n"
 numberdelimiters = ",."

 # Declare Unigram and Bigram dictionary
 Unigrams = {}
 Bigrams = {}

 # Declare variable for Bigrams (used for concatenation)
 # Max number of unigrams and bigrams with highest frequency written

 prev_word = "START"
 writemax = 100

 # Main tokenizer starts here    
 def main(fname):
   print("starting tokenizer")
   global delimiterSet
   global writemax
   
   if not os.path.isfile(fname):
      print("Error: Not a file", fname, "\n")
      usage()
      return

   try:
      #inStream = open(fname, mode="r")
      inStream = open(fname, mode="r", encoding="UTF-8")
      token = ""
      ch = inStream.read(1)
      lookahead = inStream.read(1)
      while True:
         if not ch:
            if token:
               print(token)
               process(token)
               
            break
         if ch in delimiterSet:
            if token:
               if token[-1] in digits and lookahead in digits and ch in numberdelimiters:
                  token = "".join( (token, ch) )
               elif token[-1] in chars and lookahead in chars and ch in numberdelimiters:
                  token = "".join( (token, ch) )
               else:
                  print(token)
                  process(token)
                  token = ""
                  if ch not in spaces:
                     print(ch)
                     process(ch)   
         elif ch in spaces:
            if token:
               print(token)
               process(token)
               token = ""
         else:
            token = "".join( (token, ch) )
         ch = lookahead
         lookahead = inStream.read(1)
      inStream.close()
   except IOError:
      print("Cannot read from file:", fname, file=sys.stdout)

 #At this point, unigrams and bigrams are created - so print from these
 #--------------------------------------------------------------------
 # Write unigrams to output file - first 100 highest freq are written
   output_file = open("unigram-out.txt","w", encoding="UTF-8")
   writecount = 0
   for uni in sorted(Unigrams, key=Unigrams.get, reverse=True):
      countU = Unigrams[uni]
      output_file.write(str(countU) + '\t' + str(uni) + '\n')
      writecount +=1
      if writecount >= writemax:
         break
      #print("output = ", str(countU), '\t', uni, '\n')
   output_file.close()

 # Write bigrams to output file:
   output_file = open("bigram-out.txt","w", encoding="UTF-8")
   writecount = 0
   for bi in sorted(Bigrams, key=Bigrams.get, reverse=True):
      countB = Bigrams[bi]
      output_file.write(str(countB)+ '\t' + str(bi) + '\n')
      writecount +=1
      if writecount >= writemax:
         break
      #print("output = ", str(countB), '\t', bi, '\n')
   output_file.close()

 #-----------------------------------------------------------
 def process(word):
 # This function populates the unigrams and bigrams
 # This should be called for every token from the main function

   global prev_word

   # loop over unigrams:
   if word in Unigrams:
      Unigrams[word] += 1
   else:
      Unigrams[word] = 1
 #-----------------------------------------------------------
   # concatenate words to get bigram:
   bigram = prev_word + ' ' + word

   if bigram in Bigrams:
      Bigrams[bigram] += 1
   else:
      Bigrams[bigram] = 1
      # change value of prev_word
      prev_word = word

 #-----------------------------------------------------------
 def usage():
   print("""
 tokenizer.py

 Usage:
 python tokenizer.py mytext.txt 
 """)

 if __name__ == '__main__':
   if len(sys.argv) > 1:
      for i in sys.argv[1:]:
         for j in glob.glob(i):
            main(os.path.expanduser(os.path.expandvars(j)))
   else:
      usage()
	import sys, os, os.path, glob, codecs

	# Set the codecs
	sys.stdout = codecs.getwriter('UTF-8')(sys.stdout.detach())

	# set the delimiters
	delimiterSet = ";.,!?\"()':[]\n/+-—=≤≥{}><*’”“\|"
	digits = "0123456789"
	chars = "abcdefghijklmnopqrstuvwxyz"
	chars = "".join( (chars, chars.upper()) )
	spaces = " \t\n"
	numberdelimiters = ",."

	# Declare Unigram and Bigram dictionary
	Unigrams = {}
	Bigrams = {}

	# Declare variable for Bigrams (used for concatenation)
	# Max number of unigrams and bigrams with highest frequency written

	prev_word = "START"
	writemax = 100

	# Main tokenizer starts here
	def main(fname):
	print("starting tokenizer")
	global delimiterSet
	global writemax

	if not os.path.isfile(fname):
	print("Error: Not a file", fname, "\n")
	usage()
	return

	try:
	#inStream = open(fname, mode="r")
	inStream = open(fname, mode="r", encoding="UTF-8")
	token = ""
	ch = inStream.read(1)
	lookahead = inStream.read(1)
	while True:
	if not ch:
	if token:
	print(token)
	process(token)

	break
	if ch in delimiterSet:
	if token:
	if token[-1] in digits and lookahead in digits and ch in numberdelimiters:
	token = "".join( (token, ch) )
	elif token[-1] in chars and lookahead in chars and ch in numberdelimiters:
	token = "".join( (token, ch) )
	else:
	print(token)
	process(token)
	token = ""
	if ch not in spaces:
	print(ch)
	process(ch)
	elif ch in spaces:
	if token:
	print(token)
	process(token)
	token = ""
	else:
	token = "".join( (token, ch) )
	ch = lookahead
	lookahead = inStream.read(1)
	inStream.close()
	except IOError:
	print("Cannot read from file:", fname, file=sys.stdout)

	#At this point, unigrams and bigrams are created - so print from these
	#--------------------------------------------------------------------
	# Write unigrams to output file - first 100 highest freq are written
	output_file = open("unigram-out.txt","w", encoding="UTF-8")
	writecount = 0
	for uni in sorted(Unigrams, key=Unigrams.get, reverse=True):
	countU = Unigrams[uni]
	output_file.write(str(countU) + '\t' + str(uni) + '\n')
	writecount +=1
	if writecount >= writemax:
	break
	#print("output = ", str(countU), '\t', uni, '\n')
	output_file.close()

	# Write bigrams to output file:
	output_file = open("bigram-out.txt","w", encoding="UTF-8")
	writecount = 0
	for bi in sorted(Bigrams, key=Bigrams.get, reverse=True):
	countB = Bigrams[bi]
	output_file.write(str(countB)+ '\t' + str(bi) + '\n')
	writecount +=1
	if writecount >= writemax:
	break
	#print("output = ", str(countB), '\t', bi, '\n')
	output_file.close()

	#-----------------------------------------------------------
	def process(word):
	# This function populates the unigrams and bigrams
	# This should be called for every token from the main function

	global prev_word

	# loop over unigrams:
	if word in Unigrams:
	Unigrams[word] += 1
	else:
	Unigrams[word] = 1
	#-----------------------------------------------------------
	# concatenate words to get bigram:
	bigram = prev_word + ' ' + word

	if bigram in Bigrams:
	Bigrams[bigram] += 1
	else:
	Bigrams[bigram] = 1
	# change value of prev_word
	prev_word = word

	#-----------------------------------------------------------
	def usage():
	print("""
	tokenizer.py

	Usage:
	python tokenizer.py mytext.txt
	""")

	if __name__ == '__main__':
	if len(sys.argv) > 1:
	for i in sys.argv[1:]:
	for j in glob.glob(i):
	main(os.path.expanduser(os.path.expandvars(j)))
	else:
	usage()