nekopanic · August 30, 2012 19:33
diff --git a/puddlejumper.py b/puddlejumper.py
 #!/usr/bin/env python

 # Try to determine if a pile of text is American English or British English.
 # Give it a list of files on standard input. e.g. find /usr/share/doc/python-doc | ./puddlejump.py

 import fileinput
 from os import path
 from HTMLParser import HTMLParser

 files = []
 all_words = []
 american_word_count = 0
 british_word_count = 0

 #
 # Load the file list and dictionaries
 #

 for line in fileinput.input():
 	filename = line.strip()
 	if path.exists(filename) and path.isfile(filename):
 		files.append(filename)
 print("Examining {0} files".format(len(files)))

 american_english = {}
 for word in fileinput.input('/usr/share/dict/american-english'):
 	american_english[word.strip()] = True
 print("Loaded {0} words of American english".format(len(american_english)))

 british_english = {}
 for word in fileinput.input('/usr/share/dict/british-english'):
 	british_english[word.strip()] = True
 print("Loaded {0} words of British english".format(len(british_english)))

 # 
 # This HTML Parser will help with the HTML docs
 # 

 class HTMLWordParser(HTMLParser):

 	def __init__(self, all_words):
 		self.all_words = all_words
 		HTMLParser.__init__(self)

 	def handle_data(self, data):
 		self.all_words += data.split()

 html_parser = HTMLWordParser(all_words)

 # 
 # Now count the words and print the result!
 #
 for filename in files:
 	print("Examining {0}".format(filename))
 	fd = open(filename)
 	if filename[-5:] == '.html' or  filename[-4:] == '.htm':
 		html_parser.feed(fd.read())
 	elif filename[-4:] == '.txt':
 		all_words += fd.read().split()
 	fd.close()

 for word in all_words:
 	if american_english.has_key(word):
 		american_word_count += 1
 	if british_english.has_key(word):
 		british_word_count += 1

 print("Of {0} words, {1} were American English and {2} were British English".format(len(all_words), american_word_count, british_word_count))
 if american_word_count > british_word_count:
 	print("America wins by {0} words".format(american_word_count - british_word_count))
 if british_word_count > american_word_count:
 	print("Britain wins by {0} words".format(british_word_count - american_word_count))
	#!/usr/bin/env python

	# Try to determine if a pile of text is American English or British English.
	# Give it a list of files on standard input. e.g. find /usr/share/doc/python-doc \| ./puddlejump.py

	import fileinput
	from os import path
	from HTMLParser import HTMLParser

	files = []
	all_words = []
	american_word_count = 0
	british_word_count = 0

	#
	# Load the file list and dictionaries
	#

	for line in fileinput.input():
	filename = line.strip()
	if path.exists(filename) and path.isfile(filename):
	files.append(filename)
	print("Examining {0} files".format(len(files)))

	american_english = {}
	for word in fileinput.input('/usr/share/dict/american-english'):
	american_english[word.strip()] = True
	print("Loaded {0} words of American english".format(len(american_english)))

	british_english = {}
	for word in fileinput.input('/usr/share/dict/british-english'):
	british_english[word.strip()] = True
	print("Loaded {0} words of British english".format(len(british_english)))

	#
	# This HTML Parser will help with the HTML docs
	#

	class HTMLWordParser(HTMLParser):

	def __init__(self, all_words):
	self.all_words = all_words
	HTMLParser.__init__(self)

	def handle_data(self, data):
	self.all_words += data.split()

	html_parser = HTMLWordParser(all_words)

	#
	# Now count the words and print the result!
	#
	for filename in files:
	print("Examining {0}".format(filename))
	fd = open(filename)
	if filename[-5:] == '.html' or filename[-4:] == '.htm':
	html_parser.feed(fd.read())
	elif filename[-4:] == '.txt':
	all_words += fd.read().split()
	fd.close()

	for word in all_words:
	if american_english.has_key(word):
	american_word_count += 1
	if british_english.has_key(word):
	british_word_count += 1

	print("Of {0} words, {1} were American English and {2} were British English".format(len(all_words), american_word_count, british_word_count))
	if american_word_count > british_word_count:
	print("America wins by {0} words".format(american_word_count - british_word_count))
	if british_word_count > american_word_count:
	print("Britain wins by {0} words".format(british_word_count - american_word_count))
No results found