tedmiston · November 12, 2012 20:51
diff --git a/prefixes.py b/prefixes.py
 '''
 PROBLEM: 
  How many 3-letter prefixes are commonly used in English?

 MOTIVATION:
  The Lumosity word game constantly tests my vocabulary and ability to 
  remember simple, common words.  I would like to improve my performance.

 SOLUTION:
  Count the n-letter prefixes used in a dictionary.

 USER PARAMS:
  - PREFIX_N - length of prefix in chars
  - PERC_THRESHOLD - output the top x percent of prefixes (range: 0.0-1.0)
  - WORDS_FILE - a list of dictionary words
  - OUTPUT_FILE - output to top prefixes to the given file path; or if set to 
      None, output to stdout
 '''
 PREFIX_N = 3
 PERC_THRESHOLD = .10
 WORDS_FILE = '/usr/share/dict/web2'
 OUTPUT_FILE = 'output.txt'

 # 0. load words file
 f = open(WORDS_FILE)
 lines = f.read()
 f.close()
 words = lines.split('\n')

 # 1. gather common prefixes
 prefixes = {}
 for i in words: 
 	if len(i) > PREFIX_N:
 		pre = i[:PREFIX_N].lower()
 		if prefixes.has_key(pre):
 			prefixes[pre] += 1
 		else:
 			prefixes[pre] = 1

 # 2. sort the prefixes by decreasing frequency
 prefixes_by_freq = [(prefixes[i], i) for i in prefixes]
 prefixes_by_freq.sort(reverse=True)

 # 3. output the top x% of words using threshold
 # for my words file:
 # THRESHOLD  # PREFIXES  % OF WORDS 
 #         3         114       37.14
 #         5         190       47.19
 #        10         381       63.29
 #        25         953       85.27
 #        50        1906       96.83
 #        75        2859       99.43
 total_counts = float(sum([i[0] for i in prefixes_by_freq]))
 lmt = int( len(prefixes_by_freq) * PERC_THRESHOLD )
 output_lines = []
 for i in prefixes_by_freq[:lmt]:
 	freq = str(i[0])
 	pre = i[1]
 	perc = str(i[0] / total_counts * 100)
 	output_lines.append( freq + "\t" + pre + "\t" + perc )
 if OUTPUT_FILE:
 	f = open(OUTPUT_FILE, 'wb')
 	for i in output_lines:
 		f.write(i + "\n")
 	f.close()
 else:
 	for i in output_lines:
 		print i
	'''
	PROBLEM:
	How many 3-letter prefixes are commonly used in English?

	MOTIVATION:
	The Lumosity word game constantly tests my vocabulary and ability to
	remember simple, common words. I would like to improve my performance.

	SOLUTION:
	Count the n-letter prefixes used in a dictionary.

	USER PARAMS:
	- PREFIX_N - length of prefix in chars
	- PERC_THRESHOLD - output the top x percent of prefixes (range: 0.0-1.0)
	- WORDS_FILE - a list of dictionary words
	- OUTPUT_FILE - output to top prefixes to the given file path; or if set to
	None, output to stdout
	'''
	PREFIX_N = 3
	PERC_THRESHOLD = .10
	WORDS_FILE = '/usr/share/dict/web2'
	OUTPUT_FILE = 'output.txt'

	# 0. load words file
	f = open(WORDS_FILE)
	lines = f.read()
	f.close()
	words = lines.split('\n')

	# 1. gather common prefixes
	prefixes = {}
	for i in words:
	if len(i) > PREFIX_N:
	pre = i[:PREFIX_N].lower()
	if prefixes.has_key(pre):
	prefixes[pre] += 1
	else:
	prefixes[pre] = 1

	# 2. sort the prefixes by decreasing frequency
	prefixes_by_freq = [(prefixes[i], i) for i in prefixes]
	prefixes_by_freq.sort(reverse=True)

	# 3. output the top x% of words using threshold
	# for my words file:
	# THRESHOLD # PREFIXES % OF WORDS
	# 3 114 37.14
	# 5 190 47.19
	# 10 381 63.29
	# 25 953 85.27
	# 50 1906 96.83
	# 75 2859 99.43
	total_counts = float(sum([i[0] for i in prefixes_by_freq]))
	lmt = int( len(prefixes_by_freq) * PERC_THRESHOLD )
	output_lines = []
	for i in prefixes_by_freq[:lmt]:
	freq = str(i[0])
	pre = i[1]
	perc = str(i[0] / total_counts * 100)
	output_lines.append( freq + "\t" + pre + "\t" + perc )
	if OUTPUT_FILE:
	f = open(OUTPUT_FILE, 'wb')
	for i in output_lines:
	f.write(i + "\n")
	f.close()
	else:
	for i in output_lines:
	print i