ubershmekel · September 25, 2016 04:35
diff --git a/ibeforeeexceptafterc.py b/ibeforeeexceptafterc.py
 """
 Inspired by this bogus TIL: https://www.reddit.com/r/todayilearned/comments/54c05w/til_there_are_923_words_in_the_english_language/

 Downloaded this corpus: https://sourceforge.net/projects/wordlist/files/SCOWL/2016.06.26/scowl-2016.06.26.zip/download?use_mirror=pilotfiber

 From http://wordlist.aspell.net/
 SCOWL (Spell Checker Oriented Word Lists) 

 The results of running this script are:

 Total words: 643702
 I before E odds: 3.1 to 1
 I before E after C odds: 3.9 to 1
 {'ie': 25800, 'cei': 351, 'ei': 8284, 'cie': 1382, '[not_c]ei': 7933}
 """

 import glob
 from collections import Counter

 tracking_strings = 'cei', 'cie', 'ie', 'ei'
 counts = {}

 word_set = set()
 ei_not_after_c = '[not_c]ei'

 for fn in glob.glob('final/*'):
    for line in open(fn):
        norm_word = line.strip().lower()
        if norm_word in word_set:
            continue
        word_set.add(norm_word)
        for str_to_track in tracking_strings:
            if str_to_track in norm_word:
                counts[str_to_track] = counts.get(str_to_track, 0) + 1
        
        
        if norm_word.find('ei') >= 0 and 'cei' not in norm_word:
            counts[ei_not_after_c] = counts.get(ei_not_after_c, 0) + 1

 print('Total words: %d' % len(word_set))
 print('I before E odds: %.1f to 1' % (counts['ie'] * 1.0 / counts['ei']))
 print('I before E after C odds: %.1f to 1' % (counts['cie'] * 1.0 / counts['cei']))
 print(counts)
	"""
	Inspired by this bogus TIL: https://www.reddit.com/r/todayilearned/comments/54c05w/til_there_are_923_words_in_the_english_language/

	Downloaded this corpus: https://sourceforge.net/projects/wordlist/files/SCOWL/2016.06.26/scowl-2016.06.26.zip/download?use_mirror=pilotfiber

	From http://wordlist.aspell.net/
	SCOWL (Spell Checker Oriented Word Lists)

	The results of running this script are:

	Total words: 643702
	I before E odds: 3.1 to 1
	I before E after C odds: 3.9 to 1
	{'ie': 25800, 'cei': 351, 'ei': 8284, 'cie': 1382, '[not_c]ei': 7933}
	"""

	import glob
	from collections import Counter

	tracking_strings = 'cei', 'cie', 'ie', 'ei'
	counts = {}

	word_set = set()
	ei_not_after_c = '[not_c]ei'

	for fn in glob.glob('final/*'):
	for line in open(fn):
	norm_word = line.strip().lower()
	if norm_word in word_set:
	continue
	word_set.add(norm_word)
	for str_to_track in tracking_strings:
	if str_to_track in norm_word:
	counts[str_to_track] = counts.get(str_to_track, 0) + 1


	if norm_word.find('ei') >= 0 and 'cei' not in norm_word:
	counts[ei_not_after_c] = counts.get(ei_not_after_c, 0) + 1

	print('Total words: %d' % len(word_set))
	print('I before E odds: %.1f to 1' % (counts['ie'] * 1.0 / counts['ei']))
	print('I before E after C odds: %.1f to 1' % (counts['cie'] * 1.0 / counts['cei']))
	print(counts)