Created
March 31, 2013 19:29
-
-
Save Xion/5281703 to your computer and use it in GitHub Desktop.
English plurals experiment
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
""" | |
Experiment with automatic generation of English plurals. | |
""" | |
import os | |
import random | |
def main(): | |
total_count = 0 | |
correct_count = 0 | |
while True: | |
count = raw_input("Words in the sample: ") | |
if not count: | |
break | |
try: | |
count = int(count) | |
except ValueError: | |
continue | |
sample = sample_words(count) | |
for pair in sample: | |
verdict = None | |
while not verdict: | |
verdict = raw_input("%s -> %s ? " % pair) | |
if verdict.lower() == 'y': | |
correct_count += 1 | |
total_count += 1 | |
if not total_count: | |
print "No samples" | |
return | |
print "Results: {correct} out of {total} words correct ({percent}%)".format( | |
total=total_count, correct=correct_count, | |
percent=(correct_count * 100.0 / total_count)) | |
def get_corpora(): | |
path = os.path.expanduser('~/nltk_data/corpora/wordnet/index.noun') | |
with open(path) as f: | |
wordlines = (wl for wl in f.readlines() if not wl.startswith(' ')) | |
words = [wl.split()[0] for wl in wordlines] | |
return [w for w in words if not '_' in w] | |
def sample_words(count, corpora=get_corpora()): | |
singulars = random.sample(corpora, count) | |
plurals = map(pluralize, singulars) | |
return zip(singulars, plurals) | |
# Pluralization | |
def pluralize(singular): | |
"""Returns a plural form of given English noun, | |
or more specifically, an attempt at something | |
that can sometimes pass as a plural form... maybe. | |
""" | |
plural = None | |
if not plural: | |
for suffix in ("ff", "fe", "f"): | |
if singular.endswith(suffix): | |
plural = singular[:-len(suffix)] + "ves" | |
break | |
if not plural: | |
for suffix in ("s", "sh", "x", "z"): | |
if singular.endswith(suffix): | |
plural = singular + "es" | |
break | |
if not plural: | |
if len(singular) < 2: | |
plural = singular + "'s" | |
elif singular.endswith("y") and singular[-2] not in "aeiouy": | |
plural = singular[:-1] + "ies" | |
if not plural: | |
plural = singular + "s" | |
return plural | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment