Last active
January 3, 2017 17:56
-
-
Save nschneid/d49de87813bd6499ff7c2d861eba196c to your computer and use it in GitHub Desktop.
Use fine-grained POS tags to avoid problems with WordNet morphy lemmatizer (e.g., synsets with plural lemmas)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from nltk.corpus import wordnet as wn | |
@memoize | |
def lemmatize_liberally(w, p): | |
""" | |
Given an English word and its PTB POS tag (whose fine-grained information helps disambiguate some words), | |
produce its lemma/stem. Uses WordNet, but is more aggressive than the `morphy` behavior, giving less surprising | |
results for words like 'people' and 'eggs'. | |
>>> lemmatize_liberally('people', 'NNS') | |
person | |
>>> lemmatize_liberally('men', 'NNS') | |
man | |
>>> lemmatize_liberally('arms', 'NNS') | |
arm | |
>>> lemmatize_liberally('eggs', 'NNS') | |
egg | |
>>> lemmatize_liberally('twenties', 'NNS') | |
twenty | |
>>> lemmatize_liberally('woods', 'NNS') | |
wood | |
>>> lemmatize_liberally('glasses', 'NNS') | |
glasses | |
>>> lemmatize_liberally('moses', 'NNS') | |
moses | |
>>> lemmatize_liberally('alps', 'NNS') | |
alps | |
""" | |
w = w.lower() | |
if p in {'VBD','VBN'}: | |
if w=='found': return 'find' | |
elif w=='ground': return 'grind' | |
elif w=='rent': return 'rend' | |
elif w=='smelt': return 'smell' | |
elif w=='wound': return 'wind' | |
elif p=='VBD': | |
if w=='fell': return 'fall' | |
elif w=='lay': return 'lie' | |
elif w=='saw': return 'see' | |
elif p[0]=='V' and w=='stove': return 'stove' # WordNet has only the past/ppt form of 'stave', but apparently 'stove' can be a verb | |
# 'ridden' is a past participle of 'rid' and the past participle of 'ride'. The POS is not enough to disambiguate, but 'ride' (which WordNet gives) is probably more common. | |
elif p=='NNS': # checked http://www.esldesk.com/vocabulary/irregular-nouns for irregular forms | |
if w=='people': return 'person' | |
elif w=='teeth': return 'tooth' | |
elif w=='men': return 'man' | |
elif w=='brethren': return 'brother' | |
elif w=='dice': return 'die' | |
elif w=='elves': return 'elf' | |
elif w=='fungi': return 'fungus' | |
elif w=='memoranda': return 'memorandum' | |
elif w=='oxen': return 'ox' | |
elif w=='vitae': return 'vita' | |
elif w in {'clutches', 'losses', 'marches', 'masses', 'starches'}: # sibilant + -es | |
return w[:-2] | |
# 'axes' will stem to 'ax', though it could also be 'axis' | |
# 'bases' will stem to 'base' though it could also be 'basis' | |
# 'glasses' will stem to 'glasses' (i.e. eyewear) though it could be the plural of 'glass' | |
# 'breeches', 'riches' will not change when stemmed. | |
# TODO: these->this, those->that, an->a? | |
# TODO: towards->toward, o'er->over, till->until, outta->out of, etc.? | |
if w=='cannot' or "'" in w: | |
tt = word_tokenize(w+' .')[:-1] # period ensures no part of the word is interpreted as sentence-final punctuation | |
else: | |
tt = [w] | |
lem = wn.morphy(tt[0], p and {'N': wn.NOUN, 'V': wn.VERB, 'J': wn.ADJ, 'R': wn.ADV}.get(p[0])) | |
if lem: | |
if lem==w: | |
if p=='NNS' and w.endswith('ies') and wn.lemmas(w[:-3]+'y', wn.NOUN): # exclude: 'rabies', etc. | |
if w not in {'alleghenies', 'mounties', 'humanities', 'species'}: | |
lem = w[:-3]+'y' # replace -ies with -y | |
# TODO: if 'mounties' is tagged as NNPS, could stem to 'mountie'. don't do that for 'alleghenies' | |
elif p=='NNS' and len(w)>=4 and w.endswith('s') and not w.endswith('ss') and not w.endswith('us') and wn.lemmas(w[:-1], wn.NOUN): | |
# excludes 'yes', 'halitosis', etc. (which are probably not NNS) | |
if w=='abcs': | |
lem = 'abc' | |
elif all(l.name()[0].isupper() for l in wn.lemmas(w, wn.NOUN)): | |
pass # always capitalized in the singular, so probably should be NNP or NNPS. e.g. 'mormons' | |
elif w in {'brits', 'romans', 'alps', 'anas', 'hays'}: | |
pass # 'brits' is usually NNPS; 'romans' is NNP or NNPS; others are NNP | |
elif w in {'alas', 'amnios', 'corps', 'mores', 'acoustics', 'aquatics', 'auspices', 'statics', 'pragmatics', 'winnings'}: | |
pass # typically not the plural form of a singular noun | |
else: | |
# 575 single unhyphenated words are specially listed in their plural forms in WordNet, | |
# though they can be regular plurals | |
# e.g. 'acres', 'arms', 'basics', 'eggs', 'proverbs', 'shorts', 'woods' | |
lem = w[:-1] # remove -s | |
tt[0] = lem | |
return tt[0] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
NNS--stem to -y: | |
allies amenities authorities canaries contemporaries | |
flies follies formalities fries funnies liabilities | |
hostilities jimmies skivvies | |
NNS--stem to -y or -ie | |
hippies | |
NNPS--leave: | |
alleghenies | |
NNPS--can stem to -ie: | |
mounties | |
Not plural of a singular noun--leave: | |
humanities | |
species | |
Plural multiples of 10--stem to -y | |
twenties thirties forties fifties sixties seventies eighties nineties |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Single-word, unhyphenated nouns in WordNet ending in -s but not -ies, -ss, or -us | |
# that are >=4 chars long and have a corresponding noun stem in WordNet | |
# 575 entries | |
['abcs', 'acoustics', 'acres', 'acts', 'adams', 'adps', 'aesthetics', 'affairs', | |
'aides', 'aids', 'airs', 'alas', 'aleutians', 'alexanders', 'aloes', 'alps', | |
'ambages', 'amnios', 'anagrams', 'anas', 'ancients', 'andrews', 'anklets', | |
'antipodes', 'antitrades', 'appalachians', 'aquatics', 'aras', 'archives', | |
'ares', 'argos', 'arms', 'arts', 'ascomycetes', 'assets', 'assizes', | |
'auspices', 'baas', 'balkans', 'balusters', 'banks', 'baptists', 'barrels', | |
'bars', 'basics', 'basidiomycetes', 'bats', 'beads', 'beatniks', 'beats', | |
'bellows', 'belongings', 'bends', 'berbers', 'berkshires', 'bermudas', | |
'bilges', 'billings', 'bitters', 'blahs', 'bleachers', 'blinks', 'bloomers', | |
'blues', 'boards', 'bobbysocks', 'bollocks', 'bones', 'bounds', 'bowels', | |
'bowls', 'boxcars', 'boxers', 'braces', 'brakes', 'breakers', 'bridges', | |
'briefs', 'brits', 'brooks', 'buckskins', 'buns', 'burns', 'buttocks', | |
'callas', 'candlepins', 'canticles', 'cards', 'carolinas', 'cascades', | |
'ceramics', 'ceres', 'chains', 'chambers', 'channels', 'charades', 'checkers', | |
'cheviots', 'chilblains', 'chips', 'chives', 'circumstances', 'clams', | |
'clappers', 'classics', 'cleaners', 'cleats', 'cleavers', 'clews', 'clocks', | |
'cobblers', 'cocos', 'coevals', 'collards', 'colors', 'colossians', 'colours', | |
'comforts', 'commons', 'communications', 'compliments', 'conditions', | |
'congratulations', 'conserves', 'contents', 'contras', 'conveniences', 'cords', | |
'corduroys', 'corps', 'corrections', 'costs', 'cotswolds', 'cows', 'crabs', | |
'crackers', 'cracklings', 'craps', 'credentials', 'credits', 'creeps', | |
'crossroads', 'crossways', 'customs', 'damages', 'darts', 'davys', 'days', | |
'dealings', 'debs', 'deeds', 'dermatoglyphics', 'descendants', 'deserts', | |
'details', 'devices', 'dialectics', 'dias', 'diggings', 'digs', 'dippers', | |
'discomycetes', 'divers', 'dominos', 'doubles', 'dozens', 'draughts', | |
'drawers', 'dregs', 'drippings', 'duckpins', 'duds', 'dumas', 'dumplings', | |
'dumps', 'dunkers', 'dynamics', 'eastwards', 'edwards', 'effects', 'eggs', | |
'elements', 'ephesians', 'esthetics', 'ethics', 'eyes', 'falls', 'fatigues', | |
'feelings', 'fields', 'fijis', 'finances', 'findings', 'fives', 'fixings', | |
'flaps', 'flats', 'folks', 'followers', 'forwards', 'fumes', 'fundamentals', | |
'funds', 'galatians', 'gasteromycetes', 'gastromycetes', 'gates', 'gens', | |
'giblets', 'glassworks', 'gobs', 'goldfields', 'gospels', 'graphics', 'graves', | |
'greaves', 'greens', 'gripes', 'grits', 'groats', 'grounds', 'groves', 'guts', | |
'gyps', 'hackles', 'hands', 'hanks', 'harmonics', 'hays', 'heaps', 'hearts', | |
'heavens', 'heaves', 'hebrews', 'heights', 'heroics', 'highlands', 'himalayas', | |
'hindquarters', 'hipsters', 'hives', 'hollands', 'honours', 'hooks', 'hoops', | |
'hops', 'horseshoes', 'hours', 'houyhnhnms', 'huaraches', 'humans', 'hurdles', | |
'hymeneals', 'hysterics', 'indris', 'innings', 'instructions', 'irons', | |
'ironsides', 'ironworks', 'isis', 'isometrics', 'israelites', 'jacks', | |
'jackstraws', 'jacobs', 'jaspers', 'jitters', 'jodhpurs', 'johns', 'judas', | |
'judges', 'junkers', 'kansas', 'khakis', 'knuckles', 'lamentations', 'lancers', | |
'laos', 'lashings', 'lats', 'laurels', 'laws', 'leaders', 'lees', 'leftovers', | |
'legs', 'leotards', 'letters', 'levis', 'limbers', 'links', 'loads', | |
'lodgings', 'logos', 'loins', 'loos', 'lots', 'lowlands', 'lyons', 'madeiras', | |
'majors', 'manes', 'manners', 'marbles', 'marches', 'marines', 'marks', 'mars', | |
'marseilles', 'marshals', 'masses', 'masters', 'mates', 'maths', 'maulers', | |
'mayas', 'mays', 'means', 'mechanics', 'medlars', 'megrims', 'metalworks', | |
'methodists', 'metrics', 'mews', 'mills', 'minors', 'minutes', 'mithras', | |
'mnemonics', 'mods', 'monas', 'mons', 'mopes', 'morals', 'mores', 'mormons', | |
'myxomycetes', 'najas', 'names', 'needs', 'nerves', 'ninepins', 'nones', | |
'northwards', 'nothings', 'numbers', 'nuts', 'nylons', 'occasions', 'oddments', | |
'offsides', 'operations', 'optics', 'organs', 'oscines', 'outskirts', 'owens', | |
'pains', 'pants', 'papers', 'parks', 'parsons', 'parts', 'payables', 'peanuts', | |
'peoples', 'pharmaceutics', 'philippians', 'philippines', 'physics', | |
'pickings', 'piles', 'plaudits', 'pliers', 'plyers', 'poitiers', 'polemics', | |
'polls', 'porcupines', 'pragmatics', 'prelims', 'premises', 'preserves', | |
'primates', 'privates', 'proceedings', 'profits', 'propaedeutics', 'prophets', | |
'props', 'proverbs', 'provisions', 'provos', 'psalms', 'punks', 'pyrites', | |
'pyrotechnics', 'pythias', 'quadratics', 'quakers', 'quarters', 'queens', | |
'quoits', 'raffles', 'rafts', 'rails', 'rastas', 'rates', 'rayons', | |
'rearwards', 'receipts', 'regrets', 'relations', 'reserves', 'respects', | |
'rings', 'roads', 'roberts', 'rockers', 'romans', 'rooms', 'roots', 'rounders', | |
'ruddles', 'rudiments', 'sales', 'sands', 'sauternes', 'savings', 'scads', | |
'scopes', 'scores', 'scots', 'scours', 'scraps', 'scrubs', 'scruples', 'seats', | |
'seawards', 'sebs', 'sellers', 'sens', 'services', 'sessions', 'settlings', | |
'sevens', 'shades', 'shakers', 'shambles', 'shears', 'shekels', 'shingles', | |
'shirtsleeves', 'shoes', 'shorts', 'shucks', 'sights', 'silks', 'sills', | |
'silversides', 'singles', 'skinheads', 'skittles', 'slacks', 'slews', 'slops', | |
'smuts', 'snips', 'snuffers', 'solomons', 'sops', 'southwards', 'soviets', | |
'spareribs', 'specs', 'spectacles', 'spillikins', 'spirits', 'splinters', | |
'spots', 'sprinkles', 'sprites', 'stacks', 'staggers', 'stairs', 'stakes', | |
'stalls', 'statics', 'stations', 'statistics', 'stays', 'steps', | |
'stockholdings', 'stocks', 'stops', 'straits', 'strings', 'stripes', | |
'summercaters', 'sweats', 'sweetbreads', 'tabernacles', 'tabis', 'tactics', | |
'tails', 'takings', 'talks', 'tangiers', 'taos', 'taps', 'taxis', 'tears', | |
'teens', 'tenpins', 'teras', 'terms', 'thebes', 'therapeutics', 'things', | |
'threads', 'thrips', 'throes', 'tigers', 'times', 'tons', 'tops', 'tours', | |
'transactions', 'trappings', 'trembles', 'trimmings', 'troglodytes', 'troops', | |
'tropics', 'trumpets', 'trunks', 'tums', 'turps', 'twins', 'ulfilas', 'values', | |
'vapors', 'vapours', 'velours', 'vespers', 'viands', 'vibes', 'victuals', | |
'viewers', 'waders', 'wads', 'wages', 'wales', 'walloons', 'waters', 'watts', | |
'ways', 'weeds', 'wells', 'westwards', 'whiskers', 'windows', 'wings', | |
'winnings', 'wits', 'woods', 'words', 'workings', 'works', 'writings', 'yaws', | |
'years', 'yips'] | |
# Decisions: | |
""" | |
Always capitalized in singular--NNP or NNPS: | |
adams adps aleutians alexanders andrews appalachians aras argos balkans baptists | |
berbers berkshires bermudas carolinas cheviots colossians contras cotswolds davys dias | |
dumas edwards ephesians fijis galatians hebrews himalayas hollands houyhnhnms isis | |
israelites jacobs judas junkers kansas laos levis lyons mayas madeiras methodists mithras | |
monas mons mormons najas owens philippians philippines poitiers provos pythias rastas roberts | |
sauternes scots sebs solomons tangiers taos teras turps ulfilas walloons | |
Usually NNPS: | |
brits | |
NNP or NNPS: | |
romans | |
NNP: | |
alps anas hays | |
Typically not the plural form of a singular noun: | |
alas | |
amnios | |
corps | |
mores | |
acoustics aquatics auspices statics pragmatics | |
winnings | |
Others--remove the 's': | |
acres acts aesthetics affairs aides aids airs aloes | |
ambages anagrams ancients anklets antipodes antitrades | |
archives ares arms arts ascomycetes assets assizes baas balusters | |
banks barrels bars basics basidiomycetes bats beads beatniks beats bellows | |
belongings bends bilges billings bitters blahs bleachers blinks bloomers | |
blues boards bobbysocks bollocks bones bounds bowels bowls boxcars boxers | |
braces brakes breakers bridges briefs brooks buckskins buns burns | |
buttocks callas candlepins canticles cards cascades ceramics ceres chains | |
chambers channels charades checkers chilblains chips chives circumstances | |
clams clappers classics cleaners cleats cleavers clews clocks cobblers cocos | |
coevals collards colors colours comforts commons communications compliments | |
conditions congratulations conserves contents conveniences cords corduroys | |
corrections costs cows crabs crackers cracklings craps credentials | |
credits creeps crossroads crossways customs damages darts days dealings debs | |
deeds dermatoglyphics descendants deserts details devices dialectics | |
diggings digs dippers discomycetes divers dominos doubles dozens draughts | |
drawers dregs drippings duckpins duds dumplings dumps dunkers dynamics | |
eastwards effects eggs elements esthetics ethics eyes falls fatigues | |
feelings fields finances findings fives fixings flaps flats folks followers | |
forwards fumes fundamentals funds gasteromycetes gastromycetes gates gens | |
giblets glassworks gobs goldfields gospels graphics graves greaves greens | |
gripes grits groats grounds groves guts gyps hackles hands hanks harmonics | |
heaps hearts heavens heaves heights heroics highlands hindquarters | |
hipsters hives honours hooks hoops hops horseshoes hours huaraches humans | |
hurdles hymeneals hysterics indris innings instructions irons ironsides | |
ironworks isometrics jacks jackstraws jaspers jitters jodhpurs johns judges | |
khakis knuckles lamentations lancers lashings lats laurels laws leaders lees | |
leftovers legs leotards letters limbers links loads lodgings logos loins | |
loos lots lowlands majors manes manners marbles marines marks mars | |
marseilles marshals masters mates maths maulers mays means mechanics | |
medlars megrims metalworks metrics mews mills minors minutes mnemonics mods | |
mopes morals myxomycetes names needs nerves ninepins nones northwards | |
nothings numbers nuts nylons occasions oddments offsides operations optics | |
organs oscines outskirts pains pants papers parks parsons parts payables | |
peanuts peoples pharmaceutics physics pickings piles plaudits pliers plyers | |
polemics polls porcupines prelims premises preserves primates | |
privates proceedings profits propaedeutics prophets props proverbs | |
provisions psalms punks pyrites pyrotechnics quadratics quakers quarters | |
queens quoits raffles rafts rails rates rayons rearwards receipts regrets | |
relations reserves respects rings roads rockers rooms roots rounders | |
ruddles rudiments sales sands savings scads scopes scores scours scraps | |
scrubs scruples seats seawards sellers sens services sessions settlings | |
sevens shades shakers shambles shears shekels shingles shirtsleeves shoes | |
shorts shucks sights silks sills silversides singles skinheads skittles | |
slacks slews slops smuts snips snuffers sops southwards soviets spareribs | |
specs spectacles spillikins spirits splinters spots sprinkles sprites stacks | |
staggers stairs stakes stalls stations statistics stays steps | |
stockholdings stocks stops straits strings stripes summercaters sweats | |
sweetbreads tabernacles tabis tactics tails takings talks taps taxis tears | |
teens tenpins terms thebes therapeutics things threads thrips throes tigers | |
times tons tops tours transactions trappings trembles trimmings troglodytes | |
troops tropics trumpets trunks tums twins values vapors vapours velours | |
vespers viands vibes victuals viewers waders wads wages wales waters watts | |
ways weeds wells westwards whiskers windows wings wits woods words | |
workings works writings yaws years yips | |
""" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment