Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save nschneid/d49de87813bd6499ff7c2d861eba196c to your computer and use it in GitHub Desktop.
Save nschneid/d49de87813bd6499ff7c2d861eba196c to your computer and use it in GitHub Desktop.
Use fine-grained POS tags to avoid problems with WordNet morphy lemmatizer (e.g., synsets with plural lemmas)
from nltk.corpus import wordnet as wn
@memoize
def lemmatize_liberally(w, p):
"""
Given an English word and its PTB POS tag (whose fine-grained information helps disambiguate some words),
produce its lemma/stem. Uses WordNet, but is more aggressive than the `morphy` behavior, giving less surprising
results for words like 'people' and 'eggs'.
>>> lemmatize_liberally('people', 'NNS')
person
>>> lemmatize_liberally('men', 'NNS')
man
>>> lemmatize_liberally('arms', 'NNS')
arm
>>> lemmatize_liberally('eggs', 'NNS')
egg
>>> lemmatize_liberally('twenties', 'NNS')
twenty
>>> lemmatize_liberally('woods', 'NNS')
wood
>>> lemmatize_liberally('glasses', 'NNS')
glasses
>>> lemmatize_liberally('moses', 'NNS')
moses
>>> lemmatize_liberally('alps', 'NNS')
alps
"""
w = w.lower()
if p in {'VBD','VBN'}:
if w=='found': return 'find'
elif w=='ground': return 'grind'
elif w=='rent': return 'rend'
elif w=='smelt': return 'smell'
elif w=='wound': return 'wind'
elif p=='VBD':
if w=='fell': return 'fall'
elif w=='lay': return 'lie'
elif w=='saw': return 'see'
elif p[0]=='V' and w=='stove': return 'stove' # WordNet has only the past/ppt form of 'stave', but apparently 'stove' can be a verb
# 'ridden' is a past participle of 'rid' and the past participle of 'ride'. The POS is not enough to disambiguate, but 'ride' (which WordNet gives) is probably more common.
elif p=='NNS': # checked http://www.esldesk.com/vocabulary/irregular-nouns for irregular forms
if w=='people': return 'person'
elif w=='teeth': return 'tooth'
elif w=='men': return 'man'
elif w=='brethren': return 'brother'
elif w=='dice': return 'die'
elif w=='elves': return 'elf'
elif w=='fungi': return 'fungus'
elif w=='memoranda': return 'memorandum'
elif w=='oxen': return 'ox'
elif w=='vitae': return 'vita'
elif w in {'clutches', 'losses', 'marches', 'masses', 'starches'}: # sibilant + -es
return w[:-2]
# 'axes' will stem to 'ax', though it could also be 'axis'
# 'bases' will stem to 'base' though it could also be 'basis'
# 'glasses' will stem to 'glasses' (i.e. eyewear) though it could be the plural of 'glass'
# 'breeches', 'riches' will not change when stemmed.
# TODO: these->this, those->that, an->a?
# TODO: towards->toward, o'er->over, till->until, outta->out of, etc.?
if w=='cannot' or "'" in w:
tt = word_tokenize(w+' .')[:-1] # period ensures no part of the word is interpreted as sentence-final punctuation
else:
tt = [w]
lem = wn.morphy(tt[0], p and {'N': wn.NOUN, 'V': wn.VERB, 'J': wn.ADJ, 'R': wn.ADV}.get(p[0]))
if lem:
if lem==w:
if p=='NNS' and w.endswith('ies') and wn.lemmas(w[:-3]+'y', wn.NOUN): # exclude: 'rabies', etc.
if w not in {'alleghenies', 'mounties', 'humanities', 'species'}:
lem = w[:-3]+'y' # replace -ies with -y
# TODO: if 'mounties' is tagged as NNPS, could stem to 'mountie'. don't do that for 'alleghenies'
elif p=='NNS' and len(w)>=4 and w.endswith('s') and not w.endswith('ss') and not w.endswith('us') and wn.lemmas(w[:-1], wn.NOUN):
# excludes 'yes', 'halitosis', etc. (which are probably not NNS)
if w=='abcs':
lem = 'abc'
elif all(l.name()[0].isupper() for l in wn.lemmas(w, wn.NOUN)):
pass # always capitalized in the singular, so probably should be NNP or NNPS. e.g. 'mormons'
elif w in {'brits', 'romans', 'alps', 'anas', 'hays'}:
pass # 'brits' is usually NNPS; 'romans' is NNP or NNPS; others are NNP
elif w in {'alas', 'amnios', 'corps', 'mores', 'acoustics', 'aquatics', 'auspices', 'statics', 'pragmatics', 'winnings'}:
pass # typically not the plural form of a singular noun
else:
# 575 single unhyphenated words are specially listed in their plural forms in WordNet,
# though they can be regular plurals
# e.g. 'acres', 'arms', 'basics', 'eggs', 'proverbs', 'shorts', 'woods'
lem = w[:-1] # remove -s
tt[0] = lem
return tt[0]
NNS--stem to -y:
allies amenities authorities canaries contemporaries
flies follies formalities fries funnies liabilities
hostilities jimmies skivvies
NNS--stem to -y or -ie
hippies
NNPS--leave:
alleghenies
NNPS--can stem to -ie:
mounties
Not plural of a singular noun--leave:
humanities
species
Plural multiples of 10--stem to -y
twenties thirties forties fifties sixties seventies eighties nineties
# Single-word, unhyphenated nouns in WordNet ending in -s but not -ies, -ss, or -us
# that are >=4 chars long and have a corresponding noun stem in WordNet
# 575 entries
['abcs', 'acoustics', 'acres', 'acts', 'adams', 'adps', 'aesthetics', 'affairs',
'aides', 'aids', 'airs', 'alas', 'aleutians', 'alexanders', 'aloes', 'alps',
'ambages', 'amnios', 'anagrams', 'anas', 'ancients', 'andrews', 'anklets',
'antipodes', 'antitrades', 'appalachians', 'aquatics', 'aras', 'archives',
'ares', 'argos', 'arms', 'arts', 'ascomycetes', 'assets', 'assizes',
'auspices', 'baas', 'balkans', 'balusters', 'banks', 'baptists', 'barrels',
'bars', 'basics', 'basidiomycetes', 'bats', 'beads', 'beatniks', 'beats',
'bellows', 'belongings', 'bends', 'berbers', 'berkshires', 'bermudas',
'bilges', 'billings', 'bitters', 'blahs', 'bleachers', 'blinks', 'bloomers',
'blues', 'boards', 'bobbysocks', 'bollocks', 'bones', 'bounds', 'bowels',
'bowls', 'boxcars', 'boxers', 'braces', 'brakes', 'breakers', 'bridges',
'briefs', 'brits', 'brooks', 'buckskins', 'buns', 'burns', 'buttocks',
'callas', 'candlepins', 'canticles', 'cards', 'carolinas', 'cascades',
'ceramics', 'ceres', 'chains', 'chambers', 'channels', 'charades', 'checkers',
'cheviots', 'chilblains', 'chips', 'chives', 'circumstances', 'clams',
'clappers', 'classics', 'cleaners', 'cleats', 'cleavers', 'clews', 'clocks',
'cobblers', 'cocos', 'coevals', 'collards', 'colors', 'colossians', 'colours',
'comforts', 'commons', 'communications', 'compliments', 'conditions',
'congratulations', 'conserves', 'contents', 'contras', 'conveniences', 'cords',
'corduroys', 'corps', 'corrections', 'costs', 'cotswolds', 'cows', 'crabs',
'crackers', 'cracklings', 'craps', 'credentials', 'credits', 'creeps',
'crossroads', 'crossways', 'customs', 'damages', 'darts', 'davys', 'days',
'dealings', 'debs', 'deeds', 'dermatoglyphics', 'descendants', 'deserts',
'details', 'devices', 'dialectics', 'dias', 'diggings', 'digs', 'dippers',
'discomycetes', 'divers', 'dominos', 'doubles', 'dozens', 'draughts',
'drawers', 'dregs', 'drippings', 'duckpins', 'duds', 'dumas', 'dumplings',
'dumps', 'dunkers', 'dynamics', 'eastwards', 'edwards', 'effects', 'eggs',
'elements', 'ephesians', 'esthetics', 'ethics', 'eyes', 'falls', 'fatigues',
'feelings', 'fields', 'fijis', 'finances', 'findings', 'fives', 'fixings',
'flaps', 'flats', 'folks', 'followers', 'forwards', 'fumes', 'fundamentals',
'funds', 'galatians', 'gasteromycetes', 'gastromycetes', 'gates', 'gens',
'giblets', 'glassworks', 'gobs', 'goldfields', 'gospels', 'graphics', 'graves',
'greaves', 'greens', 'gripes', 'grits', 'groats', 'grounds', 'groves', 'guts',
'gyps', 'hackles', 'hands', 'hanks', 'harmonics', 'hays', 'heaps', 'hearts',
'heavens', 'heaves', 'hebrews', 'heights', 'heroics', 'highlands', 'himalayas',
'hindquarters', 'hipsters', 'hives', 'hollands', 'honours', 'hooks', 'hoops',
'hops', 'horseshoes', 'hours', 'houyhnhnms', 'huaraches', 'humans', 'hurdles',
'hymeneals', 'hysterics', 'indris', 'innings', 'instructions', 'irons',
'ironsides', 'ironworks', 'isis', 'isometrics', 'israelites', 'jacks',
'jackstraws', 'jacobs', 'jaspers', 'jitters', 'jodhpurs', 'johns', 'judas',
'judges', 'junkers', 'kansas', 'khakis', 'knuckles', 'lamentations', 'lancers',
'laos', 'lashings', 'lats', 'laurels', 'laws', 'leaders', 'lees', 'leftovers',
'legs', 'leotards', 'letters', 'levis', 'limbers', 'links', 'loads',
'lodgings', 'logos', 'loins', 'loos', 'lots', 'lowlands', 'lyons', 'madeiras',
'majors', 'manes', 'manners', 'marbles', 'marches', 'marines', 'marks', 'mars',
'marseilles', 'marshals', 'masses', 'masters', 'mates', 'maths', 'maulers',
'mayas', 'mays', 'means', 'mechanics', 'medlars', 'megrims', 'metalworks',
'methodists', 'metrics', 'mews', 'mills', 'minors', 'minutes', 'mithras',
'mnemonics', 'mods', 'monas', 'mons', 'mopes', 'morals', 'mores', 'mormons',
'myxomycetes', 'najas', 'names', 'needs', 'nerves', 'ninepins', 'nones',
'northwards', 'nothings', 'numbers', 'nuts', 'nylons', 'occasions', 'oddments',
'offsides', 'operations', 'optics', 'organs', 'oscines', 'outskirts', 'owens',
'pains', 'pants', 'papers', 'parks', 'parsons', 'parts', 'payables', 'peanuts',
'peoples', 'pharmaceutics', 'philippians', 'philippines', 'physics',
'pickings', 'piles', 'plaudits', 'pliers', 'plyers', 'poitiers', 'polemics',
'polls', 'porcupines', 'pragmatics', 'prelims', 'premises', 'preserves',
'primates', 'privates', 'proceedings', 'profits', 'propaedeutics', 'prophets',
'props', 'proverbs', 'provisions', 'provos', 'psalms', 'punks', 'pyrites',
'pyrotechnics', 'pythias', 'quadratics', 'quakers', 'quarters', 'queens',
'quoits', 'raffles', 'rafts', 'rails', 'rastas', 'rates', 'rayons',
'rearwards', 'receipts', 'regrets', 'relations', 'reserves', 'respects',
'rings', 'roads', 'roberts', 'rockers', 'romans', 'rooms', 'roots', 'rounders',
'ruddles', 'rudiments', 'sales', 'sands', 'sauternes', 'savings', 'scads',
'scopes', 'scores', 'scots', 'scours', 'scraps', 'scrubs', 'scruples', 'seats',
'seawards', 'sebs', 'sellers', 'sens', 'services', 'sessions', 'settlings',
'sevens', 'shades', 'shakers', 'shambles', 'shears', 'shekels', 'shingles',
'shirtsleeves', 'shoes', 'shorts', 'shucks', 'sights', 'silks', 'sills',
'silversides', 'singles', 'skinheads', 'skittles', 'slacks', 'slews', 'slops',
'smuts', 'snips', 'snuffers', 'solomons', 'sops', 'southwards', 'soviets',
'spareribs', 'specs', 'spectacles', 'spillikins', 'spirits', 'splinters',
'spots', 'sprinkles', 'sprites', 'stacks', 'staggers', 'stairs', 'stakes',
'stalls', 'statics', 'stations', 'statistics', 'stays', 'steps',
'stockholdings', 'stocks', 'stops', 'straits', 'strings', 'stripes',
'summercaters', 'sweats', 'sweetbreads', 'tabernacles', 'tabis', 'tactics',
'tails', 'takings', 'talks', 'tangiers', 'taos', 'taps', 'taxis', 'tears',
'teens', 'tenpins', 'teras', 'terms', 'thebes', 'therapeutics', 'things',
'threads', 'thrips', 'throes', 'tigers', 'times', 'tons', 'tops', 'tours',
'transactions', 'trappings', 'trembles', 'trimmings', 'troglodytes', 'troops',
'tropics', 'trumpets', 'trunks', 'tums', 'turps', 'twins', 'ulfilas', 'values',
'vapors', 'vapours', 'velours', 'vespers', 'viands', 'vibes', 'victuals',
'viewers', 'waders', 'wads', 'wages', 'wales', 'walloons', 'waters', 'watts',
'ways', 'weeds', 'wells', 'westwards', 'whiskers', 'windows', 'wings',
'winnings', 'wits', 'woods', 'words', 'workings', 'works', 'writings', 'yaws',
'years', 'yips']
# Decisions:
"""
Always capitalized in singular--NNP or NNPS:
adams adps aleutians alexanders andrews appalachians aras argos balkans baptists
berbers berkshires bermudas carolinas cheviots colossians contras cotswolds davys dias
dumas edwards ephesians fijis galatians hebrews himalayas hollands houyhnhnms isis
israelites jacobs judas junkers kansas laos levis lyons mayas madeiras methodists mithras
monas mons mormons najas owens philippians philippines poitiers provos pythias rastas roberts
sauternes scots sebs solomons tangiers taos teras turps ulfilas walloons
Usually NNPS:
brits
NNP or NNPS:
romans
NNP:
alps anas hays
Typically not the plural form of a singular noun:
alas
amnios
corps
mores
acoustics aquatics auspices statics pragmatics
winnings
Others--remove the 's':
acres acts aesthetics affairs aides aids airs aloes
ambages anagrams ancients anklets antipodes antitrades
archives ares arms arts ascomycetes assets assizes baas balusters
banks barrels bars basics basidiomycetes bats beads beatniks beats bellows
belongings bends bilges billings bitters blahs bleachers blinks bloomers
blues boards bobbysocks bollocks bones bounds bowels bowls boxcars boxers
braces brakes breakers bridges briefs brooks buckskins buns burns
buttocks callas candlepins canticles cards cascades ceramics ceres chains
chambers channels charades checkers chilblains chips chives circumstances
clams clappers classics cleaners cleats cleavers clews clocks cobblers cocos
coevals collards colors colours comforts commons communications compliments
conditions congratulations conserves contents conveniences cords corduroys
corrections costs cows crabs crackers cracklings craps credentials
credits creeps crossroads crossways customs damages darts days dealings debs
deeds dermatoglyphics descendants deserts details devices dialectics
diggings digs dippers discomycetes divers dominos doubles dozens draughts
drawers dregs drippings duckpins duds dumplings dumps dunkers dynamics
eastwards effects eggs elements esthetics ethics eyes falls fatigues
feelings fields finances findings fives fixings flaps flats folks followers
forwards fumes fundamentals funds gasteromycetes gastromycetes gates gens
giblets glassworks gobs goldfields gospels graphics graves greaves greens
gripes grits groats grounds groves guts gyps hackles hands hanks harmonics
heaps hearts heavens heaves heights heroics highlands hindquarters
hipsters hives honours hooks hoops hops horseshoes hours huaraches humans
hurdles hymeneals hysterics indris innings instructions irons ironsides
ironworks isometrics jacks jackstraws jaspers jitters jodhpurs johns judges
khakis knuckles lamentations lancers lashings lats laurels laws leaders lees
leftovers legs leotards letters limbers links loads lodgings logos loins
loos lots lowlands majors manes manners marbles marines marks mars
marseilles marshals masters mates maths maulers mays means mechanics
medlars megrims metalworks metrics mews mills minors minutes mnemonics mods
mopes morals myxomycetes names needs nerves ninepins nones northwards
nothings numbers nuts nylons occasions oddments offsides operations optics
organs oscines outskirts pains pants papers parks parsons parts payables
peanuts peoples pharmaceutics physics pickings piles plaudits pliers plyers
polemics polls porcupines prelims premises preserves primates
privates proceedings profits propaedeutics prophets props proverbs
provisions psalms punks pyrites pyrotechnics quadratics quakers quarters
queens quoits raffles rafts rails rates rayons rearwards receipts regrets
relations reserves respects rings roads rockers rooms roots rounders
ruddles rudiments sales sands savings scads scopes scores scours scraps
scrubs scruples seats seawards sellers sens services sessions settlings
sevens shades shakers shambles shears shekels shingles shirtsleeves shoes
shorts shucks sights silks sills silversides singles skinheads skittles
slacks slews slops smuts snips snuffers sops southwards soviets spareribs
specs spectacles spillikins spirits splinters spots sprinkles sprites stacks
staggers stairs stakes stalls stations statistics stays steps
stockholdings stocks stops straits strings stripes summercaters sweats
sweetbreads tabernacles tabis tactics tails takings talks taps taxis tears
teens tenpins terms thebes therapeutics things threads thrips throes tigers
times tons tops tours transactions trappings trembles trimmings troglodytes
troops tropics trumpets trunks tums twins values vapors vapours velours
vespers viands vibes victuals viewers waders wads wages wales waters watts
ways weeds wells westwards whiskers windows wings wits woods words
workings works writings yaws years yips
"""
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment