Skip to content

Instantly share code, notes, and snippets.

@nschneid
Created September 7, 2013 15:50
Show Gist options
  • Save nschneid/6476715 to your computer and use it in GitHub Desktop.
Save nschneid/6476715 to your computer and use it in GitHub Desktop.
Scripts for working with part-of-speech tagsets: describing the morphosyntactic attributes encoded by tags, and converting between different tagsets. Cf. https://gist.github.com/nschneid/4231292
# Some compound tags from full Brown tag list at http://www.scs.leeds.ac.uk/ccalas/tagsets/brown.html
# (plus a few missing from the other POS mapping).
# Possessive/genitive tags (ending in '$') are included if they don't correspond to a single PTB tag due to tokenization differences.
# For the purposes of converting to Twitter tags, strip '*' from the tag. Foreign word tags begin with "FW-"; convert these to Twitter POS "G". For any compound tag not in this list, use the first part of the tag.
Brown Twitter PTB
QLP R RB
NRS ^ NNPS
NIL G ??
' , ??
AP$ X JJ+POS
CD$ S CD+POS
DO+PPSS G VB+PRP
DO+PPSS G VBP+PRP
DT$ D DET+POS
DT+BEZ L DT+VBZ
DT+MD L DT+MD
DTS+BEZ L DT+VBZ
EX+BEZ Y EX+VBZ
EX+HVD Y EX+VBD
EX+HVZ Y EX+VBZ
EX+MD Y EX+MD
JJ$ X JJ+POS
NN$ S NN+POS
NN+BEZ L NN+VBZ
NN+HVD L NN+VBD
NN+HVZ L NN+VBZ
NN+MD L NN+MD
NNS$ S NNS+POS
NNS+MD L NNS+MD
NP$ Z NNP+POS
NP+BEZ M NNP+VBZ
NP+HVZ M NNP+VBZ
NP+MD M NNP+MD
NPS$ Z NNPS+POS
NR$ S NN+POS
NR+MD L NN+MD
PN$ S PRP+POS
PN+BEZ L NN+VBZ
PN+HVD L NN+VBD
PN+HVZ L NN+VBZ
PN+MD L NN+MD
PPS+BEZ L PRP+VBZ
PPS+HVD L PRP+VBD
PPS+HVZ L PRP+VBZ
PPS+MD L PRP+MD
PPSS+BEM L PRP+VBP
PPSS+BER L PRP+VBP
PPSS+BEZ L PRP+VBZ
PPSS+HV L PRP+VB
PPSS+HV L PRP+VBP
PPSS+HVD L PRP+VBD
PPSS+MD L PRP+MD
PPSS+VB L PRP+VB
PPSS+VB L PRP+VBP
RB$ X RB+POS
RB+BEZ X RB+VBZ
WDT+BER L WDT+VBP
WDT+BER+PP G ??
WDT+BEZ L WDT+VBZ
WDT+DO+PPS G ??
WDT+DOD L WDT+VBD
WDT+HVZ L WDT+VBZ
WPS+BEZ L WP+VBZ
WPS+HVD L WP+VBD
WPS+HVZ L WP+VBZ
WPS+MD L WP+MD
WRB+BER X WRB+VBP
WRB+BEZ X WRB+VBZ
WRB+DO X WRB+VBP
WRB+DOD X WRB+VBD
WRB+DOZ X WRB+VBZ
WRB+IN X WRB+IN
WRB+MD X WRB+MD
?? , #
# Due to PTB bugs
WPS+BEZ L WP+POS
?? L WDT+POS
?? Z VBP+POS
?? Y EX+POS
# that's (demonstrative) - should be DT+VBZ
?? L IN+POS
?? L IN+VBZ
?? L DT+POS
# let's
?? V VB+POS
# Boeing's
?? Z VBG+POS
# United's
?? Z VBN+POS
?? G FW+POS
# Not sure if WPS~WP and WDT~WDT are exact Brown~PTB correspondences.
# Does WDT+MD (that'll) ever occur in Brown? WPS+HV (who've)?
WDT+MD L WDT+MD
WPS+HV L WP+VBP
'''
Map between POSes from different tagsets.
Currently supports PTB, Brown, and the ARK Twitter tagset, though a
few conventions of the latter are not yet finalized.
Depends on two map files, POSMappings.txt and browncompound2twitter.txt
(the latter is a supplement to the former). Some mappings are unclear
or not yet finalized.
Coverage has been tested for Brown -> Twitter.
@author: Nathan Schneider ([email protected])
@since: 2011-02-18
'''
from __future__ import print_function, unicode_literals, division
from future_builtins import map, filter
from collections import defaultdict, Counter
MAP_FILE = 'POSMappings.txt'
BROWN_CMPD_MAP_FILE = 'browncompound2twitter.txt'
def read_tabbed_file(filename):
lineNum = 0 # raw line, 1-based
clineNum = 0 # content line (not counting comment lines), 0-based
with open(filename) as f:
for ln in f:
lineNum += 1
if ln.startswith('#'):
continue
ln = ln[:-1] # remove newline
if ln=='':
continue
yield (lineNum, clineNum, ln)
clineNum += 1
# Tag mapping table
tbl = []
tags = defaultdict(list) # map from a tagset-bound tag (such as 'Brown::VBZ') to a list of indices of rows in the table
# Load main table
header = None
for lnum,clnum,ln in read_tabbed_file(MAP_FILE):
if clnum==0:
header = ln.split('\t')
continue
parts = ln.split('\t')
tbl.append(parts)
for tb,tag in zip(header[2:],parts[2:]):
if tag!='' and tag!='not':
tags['{}::{}'.format(tb, tag)].append(len(tbl)-1)
# Load supplementary table for converting compound Brown tags to Twitter tags
for lnum,clnum,ln in read_tabbed_file(BROWN_CMPD_MAP_FILE):
if clnum==0:
assert ln.split('\t')==['Brown','Twitter','PTB']
continue
try:
brownTag, twitTag, ptbTag = ln.split('\t')
except:
print(ln)
raise
if brownTag=='' or twitTag=='' or ptbTag=='':
print('Incomplete entry in tag map:',ln)
newEntry = [None]*len(header)
newEntry[header.index('Brown')] = brownTag
newEntry[header.index('Twitter')] = twitTag
tbl.append(newEntry)
tags['Brown::{}'.format(brownTag)].append(len(tbl)-1)
tags['Twitter::{}'.format(twitTag)].append(len(tbl)-1)
tags['PTB::{}'.format(ptbTag)].append(len(tbl)-1)
def remove_duplicates(l):
'''
Removes duplicate items from the list, preserving the first occurrence of each item in order
>>> remove_duplicates([4,4,6,3,6,9])
[4, 6, 3, 9]
'''
uniqL = []
for itm in l:
if itm not in uniqL:
uniqL.append(itm)
return uniqL
def ptb2brown(ptbTag, token=None):
'''
Returns a list of Brown tags corresponding to this PTB tag.
>>> ptb2brown('VBZ')
[u'VBZ', u'DOZ', u'HVZ', u'BEZ']
>>> ptb2brown('IN')
[u'CS', u'IN']
>>> ptb2brown(':', token='--')
[u'--']
'''
if ptbTag==':':
if token==':':
return [':']
if token=='--':
return ['--']
x = [tbl[i][header.index('Brown')] for i in tags['PTB::{}'.format(ptbTag)]]
x = remove_duplicates(x)
assert 'not' not in x,('May be ignored, untagged, or tokenized differently',ptbTag,x)
# TODO: Deal with issue of NR, NRS, NR$
assert len(x)>0, ptbTag
return x
def ptb2twit(ptbTag):
'''
Returns the Twitter tag corresponding to this PTB tag.
>>> ptb2twit('VBZ')
u'V'
>>> ptb2twit('IN')
u'P'
>>> ptb2twit('.')
u','
>>> ptb2twit('PRP')
u'O'
>>> ptb2twit('NN+VBZ')
u'L'
>>> ptb2twit('PRP+VBP')
u'L'
>>> ptb2twit('PRP+VB')
u'L'
>>> ptb2twit('NNP+MD')
u'M'
>>> ptb2twit('NNP+POS')
u'Z'
>>> ptb2twit('EX+VBD')
u'Y'
# TODO: ptb2twit('??')
'''
x = [tbl[i][header.index('Twitter')] for i in tags['PTB::{}'.format(ptbTag)]]
x = remove_duplicates(x)
assert len(x)==1,(ptbTag,x)
return x[0]
# TODO: figure out '??' cases
def brown2ptb(brownTag, token=None, infinitive=None):
'''
Returns the PTB tag corresponding to this Brown tag.
>>> brown2ptb('VBZ')
u'VBZ'
>>> brown2ptb('BEZ')
u'VBZ'
>>> brown2ptb('IN', 'to')
u'TO'
>>> brown2ptb('IN', 'for')
u'IN'
>>> brown2ptb('VB', infinitive=False)
u'VBP'
>>> brown2ptb('VB', infinitive=True)
u'VB'
'''
if brownTag=='IN':
assert token is not None,'Transformation from Brown IN to PTB is indeterminate without token (could be TO or IN)'
return 'TO' if token.lower()=='to' else 'IN'
elif brownTag=='.':
assert token is not None,'Transformation from Brown . to PTB is indeterminate without token (could be . or :)'
return ':' if token in (';','...') else '.'
elif brownTag=='--':
return ':'
elif brownTag in ('VB', 'HV', 'DO'):
assert infinitive is not None,'Transformation from Brown {} to PTB is indeterminate without infinitive flag (could be VB or VBP)'.format(brownTag)
return 'VB' if infinitive else 'VBP'
elif brownTag=='NR':
assert token is not None,'Transformation from Brown NR to PTB is indeterminate without token (could be NN or NNP or RB)'
if token[0]==token[0].upper(): # capitalized
return 'NNP'
assert False,'TODO: NN vs. RB'
# TODO: same as above, but for NR$ (= NN+POS vs. NNP+POS vs. RB+POS)
elif brownTag=='NRS':
return 'NNPS'
x = remove_duplicates([tbl[i][header.index('PTB')] for i in tags['Brown::{}'.format(brownTag)]])
assert 'not' not in x, ('May be ignored, untagged, or tokenized differently',brownTag,x)
# TODO: "VBG, JJ" and "VBN, JJ" entries in map file present problems. For now:
if 'VBG, JJ' in x:
x.remove('VBG, JJ')
if 'VBN, JJ' in x:
x.remove('VBN, JJ')
assert len(x)==1,(brownTag,x)
return x[0]
def brown2twit(brownTag, token=None):
'''
Returns the PTB tag corresponding to this Brown tag.
>>> brown2twit('VBZ')
u'V'
>>> brown2twit('BEZ')
u'V'
>>> brown2twit('IN')
u'P'
>>> brown2twit('VB')
u'V'
>>> brown2twit('.')
u','
>>> brown2twit('NP+BEZ')
u'M'
>>> brown2twit('EX+HVD')
u'Y'
>>> brown2twit('FW-*')
u'G'
>>> brown2twit('WPS+BEZ')
u'L'
>>> brown2twit('NNS$')
u'S'
>>> brown2twit('NP$')
u'Z'
>>> brown2twit('WDT+BER+PP')
u'G'
>>> brown2twit('NP-TL')
u'^'
'''
if brownTag.startswith('FW-'):
return 'G' # foreign word
if '-' in brownTag:
if brownTag.startswith('--'):
if '-' in brownTag[2:]:
brownTag = brownTag[:brownTag.index('-',2)]
else:
brownTag = brownTag[:brownTag.index('-')]
if len(brownTag)>1:
brownTag = brownTag.replace('*','') # negative marker, doesn't affect Twitter tag category
if '+' in brownTag and 'Brown::{}'.format(brownTag) not in tags:
brownTag = brownTag[:brownTag.index('+')]
if brownTag=='NR': # Adverbial nouns. TODO: Revisit this
if token is not None and token[0].lower()!=token[0]:
return '^'
return 'N'
x = [tbl[i][header.index('Twitter')] for i in tags['Brown::{}'.format(brownTag)]]
x = remove_duplicates(x)
assert len(x)==1,(brownTag,x)
return x[0]
def brownSequence2twit(tagged):
'''
>>> brownSequence2twit([('The', 'AT'), ('Fulton', 'NP-TL'), ('County', 'NN-TL'), ('Grand', 'JJ-TL'), ('Jury', 'NN-TL'), ('said', 'VBD'), ('Friday', 'NR'), ('an', 'AT'), ('investigation', 'NN'), ('of', 'IN'), ("Atlanta's", 'NP$'), ('recent', 'JJ'), ('primary', 'NN'), ('election', 'NN'), ('produced', 'VBD'), ('``', '``'), ('no', 'AT'), ('evidence', 'NN'), ("''", "''"), ('that', 'CS'), ('any', 'DTI'), ('irregularities', 'NNS'), ('took', 'VBD'), ('place', 'NN'), ('.', '.')])
[(u'The', u'D'), (u'Fulton', u'^'), (u'County', u'N'), (u'Grand', u'A'), (u'Jury', u'N'), (u'said', u'V'), (u'Friday', u'^'), (u'an', u'D'), (u'investigation', u'N'), (u'of', u'P'), (u"Atlanta's", u'Z'), (u'recent', u'A'), (u'primary', u'N'), (u'election', u'N'), (u'produced', u'V'), (u'``', u','), (u'no', u'D'), (u'evidence', u'N'), (u"''", u','), (u'that', u'P'), (u'any', u'D'), (u'irregularities', u'N'), (u'took', u'V'), (u'place', u'N'), (u'.', u',')]
'''
return [(tkn, brown2twit(tag, token=tkn)) for tkn,tag in tagged]
def brownSequence2ptb(tagged):
ptbtagged = []
for i,(w,bt) in enumerate(tagged):
infin = False
# Heuristics for deciding if this is a bare verb (VB) or present tense (but not 3rd person singular) (VBP).
# Can predict VBP where it should be VB due to coordination of bare verbs, e.g. "will/MD [ go/VB ... and take/VB ... ]"
if i>0:
infin = (tagged[i-1][1][:2] in ('MD','TO','VB'))
if i>1:
infin = (infin or (tagged[i-2][1][:2] in ('MD','TO','VB') and tagged[i-1][1]=='RB'))
ptbtagged.append((w, brown2ptb(bt, token=w, infinitive=infin)))
return ptbtagged
def ptbSequence2twit(tagged):
'''
>>> ptbSequence2twit([('Pierre', 'NNP'), ('Vinken', 'NNP'), (',', ','), ('61', 'CD'), ('years', 'NNS'), ('old', 'JJ'), (',', ','), ('will', 'MD'), ('join', 'VB'), ('the', 'DT'), ('board', 'NN'), ('as', 'IN'), ('a', 'DT'), ('nonexecutive', 'JJ'), ('director', 'NN'), ('Nov.', 'NNP'), ('29', 'CD'), ('.', '.')])
[(u'Pierre', u'^'), (u'Vinken', u'^'), (u',', u','), (u'61', u'$'), (u'years', u'N'), (u'old', u'A'), (u',', u','), (u'will', u'V'), (u'join', u'V'), (u'the', u'D'), (u'board', u'N'), (u'as', u'P'), (u'a', u'D'), (u'nonexecutive', u'A'), (u'director', u'N'), (u'Nov.', u'^'), (u'29', u'$'), (u'.', u',')]
>>> ptbSequence2twit([('Although', 'IN'), ('preliminary', 'JJ'), ('findings', 'NNS'), ('were', 'VBD'), ('reported', 'VBN'), ('*-2', '-NONE-'), ('more', 'RBR'), ('than', 'IN'), ('a', 'DT'), ('year', 'NN'), ('ago', 'IN'), (',', ','), ('the', 'DT'), ('latest', 'JJS'), ('results', 'NNS'), ('appear', 'VBP'), ('in', 'IN'), ('today', 'NN'), ("'s", 'POS'), ('New', 'NNP'), ('England', 'NNP'), ('Journal', 'NNP'), ('of', 'IN'), ('Medicine', 'NNP'), (',', ','), ('a', 'DT'), ('forum', 'NN'), ('likely', 'JJ'), ('*', '-NONE-'), ('to', 'TO'), ('bring', 'VB'), ('new', 'JJ'), ('attention', 'NN'), ('to', 'TO'), ('the', 'DT'), ('problem', 'NN'), ('.', '.')])
[(u'Although', u'P'), (u'preliminary', u'A'), (u'findings', u'N'), (u'were', u'V'), (u'reported', u'V'), (u'more', u'R'), (u'than', u'P'), (u'a', u'D'), (u'year', u'N'), (u'ago', u'P'), (u',', u','), (u'the', u'D'), (u'latest', u'A'), (u'results', u'N'), (u'appear', u'V'), (u'in', u'P'), (u"today's", u'S'), (u'New', u'^'), (u'England', u'^'), (u'Journal', u'^'), (u'of', u'P'), (u'Medicine', u'^'), (u',', u','), (u'a', u'D'), (u'forum', u'N'), (u'likely', u'A'), (u'to', u'P'), (u'bring', u'V'), (u'new', u'A'), (u'attention', u'N'), (u'to', u'P'), (u'the', u'D'), (u'problem', u'N'), (u'.', u',')]
'''
seq = []
for tkn,tag in tagged:
if tag=='-NONE-' or (tkn=="n't" and tag=='RB'): # negative clitic doesn't matter in terms of the base tag
continue
if len(tkn)>2 and tag[0]=='-' and tag[-1]=='-':
tag = {'-LRB-': '(', '-RRB-': ')'}[tag]
if tkn[0]=="'" and tag in ('POS','MD','VBZ','VBP','VBD'): # join contracted/possessive forms to the previous token
assert len(seq)>0
ptkn,ptag = seq[-1]
seq[-1] = (ptkn+tkn, ptag+'+'+tag)
else:
if "'" in tkn and tag not in ("''", 'CD', 'CC'):
print((['']+seq)[-1],tkn,tag)
seq.append((tkn,tag))
return [(tkn, ptb2twit(tag)) for tkn,tag in seq]
def describe(tag):
'''
Returns a tuple of 2 strings describing the tag: the first is the full name, and the second is a list of examples.
>>> describe('JJ')
(u'Adjective', u'happy, bad')
'''
pass
'''
List of NR ("adverbial noun") words in Brown:
downtown
home
today
to-day
tomorrow
to-morrow
tonight
yesterday
left
right
east
north(west|east)?
south(west|east)?
west
nawth
Sunday
Monday
Tuesday
Wednesday
Thursday
Friday
Saturday
Sundays
>> 'home' and 'right' are also very frequent as NN
('home', 'NR'): 301,
('home', 'NR-NC'): 1,
('home', 'NR-HL'): 1,
('Home', 'NR'): 2,
('Home', 'NR-TL'): 2,
('Home', 'NR-HL'): 1
('home', 'NN'): 217
('home', 'NN-HL'): 1,
('home', 'NN-NC'): 4
('Home', 'NN'): 2,
('Home', 'NN-TL'): 13
('Home', 'NP'): 1
For 'home', PTB appears to be inconsistent, with a tag of either RB or NN.
('right', 'RB'): 75, e.g. all right, treated right. PTB uses JJ, at least for 'all right'.
('Right', 'RB'): 4
('right', 'NR'): 56 spatial location/direction
('right', 'NN'): 122, e.g. right to bear arms, the political right. PTB uses NN.
('Right', 'NN'): 1,
('Right', 'NN-TL'): 3,
('right', 'NN-HL'): 1
('right', 'QL'): 121, e.g. right away, right now. PTB uses RB.
('Right', 'QL'): 6
Souths, Wednesdays = NRS
yesterday's, Wednesday's = NR$
today'll = NR+MD
'''
if __name__=='__main__':
# List NR... tags/tokens in Brown
#import doctest
#doctest.testmod()
# Check robustness of Brown-to-PTB conversion
from nltk.corpus import treebank
for i,s in enumerate(treebank.tagged_sents()):
if i in (44,54,147): continue # CC before VB
#if i in (23,99): continue # CC before VBP
s1 = [(w,pt) for w,pt in s if pt!='-NONE-']
s2, s3 = None, None
try:
s2 = [(w, ptb2brown(pt, token=w)[0]) for w,pt in s1]
s3 = brownSequence2ptb(s2)
except:
print('skipping',i)
continue
assert s1==s3,(s1,s2,s3)
print('OK',i)
assert False
# Create PTB-tagged file for Brown corpus
from nltk.corpus import brown
with open('brown-converted-ptb-tags.txt') as f:
for s in brown.tagged_sents():
for w,bt in s:
try:
pt = brownSequence2ptb(s)
f.write('{}\t{}\t{}\t-\n'.format(w, pt, bt))
except:
print(s)
raise
f.write('\n')
'''
twitter = defaultdict(set)
for t in tags.keys():
if t.startswith('PTB::'):
tt = ptb2twit(t[5:])
twitter[tt].add(t)
elif t.startswith('Brown::'):
tt = brown2twit(t[7:])
twitter[tt].add(t)
continue
if t=='Brown::IN' or t=='Brown::HV' or t=='Brown::DO' or t=='Brown::.' or t=='Brown::??' or t=='Brown::VB': continue
print(t)
print(brown2ptb(t[7:]))
for t in sorted(twitter.keys()):
print(t,' '.join(iter(twitter[t])))
'''
# http://nlp.cs.nyu.edu/wiki/corpuswg/AnnotationCompatibilityReport
# Table 1: Part of Speech Compatibility
# (Initial Version from Manning and Schutz 1998, pp. 141-142)
# Extended to cover Claws1 and ICE
# cf. http://www.scs.leeds.ac.uk/ccalas/tagsets/brown.html
# Nathan Schneider, 2011-02-19:
# * Fixed some errors in brown column, e.g.: DT1 => DTI, PP0 => PPO, NRS => NPS
# * Added last column (Twitter tagset) and several special tags at the end
Category Examples Claws c5, Claws1 Brown PTB ICE Twitter
Adjective happy, bad AJ0 JJ JJ ADJ.ge A
Adjective, ordinal number sixth, 72nd, last ORD, OD OD JJ NUM.od A
Adjective, comparative happier, worse AJC JJR JJR ADJ.comp A
Adjective, superlative happiest, worst AJS JJT JJS ADJ.sup A
Adjective, superlative, semantically chief, top AJ0 JJS JJ ADJ.ge A
Adjective, cardinal number 3, fifteen CRD, CD CD CD NUM.cd $
Adjective, cardinal number, one one PNI, CD1 CD CD NUM.cd $
Adjective, past-part of verb surprised JJ VBN VBN, JJ ADJ.edp V
Adjective, pres-part of verb refreshing JJ VBG VBG, JJ ADJ.ingp V
Adverb slowly, sweetly AV0 RB RB ADV.ge R
Adverb, negative not, n't XX0 * RB ADV.ge R
Adverb, comparative faster AV0 RBR RBR ADV.comp R
Adverb, superlative fastest AV0 RBT RBS ADV.sup R
Adverb, particle up, off, out AVP, RP, RI RP RP ADV.phras or ADV.ge T
Adverb, question when, how, why AVQ WRB WRB ADV.wh R
Adverb, degree & question how, however AVQ WQL WRB ADV.wh R
Adverb, degree very, so, too AV0, QL QL RB ADV.intens R
Adverb, degree, postposed enough, indeed AV0 RN RB ADV.intens, ADV.ge R
Adverb, nominal here, there, now AV0, RB RN RB ADV.ge, EXTHERE R
Adverb, conjunctive therefore, however AV0,RB RN RB CONNEC.ge R
Conjunction, coordination and, or CJC, CC CC CC CONJUNC.coord &
Conjunction, subordinating although, when CJS, CS CS IN CONJUNC.subord, P
Conjunction, complementizer 'that' that, WP, WPA, WPO CJT CS IN PRON.rel, CONJUNC.subord P
Determiner this, each, another DT0, DT DT DT PRON.dem.sing, PRON(recip) D
Determiner, pronoun any, some DT0, DTI DTI DT PRON.nonass, PRON.ass D
Determiner, pronoun, plural these, those DT0, DTS DTS DT PRON.dem.plu D
Determiner, prequalifier quite DT0, aBL ABL PDT ADV.intens X
Determiner, prequalifier all, half DT0, ABN ABN PDT PRON.univ, PRON.quant X
Determiner, pronoun or double conj. both DT0, ABX ABX DT (CC) PRON.univ.plu D
Determiner, pronoun or double conj. either, neither DT0, DTX DTX DT (CC) PRON.neg PRON.nonass.sing D
Determiner, article the, no AT0, ATI, DTX AT DT ART.def, PRON.neg D
Determiner, article a, an AT0, AT AT DT ART.indef D
Determiner, postdeterminer many, same DT0, AP,APS AP JJ PRON.quant.{sing,plu}, ADJ.ge A
Determiner, possessive their, your DPS, PP$, PP$$ PP$ PRP$ PRON.poss D
Determiner, possessive, second mine, yours DPS, PP$ PP$$ PRP PRON.poss O
Determiner, question which, whatever DTQ, WDT WDT WDT PRON.{inter, rel} D
Determiner, possessive & question whose DTQ, WP$ WP$ WP$ PRON.rel D
Noun aircraft, data NN0 NN NN N.com.sing N
Noun, singular woman, book NN1 NN NN N.com.sing N
Noun, plural women, books NN2 NNS NNS N.com.plu N
Noun, proper, singular London, Michael NP0 NP NNP N.prop.sing ^
Noun, proper, plural Australians, Methodists NP2 NPS NNPS N.prop.plu ^
Noun, adverbial tomorrow, home NN0 NR NN, NNP, RB N.com.sing N, ^, R
Noun, plural from post-determiner others NN2, APS NNS NNS PRON.quant.plu N
Pronoun, nominal (indefinite) none, everything, one PNI,PN PN NN PRON(neg), PRON.univ.{sing,plu}, NUM(card,sing) N
Pronoun, personal, subject you, we PNP, PP2 PPSS PRP PRON.pers.{sing,plu} O
Pronoun, personal, subject, 3SG she, he, it PNP, PP1A, PP1AS, PP3, PP3A, PP3AS PPS PRP PRON.{antit,cleftit}, O
Pronoun, personal, object you, them, me PNP, PP1O, PP1OS, PP3O, PP3OS PPO PRP PRON.pers.{sing,plu} O
Pronoun, reflexive herself, myself PNX, PPL PPL PRP PRON.ref.sing O
Pronoun, reflexive, plural themselves, ourselves PNX, PPLS PPLS PRP PRON.ref.plu O
Pronoun, question, subject who, whoever PNQ WPS WP PRON.inter, PRON(nom) O
Pronoun, question, object who, whoever PNQ WPO WP PRON.inter, PRON(nom) O
Pronoun, question, existential there there EX0 EX EX EXTHERE X
Verb. base present form (not infinitive) take, live VVB VB VBP V.X.{pres,imp} V
Verb, infinitive take, live VVI VB VB V.X.infin V
Verb, past tense took, lived VVD VBD VBD V.X.past V
Verb, present participle taking, living VVG VBG VBG V.X.ingp V
Verb, past/passive participle taken, lived VVN VBN VBN V.X.edp V
Verb, present 3SG -s form takes, lives VVZ VBZ VBZ V.X.pres V
Verb, auxilliary do, base do VDB, DO DO VBP AUX.do.{pres,imp} V
Verb, auxilliary do, infinitive do, DO VDB DO VB AUX.do.infin V
Verb, auxilliary do, past did VDD, DOD DOD VBD AUX.do.past V
Verb, auxilliary do, present part. doing VDG, VBG VBG V.X.ingp V
Verb, auxilliary do, past part. done VDN VBN VBN V.X.edp V
Verb, auxilliary do, present 3SG does VDZ, DOZ DOZ VBZ AUX.do.pres V
Verb, auxilliary have, base have VHB, HV HV VBP V.X.pres, AUX.{perf,semi}.pres V
Verb, auxilliary have, infinitive have VHI, HV HV VB V.X.infin, AUX.{perf,semi}.{imp,infin} V
Verb, auxilliary have, past had VHD, HVD HVD VBD V.X.past, AUX.{perf,semi}.past V
Verb, auxilliary have, present part. having VHG, HVG HVG VBG V.X.ingp, AUX.perf.ingp V
Verb, auxilliary have, past part. had VHN, HVN HVN VBN V.X.edp, AUX.perf.past V
Verb, auxilliary have, present 3SG has VHZ, HVZ HVZ VBZ V.X.pres, AUX.{perf,semi}.{pres} V
Verb, auxilliary be, infinitive be VBI, BE BE VB V.cop.{infin,imp} AUX.prog.infin V
Verb, auxilliary be, past were VBD, BED BED VBD V.cop.{past.subjun}, AUX.prog.past V
Verb, auxilliary be, past, 3SG was VBD, BEDZ BEDZ VBD V.cop.past AUX.prog.past V
Verb, auxilliary be, present part. being VBG, BEG BEG VBG V.cop.ingp AUX.prog.ingp V
Verb, auxilliary be, past part. been VBN, BEN BEN VBN V.cop.edp AUX.prog.edp V
Verb, auxilliary be, present, 3SG is, 's VBZ, BEZ BEZ VBZ V.cop.pres AUX.prog.pres V
Verb, auxilliary be, present, 1SG am, 'm VBB, BEM BEM VBP V.cop.pres AUX.prog.pres V
Verb, auxilliary be, present are, 're VBB, BER BER VBP V.cop.pres AUX.prog.pres V
Verb, modal can, could, 'll VMG, MD MD MD AUX.mod.{past,pres} V
Infinitive marker to TO0, TO TO TO PRTCL.to P
Preposition, to to PRP IN TO PREP.ge P
Preposition, for, above PRP IN IN PREP.ge, PRTCL(for) P
Preposition, of of PRF IN IN PREP.ge P
Possessive 's, ' POS, not $ POS ADV(ge) not
Interjection (or other isolate) oh, yes, mmm ITJ, UH UH UH INTERJEC !
Punctuation, sentence ender . ! ? PUN, . . . PUNC.{qm,per,exm} ,
Punctuation, semicolon ; PUN, ; . : PUNC.scol ,
Punctuation, colon or ellipsis : ... PUN, ... : : PUNC.{ellip,col} ,
Punctuation, comma , PUN, , , , PUNC.com ,
Punctuation, dash - PUN, not -- - PUNC.dash ,
Punctuation, dollar sign $ PUN not $ not ,
Punctuation, left bracket ( [ { PUL, ( ( ( PUNC.obrack ,
Punctuation, right bracket ) ] } PUR, ) ) ) PUNC.cbrack ,
Punctuation, left quotation PUQ, `` `` PUNC.oquo ,
Punctuation, right quotation PUQ, '' '' PUNC.cquo ,
Foreign words (not in English lexicon) UNC, &FW (FW-) FW not G
Symbol [fj] * not SYM not G
Symbol, alphabetical A, B, c, d ZZ0 not G
Symbol, list item A, A, First LS not G
URL or email address http://www.twitter.com ?? ?? U
Emoticon :), <3 ?? ?? E
Online discourse marker RT ?? ?? ~
Possessive nominal his, book's ?? ?? S
Possessive proper noun Mark's ?? ?? Z
Nominal combined with verbal he's, you're, book'll ?? ?? L
Proper noun combined with verbal Mark'll ?? ?? M
Miscellaneous function word combined with verbal there's ?? ?? Y
'''
Maps Penn Treebank POS tags to morphosyntactic attributes. Excerpted from
wsj.py,
Utilities for working with Penn Treebank Wall Street Journal data.
@author: Nathan Schneider ([email protected])
@since: 2011-05-01
'''
from __future__ import print_function, division
from future_builtins import map, filter, zip
from collections import defaultdict
def posinfo():
'''
>>> dict(posinfo()['VBP'])
{'finite': True, 'tag': 'VBP', 'description': 'Verb, non-3rd person singular present', 'verbal': True}
'''
# http://www.computing.dcu.ie/~acahill/tagset.html
# tag groups: j = adjective, n = nominal, d = determiner, v = verbal, f = finite, r = adverbial, x = function/closed-class
# Aside from f and x, these are mutually exclusive and can be interpreted as coarse tags.
TABLE = '''
CC x Coordinating conjunction >> and, but, or...
CD Cardinal Number
DT dx Determiner
EX x Existential there
FW Foreign Word
IN x Preposision or subordinating conjunction
JJ j Adjective
JJR j Adjective, comparative
JJS j Adjective, superlative
LS List Item Marker
MD vx Modal >> can, could, might, may...
NN n Noun, singular or mass
NNP n Proper Noun, singular
NNPS n Proper Noun, plural
NNS n Noun, plural
PDT dx Predeterminer >> all, both ... when they precede an article
POS x Possessive Ending >> 's
PRP nx Personal Pronoun >> I, me, you, he...
PRP$ nx Possessive Pronoun >> my, your, mine, yours...
RB r Adverb >> Most words that end in -ly as well as degree words like quite, too and very
RBR r Adverb, comparative >> Adverbs with the comparative ending -er, with a strictly comparative meaning
RBS r Adverb, superlative
RP x Particle
SYM Symbol >> Should be used for mathematical, scientific or technical symbols
TO x to
UH Interjection >> e.g. uh, well, yes, my...
VB v Verb, base form >> subsumes imperatives, infinitives and subjunctives
VBD fv Verb, past tense >> includes the conditional form of the verb to be
VBG v Verb, gerund or persent participle
VBN v Verb, past participle
VBP fv Verb, non-3rd person singular present
VBZ fv Verb, 3rd person singular present
WDT dx Wh-determiner >> e.g. which, and that when it is used as a relative pronoun
WP nx Wh-pronoun >> e.g. what, who, whom...
WP$ nx Possessive wh-pronoun
WRB rx Wh-adverb >> how, where why
#
$
''
(
)
-LRB- (
-RRB- )
-LSB- [
-RSB- ]
-LCB- {
-RCB- }
,
.
:
``
'''
info = {}
for ln in TABLE.strip().splitlines():
ln = ln.strip()
entry = defaultdict(lambda: False)
if ln[0] in '()[]{}-' or ' ' not in ln:
if ln[0] in '()[]{}-':
entry['bracket'] = True
entry['tag'] = ln.split()[0]
entry['punct'] = True
entry['symbol'] = True
if ln in ("''", "``"): entry['quote'] = True
else:
# TODO: traces?
if '>>' in ln:
entry['extra'] = ln[ln.index('>> ')+3:]
ln = ln[:ln.index('>> ')].strip()
parts = ln.split()
entry['tag'] = parts[0]
atts = ''
if parts[1][0].islower():
atts = parts[1]
del parts[1]
entry['description'] = ' '.join(parts[1:])
if 'Possessive' in entry['description']:
entry['possessive'] = True
if atts:
if 'x' in atts: entry['functional'] = True
if 'n' in atts: entry['nominal'] = True
if 'nx' in atts: entry['pronominal'] = True
if 'j' in atts: entry['adjectival'] = True
if 'v' in atts: entry['verbal'] = True
if 'r' in atts: entry['adverbial'] = True
if 'd' in atts: entry['determiner'] = True
if 'f' in atts: entry['finite'] = True
coarse = ['n' in atts, 'v' in atts, 'd' in atts, 'r' in atts, 'j' in atts]
if sum(coarse)==1:
entry['coarse'] = 'nvdrj'[coarse.index(True)]
else:
assert sum(coarse)==0
entry['coarse'] = entry['tag']
else:
entry['coarse'] = entry['tag']
if entry['tag'].startswith('W'): entry['wh'] = True
info[entry['tag']] = entry
info['LS']['symbol'] = True
info['SYM']['symbol'] = True
info['NNP']['proper'] = True
info['NNPS']['proper'] = True
info['FW']['foreign'] = True
return info
def poses(**criteria):
'''
Retrieves a list of all tags meeting the specified criteria, where criteria are boolean attribute
names in the entries of the object returned by posinfo().
>>> poses(finite=True)
set(['VBZ', 'VBP', 'VBD'])
>>> poses(nominal=True, pronominal=False)
set(['NNPS', 'NNS', 'NN', 'NNP'])
>>> poses(functional=True, nominal=False, verbal=False, adverbial=False, determiner=False, possessive=False)
set(['CC', 'TO', 'RP', 'EX', 'IN'])
'''
return {entry['tag'] for entry in posinfo().values() if sum(entry[attn]==v for attn,v in criteria.items())==len(criteria)}
def posAttributes(tag):
return posinfo()[tag]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment