Created
September 7, 2013 15:50
-
-
Save nschneid/6476715 to your computer and use it in GitHub Desktop.
Scripts for working with part-of-speech tagsets: describing the morphosyntactic attributes encoded by tags, and converting between different tagsets. Cf. https://gist.github.com/nschneid/4231292
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Some compound tags from full Brown tag list at http://www.scs.leeds.ac.uk/ccalas/tagsets/brown.html | |
# (plus a few missing from the other POS mapping). | |
# Possessive/genitive tags (ending in '$') are included if they don't correspond to a single PTB tag due to tokenization differences. | |
# For the purposes of converting to Twitter tags, strip '*' from the tag. Foreign word tags begin with "FW-"; convert these to Twitter POS "G". For any compound tag not in this list, use the first part of the tag. | |
Brown Twitter PTB | |
QLP R RB | |
NRS ^ NNPS | |
NIL G ?? | |
' , ?? | |
AP$ X JJ+POS | |
CD$ S CD+POS | |
DO+PPSS G VB+PRP | |
DO+PPSS G VBP+PRP | |
DT$ D DET+POS | |
DT+BEZ L DT+VBZ | |
DT+MD L DT+MD | |
DTS+BEZ L DT+VBZ | |
EX+BEZ Y EX+VBZ | |
EX+HVD Y EX+VBD | |
EX+HVZ Y EX+VBZ | |
EX+MD Y EX+MD | |
JJ$ X JJ+POS | |
NN$ S NN+POS | |
NN+BEZ L NN+VBZ | |
NN+HVD L NN+VBD | |
NN+HVZ L NN+VBZ | |
NN+MD L NN+MD | |
NNS$ S NNS+POS | |
NNS+MD L NNS+MD | |
NP$ Z NNP+POS | |
NP+BEZ M NNP+VBZ | |
NP+HVZ M NNP+VBZ | |
NP+MD M NNP+MD | |
NPS$ Z NNPS+POS | |
NR$ S NN+POS | |
NR+MD L NN+MD | |
PN$ S PRP+POS | |
PN+BEZ L NN+VBZ | |
PN+HVD L NN+VBD | |
PN+HVZ L NN+VBZ | |
PN+MD L NN+MD | |
PPS+BEZ L PRP+VBZ | |
PPS+HVD L PRP+VBD | |
PPS+HVZ L PRP+VBZ | |
PPS+MD L PRP+MD | |
PPSS+BEM L PRP+VBP | |
PPSS+BER L PRP+VBP | |
PPSS+BEZ L PRP+VBZ | |
PPSS+HV L PRP+VB | |
PPSS+HV L PRP+VBP | |
PPSS+HVD L PRP+VBD | |
PPSS+MD L PRP+MD | |
PPSS+VB L PRP+VB | |
PPSS+VB L PRP+VBP | |
RB$ X RB+POS | |
RB+BEZ X RB+VBZ | |
WDT+BER L WDT+VBP | |
WDT+BER+PP G ?? | |
WDT+BEZ L WDT+VBZ | |
WDT+DO+PPS G ?? | |
WDT+DOD L WDT+VBD | |
WDT+HVZ L WDT+VBZ | |
WPS+BEZ L WP+VBZ | |
WPS+HVD L WP+VBD | |
WPS+HVZ L WP+VBZ | |
WPS+MD L WP+MD | |
WRB+BER X WRB+VBP | |
WRB+BEZ X WRB+VBZ | |
WRB+DO X WRB+VBP | |
WRB+DOD X WRB+VBD | |
WRB+DOZ X WRB+VBZ | |
WRB+IN X WRB+IN | |
WRB+MD X WRB+MD | |
?? , # | |
# Due to PTB bugs | |
WPS+BEZ L WP+POS | |
?? L WDT+POS | |
?? Z VBP+POS | |
?? Y EX+POS | |
# that's (demonstrative) - should be DT+VBZ | |
?? L IN+POS | |
?? L IN+VBZ | |
?? L DT+POS | |
# let's | |
?? V VB+POS | |
# Boeing's | |
?? Z VBG+POS | |
# United's | |
?? Z VBN+POS | |
?? G FW+POS | |
# Not sure if WPS~WP and WDT~WDT are exact Brown~PTB correspondences. | |
# Does WDT+MD (that'll) ever occur in Brown? WPS+HV (who've)? | |
WDT+MD L WDT+MD | |
WPS+HV L WP+VBP |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
''' | |
Map between POSes from different tagsets. | |
Currently supports PTB, Brown, and the ARK Twitter tagset, though a | |
few conventions of the latter are not yet finalized. | |
Depends on two map files, POSMappings.txt and browncompound2twitter.txt | |
(the latter is a supplement to the former). Some mappings are unclear | |
or not yet finalized. | |
Coverage has been tested for Brown -> Twitter. | |
@author: Nathan Schneider ([email protected]) | |
@since: 2011-02-18 | |
''' | |
from __future__ import print_function, unicode_literals, division | |
from future_builtins import map, filter | |
from collections import defaultdict, Counter | |
MAP_FILE = 'POSMappings.txt' | |
BROWN_CMPD_MAP_FILE = 'browncompound2twitter.txt' | |
def read_tabbed_file(filename): | |
lineNum = 0 # raw line, 1-based | |
clineNum = 0 # content line (not counting comment lines), 0-based | |
with open(filename) as f: | |
for ln in f: | |
lineNum += 1 | |
if ln.startswith('#'): | |
continue | |
ln = ln[:-1] # remove newline | |
if ln=='': | |
continue | |
yield (lineNum, clineNum, ln) | |
clineNum += 1 | |
# Tag mapping table | |
tbl = [] | |
tags = defaultdict(list) # map from a tagset-bound tag (such as 'Brown::VBZ') to a list of indices of rows in the table | |
# Load main table | |
header = None | |
for lnum,clnum,ln in read_tabbed_file(MAP_FILE): | |
if clnum==0: | |
header = ln.split('\t') | |
continue | |
parts = ln.split('\t') | |
tbl.append(parts) | |
for tb,tag in zip(header[2:],parts[2:]): | |
if tag!='' and tag!='not': | |
tags['{}::{}'.format(tb, tag)].append(len(tbl)-1) | |
# Load supplementary table for converting compound Brown tags to Twitter tags | |
for lnum,clnum,ln in read_tabbed_file(BROWN_CMPD_MAP_FILE): | |
if clnum==0: | |
assert ln.split('\t')==['Brown','Twitter','PTB'] | |
continue | |
try: | |
brownTag, twitTag, ptbTag = ln.split('\t') | |
except: | |
print(ln) | |
raise | |
if brownTag=='' or twitTag=='' or ptbTag=='': | |
print('Incomplete entry in tag map:',ln) | |
newEntry = [None]*len(header) | |
newEntry[header.index('Brown')] = brownTag | |
newEntry[header.index('Twitter')] = twitTag | |
tbl.append(newEntry) | |
tags['Brown::{}'.format(brownTag)].append(len(tbl)-1) | |
tags['Twitter::{}'.format(twitTag)].append(len(tbl)-1) | |
tags['PTB::{}'.format(ptbTag)].append(len(tbl)-1) | |
def remove_duplicates(l): | |
''' | |
Removes duplicate items from the list, preserving the first occurrence of each item in order | |
>>> remove_duplicates([4,4,6,3,6,9]) | |
[4, 6, 3, 9] | |
''' | |
uniqL = [] | |
for itm in l: | |
if itm not in uniqL: | |
uniqL.append(itm) | |
return uniqL | |
def ptb2brown(ptbTag, token=None): | |
''' | |
Returns a list of Brown tags corresponding to this PTB tag. | |
>>> ptb2brown('VBZ') | |
[u'VBZ', u'DOZ', u'HVZ', u'BEZ'] | |
>>> ptb2brown('IN') | |
[u'CS', u'IN'] | |
>>> ptb2brown(':', token='--') | |
[u'--'] | |
''' | |
if ptbTag==':': | |
if token==':': | |
return [':'] | |
if token=='--': | |
return ['--'] | |
x = [tbl[i][header.index('Brown')] for i in tags['PTB::{}'.format(ptbTag)]] | |
x = remove_duplicates(x) | |
assert 'not' not in x,('May be ignored, untagged, or tokenized differently',ptbTag,x) | |
# TODO: Deal with issue of NR, NRS, NR$ | |
assert len(x)>0, ptbTag | |
return x | |
def ptb2twit(ptbTag): | |
''' | |
Returns the Twitter tag corresponding to this PTB tag. | |
>>> ptb2twit('VBZ') | |
u'V' | |
>>> ptb2twit('IN') | |
u'P' | |
>>> ptb2twit('.') | |
u',' | |
>>> ptb2twit('PRP') | |
u'O' | |
>>> ptb2twit('NN+VBZ') | |
u'L' | |
>>> ptb2twit('PRP+VBP') | |
u'L' | |
>>> ptb2twit('PRP+VB') | |
u'L' | |
>>> ptb2twit('NNP+MD') | |
u'M' | |
>>> ptb2twit('NNP+POS') | |
u'Z' | |
>>> ptb2twit('EX+VBD') | |
u'Y' | |
# TODO: ptb2twit('??') | |
''' | |
x = [tbl[i][header.index('Twitter')] for i in tags['PTB::{}'.format(ptbTag)]] | |
x = remove_duplicates(x) | |
assert len(x)==1,(ptbTag,x) | |
return x[0] | |
# TODO: figure out '??' cases | |
def brown2ptb(brownTag, token=None, infinitive=None): | |
''' | |
Returns the PTB tag corresponding to this Brown tag. | |
>>> brown2ptb('VBZ') | |
u'VBZ' | |
>>> brown2ptb('BEZ') | |
u'VBZ' | |
>>> brown2ptb('IN', 'to') | |
u'TO' | |
>>> brown2ptb('IN', 'for') | |
u'IN' | |
>>> brown2ptb('VB', infinitive=False) | |
u'VBP' | |
>>> brown2ptb('VB', infinitive=True) | |
u'VB' | |
''' | |
if brownTag=='IN': | |
assert token is not None,'Transformation from Brown IN to PTB is indeterminate without token (could be TO or IN)' | |
return 'TO' if token.lower()=='to' else 'IN' | |
elif brownTag=='.': | |
assert token is not None,'Transformation from Brown . to PTB is indeterminate without token (could be . or :)' | |
return ':' if token in (';','...') else '.' | |
elif brownTag=='--': | |
return ':' | |
elif brownTag in ('VB', 'HV', 'DO'): | |
assert infinitive is not None,'Transformation from Brown {} to PTB is indeterminate without infinitive flag (could be VB or VBP)'.format(brownTag) | |
return 'VB' if infinitive else 'VBP' | |
elif brownTag=='NR': | |
assert token is not None,'Transformation from Brown NR to PTB is indeterminate without token (could be NN or NNP or RB)' | |
if token[0]==token[0].upper(): # capitalized | |
return 'NNP' | |
assert False,'TODO: NN vs. RB' | |
# TODO: same as above, but for NR$ (= NN+POS vs. NNP+POS vs. RB+POS) | |
elif brownTag=='NRS': | |
return 'NNPS' | |
x = remove_duplicates([tbl[i][header.index('PTB')] for i in tags['Brown::{}'.format(brownTag)]]) | |
assert 'not' not in x, ('May be ignored, untagged, or tokenized differently',brownTag,x) | |
# TODO: "VBG, JJ" and "VBN, JJ" entries in map file present problems. For now: | |
if 'VBG, JJ' in x: | |
x.remove('VBG, JJ') | |
if 'VBN, JJ' in x: | |
x.remove('VBN, JJ') | |
assert len(x)==1,(brownTag,x) | |
return x[0] | |
def brown2twit(brownTag, token=None): | |
''' | |
Returns the PTB tag corresponding to this Brown tag. | |
>>> brown2twit('VBZ') | |
u'V' | |
>>> brown2twit('BEZ') | |
u'V' | |
>>> brown2twit('IN') | |
u'P' | |
>>> brown2twit('VB') | |
u'V' | |
>>> brown2twit('.') | |
u',' | |
>>> brown2twit('NP+BEZ') | |
u'M' | |
>>> brown2twit('EX+HVD') | |
u'Y' | |
>>> brown2twit('FW-*') | |
u'G' | |
>>> brown2twit('WPS+BEZ') | |
u'L' | |
>>> brown2twit('NNS$') | |
u'S' | |
>>> brown2twit('NP$') | |
u'Z' | |
>>> brown2twit('WDT+BER+PP') | |
u'G' | |
>>> brown2twit('NP-TL') | |
u'^' | |
''' | |
if brownTag.startswith('FW-'): | |
return 'G' # foreign word | |
if '-' in brownTag: | |
if brownTag.startswith('--'): | |
if '-' in brownTag[2:]: | |
brownTag = brownTag[:brownTag.index('-',2)] | |
else: | |
brownTag = brownTag[:brownTag.index('-')] | |
if len(brownTag)>1: | |
brownTag = brownTag.replace('*','') # negative marker, doesn't affect Twitter tag category | |
if '+' in brownTag and 'Brown::{}'.format(brownTag) not in tags: | |
brownTag = brownTag[:brownTag.index('+')] | |
if brownTag=='NR': # Adverbial nouns. TODO: Revisit this | |
if token is not None and token[0].lower()!=token[0]: | |
return '^' | |
return 'N' | |
x = [tbl[i][header.index('Twitter')] for i in tags['Brown::{}'.format(brownTag)]] | |
x = remove_duplicates(x) | |
assert len(x)==1,(brownTag,x) | |
return x[0] | |
def brownSequence2twit(tagged): | |
''' | |
>>> brownSequence2twit([('The', 'AT'), ('Fulton', 'NP-TL'), ('County', 'NN-TL'), ('Grand', 'JJ-TL'), ('Jury', 'NN-TL'), ('said', 'VBD'), ('Friday', 'NR'), ('an', 'AT'), ('investigation', 'NN'), ('of', 'IN'), ("Atlanta's", 'NP$'), ('recent', 'JJ'), ('primary', 'NN'), ('election', 'NN'), ('produced', 'VBD'), ('``', '``'), ('no', 'AT'), ('evidence', 'NN'), ("''", "''"), ('that', 'CS'), ('any', 'DTI'), ('irregularities', 'NNS'), ('took', 'VBD'), ('place', 'NN'), ('.', '.')]) | |
[(u'The', u'D'), (u'Fulton', u'^'), (u'County', u'N'), (u'Grand', u'A'), (u'Jury', u'N'), (u'said', u'V'), (u'Friday', u'^'), (u'an', u'D'), (u'investigation', u'N'), (u'of', u'P'), (u"Atlanta's", u'Z'), (u'recent', u'A'), (u'primary', u'N'), (u'election', u'N'), (u'produced', u'V'), (u'``', u','), (u'no', u'D'), (u'evidence', u'N'), (u"''", u','), (u'that', u'P'), (u'any', u'D'), (u'irregularities', u'N'), (u'took', u'V'), (u'place', u'N'), (u'.', u',')] | |
''' | |
return [(tkn, brown2twit(tag, token=tkn)) for tkn,tag in tagged] | |
def brownSequence2ptb(tagged): | |
ptbtagged = [] | |
for i,(w,bt) in enumerate(tagged): | |
infin = False | |
# Heuristics for deciding if this is a bare verb (VB) or present tense (but not 3rd person singular) (VBP). | |
# Can predict VBP where it should be VB due to coordination of bare verbs, e.g. "will/MD [ go/VB ... and take/VB ... ]" | |
if i>0: | |
infin = (tagged[i-1][1][:2] in ('MD','TO','VB')) | |
if i>1: | |
infin = (infin or (tagged[i-2][1][:2] in ('MD','TO','VB') and tagged[i-1][1]=='RB')) | |
ptbtagged.append((w, brown2ptb(bt, token=w, infinitive=infin))) | |
return ptbtagged | |
def ptbSequence2twit(tagged): | |
''' | |
>>> ptbSequence2twit([('Pierre', 'NNP'), ('Vinken', 'NNP'), (',', ','), ('61', 'CD'), ('years', 'NNS'), ('old', 'JJ'), (',', ','), ('will', 'MD'), ('join', 'VB'), ('the', 'DT'), ('board', 'NN'), ('as', 'IN'), ('a', 'DT'), ('nonexecutive', 'JJ'), ('director', 'NN'), ('Nov.', 'NNP'), ('29', 'CD'), ('.', '.')]) | |
[(u'Pierre', u'^'), (u'Vinken', u'^'), (u',', u','), (u'61', u'$'), (u'years', u'N'), (u'old', u'A'), (u',', u','), (u'will', u'V'), (u'join', u'V'), (u'the', u'D'), (u'board', u'N'), (u'as', u'P'), (u'a', u'D'), (u'nonexecutive', u'A'), (u'director', u'N'), (u'Nov.', u'^'), (u'29', u'$'), (u'.', u',')] | |
>>> ptbSequence2twit([('Although', 'IN'), ('preliminary', 'JJ'), ('findings', 'NNS'), ('were', 'VBD'), ('reported', 'VBN'), ('*-2', '-NONE-'), ('more', 'RBR'), ('than', 'IN'), ('a', 'DT'), ('year', 'NN'), ('ago', 'IN'), (',', ','), ('the', 'DT'), ('latest', 'JJS'), ('results', 'NNS'), ('appear', 'VBP'), ('in', 'IN'), ('today', 'NN'), ("'s", 'POS'), ('New', 'NNP'), ('England', 'NNP'), ('Journal', 'NNP'), ('of', 'IN'), ('Medicine', 'NNP'), (',', ','), ('a', 'DT'), ('forum', 'NN'), ('likely', 'JJ'), ('*', '-NONE-'), ('to', 'TO'), ('bring', 'VB'), ('new', 'JJ'), ('attention', 'NN'), ('to', 'TO'), ('the', 'DT'), ('problem', 'NN'), ('.', '.')]) | |
[(u'Although', u'P'), (u'preliminary', u'A'), (u'findings', u'N'), (u'were', u'V'), (u'reported', u'V'), (u'more', u'R'), (u'than', u'P'), (u'a', u'D'), (u'year', u'N'), (u'ago', u'P'), (u',', u','), (u'the', u'D'), (u'latest', u'A'), (u'results', u'N'), (u'appear', u'V'), (u'in', u'P'), (u"today's", u'S'), (u'New', u'^'), (u'England', u'^'), (u'Journal', u'^'), (u'of', u'P'), (u'Medicine', u'^'), (u',', u','), (u'a', u'D'), (u'forum', u'N'), (u'likely', u'A'), (u'to', u'P'), (u'bring', u'V'), (u'new', u'A'), (u'attention', u'N'), (u'to', u'P'), (u'the', u'D'), (u'problem', u'N'), (u'.', u',')] | |
''' | |
seq = [] | |
for tkn,tag in tagged: | |
if tag=='-NONE-' or (tkn=="n't" and tag=='RB'): # negative clitic doesn't matter in terms of the base tag | |
continue | |
if len(tkn)>2 and tag[0]=='-' and tag[-1]=='-': | |
tag = {'-LRB-': '(', '-RRB-': ')'}[tag] | |
if tkn[0]=="'" and tag in ('POS','MD','VBZ','VBP','VBD'): # join contracted/possessive forms to the previous token | |
assert len(seq)>0 | |
ptkn,ptag = seq[-1] | |
seq[-1] = (ptkn+tkn, ptag+'+'+tag) | |
else: | |
if "'" in tkn and tag not in ("''", 'CD', 'CC'): | |
print((['']+seq)[-1],tkn,tag) | |
seq.append((tkn,tag)) | |
return [(tkn, ptb2twit(tag)) for tkn,tag in seq] | |
def describe(tag): | |
''' | |
Returns a tuple of 2 strings describing the tag: the first is the full name, and the second is a list of examples. | |
>>> describe('JJ') | |
(u'Adjective', u'happy, bad') | |
''' | |
pass | |
''' | |
List of NR ("adverbial noun") words in Brown: | |
downtown | |
home | |
today | |
to-day | |
tomorrow | |
to-morrow | |
tonight | |
yesterday | |
left | |
right | |
east | |
north(west|east)? | |
south(west|east)? | |
west | |
nawth | |
Sunday | |
Monday | |
Tuesday | |
Wednesday | |
Thursday | |
Friday | |
Saturday | |
Sundays | |
>> 'home' and 'right' are also very frequent as NN | |
('home', 'NR'): 301, | |
('home', 'NR-NC'): 1, | |
('home', 'NR-HL'): 1, | |
('Home', 'NR'): 2, | |
('Home', 'NR-TL'): 2, | |
('Home', 'NR-HL'): 1 | |
('home', 'NN'): 217 | |
('home', 'NN-HL'): 1, | |
('home', 'NN-NC'): 4 | |
('Home', 'NN'): 2, | |
('Home', 'NN-TL'): 13 | |
('Home', 'NP'): 1 | |
For 'home', PTB appears to be inconsistent, with a tag of either RB or NN. | |
('right', 'RB'): 75, e.g. all right, treated right. PTB uses JJ, at least for 'all right'. | |
('Right', 'RB'): 4 | |
('right', 'NR'): 56 spatial location/direction | |
('right', 'NN'): 122, e.g. right to bear arms, the political right. PTB uses NN. | |
('Right', 'NN'): 1, | |
('Right', 'NN-TL'): 3, | |
('right', 'NN-HL'): 1 | |
('right', 'QL'): 121, e.g. right away, right now. PTB uses RB. | |
('Right', 'QL'): 6 | |
Souths, Wednesdays = NRS | |
yesterday's, Wednesday's = NR$ | |
today'll = NR+MD | |
''' | |
if __name__=='__main__': | |
# List NR... tags/tokens in Brown | |
#import doctest | |
#doctest.testmod() | |
# Check robustness of Brown-to-PTB conversion | |
from nltk.corpus import treebank | |
for i,s in enumerate(treebank.tagged_sents()): | |
if i in (44,54,147): continue # CC before VB | |
#if i in (23,99): continue # CC before VBP | |
s1 = [(w,pt) for w,pt in s if pt!='-NONE-'] | |
s2, s3 = None, None | |
try: | |
s2 = [(w, ptb2brown(pt, token=w)[0]) for w,pt in s1] | |
s3 = brownSequence2ptb(s2) | |
except: | |
print('skipping',i) | |
continue | |
assert s1==s3,(s1,s2,s3) | |
print('OK',i) | |
assert False | |
# Create PTB-tagged file for Brown corpus | |
from nltk.corpus import brown | |
with open('brown-converted-ptb-tags.txt') as f: | |
for s in brown.tagged_sents(): | |
for w,bt in s: | |
try: | |
pt = brownSequence2ptb(s) | |
f.write('{}\t{}\t{}\t-\n'.format(w, pt, bt)) | |
except: | |
print(s) | |
raise | |
f.write('\n') | |
''' | |
twitter = defaultdict(set) | |
for t in tags.keys(): | |
if t.startswith('PTB::'): | |
tt = ptb2twit(t[5:]) | |
twitter[tt].add(t) | |
elif t.startswith('Brown::'): | |
tt = brown2twit(t[7:]) | |
twitter[tt].add(t) | |
continue | |
if t=='Brown::IN' or t=='Brown::HV' or t=='Brown::DO' or t=='Brown::.' or t=='Brown::??' or t=='Brown::VB': continue | |
print(t) | |
print(brown2ptb(t[7:])) | |
for t in sorted(twitter.keys()): | |
print(t,' '.join(iter(twitter[t]))) | |
''' |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# http://nlp.cs.nyu.edu/wiki/corpuswg/AnnotationCompatibilityReport | |
# Table 1: Part of Speech Compatibility | |
# (Initial Version from Manning and Schutz 1998, pp. 141-142) | |
# Extended to cover Claws1 and ICE | |
# cf. http://www.scs.leeds.ac.uk/ccalas/tagsets/brown.html | |
# Nathan Schneider, 2011-02-19: | |
# * Fixed some errors in brown column, e.g.: DT1 => DTI, PP0 => PPO, NRS => NPS | |
# * Added last column (Twitter tagset) and several special tags at the end | |
Category Examples Claws c5, Claws1 Brown PTB ICE Twitter | |
Adjective happy, bad AJ0 JJ JJ ADJ.ge A | |
Adjective, ordinal number sixth, 72nd, last ORD, OD OD JJ NUM.od A | |
Adjective, comparative happier, worse AJC JJR JJR ADJ.comp A | |
Adjective, superlative happiest, worst AJS JJT JJS ADJ.sup A | |
Adjective, superlative, semantically chief, top AJ0 JJS JJ ADJ.ge A | |
Adjective, cardinal number 3, fifteen CRD, CD CD CD NUM.cd $ | |
Adjective, cardinal number, one one PNI, CD1 CD CD NUM.cd $ | |
Adjective, past-part of verb surprised JJ VBN VBN, JJ ADJ.edp V | |
Adjective, pres-part of verb refreshing JJ VBG VBG, JJ ADJ.ingp V | |
Adverb slowly, sweetly AV0 RB RB ADV.ge R | |
Adverb, negative not, n't XX0 * RB ADV.ge R | |
Adverb, comparative faster AV0 RBR RBR ADV.comp R | |
Adverb, superlative fastest AV0 RBT RBS ADV.sup R | |
Adverb, particle up, off, out AVP, RP, RI RP RP ADV.phras or ADV.ge T | |
Adverb, question when, how, why AVQ WRB WRB ADV.wh R | |
Adverb, degree & question how, however AVQ WQL WRB ADV.wh R | |
Adverb, degree very, so, too AV0, QL QL RB ADV.intens R | |
Adverb, degree, postposed enough, indeed AV0 RN RB ADV.intens, ADV.ge R | |
Adverb, nominal here, there, now AV0, RB RN RB ADV.ge, EXTHERE R | |
Adverb, conjunctive therefore, however AV0,RB RN RB CONNEC.ge R | |
Conjunction, coordination and, or CJC, CC CC CC CONJUNC.coord & | |
Conjunction, subordinating although, when CJS, CS CS IN CONJUNC.subord, P | |
Conjunction, complementizer 'that' that, WP, WPA, WPO CJT CS IN PRON.rel, CONJUNC.subord P | |
Determiner this, each, another DT0, DT DT DT PRON.dem.sing, PRON(recip) D | |
Determiner, pronoun any, some DT0, DTI DTI DT PRON.nonass, PRON.ass D | |
Determiner, pronoun, plural these, those DT0, DTS DTS DT PRON.dem.plu D | |
Determiner, prequalifier quite DT0, aBL ABL PDT ADV.intens X | |
Determiner, prequalifier all, half DT0, ABN ABN PDT PRON.univ, PRON.quant X | |
Determiner, pronoun or double conj. both DT0, ABX ABX DT (CC) PRON.univ.plu D | |
Determiner, pronoun or double conj. either, neither DT0, DTX DTX DT (CC) PRON.neg PRON.nonass.sing D | |
Determiner, article the, no AT0, ATI, DTX AT DT ART.def, PRON.neg D | |
Determiner, article a, an AT0, AT AT DT ART.indef D | |
Determiner, postdeterminer many, same DT0, AP,APS AP JJ PRON.quant.{sing,plu}, ADJ.ge A | |
Determiner, possessive their, your DPS, PP$, PP$$ PP$ PRP$ PRON.poss D | |
Determiner, possessive, second mine, yours DPS, PP$ PP$$ PRP PRON.poss O | |
Determiner, question which, whatever DTQ, WDT WDT WDT PRON.{inter, rel} D | |
Determiner, possessive & question whose DTQ, WP$ WP$ WP$ PRON.rel D | |
Noun aircraft, data NN0 NN NN N.com.sing N | |
Noun, singular woman, book NN1 NN NN N.com.sing N | |
Noun, plural women, books NN2 NNS NNS N.com.plu N | |
Noun, proper, singular London, Michael NP0 NP NNP N.prop.sing ^ | |
Noun, proper, plural Australians, Methodists NP2 NPS NNPS N.prop.plu ^ | |
Noun, adverbial tomorrow, home NN0 NR NN, NNP, RB N.com.sing N, ^, R | |
Noun, plural from post-determiner others NN2, APS NNS NNS PRON.quant.plu N | |
Pronoun, nominal (indefinite) none, everything, one PNI,PN PN NN PRON(neg), PRON.univ.{sing,plu}, NUM(card,sing) N | |
Pronoun, personal, subject you, we PNP, PP2 PPSS PRP PRON.pers.{sing,plu} O | |
Pronoun, personal, subject, 3SG she, he, it PNP, PP1A, PP1AS, PP3, PP3A, PP3AS PPS PRP PRON.{antit,cleftit}, O | |
Pronoun, personal, object you, them, me PNP, PP1O, PP1OS, PP3O, PP3OS PPO PRP PRON.pers.{sing,plu} O | |
Pronoun, reflexive herself, myself PNX, PPL PPL PRP PRON.ref.sing O | |
Pronoun, reflexive, plural themselves, ourselves PNX, PPLS PPLS PRP PRON.ref.plu O | |
Pronoun, question, subject who, whoever PNQ WPS WP PRON.inter, PRON(nom) O | |
Pronoun, question, object who, whoever PNQ WPO WP PRON.inter, PRON(nom) O | |
Pronoun, question, existential there there EX0 EX EX EXTHERE X | |
Verb. base present form (not infinitive) take, live VVB VB VBP V.X.{pres,imp} V | |
Verb, infinitive take, live VVI VB VB V.X.infin V | |
Verb, past tense took, lived VVD VBD VBD V.X.past V | |
Verb, present participle taking, living VVG VBG VBG V.X.ingp V | |
Verb, past/passive participle taken, lived VVN VBN VBN V.X.edp V | |
Verb, present 3SG -s form takes, lives VVZ VBZ VBZ V.X.pres V | |
Verb, auxilliary do, base do VDB, DO DO VBP AUX.do.{pres,imp} V | |
Verb, auxilliary do, infinitive do, DO VDB DO VB AUX.do.infin V | |
Verb, auxilliary do, past did VDD, DOD DOD VBD AUX.do.past V | |
Verb, auxilliary do, present part. doing VDG, VBG VBG V.X.ingp V | |
Verb, auxilliary do, past part. done VDN VBN VBN V.X.edp V | |
Verb, auxilliary do, present 3SG does VDZ, DOZ DOZ VBZ AUX.do.pres V | |
Verb, auxilliary have, base have VHB, HV HV VBP V.X.pres, AUX.{perf,semi}.pres V | |
Verb, auxilliary have, infinitive have VHI, HV HV VB V.X.infin, AUX.{perf,semi}.{imp,infin} V | |
Verb, auxilliary have, past had VHD, HVD HVD VBD V.X.past, AUX.{perf,semi}.past V | |
Verb, auxilliary have, present part. having VHG, HVG HVG VBG V.X.ingp, AUX.perf.ingp V | |
Verb, auxilliary have, past part. had VHN, HVN HVN VBN V.X.edp, AUX.perf.past V | |
Verb, auxilliary have, present 3SG has VHZ, HVZ HVZ VBZ V.X.pres, AUX.{perf,semi}.{pres} V | |
Verb, auxilliary be, infinitive be VBI, BE BE VB V.cop.{infin,imp} AUX.prog.infin V | |
Verb, auxilliary be, past were VBD, BED BED VBD V.cop.{past.subjun}, AUX.prog.past V | |
Verb, auxilliary be, past, 3SG was VBD, BEDZ BEDZ VBD V.cop.past AUX.prog.past V | |
Verb, auxilliary be, present part. being VBG, BEG BEG VBG V.cop.ingp AUX.prog.ingp V | |
Verb, auxilliary be, past part. been VBN, BEN BEN VBN V.cop.edp AUX.prog.edp V | |
Verb, auxilliary be, present, 3SG is, 's VBZ, BEZ BEZ VBZ V.cop.pres AUX.prog.pres V | |
Verb, auxilliary be, present, 1SG am, 'm VBB, BEM BEM VBP V.cop.pres AUX.prog.pres V | |
Verb, auxilliary be, present are, 're VBB, BER BER VBP V.cop.pres AUX.prog.pres V | |
Verb, modal can, could, 'll VMG, MD MD MD AUX.mod.{past,pres} V | |
Infinitive marker to TO0, TO TO TO PRTCL.to P | |
Preposition, to to PRP IN TO PREP.ge P | |
Preposition, for, above PRP IN IN PREP.ge, PRTCL(for) P | |
Preposition, of of PRF IN IN PREP.ge P | |
Possessive 's, ' POS, not $ POS ADV(ge) not | |
Interjection (or other isolate) oh, yes, mmm ITJ, UH UH UH INTERJEC ! | |
Punctuation, sentence ender . ! ? PUN, . . . PUNC.{qm,per,exm} , | |
Punctuation, semicolon ; PUN, ; . : PUNC.scol , | |
Punctuation, colon or ellipsis : ... PUN, ... : : PUNC.{ellip,col} , | |
Punctuation, comma , PUN, , , , PUNC.com , | |
Punctuation, dash - PUN, not -- - PUNC.dash , | |
Punctuation, dollar sign $ PUN not $ not , | |
Punctuation, left bracket ( [ { PUL, ( ( ( PUNC.obrack , | |
Punctuation, right bracket ) ] } PUR, ) ) ) PUNC.cbrack , | |
Punctuation, left quotation PUQ, `` `` PUNC.oquo , | |
Punctuation, right quotation PUQ, '' '' PUNC.cquo , | |
Foreign words (not in English lexicon) UNC, &FW (FW-) FW not G | |
Symbol [fj] * not SYM not G | |
Symbol, alphabetical A, B, c, d ZZ0 not G | |
Symbol, list item A, A, First LS not G | |
URL or email address http://www.twitter.com ?? ?? U | |
Emoticon :), <3 ?? ?? E | |
Online discourse marker RT ?? ?? ~ | |
Possessive nominal his, book's ?? ?? S | |
Possessive proper noun Mark's ?? ?? Z | |
Nominal combined with verbal he's, you're, book'll ?? ?? L | |
Proper noun combined with verbal Mark'll ?? ?? M | |
Miscellaneous function word combined with verbal there's ?? ?? Y |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
''' | |
Maps Penn Treebank POS tags to morphosyntactic attributes. Excerpted from | |
wsj.py, | |
Utilities for working with Penn Treebank Wall Street Journal data. | |
@author: Nathan Schneider ([email protected]) | |
@since: 2011-05-01 | |
''' | |
from __future__ import print_function, division | |
from future_builtins import map, filter, zip | |
from collections import defaultdict | |
def posinfo(): | |
''' | |
>>> dict(posinfo()['VBP']) | |
{'finite': True, 'tag': 'VBP', 'description': 'Verb, non-3rd person singular present', 'verbal': True} | |
''' | |
# http://www.computing.dcu.ie/~acahill/tagset.html | |
# tag groups: j = adjective, n = nominal, d = determiner, v = verbal, f = finite, r = adverbial, x = function/closed-class | |
# Aside from f and x, these are mutually exclusive and can be interpreted as coarse tags. | |
TABLE = ''' | |
CC x Coordinating conjunction >> and, but, or... | |
CD Cardinal Number | |
DT dx Determiner | |
EX x Existential there | |
FW Foreign Word | |
IN x Preposision or subordinating conjunction | |
JJ j Adjective | |
JJR j Adjective, comparative | |
JJS j Adjective, superlative | |
LS List Item Marker | |
MD vx Modal >> can, could, might, may... | |
NN n Noun, singular or mass | |
NNP n Proper Noun, singular | |
NNPS n Proper Noun, plural | |
NNS n Noun, plural | |
PDT dx Predeterminer >> all, both ... when they precede an article | |
POS x Possessive Ending >> 's | |
PRP nx Personal Pronoun >> I, me, you, he... | |
PRP$ nx Possessive Pronoun >> my, your, mine, yours... | |
RB r Adverb >> Most words that end in -ly as well as degree words like quite, too and very | |
RBR r Adverb, comparative >> Adverbs with the comparative ending -er, with a strictly comparative meaning | |
RBS r Adverb, superlative | |
RP x Particle | |
SYM Symbol >> Should be used for mathematical, scientific or technical symbols | |
TO x to | |
UH Interjection >> e.g. uh, well, yes, my... | |
VB v Verb, base form >> subsumes imperatives, infinitives and subjunctives | |
VBD fv Verb, past tense >> includes the conditional form of the verb to be | |
VBG v Verb, gerund or persent participle | |
VBN v Verb, past participle | |
VBP fv Verb, non-3rd person singular present | |
VBZ fv Verb, 3rd person singular present | |
WDT dx Wh-determiner >> e.g. which, and that when it is used as a relative pronoun | |
WP nx Wh-pronoun >> e.g. what, who, whom... | |
WP$ nx Possessive wh-pronoun | |
WRB rx Wh-adverb >> how, where why | |
# | |
$ | |
'' | |
( | |
) | |
-LRB- ( | |
-RRB- ) | |
-LSB- [ | |
-RSB- ] | |
-LCB- { | |
-RCB- } | |
, | |
. | |
: | |
`` | |
''' | |
info = {} | |
for ln in TABLE.strip().splitlines(): | |
ln = ln.strip() | |
entry = defaultdict(lambda: False) | |
if ln[0] in '()[]{}-' or ' ' not in ln: | |
if ln[0] in '()[]{}-': | |
entry['bracket'] = True | |
entry['tag'] = ln.split()[0] | |
entry['punct'] = True | |
entry['symbol'] = True | |
if ln in ("''", "``"): entry['quote'] = True | |
else: | |
# TODO: traces? | |
if '>>' in ln: | |
entry['extra'] = ln[ln.index('>> ')+3:] | |
ln = ln[:ln.index('>> ')].strip() | |
parts = ln.split() | |
entry['tag'] = parts[0] | |
atts = '' | |
if parts[1][0].islower(): | |
atts = parts[1] | |
del parts[1] | |
entry['description'] = ' '.join(parts[1:]) | |
if 'Possessive' in entry['description']: | |
entry['possessive'] = True | |
if atts: | |
if 'x' in atts: entry['functional'] = True | |
if 'n' in atts: entry['nominal'] = True | |
if 'nx' in atts: entry['pronominal'] = True | |
if 'j' in atts: entry['adjectival'] = True | |
if 'v' in atts: entry['verbal'] = True | |
if 'r' in atts: entry['adverbial'] = True | |
if 'd' in atts: entry['determiner'] = True | |
if 'f' in atts: entry['finite'] = True | |
coarse = ['n' in atts, 'v' in atts, 'd' in atts, 'r' in atts, 'j' in atts] | |
if sum(coarse)==1: | |
entry['coarse'] = 'nvdrj'[coarse.index(True)] | |
else: | |
assert sum(coarse)==0 | |
entry['coarse'] = entry['tag'] | |
else: | |
entry['coarse'] = entry['tag'] | |
if entry['tag'].startswith('W'): entry['wh'] = True | |
info[entry['tag']] = entry | |
info['LS']['symbol'] = True | |
info['SYM']['symbol'] = True | |
info['NNP']['proper'] = True | |
info['NNPS']['proper'] = True | |
info['FW']['foreign'] = True | |
return info | |
def poses(**criteria): | |
''' | |
Retrieves a list of all tags meeting the specified criteria, where criteria are boolean attribute | |
names in the entries of the object returned by posinfo(). | |
>>> poses(finite=True) | |
set(['VBZ', 'VBP', 'VBD']) | |
>>> poses(nominal=True, pronominal=False) | |
set(['NNPS', 'NNS', 'NN', 'NNP']) | |
>>> poses(functional=True, nominal=False, verbal=False, adverbial=False, determiner=False, possessive=False) | |
set(['CC', 'TO', 'RP', 'EX', 'IN']) | |
''' | |
return {entry['tag'] for entry in posinfo().values() if sum(entry[attn]==v for attn,v in criteria.items())==len(criteria)} | |
def posAttributes(tag): | |
return posinfo()[tag] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment