nschneid · September 7, 2013 15:50
diff --git a/browncompound2twitter.txt b/browncompound2twitter.txt
 #	Some compound tags from full Brown tag list at http://www.scs.leeds.ac.uk/ccalas/tagsets/brown.html
 #   (plus a few missing from the other POS mapping).
 #	Possessive/genitive tags (ending in '$') are included if they don't correspond to a single PTB tag due to tokenization differences.
 #	For the purposes of converting to Twitter tags, strip '*' from the tag. Foreign word tags begin with "FW-"; convert these to Twitter POS "G". For any compound tag not in this list, use the first part of the tag.
 Brown	Twitter	PTB
 QLP	R	RB
 NRS	^	NNPS
 NIL	G	??
 '	,	??
 AP$	X	JJ+POS
 CD$	S	CD+POS
 DO+PPSS	G	VB+PRP
 DO+PPSS	G	VBP+PRP
 DT$	D	DET+POS
 DT+BEZ	L	DT+VBZ
 DT+MD	L	DT+MD
 DTS+BEZ	L	DT+VBZ
 EX+BEZ	Y	EX+VBZ
 EX+HVD	Y	EX+VBD
 EX+HVZ	Y	EX+VBZ
 EX+MD	Y	EX+MD
 JJ$	X	JJ+POS
 NN$	S	NN+POS
 NN+BEZ	L	NN+VBZ
 NN+HVD	L	NN+VBD
 NN+HVZ	L	NN+VBZ
 NN+MD	L	NN+MD
 NNS$	S	NNS+POS
 NNS+MD	L	NNS+MD
 NP$	Z	NNP+POS
 NP+BEZ	M	NNP+VBZ
 NP+HVZ	M	NNP+VBZ
 NP+MD	M	NNP+MD
 NPS$	Z	NNPS+POS
 NR$	S	NN+POS
 NR+MD	L	NN+MD
 PN$	S	PRP+POS
 PN+BEZ	L	NN+VBZ
 PN+HVD	L	NN+VBD
 PN+HVZ	L	NN+VBZ
 PN+MD	L	NN+MD
 PPS+BEZ	L	PRP+VBZ
 PPS+HVD	L	PRP+VBD
 PPS+HVZ	L	PRP+VBZ
 PPS+MD	L	PRP+MD
 PPSS+BEM	L	PRP+VBP
 PPSS+BER	L	PRP+VBP
 PPSS+BEZ	L	PRP+VBZ
 PPSS+HV	L	PRP+VB
 PPSS+HV	L	PRP+VBP
 PPSS+HVD	L	PRP+VBD
 PPSS+MD	L	PRP+MD
 PPSS+VB	L	PRP+VB
 PPSS+VB	L	PRP+VBP
 RB$	X	RB+POS
 RB+BEZ	X	RB+VBZ
 WDT+BER	L	WDT+VBP
 WDT+BER+PP	G	??
 WDT+BEZ	L	WDT+VBZ
 WDT+DO+PPS	G	??
 WDT+DOD	L	WDT+VBD
 WDT+HVZ	L	WDT+VBZ
 WPS+BEZ	L	WP+VBZ
 WPS+HVD	L	WP+VBD
 WPS+HVZ	L	WP+VBZ
 WPS+MD	L	WP+MD
 WRB+BER	X	WRB+VBP
 WRB+BEZ	X	WRB+VBZ
 WRB+DO	X	WRB+VBP
 WRB+DOD	X	WRB+VBD
 WRB+DOZ	X	WRB+VBZ
 WRB+IN	X	WRB+IN
 WRB+MD	X	WRB+MD
 ??	,	#

 # Due to PTB bugs
 WPS+BEZ	L	WP+POS
 ??	L	WDT+POS
 ??	Z	VBP+POS
 ??	Y	EX+POS
 # that's (demonstrative) - should be DT+VBZ
 ??	L	IN+POS
 ??	L	IN+VBZ
 ??	L	DT+POS
 # let's
 ??	V	VB+POS
 # Boeing's
 ??	Z	VBG+POS
 # United's
 ??	Z	VBN+POS

 ??	G	FW+POS

 # Not sure if WPS~WP and WDT~WDT are exact Brown~PTB correspondences. 
 # Does WDT+MD (that'll) ever occur in Brown? WPS+HV (who've)?
 WDT+MD	L	WDT+MD
 WPS+HV	L	WP+VBP
diff --git a/poses.py b/poses.py
 '''
 Map between POSes from different tagsets.

 Currently supports PTB, Brown, and the ARK Twitter tagset, though a 
 few conventions of the latter are not yet finalized.

 Depends on two map files, POSMappings.txt and browncompound2twitter.txt 
 (the latter is a supplement to the former). Some mappings are unclear 
 or not yet finalized.

 Coverage has been tested for Brown -> Twitter.

 @author: Nathan Schneider ([email protected])
 @since: 2011-02-18
 '''
 from __future__ import print_function, unicode_literals, division
 from future_builtins import map, filter

 from collections import defaultdict, Counter

 MAP_FILE = 'POSMappings.txt'
 BROWN_CMPD_MAP_FILE = 'browncompound2twitter.txt'

 def read_tabbed_file(filename):
    lineNum = 0   # raw line, 1-based
    clineNum = 0   # content line (not counting comment lines), 0-based
    with open(filename) as f:
        for ln in f:
            lineNum += 1
            if ln.startswith('#'):
                continue
            ln = ln[:-1]    # remove newline
            if ln=='':
                continue
            
            yield (lineNum, clineNum, ln)
            clineNum += 1

 # Tag mapping table
 tbl = []
 tags = defaultdict(list)   # map from a tagset-bound tag (such as 'Brown::VBZ') to a list of indices of rows in the table

 # Load main table
 header = None
 for lnum,clnum,ln in read_tabbed_file(MAP_FILE):
    if clnum==0:
        header = ln.split('\t')
        continue
        
    parts = ln.split('\t')
    tbl.append(parts)
    for tb,tag in zip(header[2:],parts[2:]):
        if tag!='' and tag!='not':
            tags['{}::{}'.format(tb, tag)].append(len(tbl)-1)

 # Load supplementary table for converting compound Brown tags to Twitter tags
 for lnum,clnum,ln in read_tabbed_file(BROWN_CMPD_MAP_FILE):
    if clnum==0:
        assert ln.split('\t')==['Brown','Twitter','PTB']
        continue
    try:
        brownTag, twitTag, ptbTag = ln.split('\t')
    except:
        print(ln)
        raise
    if brownTag=='' or twitTag=='' or ptbTag=='':
        print('Incomplete entry in tag map:',ln)
    newEntry = [None]*len(header)
    newEntry[header.index('Brown')] = brownTag
    newEntry[header.index('Twitter')] = twitTag
    tbl.append(newEntry)
    tags['Brown::{}'.format(brownTag)].append(len(tbl)-1)
    tags['Twitter::{}'.format(twitTag)].append(len(tbl)-1)
    tags['PTB::{}'.format(ptbTag)].append(len(tbl)-1)

 def remove_duplicates(l):
    '''
    Removes duplicate items from the list, preserving the first occurrence of each item in order
    >>> remove_duplicates([4,4,6,3,6,9])
    [4, 6, 3, 9]
    '''
    uniqL = []
    for itm in l:
        if itm not in uniqL:
            uniqL.append(itm)
    return uniqL

 def ptb2brown(ptbTag, token=None):
    '''
    Returns a list of Brown tags corresponding to this PTB tag.
    >>> ptb2brown('VBZ')
    [u'VBZ', u'DOZ', u'HVZ', u'BEZ']
    >>> ptb2brown('IN')
    [u'CS', u'IN']
    >>> ptb2brown(':', token='--')
    [u'--']
    '''
    if ptbTag==':':
        if token==':':
            return [':']
        if token=='--':
            return ['--']
    x = [tbl[i][header.index('Brown')] for i in tags['PTB::{}'.format(ptbTag)]]
    x = remove_duplicates(x)
    assert 'not' not in x,('May be ignored, untagged, or tokenized differently',ptbTag,x)
    # TODO: Deal with issue of NR, NRS, NR$
    assert len(x)>0, ptbTag
    return x

 def ptb2twit(ptbTag):
    '''
    Returns the Twitter tag corresponding to this PTB tag.
    >>> ptb2twit('VBZ')
    u'V'
    >>> ptb2twit('IN')
    u'P'
    >>> ptb2twit('.')
    u','
    >>> ptb2twit('PRP')
    u'O'
    >>> ptb2twit('NN+VBZ')
    u'L'
    >>> ptb2twit('PRP+VBP')
    u'L'
    >>> ptb2twit('PRP+VB')
    u'L'
    >>> ptb2twit('NNP+MD')
    u'M'
    >>> ptb2twit('NNP+POS')
    u'Z'
    >>> ptb2twit('EX+VBD')
    u'Y'
    
    # TODO: ptb2twit('??')
    '''
    x = [tbl[i][header.index('Twitter')] for i in tags['PTB::{}'.format(ptbTag)]]
    x = remove_duplicates(x)
    assert len(x)==1,(ptbTag,x)
    return x[0]

 # TODO: figure out '??' cases
 def brown2ptb(brownTag, token=None, infinitive=None):
    '''
    Returns the PTB tag corresponding to this Brown tag.
    >>> brown2ptb('VBZ')
    u'VBZ'
    >>> brown2ptb('BEZ')
    u'VBZ'
    >>> brown2ptb('IN', 'to')
    u'TO'
    >>> brown2ptb('IN', 'for')
    u'IN'
    >>> brown2ptb('VB', infinitive=False)
    u'VBP'
    >>> brown2ptb('VB', infinitive=True)
    u'VB'
    '''
    if brownTag=='IN':
        assert token is not None,'Transformation from Brown IN to PTB is indeterminate without token (could be TO or IN)'
        return 'TO' if token.lower()=='to' else 'IN'
    elif brownTag=='.':
        assert token is not None,'Transformation from Brown . to PTB is indeterminate without token (could be . or :)'
        return ':' if token in (';','...') else '.'
    elif brownTag=='--':
        return ':'
    elif brownTag in ('VB', 'HV', 'DO'):
        assert infinitive is not None,'Transformation from Brown {} to PTB is indeterminate without infinitive flag (could be VB or VBP)'.format(brownTag)
        return 'VB' if infinitive else 'VBP'
    elif brownTag=='NR':
        assert token is not None,'Transformation from Brown NR to PTB is indeterminate without token (could be NN or NNP or RB)'
        if token[0]==token[0].upper():  # capitalized
            return 'NNP'
        assert False,'TODO: NN vs. RB'
    # TODO: same as above, but for NR$ (= NN+POS vs. NNP+POS vs. RB+POS)
    elif brownTag=='NRS':
        return 'NNPS'
    x = remove_duplicates([tbl[i][header.index('PTB')] for i in tags['Brown::{}'.format(brownTag)]])
    assert 'not' not in x, ('May be ignored, untagged, or tokenized differently',brownTag,x)
    
    # TODO: "VBG, JJ" and "VBN, JJ" entries in map file present problems. For now:
    if 'VBG, JJ' in x:
        x.remove('VBG, JJ')
    if 'VBN, JJ' in x:
        x.remove('VBN, JJ')
    
    assert len(x)==1,(brownTag,x)
    return x[0]

 def brown2twit(brownTag, token=None):
    '''
    Returns the PTB tag corresponding to this Brown tag.
    >>> brown2twit('VBZ')
    u'V'
    >>> brown2twit('BEZ')
    u'V'
    >>> brown2twit('IN')
    u'P'
    >>> brown2twit('VB')
    u'V'
    >>> brown2twit('.')
    u','
    >>> brown2twit('NP+BEZ')
    u'M'
    >>> brown2twit('EX+HVD')
    u'Y'
    >>> brown2twit('FW-*')
    u'G'
    >>> brown2twit('WPS+BEZ')
    u'L'
    >>> brown2twit('NNS$')
    u'S'
    >>> brown2twit('NP$')
    u'Z'
    >>> brown2twit('WDT+BER+PP')
    u'G'
    >>> brown2twit('NP-TL')
    u'^'
    '''
    if brownTag.startswith('FW-'):
        return 'G'  # foreign word
    if '-' in brownTag:
        if brownTag.startswith('--'):
            if '-' in brownTag[2:]:
                brownTag = brownTag[:brownTag.index('-',2)]
        else:
            brownTag = brownTag[:brownTag.index('-')]
    if len(brownTag)>1:
        brownTag = brownTag.replace('*','') # negative marker, doesn't affect Twitter tag category
    
    if '+' in brownTag and 'Brown::{}'.format(brownTag) not in tags:
        brownTag = brownTag[:brownTag.index('+')]
    
    if brownTag=='NR':  # Adverbial nouns. TODO: Revisit this
        if token is not None and token[0].lower()!=token[0]:
            return '^'
        return 'N'
    
    x = [tbl[i][header.index('Twitter')] for i in tags['Brown::{}'.format(brownTag)]]
    x = remove_duplicates(x)
    assert len(x)==1,(brownTag,x)
    return x[0]

 def brownSequence2twit(tagged):
    '''
    >>> brownSequence2twit([('The', 'AT'), ('Fulton', 'NP-TL'), ('County', 'NN-TL'), ('Grand', 'JJ-TL'), ('Jury', 'NN-TL'), ('said', 'VBD'), ('Friday', 'NR'), ('an', 'AT'), ('investigation', 'NN'), ('of', 'IN'), ("Atlanta's", 'NP$'), ('recent', 'JJ'), ('primary', 'NN'), ('election', 'NN'), ('produced', 'VBD'), ('``', '``'), ('no', 'AT'), ('evidence', 'NN'), ("''", "''"), ('that', 'CS'), ('any', 'DTI'), ('irregularities', 'NNS'), ('took', 'VBD'), ('place', 'NN'), ('.', '.')])
    [(u'The', u'D'), (u'Fulton', u'^'), (u'County', u'N'), (u'Grand', u'A'), (u'Jury', u'N'), (u'said', u'V'), (u'Friday', u'^'), (u'an', u'D'), (u'investigation', u'N'), (u'of', u'P'), (u"Atlanta's", u'Z'), (u'recent', u'A'), (u'primary', u'N'), (u'election', u'N'), (u'produced', u'V'), (u'``', u','), (u'no', u'D'), (u'evidence', u'N'), (u"''", u','), (u'that', u'P'), (u'any', u'D'), (u'irregularities', u'N'), (u'took', u'V'), (u'place', u'N'), (u'.', u',')]
    '''
    return [(tkn, brown2twit(tag, token=tkn)) for tkn,tag in tagged]

 def brownSequence2ptb(tagged):
    ptbtagged = []
    for i,(w,bt) in enumerate(tagged):
        infin = False
        # Heuristics for deciding if this is a bare verb (VB) or present tense (but not 3rd person singular) (VBP).
        # Can predict VBP where it should be VB due to coordination of bare verbs, e.g. "will/MD [ go/VB ... and take/VB ... ]"
        if i>0:
            infin = (tagged[i-1][1][:2] in ('MD','TO','VB'))
            if i>1:
                infin = (infin or (tagged[i-2][1][:2] in ('MD','TO','VB') and tagged[i-1][1]=='RB'))
        ptbtagged.append((w, brown2ptb(bt, token=w, infinitive=infin)))
    return ptbtagged

 def ptbSequence2twit(tagged):
    '''
    >>> ptbSequence2twit([('Pierre', 'NNP'), ('Vinken', 'NNP'), (',', ','), ('61', 'CD'), ('years', 'NNS'), ('old', 'JJ'), (',', ','), ('will', 'MD'), ('join', 'VB'), ('the', 'DT'), ('board', 'NN'), ('as', 'IN'), ('a', 'DT'), ('nonexecutive', 'JJ'), ('director', 'NN'), ('Nov.', 'NNP'), ('29', 'CD'), ('.', '.')])
    [(u'Pierre', u'^'), (u'Vinken', u'^'), (u',', u','), (u'61', u'$'), (u'years', u'N'), (u'old', u'A'), (u',', u','), (u'will', u'V'), (u'join', u'V'), (u'the', u'D'), (u'board', u'N'), (u'as', u'P'), (u'a', u'D'), (u'nonexecutive', u'A'), (u'director', u'N'), (u'Nov.', u'^'), (u'29', u'$'), (u'.', u',')]
    >>> ptbSequence2twit([('Although', 'IN'), ('preliminary', 'JJ'), ('findings', 'NNS'), ('were', 'VBD'), ('reported', 'VBN'), ('*-2', '-NONE-'), ('more', 'RBR'), ('than', 'IN'), ('a', 'DT'), ('year', 'NN'), ('ago', 'IN'), (',', ','), ('the', 'DT'), ('latest', 'JJS'), ('results', 'NNS'), ('appear', 'VBP'), ('in', 'IN'), ('today', 'NN'), ("'s", 'POS'), ('New', 'NNP'), ('England', 'NNP'), ('Journal', 'NNP'), ('of', 'IN'), ('Medicine', 'NNP'), (',', ','), ('a', 'DT'), ('forum', 'NN'), ('likely', 'JJ'), ('*', '-NONE-'), ('to', 'TO'), ('bring', 'VB'), ('new', 'JJ'), ('attention', 'NN'), ('to', 'TO'), ('the', 'DT'), ('problem', 'NN'), ('.', '.')])
    [(u'Although', u'P'), (u'preliminary', u'A'), (u'findings', u'N'), (u'were', u'V'), (u'reported', u'V'), (u'more', u'R'), (u'than', u'P'), (u'a', u'D'), (u'year', u'N'), (u'ago', u'P'), (u',', u','), (u'the', u'D'), (u'latest', u'A'), (u'results', u'N'), (u'appear', u'V'), (u'in', u'P'), (u"today's", u'S'), (u'New', u'^'), (u'England', u'^'), (u'Journal', u'^'), (u'of', u'P'), (u'Medicine', u'^'), (u',', u','), (u'a', u'D'), (u'forum', u'N'), (u'likely', u'A'), (u'to', u'P'), (u'bring', u'V'), (u'new', u'A'), (u'attention', u'N'), (u'to', u'P'), (u'the', u'D'), (u'problem', u'N'), (u'.', u',')]
    '''
    seq = []
    for tkn,tag in tagged:
        if tag=='-NONE-' or (tkn=="n't" and tag=='RB'): # negative clitic doesn't matter in terms of the base tag
            continue
        
        if len(tkn)>2 and tag[0]=='-' and tag[-1]=='-':
            tag = {'-LRB-': '(', '-RRB-': ')'}[tag]
            
        if tkn[0]=="'" and tag in ('POS','MD','VBZ','VBP','VBD'):  # join contracted/possessive forms to the previous token
            assert len(seq)>0
            ptkn,ptag = seq[-1]
            seq[-1] = (ptkn+tkn, ptag+'+'+tag)
        else:
            if "'" in tkn and tag not in ("''", 'CD', 'CC'):
                print((['']+seq)[-1],tkn,tag)
            seq.append((tkn,tag))
    return [(tkn, ptb2twit(tag)) for tkn,tag in seq]


 def describe(tag):
    '''
    Returns a tuple of 2 strings describing the tag: the first is the full name, and the second is a list of examples.
    >>> describe('JJ')
    (u'Adjective', u'happy, bad')
    '''
    pass

 '''
 List of NR ("adverbial noun") words in Brown:

 downtown
 home
 today
 to-day
 tomorrow
 to-morrow
 tonight
 yesterday

 left
 right

 east
 north(west|east)?
 south(west|east)?
 west
 nawth

 Sunday
 Monday
 Tuesday
 Wednesday
 Thursday
 Friday
 Saturday
 Sundays

 >> 'home' and 'right' are also very frequent as NN

 ('home', 'NR'): 301, 
  ('home', 'NR-NC'): 1, 
  ('home', 'NR-HL'): 1, 
  ('Home', 'NR'): 2, 
  ('Home', 'NR-TL'): 2, 
  ('Home', 'NR-HL'): 1

 ('home', 'NN'): 217
  ('home', 'NN-HL'): 1, 
  ('home', 'NN-NC'): 4
  ('Home', 'NN'): 2, 
  ('Home', 'NN-TL'): 13

 ('Home', 'NP'): 1

 For 'home', PTB appears to be inconsistent, with a tag of either RB or NN.

 ('right', 'RB'): 75,   e.g. all right, treated right. PTB uses JJ, at least for 'all right'.
  ('Right', 'RB'): 4

 ('right', 'NR'): 56    spatial location/direction

 ('right', 'NN'): 122,  e.g. right to bear arms, the political right. PTB uses NN.
  ('Right', 'NN'): 1, 
  ('Right', 'NN-TL'): 3, 
  ('right', 'NN-HL'): 1

 ('right', 'QL'): 121,  e.g. right away, right now. PTB uses RB.
  ('Right', 'QL'): 6




 Souths, Wednesdays = NRS
 yesterday's, Wednesday's = NR$
 today'll = NR+MD
 '''

 if __name__=='__main__':
    # List NR... tags/tokens in Brown
    
    #import doctest
    #doctest.testmod()
    
    
    
    # Check robustness of Brown-to-PTB conversion
    from nltk.corpus import treebank
    for i,s in enumerate(treebank.tagged_sents()):
        if i in (44,54,147): continue    # CC before VB
        #if i in (23,99): continue   # CC before VBP
        s1 = [(w,pt) for w,pt in s if pt!='-NONE-']
        s2, s3 = None, None
        
        try:
            s2 = [(w, ptb2brown(pt, token=w)[0]) for w,pt in s1]
            s3 = brownSequence2ptb(s2)
            
        except:
            print('skipping',i)
            continue
        assert s1==s3,(s1,s2,s3)
        print('OK',i)
    assert False
    # Create PTB-tagged file for Brown corpus 
    from nltk.corpus import brown
    with open('brown-converted-ptb-tags.txt') as f:
        for s in brown.tagged_sents():
            for w,bt in s:
                try:
                    pt = brownSequence2ptb(s)
                    f.write('{}\t{}\t{}\t-\n'.format(w, pt, bt))
                except:
                    print(s)
                    raise
            f.write('\n')
    
    '''
    twitter = defaultdict(set)
    for t in tags.keys():
        if t.startswith('PTB::'):
            tt = ptb2twit(t[5:])
            twitter[tt].add(t)
        elif t.startswith('Brown::'):
            tt = brown2twit(t[7:])
            twitter[tt].add(t)
            continue
            if t=='Brown::IN' or t=='Brown::HV' or t=='Brown::DO' or t=='Brown::.' or t=='Brown::??' or t=='Brown::VB': continue
            print(t)
            print(brown2ptb(t[7:]))
    for t in sorted(twitter.keys()):
        print(t,' '.join(iter(twitter[t])))
    '''
diff --git a/POSMappings.txt b/POSMappings.txt
 # http://nlp.cs.nyu.edu/wiki/corpuswg/AnnotationCompatibilityReport
 # Table 1: Part of Speech Compatibility					
 # (Initial Version from Manning and Schutz 1998, pp. 141-142)					
 # Extended to cover Claws1 and ICE
 # cf. http://www.scs.leeds.ac.uk/ccalas/tagsets/brown.html
 # Nathan Schneider, 2011-02-19:
 # * Fixed some errors in brown column, e.g.: DT1 => DTI, PP0 => PPO, NRS => NPS
 # * Added last column (Twitter tagset) and several special tags at the end
 Category	Examples	Claws c5, Claws1	Brown	PTB	ICE	Twitter
 Adjective	happy, bad	AJ0	JJ	JJ	ADJ.ge	A
 Adjective, ordinal number	sixth, 72nd, last	ORD, OD	OD	JJ	NUM.od	A
 Adjective, comparative	happier, worse	AJC	JJR	JJR	ADJ.comp	A
 Adjective, superlative	happiest, worst	AJS	JJT	JJS	ADJ.sup	A
 Adjective, superlative, semantically	chief, top	AJ0	JJS	JJ	ADJ.ge	A
 Adjective, cardinal number	3, fifteen	CRD, CD	CD	CD	NUM.cd	$
 Adjective, cardinal number, one	one	PNI, CD1	CD	CD	NUM.cd	$
 Adjective, past-part of verb	surprised	JJ	VBN	VBN, JJ	ADJ.edp	V
 Adjective, pres-part of verb	refreshing	JJ	VBG	VBG, JJ	ADJ.ingp	V
 Adverb	slowly, sweetly	AV0	RB	RB	ADV.ge	R
 Adverb, negative	not, n't	XX0	*	RB	ADV.ge	R
 Adverb, comparative	faster	AV0	RBR	RBR	ADV.comp	R
 Adverb, superlative	fastest	AV0	RBT	RBS	ADV.sup	R
 Adverb, particle	up, off, out	AVP, RP, RI	RP	RP	ADV.phras or ADV.ge	T
 Adverb, question	when, how, why	AVQ	WRB	WRB	ADV.wh	R
 Adverb, degree & question	how, however	AVQ	WQL	WRB	ADV.wh	R
 Adverb, degree	very, so, too	AV0, QL	QL	RB	ADV.intens	R
 Adverb, degree, postposed	enough, indeed	AV0	RN	RB	ADV.intens, ADV.ge	R
 Adverb, nominal	here, there, now	AV0, RB	RN	RB	ADV.ge, EXTHERE	R
 Adverb, conjunctive	therefore, however	AV0,RB	RN	RB	CONNEC.ge	R
 Conjunction, coordination	and, or	CJC, CC	CC	CC	CONJUNC.coord	&
 Conjunction, subordinating	although, when	CJS, CS	CS	IN	CONJUNC.subord,	P
 Conjunction, complementizer 'that'	that, WP, WPA, WPO	CJT	CS	IN	PRON.rel, CONJUNC.subord	P
 Determiner	this, each, another	DT0, DT	DT	DT	PRON.dem.sing, PRON(recip)	D
 Determiner, pronoun	any, some	DT0, DTI	DTI	DT	PRON.nonass, PRON.ass	D
 Determiner, pronoun, plural	these, those	DT0, DTS	DTS	DT	PRON.dem.plu	D
 Determiner, prequalifier	quite	DT0, aBL	ABL	PDT	ADV.intens	X
 Determiner, prequalifier	all, half	DT0, ABN	ABN	PDT	PRON.univ, PRON.quant	X
 Determiner, pronoun or double conj.	both	DT0, ABX	ABX	DT (CC)	PRON.univ.plu	D
 Determiner, pronoun or double conj.	either, neither	DT0, DTX	DTX	DT (CC)	PRON.neg PRON.nonass.sing	D
 Determiner, article	the, no	AT0, ATI, DTX	AT	DT	ART.def, PRON.neg	D
 Determiner, article	a, an	AT0, AT	AT	DT	ART.indef	D
 Determiner, postdeterminer	many, same	DT0, AP,APS	AP	JJ	PRON.quant.{sing,plu}, ADJ.ge	A
 Determiner, possessive	their, your	DPS, PP$, PP$$	PP$	PRP$	PRON.poss	D
 Determiner, possessive, second	mine, yours	DPS, PP$	PP$$	PRP	PRON.poss	O
 Determiner, question	which, whatever	DTQ, WDT	WDT	WDT	PRON.{inter, rel}	D
 Determiner, possessive & question	whose	DTQ, WP$	WP$	WP$	PRON.rel	D
 Noun	aircraft, data	NN0	NN	NN	N.com.sing	N
 Noun, singular	woman, book	NN1	NN	NN	N.com.sing	N
 Noun, plural	women, books	NN2	NNS	NNS	N.com.plu	N
 Noun, proper, singular	London, Michael	NP0	NP	NNP	N.prop.sing	^
 Noun, proper, plural	Australians, Methodists	NP2	NPS	NNPS	N.prop.plu	^
 Noun, adverbial	tomorrow, home	NN0	NR	NN, NNP, RB	N.com.sing	N, ^, R
 Noun, plural from post-determiner	others	NN2, APS	NNS	NNS	PRON.quant.plu	N
 Pronoun, nominal (indefinite)	none, everything, one	PNI,PN	PN	NN	PRON(neg), PRON.univ.{sing,plu}, NUM(card,sing)	N
 Pronoun, personal, subject	you, we	PNP, PP2	PPSS	PRP	PRON.pers.{sing,plu}	O
 Pronoun, personal, subject, 3SG	she, he, it	PNP, PP1A, PP1AS, PP3, PP3A, PP3AS	PPS	PRP	PRON.{antit,cleftit},	O
 Pronoun, personal, object	you, them, me	PNP, PP1O, PP1OS, PP3O, PP3OS	PPO	PRP	PRON.pers.{sing,plu}	O
 Pronoun, reflexive	herself, myself	PNX, PPL	PPL	PRP	PRON.ref.sing	O
 Pronoun, reflexive, plural	themselves, ourselves	PNX, PPLS	PPLS	PRP	PRON.ref.plu	O
 Pronoun, question, subject	who, whoever	PNQ	WPS	WP	PRON.inter, PRON(nom)	O
 Pronoun, question, object	who, whoever	PNQ	WPO	WP	PRON.inter, PRON(nom)	O
 Pronoun, question, existential there	there	EX0	EX	EX	EXTHERE	X
 Verb. base present form (not infinitive)	take, live	VVB	VB	VBP	V.X.{pres,imp}	V
 Verb, infinitive	take, live	VVI	VB	VB	V.X.infin	V
 Verb, past tense	took, lived	VVD	VBD	VBD	V.X.past	V
 Verb, present participle	taking, living	VVG	VBG	VBG	V.X.ingp	V
 Verb, past/passive participle	taken, lived	VVN	VBN	VBN	V.X.edp	V
 Verb, present 3SG -s form	takes, lives	VVZ	VBZ	VBZ	V.X.pres	V
 Verb, auxilliary do, base	do	VDB, DO	DO	VBP	AUX.do.{pres,imp}	V
 Verb, auxilliary do, infinitive	do, DO	VDB	DO	VB	AUX.do.infin	V
 Verb, auxilliary do, past	did	VDD, DOD	DOD	VBD	AUX.do.past	V
 Verb, auxilliary do, present part.	doing	VDG,	VBG	VBG	V.X.ingp	V
 Verb, auxilliary do, past part.	done	VDN	VBN	VBN	V.X.edp	V
 Verb, auxilliary do, present 3SG	does	VDZ, DOZ	DOZ	VBZ	AUX.do.pres	V
 Verb, auxilliary have, base	have	VHB, HV	HV	VBP	V.X.pres, AUX.{perf,semi}.pres	V
 Verb, auxilliary have, infinitive	have	VHI, HV	HV	VB	V.X.infin, AUX.{perf,semi}.{imp,infin}	V
 Verb, auxilliary have, past	had	VHD, HVD	HVD	VBD	V.X.past, AUX.{perf,semi}.past	V
 Verb, auxilliary have, present part.	having	VHG, HVG	HVG	VBG	V.X.ingp, AUX.perf.ingp	V
 Verb, auxilliary have, past part.	had	VHN, HVN	HVN	VBN	V.X.edp, AUX.perf.past	V
 Verb, auxilliary have, present 3SG	has	VHZ, HVZ	HVZ	VBZ	V.X.pres, AUX.{perf,semi}.{pres}	V
 Verb, auxilliary be, infinitive	be	VBI, BE	BE	VB	V.cop.{infin,imp} AUX.prog.infin	V
 Verb, auxilliary be, past	were	VBD, BED	BED	VBD	V.cop.{past.subjun}, AUX.prog.past	V
 Verb, auxilliary be, past, 3SG	was	VBD, BEDZ	BEDZ	VBD	V.cop.past AUX.prog.past	V
 Verb, auxilliary be, present part.	being	VBG, BEG	BEG	VBG	V.cop.ingp AUX.prog.ingp	V
 Verb, auxilliary be, past part.	been	VBN, BEN	BEN	VBN	V.cop.edp AUX.prog.edp	V
 Verb, auxilliary be, present, 3SG	is, 's	VBZ, BEZ	BEZ	VBZ	V.cop.pres AUX.prog.pres	V
 Verb, auxilliary be, present, 1SG	am, 'm	VBB, BEM	BEM	VBP	V.cop.pres AUX.prog.pres	V
 Verb, auxilliary be, present	are, 're	VBB, BER	BER	VBP	V.cop.pres AUX.prog.pres	V
 Verb, modal	can, could, 'll	VMG, MD	MD	MD	AUX.mod.{past,pres}	V
 Infinitive marker	to	TO0, TO	TO	TO	PRTCL.to	P
 Preposition, to	to	PRP	IN	TO	PREP.ge	P
 Preposition,	for, above	PRP	IN	IN	PREP.ge, PRTCL(for)	P
 Preposition, of	of	PRF	IN	IN	PREP.ge	P
 Possessive	's, '	POS, not	$	POS	ADV(ge)	not
 Interjection (or other isolate)	oh, yes, mmm	ITJ, UH	UH	UH	INTERJEC	!
 Punctuation, sentence ender	. ! ?	PUN, .	.	.	PUNC.{qm,per,exm}	,
 Punctuation, semicolon	;	PUN, ;	.	:	PUNC.scol	,
 Punctuation, colon or ellipsis	: ...	PUN, ...	:	:	PUNC.{ellip,col}	,
 Punctuation, comma	,	PUN, ,	,	,	PUNC.com	,
 Punctuation, dash	-	PUN, not	--	-	PUNC.dash	,
 Punctuation, dollar sign	$	PUN	not	$	not	,
 Punctuation, left bracket	( [ {	PUL, (	(	(	PUNC.obrack	,
 Punctuation, right bracket	) ] }	PUR, )	)	)	PUNC.cbrack	,
 Punctuation, left quotation		PUQ,	``	``	PUNC.oquo	,
 Punctuation, right quotation		PUQ,	''	''	PUNC.cquo	,
 Foreign words (not in English lexicon)		UNC, &FW	(FW-)	FW	not	G
 Symbol	[fj] *		not	SYM	not	G
 Symbol, alphabetical	A, B, c, d	ZZ0			not	G
 Symbol, list item	A, A, First			LS	not	G
 URL or email address	http://www.twitter.com		??	??		U
 Emoticon	:), <3		??	??		E
 Online discourse marker	RT		??	??		~
 Possessive nominal	his, book's		??	??		S
 Possessive proper noun	Mark's		??	??		Z
 Nominal combined with verbal	he's, you're, book'll		??	??		L
 Proper noun combined with verbal	Mark'll		??	??		M
 Miscellaneous function word combined with verbal	there's		??	??		Y
diff --git a/ptb_poses.py b/ptb_poses.py
 '''
 Maps Penn Treebank POS tags to morphosyntactic attributes. Excerpted from 
 wsj.py,
 Utilities for working with Penn Treebank Wall Street Journal data.

 @author: Nathan Schneider ([email protected])
 @since: 2011-05-01
 '''
 from __future__ import print_function, division
 from future_builtins import map, filter, zip

 from collections import defaultdict

 def posinfo():
    '''
    >>> dict(posinfo()['VBP'])
    {'finite': True, 'tag': 'VBP', 'description': 'Verb, non-3rd person singular present', 'verbal': True}
    '''
    # http://www.computing.dcu.ie/~acahill/tagset.html
    # tag groups: j = adjective, n = nominal, d = determiner, v = verbal, f = finite, r = adverbial, x = function/closed-class
    #   Aside from f and x, these are mutually exclusive and can be interpreted as coarse tags.
    TABLE = '''
    CC    x Coordinating conjunction    >> and, but, or...
    CD      Cardinal Number
    DT   dx Determiner
    EX    x Existential there
    FW      Foreign Word
    IN    x Preposision or subordinating conjunction
    JJ   j  Adjective
    JJR  j  Adjective, comparative
    JJS  j  Adjective, superlative
    LS      List Item Marker
    MD   vx Modal    >> can, could, might, may...
    NN   n  Noun, singular or mass
    NNP  n  Proper Noun, singular
    NNPS n  Proper Noun, plural
    NNS  n  Noun, plural
    PDT  dx Predeterminer    >> all, both ... when they precede an article
    POS   x Possessive Ending    >> 's
    PRP  nx Personal Pronoun    >> I, me, you, he...
    PRP$ nx Possessive Pronoun    >> my, your, mine, yours...
    RB   r  Adverb    >> Most words that end in -ly as well as degree words like quite, too and very
    RBR  r  Adverb, comparative    >> Adverbs with the comparative ending -er, with a strictly comparative meaning
    RBS  r  Adverb, superlative
    RP    x Particle
    SYM     Symbol    >> Should be used for mathematical, scientific or technical symbols
    TO    x to
    UH      Interjection    >> e.g. uh, well, yes, my...
    VB   v  Verb, base form    >> subsumes imperatives, infinitives and subjunctives
    VBD fv  Verb, past tense    >> includes the conditional form of the verb to be
    VBG  v  Verb, gerund or persent participle
    VBN  v  Verb, past participle
    VBP fv  Verb, non-3rd person singular present
    VBZ fv  Verb, 3rd person singular present
    WDT  dx Wh-determiner    >> e.g. which, and that when it is used as a relative pronoun
    WP   nx Wh-pronoun    >> e.g. what, who, whom...
    WP$  nx Possessive wh-pronoun
    WRB  rx Wh-adverb    >> how, where why
    #
    $
    ''
    (
    )
    -LRB-    (
    -RRB-    )
    -LSB-    [
    -RSB-    ]
    -LCB-    {
    -RCB-    }
    ,
    .
    :
    ``
    '''
    info = {}
    for ln in TABLE.strip().splitlines():
        ln = ln.strip()
        entry = defaultdict(lambda: False)
        if ln[0] in '()[]{}-' or ' ' not in ln:
            if ln[0] in '()[]{}-':
                entry['bracket'] = True
            entry['tag'] = ln.split()[0]
            entry['punct'] = True
            entry['symbol'] = True
            if ln in ("''", "``"): entry['quote'] = True
        else:
        # TODO: traces?
            if '>>' in ln:
                entry['extra'] = ln[ln.index('>> ')+3:]
                ln = ln[:ln.index('>> ')].strip()
            parts = ln.split()
            entry['tag'] = parts[0]
            atts = '' 
            if parts[1][0].islower():
                atts = parts[1]
                del parts[1]
            entry['description'] = ' '.join(parts[1:])
            if 'Possessive' in entry['description']:
                entry['possessive'] = True
            if atts:
                if 'x' in atts: entry['functional'] = True
                if 'n' in atts: entry['nominal'] = True
                if 'nx' in atts: entry['pronominal'] = True
                if 'j' in atts: entry['adjectival'] = True
                if 'v' in atts: entry['verbal'] = True
                if 'r' in atts: entry['adverbial'] = True
                if 'd' in atts: entry['determiner'] = True
                if 'f' in atts: entry['finite'] = True
                coarse = ['n' in atts, 'v' in atts, 'd' in atts, 'r' in atts, 'j' in atts]
                if sum(coarse)==1:
                    entry['coarse'] = 'nvdrj'[coarse.index(True)]
                else:
                    assert sum(coarse)==0
                    entry['coarse'] = entry['tag'] 
            else:
                entry['coarse'] = entry['tag']
                
            if entry['tag'].startswith('W'): entry['wh'] = True
        info[entry['tag']] = entry
        
    info['LS']['symbol'] = True
    info['SYM']['symbol'] = True
    info['NNP']['proper'] = True
    info['NNPS']['proper'] = True
    info['FW']['foreign'] = True
    return info

 def poses(**criteria):
    '''
    Retrieves a list of all tags meeting the specified criteria, where criteria are boolean attribute
    names in the entries of the object returned by posinfo().
    
    >>> poses(finite=True)
    set(['VBZ', 'VBP', 'VBD'])
    >>> poses(nominal=True, pronominal=False)
    set(['NNPS', 'NNS', 'NN', 'NNP'])
    >>> poses(functional=True, nominal=False, verbal=False, adverbial=False, determiner=False, possessive=False)
    set(['CC', 'TO', 'RP', 'EX', 'IN'])
    '''
    return {entry['tag'] for entry in posinfo().values() if sum(entry[attn]==v for attn,v in criteria.items())==len(criteria)}

 def posAttributes(tag):
    return posinfo()[tag]
	# Some compound tags from full Brown tag list at http://www.scs.leeds.ac.uk/ccalas/tagsets/brown.html
	# (plus a few missing from the other POS mapping).
	# Possessive/genitive tags (ending in '$') are included if they don't correspond to a single PTB tag due to tokenization differences.
	# For the purposes of converting to Twitter tags, strip '*' from the tag. Foreign word tags begin with "FW-"; convert these to Twitter POS "G". For any compound tag not in this list, use the first part of the tag.
	Brown Twitter PTB
	QLP R RB
	NRS ^ NNPS
	NIL G ??
	' , ??
	AP$ X JJ+POS
	CD$ S CD+POS
	DO+PPSS G VB+PRP
	DO+PPSS G VBP+PRP
	DT$ D DET+POS
	DT+BEZ L DT+VBZ
	DT+MD L DT+MD
	DTS+BEZ L DT+VBZ
	EX+BEZ Y EX+VBZ
	EX+HVD Y EX+VBD
	EX+HVZ Y EX+VBZ
	EX+MD Y EX+MD
	JJ$ X JJ+POS
	NN$ S NN+POS
	NN+BEZ L NN+VBZ
	NN+HVD L NN+VBD
	NN+HVZ L NN+VBZ
	NN+MD L NN+MD
	NNS$ S NNS+POS
	NNS+MD L NNS+MD
	NP$ Z NNP+POS
	NP+BEZ M NNP+VBZ
	NP+HVZ M NNP+VBZ
	NP+MD M NNP+MD
	NPS$ Z NNPS+POS
	NR$ S NN+POS
	NR+MD L NN+MD
	PN$ S PRP+POS
	PN+BEZ L NN+VBZ
	PN+HVD L NN+VBD
	PN+HVZ L NN+VBZ
	PN+MD L NN+MD
	PPS+BEZ L PRP+VBZ
	PPS+HVD L PRP+VBD
	PPS+HVZ L PRP+VBZ
	PPS+MD L PRP+MD
	PPSS+BEM L PRP+VBP
	PPSS+BER L PRP+VBP
	PPSS+BEZ L PRP+VBZ
	PPSS+HV L PRP+VB
	PPSS+HV L PRP+VBP
	PPSS+HVD L PRP+VBD
	PPSS+MD L PRP+MD
	PPSS+VB L PRP+VB
	PPSS+VB L PRP+VBP
	RB$ X RB+POS
	RB+BEZ X RB+VBZ
	WDT+BER L WDT+VBP
	WDT+BER+PP G ??
	WDT+BEZ L WDT+VBZ
	WDT+DO+PPS G ??
	WDT+DOD L WDT+VBD
	WDT+HVZ L WDT+VBZ
	WPS+BEZ L WP+VBZ
	WPS+HVD L WP+VBD
	WPS+HVZ L WP+VBZ
	WPS+MD L WP+MD
	WRB+BER X WRB+VBP
	WRB+BEZ X WRB+VBZ
	WRB+DO X WRB+VBP
	WRB+DOD X WRB+VBD
	WRB+DOZ X WRB+VBZ
	WRB+IN X WRB+IN
	WRB+MD X WRB+MD
	?? , #

	# Due to PTB bugs
	WPS+BEZ L WP+POS
	?? L WDT+POS
	?? Z VBP+POS
	?? Y EX+POS
	# that's (demonstrative) - should be DT+VBZ
	?? L IN+POS
	?? L IN+VBZ
	?? L DT+POS
	# let's
	?? V VB+POS
	# Boeing's
	?? Z VBG+POS
	# United's
	?? Z VBN+POS

	?? G FW+POS

	# Not sure if WPS~WP and WDT~WDT are exact Brown~PTB correspondences.
	# Does WDT+MD (that'll) ever occur in Brown? WPS+HV (who've)?
	WDT+MD L WDT+MD
	WPS+HV L WP+VBP
	'''
	Map between POSes from different tagsets.

	Currently supports PTB, Brown, and the ARK Twitter tagset, though a
	few conventions of the latter are not yet finalized.

	Depends on two map files, POSMappings.txt and browncompound2twitter.txt
	(the latter is a supplement to the former). Some mappings are unclear
	or not yet finalized.

	Coverage has been tested for Brown -> Twitter.

	@author: Nathan Schneider ([email protected])
	@since: 2011-02-18
	'''
	from __future__ import print_function, unicode_literals, division
	from future_builtins import map, filter

	from collections import defaultdict, Counter

	MAP_FILE = 'POSMappings.txt'
	BROWN_CMPD_MAP_FILE = 'browncompound2twitter.txt'

	def read_tabbed_file(filename):
	lineNum = 0 # raw line, 1-based
	clineNum = 0 # content line (not counting comment lines), 0-based
	with open(filename) as f:
	for ln in f:
	lineNum += 1
	if ln.startswith('#'):
	continue
	ln = ln[:-1] # remove newline
	if ln=='':
	continue

	yield (lineNum, clineNum, ln)
	clineNum += 1

	# Tag mapping table
	tbl = []
	tags = defaultdict(list) # map from a tagset-bound tag (such as 'Brown::VBZ') to a list of indices of rows in the table

	# Load main table
	header = None
	for lnum,clnum,ln in read_tabbed_file(MAP_FILE):
	if clnum==0:
	header = ln.split('\t')
	continue

	parts = ln.split('\t')
	tbl.append(parts)
	for tb,tag in zip(header[2:],parts[2:]):
	if tag!='' and tag!='not':
	tags['{}::{}'.format(tb, tag)].append(len(tbl)-1)

	# Load supplementary table for converting compound Brown tags to Twitter tags
	for lnum,clnum,ln in read_tabbed_file(BROWN_CMPD_MAP_FILE):
	if clnum==0:
	assert ln.split('\t')==['Brown','Twitter','PTB']
	continue
	try:
	brownTag, twitTag, ptbTag = ln.split('\t')
	except:
	print(ln)
	raise
	if brownTag=='' or twitTag=='' or ptbTag=='':
	print('Incomplete entry in tag map:',ln)
	newEntry = [None]*len(header)
	newEntry[header.index('Brown')] = brownTag
	newEntry[header.index('Twitter')] = twitTag
	tbl.append(newEntry)
	tags['Brown::{}'.format(brownTag)].append(len(tbl)-1)
	tags['Twitter::{}'.format(twitTag)].append(len(tbl)-1)
	tags['PTB::{}'.format(ptbTag)].append(len(tbl)-1)

	def remove_duplicates(l):
	'''
	Removes duplicate items from the list, preserving the first occurrence of each item in order
	>>> remove_duplicates([4,4,6,3,6,9])
	[4, 6, 3, 9]
	'''
	uniqL = []
	for itm in l:
	if itm not in uniqL:
	uniqL.append(itm)
	return uniqL

	def ptb2brown(ptbTag, token=None):
	'''
	Returns a list of Brown tags corresponding to this PTB tag.
	>>> ptb2brown('VBZ')
	[u'VBZ', u'DOZ', u'HVZ', u'BEZ']
	>>> ptb2brown('IN')
	[u'CS', u'IN']
	>>> ptb2brown(':', token='--')
	[u'--']
	'''
	if ptbTag==':':
	if token==':':
	return [':']
	if token=='--':
	return ['--']
	x = [tbl[i][header.index('Brown')] for i in tags['PTB::{}'.format(ptbTag)]]
	x = remove_duplicates(x)
	assert 'not' not in x,('May be ignored, untagged, or tokenized differently',ptbTag,x)
	# TODO: Deal with issue of NR, NRS, NR$
	assert len(x)>0, ptbTag
	return x

	def ptb2twit(ptbTag):
	'''
	Returns the Twitter tag corresponding to this PTB tag.
	>>> ptb2twit('VBZ')
	u'V'
	>>> ptb2twit('IN')
	u'P'
	>>> ptb2twit('.')
	u','
	>>> ptb2twit('PRP')
	u'O'
	>>> ptb2twit('NN+VBZ')
	u'L'
	>>> ptb2twit('PRP+VBP')
	u'L'
	>>> ptb2twit('PRP+VB')
	u'L'
	>>> ptb2twit('NNP+MD')
	u'M'
	>>> ptb2twit('NNP+POS')
	u'Z'
	>>> ptb2twit('EX+VBD')
	u'Y'

	# TODO: ptb2twit('??')
	'''
	x = [tbl[i][header.index('Twitter')] for i in tags['PTB::{}'.format(ptbTag)]]
	x = remove_duplicates(x)
	assert len(x)==1,(ptbTag,x)
	return x[0]

	# TODO: figure out '??' cases
	def brown2ptb(brownTag, token=None, infinitive=None):
	'''
	Returns the PTB tag corresponding to this Brown tag.
	>>> brown2ptb('VBZ')
	u'VBZ'
	>>> brown2ptb('BEZ')
	u'VBZ'
	>>> brown2ptb('IN', 'to')
	u'TO'
	>>> brown2ptb('IN', 'for')
	u'IN'
	>>> brown2ptb('VB', infinitive=False)
	u'VBP'
	>>> brown2ptb('VB', infinitive=True)
	u'VB'
	'''
	if brownTag=='IN':
	assert token is not None,'Transformation from Brown IN to PTB is indeterminate without token (could be TO or IN)'
	return 'TO' if token.lower()=='to' else 'IN'
	elif brownTag=='.':
	assert token is not None,'Transformation from Brown . to PTB is indeterminate without token (could be . or :)'
	return ':' if token in (';','...') else '.'
	elif brownTag=='--':
	return ':'
	elif brownTag in ('VB', 'HV', 'DO'):
	assert infinitive is not None,'Transformation from Brown {} to PTB is indeterminate without infinitive flag (could be VB or VBP)'.format(brownTag)
	return 'VB' if infinitive else 'VBP'
	elif brownTag=='NR':
	assert token is not None,'Transformation from Brown NR to PTB is indeterminate without token (could be NN or NNP or RB)'
	if token[0]==token[0].upper(): # capitalized
	return 'NNP'
	assert False,'TODO: NN vs. RB'
	# TODO: same as above, but for NR$ (= NN+POS vs. NNP+POS vs. RB+POS)
	elif brownTag=='NRS':
	return 'NNPS'
	x = remove_duplicates([tbl[i][header.index('PTB')] for i in tags['Brown::{}'.format(brownTag)]])
	assert 'not' not in x, ('May be ignored, untagged, or tokenized differently',brownTag,x)

	# TODO: "VBG, JJ" and "VBN, JJ" entries in map file present problems. For now:
	if 'VBG, JJ' in x:
	x.remove('VBG, JJ')
	if 'VBN, JJ' in x:
	x.remove('VBN, JJ')

	assert len(x)==1,(brownTag,x)
	return x[0]

	def brown2twit(brownTag, token=None):
	'''
	Returns the PTB tag corresponding to this Brown tag.
	>>> brown2twit('VBZ')
	u'V'
	>>> brown2twit('BEZ')
	u'V'
	>>> brown2twit('IN')
	u'P'
	>>> brown2twit('VB')
	u'V'
	>>> brown2twit('.')
	u','
	>>> brown2twit('NP+BEZ')
	u'M'
	>>> brown2twit('EX+HVD')
	u'Y'
	>>> brown2twit('FW-*')
	u'G'
	>>> brown2twit('WPS+BEZ')
	u'L'
	>>> brown2twit('NNS$')
	u'S'
	>>> brown2twit('NP$')
	u'Z'
	>>> brown2twit('WDT+BER+PP')
	u'G'
	>>> brown2twit('NP-TL')
	u'^'
	'''
	if brownTag.startswith('FW-'):
	return 'G' # foreign word
	if '-' in brownTag:
	if brownTag.startswith('--'):
	if '-' in brownTag[2:]:
	brownTag = brownTag[:brownTag.index('-',2)]
	else:
	brownTag = brownTag[:brownTag.index('-')]
	if len(brownTag)>1:
	brownTag = brownTag.replace('*','') # negative marker, doesn't affect Twitter tag category

	if '+' in brownTag and 'Brown::{}'.format(brownTag) not in tags:
	brownTag = brownTag[:brownTag.index('+')]

	if brownTag=='NR': # Adverbial nouns. TODO: Revisit this
	if token is not None and token[0].lower()!=token[0]:
	return '^'
	return 'N'

	x = [tbl[i][header.index('Twitter')] for i in tags['Brown::{}'.format(brownTag)]]
	x = remove_duplicates(x)
	assert len(x)==1,(brownTag,x)
	return x[0]

	def brownSequence2twit(tagged):
	'''
	>>> brownSequence2twit([('The', 'AT'), ('Fulton', 'NP-TL'), ('County', 'NN-TL'), ('Grand', 'JJ-TL'), ('Jury', 'NN-TL'), ('said', 'VBD'), ('Friday', 'NR'), ('an', 'AT'), ('investigation', 'NN'), ('of', 'IN'), ("Atlanta's", 'NP$'), ('recent', 'JJ'), ('primary', 'NN'), ('election', 'NN'), ('produced', 'VBD'), ('``', '``'), ('no', 'AT'), ('evidence', 'NN'), ("''", "''"), ('that', 'CS'), ('any', 'DTI'), ('irregularities', 'NNS'), ('took', 'VBD'), ('place', 'NN'), ('.', '.')])
	[(u'The', u'D'), (u'Fulton', u'^'), (u'County', u'N'), (u'Grand', u'A'), (u'Jury', u'N'), (u'said', u'V'), (u'Friday', u'^'), (u'an', u'D'), (u'investigation', u'N'), (u'of', u'P'), (u"Atlanta's", u'Z'), (u'recent', u'A'), (u'primary', u'N'), (u'election', u'N'), (u'produced', u'V'), (u'``', u','), (u'no', u'D'), (u'evidence', u'N'), (u"''", u','), (u'that', u'P'), (u'any', u'D'), (u'irregularities', u'N'), (u'took', u'V'), (u'place', u'N'), (u'.', u',')]
	'''
	return [(tkn, brown2twit(tag, token=tkn)) for tkn,tag in tagged]

	def brownSequence2ptb(tagged):
	ptbtagged = []
	for i,(w,bt) in enumerate(tagged):
	infin = False
	# Heuristics for deciding if this is a bare verb (VB) or present tense (but not 3rd person singular) (VBP).
	# Can predict VBP where it should be VB due to coordination of bare verbs, e.g. "will/MD [ go/VB ... and take/VB ... ]"
	if i>0:
	infin = (tagged[i-1][1][:2] in ('MD','TO','VB'))
	if i>1:
	infin = (infin or (tagged[i-2][1][:2] in ('MD','TO','VB') and tagged[i-1][1]=='RB'))
	ptbtagged.append((w, brown2ptb(bt, token=w, infinitive=infin)))
	return ptbtagged

	def ptbSequence2twit(tagged):
	'''
	>>> ptbSequence2twit([('Pierre', 'NNP'), ('Vinken', 'NNP'), (',', ','), ('61', 'CD'), ('years', 'NNS'), ('old', 'JJ'), (',', ','), ('will', 'MD'), ('join', 'VB'), ('the', 'DT'), ('board', 'NN'), ('as', 'IN'), ('a', 'DT'), ('nonexecutive', 'JJ'), ('director', 'NN'), ('Nov.', 'NNP'), ('29', 'CD'), ('.', '.')])
	[(u'Pierre', u'^'), (u'Vinken', u'^'), (u',', u','), (u'61', u'$'), (u'years', u'N'), (u'old', u'A'), (u',', u','), (u'will', u'V'), (u'join', u'V'), (u'the', u'D'), (u'board', u'N'), (u'as', u'P'), (u'a', u'D'), (u'nonexecutive', u'A'), (u'director', u'N'), (u'Nov.', u'^'), (u'29', u'$'), (u'.', u',')]
	>>> ptbSequence2twit([('Although', 'IN'), ('preliminary', 'JJ'), ('findings', 'NNS'), ('were', 'VBD'), ('reported', 'VBN'), ('-2', '-NONE-'), ('more', 'RBR'), ('than', 'IN'), ('a', 'DT'), ('year', 'NN'), ('ago', 'IN'), (',', ','), ('the', 'DT'), ('latest', 'JJS'), ('results', 'NNS'), ('appear', 'VBP'), ('in', 'IN'), ('today', 'NN'), ("'s", 'POS'), ('New', 'NNP'), ('England', 'NNP'), ('Journal', 'NNP'), ('of', 'IN'), ('Medicine', 'NNP'), (',', ','), ('a', 'DT'), ('forum', 'NN'), ('likely', 'JJ'), ('', '-NONE-'), ('to', 'TO'), ('bring', 'VB'), ('new', 'JJ'), ('attention', 'NN'), ('to', 'TO'), ('the', 'DT'), ('problem', 'NN'), ('.', '.')])
	[(u'Although', u'P'), (u'preliminary', u'A'), (u'findings', u'N'), (u'were', u'V'), (u'reported', u'V'), (u'more', u'R'), (u'than', u'P'), (u'a', u'D'), (u'year', u'N'), (u'ago', u'P'), (u',', u','), (u'the', u'D'), (u'latest', u'A'), (u'results', u'N'), (u'appear', u'V'), (u'in', u'P'), (u"today's", u'S'), (u'New', u'^'), (u'England', u'^'), (u'Journal', u'^'), (u'of', u'P'), (u'Medicine', u'^'), (u',', u','), (u'a', u'D'), (u'forum', u'N'), (u'likely', u'A'), (u'to', u'P'), (u'bring', u'V'), (u'new', u'A'), (u'attention', u'N'), (u'to', u'P'), (u'the', u'D'), (u'problem', u'N'), (u'.', u',')]
	'''
	seq = []
	for tkn,tag in tagged:
	if tag=='-NONE-' or (tkn=="n't" and tag=='RB'): # negative clitic doesn't matter in terms of the base tag
	continue

	if len(tkn)>2 and tag[0]=='-' and tag[-1]=='-':
	tag = {'-LRB-': '(', '-RRB-': ')'}[tag]

	if tkn[0]=="'" and tag in ('POS','MD','VBZ','VBP','VBD'): # join contracted/possessive forms to the previous token
	assert len(seq)>0
	ptkn,ptag = seq[-1]
	seq[-1] = (ptkn+tkn, ptag+'+'+tag)
	else:
	if "'" in tkn and tag not in ("''", 'CD', 'CC'):
	print((['']+seq)[-1],tkn,tag)
	seq.append((tkn,tag))
	return [(tkn, ptb2twit(tag)) for tkn,tag in seq]


	def describe(tag):
	'''
	Returns a tuple of 2 strings describing the tag: the first is the full name, and the second is a list of examples.
	>>> describe('JJ')
	(u'Adjective', u'happy, bad')
	'''
	pass

	'''
	List of NR ("adverbial noun") words in Brown:

	downtown
	home
	today
	to-day
	tomorrow
	to-morrow
	tonight
	yesterday

	left
	right

	east
	north(west\|east)?
	south(west\|east)?
	west
	nawth

	Sunday
	Monday
	Tuesday
	Wednesday
	Thursday
	Friday
	Saturday
	Sundays

	>> 'home' and 'right' are also very frequent as NN

	('home', 'NR'): 301,
	('home', 'NR-NC'): 1,
	('home', 'NR-HL'): 1,
	('Home', 'NR'): 2,
	('Home', 'NR-TL'): 2,
	('Home', 'NR-HL'): 1

	('home', 'NN'): 217
	('home', 'NN-HL'): 1,
	('home', 'NN-NC'): 4
	('Home', 'NN'): 2,
	('Home', 'NN-TL'): 13

	('Home', 'NP'): 1

	For 'home', PTB appears to be inconsistent, with a tag of either RB or NN.

	('right', 'RB'): 75, e.g. all right, treated right. PTB uses JJ, at least for 'all right'.
	('Right', 'RB'): 4

	('right', 'NR'): 56 spatial location/direction

	('right', 'NN'): 122, e.g. right to bear arms, the political right. PTB uses NN.
	('Right', 'NN'): 1,
	('Right', 'NN-TL'): 3,
	('right', 'NN-HL'): 1

	('right', 'QL'): 121, e.g. right away, right now. PTB uses RB.
	('Right', 'QL'): 6




	Souths, Wednesdays = NRS
	yesterday's, Wednesday's = NR$
	today'll = NR+MD
	'''

	if __name__=='__main__':
	# List NR... tags/tokens in Brown

	#import doctest
	#doctest.testmod()



	# Check robustness of Brown-to-PTB conversion
	from nltk.corpus import treebank
	for i,s in enumerate(treebank.tagged_sents()):
	if i in (44,54,147): continue # CC before VB
	#if i in (23,99): continue # CC before VBP
	s1 = [(w,pt) for w,pt in s if pt!='-NONE-']
	s2, s3 = None, None

	try:
	s2 = [(w, ptb2brown(pt, token=w)[0]) for w,pt in s1]
	s3 = brownSequence2ptb(s2)

	except:
	print('skipping',i)
	continue
	assert s1==s3,(s1,s2,s3)
	print('OK',i)
	assert False
	# Create PTB-tagged file for Brown corpus
	from nltk.corpus import brown
	with open('brown-converted-ptb-tags.txt') as f:
	for s in brown.tagged_sents():
	for w,bt in s:
	try:
	pt = brownSequence2ptb(s)
	f.write('{}\t{}\t{}\t-\n'.format(w, pt, bt))
	except:
	print(s)
	raise
	f.write('\n')

	'''
	twitter = defaultdict(set)
	for t in tags.keys():
	if t.startswith('PTB::'):
	tt = ptb2twit(t[5:])
	twitter[tt].add(t)
	elif t.startswith('Brown::'):
	tt = brown2twit(t[7:])
	twitter[tt].add(t)
	continue
	if t=='Brown::IN' or t=='Brown::HV' or t=='Brown::DO' or t=='Brown::.' or t=='Brown::??' or t=='Brown::VB': continue
	print(t)
	print(brown2ptb(t[7:]))
	for t in sorted(twitter.keys()):
	print(t,' '.join(iter(twitter[t])))
	'''
	# http://nlp.cs.nyu.edu/wiki/corpuswg/AnnotationCompatibilityReport
	# Table 1: Part of Speech Compatibility
	# (Initial Version from Manning and Schutz 1998, pp. 141-142)
	# Extended to cover Claws1 and ICE
	# cf. http://www.scs.leeds.ac.uk/ccalas/tagsets/brown.html
	# Nathan Schneider, 2011-02-19:
	# * Fixed some errors in brown column, e.g.: DT1 => DTI, PP0 => PPO, NRS => NPS
	# * Added last column (Twitter tagset) and several special tags at the end
	Category Examples Claws c5, Claws1 Brown PTB ICE Twitter
	Adjective happy, bad AJ0 JJ JJ ADJ.ge A
	Adjective, ordinal number sixth, 72nd, last ORD, OD OD JJ NUM.od A
	Adjective, comparative happier, worse AJC JJR JJR ADJ.comp A
	Adjective, superlative happiest, worst AJS JJT JJS ADJ.sup A
	Adjective, superlative, semantically chief, top AJ0 JJS JJ ADJ.ge A
	Adjective, cardinal number 3, fifteen CRD, CD CD CD NUM.cd $
	Adjective, cardinal number, one one PNI, CD1 CD CD NUM.cd $
	Adjective, past-part of verb surprised JJ VBN VBN, JJ ADJ.edp V
	Adjective, pres-part of verb refreshing JJ VBG VBG, JJ ADJ.ingp V
	Adverb slowly, sweetly AV0 RB RB ADV.ge R
	Adverb, negative not, n't XX0 * RB ADV.ge R
	Adverb, comparative faster AV0 RBR RBR ADV.comp R
	Adverb, superlative fastest AV0 RBT RBS ADV.sup R
	Adverb, particle up, off, out AVP, RP, RI RP RP ADV.phras or ADV.ge T
	Adverb, question when, how, why AVQ WRB WRB ADV.wh R
	Adverb, degree & question how, however AVQ WQL WRB ADV.wh R
	Adverb, degree very, so, too AV0, QL QL RB ADV.intens R
	Adverb, degree, postposed enough, indeed AV0 RN RB ADV.intens, ADV.ge R
	Adverb, nominal here, there, now AV0, RB RN RB ADV.ge, EXTHERE R
	Adverb, conjunctive therefore, however AV0,RB RN RB CONNEC.ge R
	Conjunction, coordination and, or CJC, CC CC CC CONJUNC.coord &
	Conjunction, subordinating although, when CJS, CS CS IN CONJUNC.subord, P
	Conjunction, complementizer 'that' that, WP, WPA, WPO CJT CS IN PRON.rel, CONJUNC.subord P
	Determiner this, each, another DT0, DT DT DT PRON.dem.sing, PRON(recip) D
	Determiner, pronoun any, some DT0, DTI DTI DT PRON.nonass, PRON.ass D
	Determiner, pronoun, plural these, those DT0, DTS DTS DT PRON.dem.plu D
	Determiner, prequalifier quite DT0, aBL ABL PDT ADV.intens X
	Determiner, prequalifier all, half DT0, ABN ABN PDT PRON.univ, PRON.quant X
	Determiner, pronoun or double conj. both DT0, ABX ABX DT (CC) PRON.univ.plu D
	Determiner, pronoun or double conj. either, neither DT0, DTX DTX DT (CC) PRON.neg PRON.nonass.sing D
	Determiner, article the, no AT0, ATI, DTX AT DT ART.def, PRON.neg D
	Determiner, article a, an AT0, AT AT DT ART.indef D
	Determiner, postdeterminer many, same DT0, AP,APS AP JJ PRON.quant.{sing,plu}, ADJ.ge A
	Determiner, possessive their, your DPS, PP$, PP$$ PP$ PRP$ PRON.poss D
	Determiner, possessive, second mine, yours DPS, PP$ PP$$ PRP PRON.poss O
	Determiner, question which, whatever DTQ, WDT WDT WDT PRON.{inter, rel} D
	Determiner, possessive & question whose DTQ, WP$ WP$ WP$ PRON.rel D
	Noun aircraft, data NN0 NN NN N.com.sing N
	Noun, singular woman, book NN1 NN NN N.com.sing N
	Noun, plural women, books NN2 NNS NNS N.com.plu N
	Noun, proper, singular London, Michael NP0 NP NNP N.prop.sing ^
	Noun, proper, plural Australians, Methodists NP2 NPS NNPS N.prop.plu ^
	Noun, adverbial tomorrow, home NN0 NR NN, NNP, RB N.com.sing N, ^, R
	Noun, plural from post-determiner others NN2, APS NNS NNS PRON.quant.plu N
	Pronoun, nominal (indefinite) none, everything, one PNI,PN PN NN PRON(neg), PRON.univ.{sing,plu}, NUM(card,sing) N
	Pronoun, personal, subject you, we PNP, PP2 PPSS PRP PRON.pers.{sing,plu} O
	Pronoun, personal, subject, 3SG she, he, it PNP, PP1A, PP1AS, PP3, PP3A, PP3AS PPS PRP PRON.{antit,cleftit}, O
	Pronoun, personal, object you, them, me PNP, PP1O, PP1OS, PP3O, PP3OS PPO PRP PRON.pers.{sing,plu} O
	Pronoun, reflexive herself, myself PNX, PPL PPL PRP PRON.ref.sing O
	Pronoun, reflexive, plural themselves, ourselves PNX, PPLS PPLS PRP PRON.ref.plu O
	Pronoun, question, subject who, whoever PNQ WPS WP PRON.inter, PRON(nom) O
	Pronoun, question, object who, whoever PNQ WPO WP PRON.inter, PRON(nom) O
	Pronoun, question, existential there there EX0 EX EX EXTHERE X
	Verb. base present form (not infinitive) take, live VVB VB VBP V.X.{pres,imp} V
	Verb, infinitive take, live VVI VB VB V.X.infin V
	Verb, past tense took, lived VVD VBD VBD V.X.past V
	Verb, present participle taking, living VVG VBG VBG V.X.ingp V
	Verb, past/passive participle taken, lived VVN VBN VBN V.X.edp V
	Verb, present 3SG -s form takes, lives VVZ VBZ VBZ V.X.pres V
	Verb, auxilliary do, base do VDB, DO DO VBP AUX.do.{pres,imp} V
	Verb, auxilliary do, infinitive do, DO VDB DO VB AUX.do.infin V
	Verb, auxilliary do, past did VDD, DOD DOD VBD AUX.do.past V
	Verb, auxilliary do, present part. doing VDG, VBG VBG V.X.ingp V
	Verb, auxilliary do, past part. done VDN VBN VBN V.X.edp V
	Verb, auxilliary do, present 3SG does VDZ, DOZ DOZ VBZ AUX.do.pres V
	Verb, auxilliary have, base have VHB, HV HV VBP V.X.pres, AUX.{perf,semi}.pres V
	Verb, auxilliary have, infinitive have VHI, HV HV VB V.X.infin, AUX.{perf,semi}.{imp,infin} V
	Verb, auxilliary have, past had VHD, HVD HVD VBD V.X.past, AUX.{perf,semi}.past V
	Verb, auxilliary have, present part. having VHG, HVG HVG VBG V.X.ingp, AUX.perf.ingp V
	Verb, auxilliary have, past part. had VHN, HVN HVN VBN V.X.edp, AUX.perf.past V
	Verb, auxilliary have, present 3SG has VHZ, HVZ HVZ VBZ V.X.pres, AUX.{perf,semi}.{pres} V
	Verb, auxilliary be, infinitive be VBI, BE BE VB V.cop.{infin,imp} AUX.prog.infin V
	Verb, auxilliary be, past were VBD, BED BED VBD V.cop.{past.subjun}, AUX.prog.past V
	Verb, auxilliary be, past, 3SG was VBD, BEDZ BEDZ VBD V.cop.past AUX.prog.past V
	Verb, auxilliary be, present part. being VBG, BEG BEG VBG V.cop.ingp AUX.prog.ingp V
	Verb, auxilliary be, past part. been VBN, BEN BEN VBN V.cop.edp AUX.prog.edp V
	Verb, auxilliary be, present, 3SG is, 's VBZ, BEZ BEZ VBZ V.cop.pres AUX.prog.pres V
	Verb, auxilliary be, present, 1SG am, 'm VBB, BEM BEM VBP V.cop.pres AUX.prog.pres V
	Verb, auxilliary be, present are, 're VBB, BER BER VBP V.cop.pres AUX.prog.pres V
	Verb, modal can, could, 'll VMG, MD MD MD AUX.mod.{past,pres} V
	Infinitive marker to TO0, TO TO TO PRTCL.to P
	Preposition, to to PRP IN TO PREP.ge P
	Preposition, for, above PRP IN IN PREP.ge, PRTCL(for) P
	Preposition, of of PRF IN IN PREP.ge P
	Possessive 's, ' POS, not $ POS ADV(ge) not
	Interjection (or other isolate) oh, yes, mmm ITJ, UH UH UH INTERJEC !
	Punctuation, sentence ender . ! ? PUN, . . . PUNC.{qm,per,exm} ,
	Punctuation, semicolon ; PUN, ; . : PUNC.scol ,
	Punctuation, colon or ellipsis : ... PUN, ... : : PUNC.{ellip,col} ,
	Punctuation, comma , PUN, , , , PUNC.com ,
	Punctuation, dash - PUN, not -- - PUNC.dash ,
	Punctuation, dollar sign $ PUN not $ not ,
	Punctuation, left bracket ( [ { PUL, ( ( ( PUNC.obrack ,
	Punctuation, right bracket ) ] } PUR, ) ) ) PUNC.cbrack ,
	Punctuation, left quotation PUQ, `` `` PUNC.oquo ,
	Punctuation, right quotation PUQ, '' '' PUNC.cquo ,
	Foreign words (not in English lexicon) UNC, &FW (FW-) FW not G
	Symbol [fj] * not SYM not G
	Symbol, alphabetical A, B, c, d ZZ0 not G
	Symbol, list item A, A, First LS not G
	URL or email address http://www.twitter.com ?? ?? U
	Emoticon :), <3 ?? ?? E
	Online discourse marker RT ?? ?? ~
	Possessive nominal his, book's ?? ?? S
	Possessive proper noun Mark's ?? ?? Z
	Nominal combined with verbal he's, you're, book'll ?? ?? L
	Proper noun combined with verbal Mark'll ?? ?? M
	Miscellaneous function word combined with verbal there's ?? ?? Y
	'''
	Maps Penn Treebank POS tags to morphosyntactic attributes. Excerpted from
	wsj.py,
	Utilities for working with Penn Treebank Wall Street Journal data.

	@author: Nathan Schneider ([email protected])
	@since: 2011-05-01
	'''
	from __future__ import print_function, division
	from future_builtins import map, filter, zip

	from collections import defaultdict

	def posinfo():
	'''
	>>> dict(posinfo()['VBP'])
	{'finite': True, 'tag': 'VBP', 'description': 'Verb, non-3rd person singular present', 'verbal': True}
	'''
	# http://www.computing.dcu.ie/~acahill/tagset.html
	# tag groups: j = adjective, n = nominal, d = determiner, v = verbal, f = finite, r = adverbial, x = function/closed-class
	# Aside from f and x, these are mutually exclusive and can be interpreted as coarse tags.
	TABLE = '''
	CC x Coordinating conjunction >> and, but, or...
	CD Cardinal Number
	DT dx Determiner
	EX x Existential there
	FW Foreign Word
	IN x Preposision or subordinating conjunction
	JJ j Adjective
	JJR j Adjective, comparative
	JJS j Adjective, superlative
	LS List Item Marker
	MD vx Modal >> can, could, might, may...
	NN n Noun, singular or mass
	NNP n Proper Noun, singular
	NNPS n Proper Noun, plural
	NNS n Noun, plural
	PDT dx Predeterminer >> all, both ... when they precede an article
	POS x Possessive Ending >> 's
	PRP nx Personal Pronoun >> I, me, you, he...
	PRP$ nx Possessive Pronoun >> my, your, mine, yours...
	RB r Adverb >> Most words that end in -ly as well as degree words like quite, too and very
	RBR r Adverb, comparative >> Adverbs with the comparative ending -er, with a strictly comparative meaning
	RBS r Adverb, superlative
	RP x Particle
	SYM Symbol >> Should be used for mathematical, scientific or technical symbols
	TO x to
	UH Interjection >> e.g. uh, well, yes, my...
	VB v Verb, base form >> subsumes imperatives, infinitives and subjunctives
	VBD fv Verb, past tense >> includes the conditional form of the verb to be
	VBG v Verb, gerund or persent participle
	VBN v Verb, past participle
	VBP fv Verb, non-3rd person singular present
	VBZ fv Verb, 3rd person singular present
	WDT dx Wh-determiner >> e.g. which, and that when it is used as a relative pronoun
	WP nx Wh-pronoun >> e.g. what, who, whom...
	WP$ nx Possessive wh-pronoun
	WRB rx Wh-adverb >> how, where why
	#
	$
	''
	(
	)
	-LRB- (
	-RRB- )
	-LSB- [
	-RSB- ]
	-LCB- {
	-RCB- }
	,
	.
	:
	``
	'''
	info = {}
	for ln in TABLE.strip().splitlines():
	ln = ln.strip()
	entry = defaultdict(lambda: False)
	if ln[0] in '()[]{}-' or ' ' not in ln:
	if ln[0] in '()[]{}-':
	entry['bracket'] = True
	entry['tag'] = ln.split()[0]
	entry['punct'] = True
	entry['symbol'] = True
	if ln in ("''", "``"): entry['quote'] = True
	else:
	# TODO: traces?
	if '>>' in ln:
	entry['extra'] = ln[ln.index('>> ')+3:]
	ln = ln[:ln.index('>> ')].strip()
	parts = ln.split()
	entry['tag'] = parts[0]
	atts = ''
	if parts[1][0].islower():
	atts = parts[1]
	del parts[1]
	entry['description'] = ' '.join(parts[1:])
	if 'Possessive' in entry['description']:
	entry['possessive'] = True
	if atts:
	if 'x' in atts: entry['functional'] = True
	if 'n' in atts: entry['nominal'] = True
	if 'nx' in atts: entry['pronominal'] = True
	if 'j' in atts: entry['adjectival'] = True
	if 'v' in atts: entry['verbal'] = True
	if 'r' in atts: entry['adverbial'] = True
	if 'd' in atts: entry['determiner'] = True
	if 'f' in atts: entry['finite'] = True
	coarse = ['n' in atts, 'v' in atts, 'd' in atts, 'r' in atts, 'j' in atts]
	if sum(coarse)==1:
	entry['coarse'] = 'nvdrj'[coarse.index(True)]
	else:
	assert sum(coarse)==0
	entry['coarse'] = entry['tag']
	else:
	entry['coarse'] = entry['tag']

	if entry['tag'].startswith('W'): entry['wh'] = True
	info[entry['tag']] = entry

	info['LS']['symbol'] = True
	info['SYM']['symbol'] = True
	info['NNP']['proper'] = True
	info['NNPS']['proper'] = True
	info['FW']['foreign'] = True
	return info

	def poses(**criteria):
	'''
	Retrieves a list of all tags meeting the specified criteria, where criteria are boolean attribute
	names in the entries of the object returned by posinfo().

	>>> poses(finite=True)
	set(['VBZ', 'VBP', 'VBD'])
	>>> poses(nominal=True, pronominal=False)
	set(['NNPS', 'NNS', 'NN', 'NNP'])
	>>> poses(functional=True, nominal=False, verbal=False, adverbial=False, determiner=False, possessive=False)
	set(['CC', 'TO', 'RP', 'EX', 'IN'])
	'''
	return {entry['tag'] for entry in posinfo().values() if sum(entry[attn]==v for attn,v in criteria.items())==len(criteria)}

	def posAttributes(tag):
	return posinfo()[tag]