Last active
June 15, 2017 22:58
-
-
Save nschneid/beed0bcda5b42e530011 to your computer and use it in GitHub Desktop.
Given a new-style Penn Treebank English tree, produce the part-of-speech tags according to the Universal Dependencies project.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python2.7 | |
''' | |
Converts new-style PTB POS tags to the English tagset from the Universal Dependencies project | |
(see universal-pos-en.html, from http://universaldependencies.github.io/docs/en/pos/all.html). | |
There are 17 such tags, expanded from the original 12 Universal POS tags of Petrov et al. 2011. | |
See "limitations" comment below for some details on our interpretation of the difficult-to-map | |
categories. | |
In new-style PTB, TO only applies to prepositional (not infinitival) "to". | |
Options: -i to include the token inline; -m to enable unofficial tag refinement: AUX.MD for modal auxiliaries | |
Input: Parse trees, one per line, prefixed with a tab-separated sentence ID. | |
@author: Nathan Schneider ([email protected]) | |
@since: 2015-08-23 | |
''' | |
from __future__ import print_function | |
import sys, fileinput, re | |
from collections import Counter | |
INLINE_TOK = DISTINGUISH_MD = False | |
while len(sys.argv)>1 and sys.argv[1].startswith('-'): | |
flag = sys.argv[1] | |
assert flag in ('-i','-m'),'Unrecognized flag: '+flag | |
if flag=='-i': | |
INLINE_TOK = True | |
elif flag=='-m': | |
DISTINGUISH_MD = True | |
sys.argv = sys.argv[:1] + sys.argv[2:] | |
EASY_MAP = { | |
'JJ': 'ADJ', 'JJR': 'ADJ', 'JJS': 'ADJ', | |
'RP': 'ADP', | |
'MD': 'AUX.MD' if DISTINGUISH_MD else 'AUX', | |
'CC': 'CONJ', | |
'DT': 'DET', 'PDT': 'DET', 'WDT': 'DET', | |
'UH': 'INTJ', | |
'CD': 'NUM', | |
'POS': 'PART', 'TO': 'PART', | |
'PRP': 'PRON', 'PRP$': 'PRON', 'WP': 'PRON', 'WP$': 'PRON', 'EX': 'PRON', | |
'NNP': 'PROPN', 'NNPS': 'PROPN', | |
'``': 'PUNCT', "''": 'PUNCT', '-LRB-': 'PUNCT', '-RRB-': 'PUNCT', ',': 'PUNCT', '.': 'PUNCT', ':': 'PUNCT', 'HYPH': 'PUNCT', | |
'#': 'SYM', '$': 'SYM', 'SYM': 'SYM', 'NFP': 'SYM', # see note below about NFP | |
'FW': 'X', 'LS': 'X', 'XX': 'X', 'ADD': 'X', 'AFX': 'X', 'GW': 'X' | |
} | |
AUX_FORMS = ('am', "'m", 'is', "'s", 'are', "'re", 'was', 'were', 'be', 'been', 'being', | |
'm', 'r', 're', 's', 'v', 've', 'd', | |
'ai', 'du', 'of', # ai n't, du n no, would of [have] | |
'have', "'ve", 'has', 'had', "'d", 'having', | |
'do', 'does', 'did', 'done', 'doing', | |
'get', 'gets', 'got', 'gotten', 'getting') | |
""" | |
Limitations of this script: | |
(1) The guidelines state: | |
Some uses of NFP (for lines of hyphens, asterisks or tildes) -> PUNCT | |
NFP (except for lines of separators, which become PUNCT) -> SYM | |
However, we ignore the technicality about lines of separators, presuming they will not | |
occur in the input. | |
(2) The guidelines prescribe AUX for | |
verbal tags (VB, VBP, VBG, VBN, VBD, VBZ) when they are forms of be, have, do, and get | |
when used as an auxiliary (we count passive get as an auxiliary) | |
We operationalize this by looking for these verb forms with a subsequent VP sister. | |
This is a fairly robust heuristic, even capturing fronted (topicalized) VPs | |
thanks to the treebanking convention of including a second VP with a trace: | |
(SINV (VP-TPC-1 (VBG Sailing) | |
(PP (IN with) | |
(NP (DT the) (NNP Roosevelt)))) | |
(VP (VBZ is) | |
(VP-1 (-NONE- *T*))) | |
(NP-SBJ (NP (DT the) (NNP Tarawa) (NNP Expeditionary) (NNP Strike) (NNP Force)))))) | |
Depending on what was intended by the guidelines, this rule may not capture everything. | |
Here is a tree fragment from the English Web Treebank where "have" and "get" | |
could arguably be considered auxiliaries, but they are followed by an S constituent: | |
you 'll have * to fight *PRO* to get it resolved * | |
(NP-SBJ-2 (PRP you)) | |
(VP (MD 'll) | |
(VP (VB have) | |
(S (NP-SBJ-2 (-NONE- *)) | |
(VP (TO to) | |
(VP (VB fight) | |
(S-PRP (NP-SBJ-2 (-NONE- *PRO*)) | |
(VP (TO to) | |
(VP (VB get) | |
(S (NP-SBJ-3 (PRP it)) | |
(VP (VBN resolved) | |
(NP-3 (-NONE- *)))))))))))) | |
""" | |
def unitags(tree): | |
penntags = [(m.group(2), m.group(1), (m.start(0), m.end(0))) \ | |
for m in re.finditer(r'\(([^\s\(\)]+) ([^\s\(\)]+)\)', tree) if m.group(1)!='-NONE-'] | |
result = [] | |
for (w, t, (i,j)) in penntags: | |
u = EASY_MAP.get(t) | |
if not u: | |
if t in ('NN','NNS'): # NOUN : all cases of PTB NN and NNS, except for %, which we retag as SYM. | |
if w=='%': u = 'SYM' | |
else: u = 'NOUN' | |
elif t in ('RB','RBR','RBS','WRB'): # ADV : all uses of PTB tags RB, RBR, RBS, and WRB except the clausal negation not and reduced forms of it, which become PART. | |
if w.lower() in ('not', "n't"): u = 'PART' | |
else: u = 'ADV' | |
elif t=='IN': # SCONJ if complementizer or subordinating conjunction, ADP o.w. | |
# Diagnostic for SCONJ: ook for an S* constituent starting immediately | |
# after this word. (May not be perfect, but should be good enough) | |
if tree[j+1:].strip().startswith('(S'): u = 'SCONJ' | |
else: u = 'ADP' | |
else: | |
assert t in ('VB','VBP','VBG','VBN','VBD','VBZ'),(w,t) | |
# read the sentence in as a stack (ignoring terminals) | |
s = tree | |
k = 0 | |
target_stack_depth = None | |
right_sister_vp = False | |
right_sister_s = False | |
stack = [] | |
while s: | |
first, s = s.split(' ', 1) | |
k += len(first)+1 | |
if first.startswith('('): # push | |
stack.append(first) | |
if target_stack_depth is not None and len(stack)==target_stack_depth: | |
# pushed the target word or one of its right sisters | |
if first.startswith('(VP'): | |
right_sister_vp = True | |
elif first.startswith('(S') and k-len(first)-1==j+1: | |
right_sister_s = True # IMMEDIATE right sister: e.g., (VBD had) (S ... (VP (TO to) ... | |
else: #terminal or close paren. pop | |
stack = stack[:-1] | |
if target_stack_depth is not None and len(stack)<target_stack_depth-1: | |
# we've popped the parent of the target constit | |
break | |
if i==k: | |
target_stack_depth = len(stack)+1 | |
# our target word is ready to be pushed | |
assert target_stack_depth is not None | |
u = 'VERB' | |
if w.lower() in AUX_FORMS: | |
if right_sister_vp: | |
u = 'AUX' | |
#elif right_sister_s: | |
# u += '~S' | |
#u += '^' + str(target_stack_depth) | |
result.append((w,t,u)) | |
return result | |
c = Counter() | |
for ln in fileinput.input(): | |
if not ln.strip(): continue | |
sentid, tree = ln.split('\t') | |
tree = tree.replace(')', ') ').replace(' ', ' ').strip()+' ' | |
uu = unitags(tree) # space out the close parens | |
for w,t,u in uu: | |
c[t,u] += 1 | |
print(sentid, ' '.join((w+'|'+u if INLINE_TOK else u) for w,t,u in uu), sep='\t') | |
print(c, file=sys.stderr) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment