Created
February 26, 2014 21:56
-
-
Save nschneid/9239450 to your computer and use it in GitHub Desktop.
Script used to load Arabic supersense lexicons (from Arabic WordNet and OntoNotes) and list the possible matches for each token of an input text. One of the imports depends on code in https://github.com/nschneid/pyutil.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#coding=UTF-8 | |
''' | |
to run the code: | |
METHOD 1: .stem_pos files | |
$ export PYTHONPATH=/path/to/AQMAR | |
$ python2.7 supersenseDefaults.py [mode] ar.stem_pos > ar.lexiconsst | |
METHOD 2: parallel .tok and .wd_pos_ne.txt files | |
$ export PYTHONPATH=/path/to/AQMAR | |
$ python2.7 supersenseDefaults.py [mode] ../tokanFiles/dev/*.tok > ../tokanFiles/dev/dev.lexiconsst | |
where 'mode' is a comma-separated list of parts of speech to be considered, | |
of 'noun', 'verb', 'adj', and 'adv'. The default is "noun,verb". | |
Compute coverage statistics for Arabic Wikipedia articles given named entity annotations and Arabic WordNet. | |
Output for each nominal token a supersense selected from AWN and the class corresponding to the NE annotation. | |
The ultimate goal is to provide defaults for our annotators where possible. | |
TODO: OntoNotes entities with digits | |
@author: Nathan Schneider (nschneid) | |
@since: 2011-10-16 | |
''' | |
from __future__ import print_function, division | |
import sys, codecs, fileinput, re, json, os | |
from glob import glob | |
from collections import Counter, defaultdict | |
from itertools import izip_longest | |
from edu.cmu.cs.lti.ark.pyutil.corpus.patb.patb import romanize | |
def czip(*iterables): | |
''' | |
Checked version of izip() that requires all arguments be of the same length. | |
>>> list(czip('ABC','xyz')) | |
[('A', 'x'), ('B', 'y'), ('C', 'z')] | |
>>> list(czip('ABC','xy')) | |
Traceback (most recent call last): | |
... | |
ValueError: czip() arguments have unequal length | |
>>> list(czip('AB','xyz')) | |
Traceback (most recent call last): | |
... | |
ValueError: czip() arguments have unequal length | |
>>> list(czip('AB',[],'xy')) | |
Traceback (most recent call last): | |
... | |
ValueError: czip() arguments have unequal length | |
''' | |
SHORT=object() | |
for y in izip_longest(*iterables, fillvalue=SHORT): | |
if SHORT in y: | |
raise ValueError('czip() arguments have unequal length') | |
else: | |
yield y | |
def normalize(d, by=None): | |
''' | |
Returns a copy of 'd', but with all values divided by the provided value. | |
If that value is None, their sum is used. | |
>>> c = normalize(Counter({9: 3, 's': 1, 8: 6})) | |
>>> c==Counter({9: 0.3, 's': 0.1, 8: 0.6}) | |
True | |
>>> c = normalize(Counter({9: 3, 's': 12, 8: 6}), 3) | |
>>> c==Counter({9: 1, 's': 4, 8: 2}) | |
True | |
''' | |
if by is None: | |
by = sum(d.values()) | |
return applyToValues((lambda v: v/by), d) | |
def prop(x, y): | |
return '{}/{} = {:.2%}'.format(x,y,x/y) | |
# Map from OntoNotes entity classes to supersenses | |
# (some of these mappings are imperfect, but they're good enough for defaults). | |
ONTONOTES_ENTITY_S = ''' | |
CARDINAL noun.QUANTITY | |
DATE noun.TIME | |
EVENT noun.EVENT | |
FAC noun.LOCATION | |
GPE noun.LOCATION | |
LANGUAGE noun.COMMUNICATION | |
LAW noun.COMMUNICATION | |
LOC noun.LOCATION | |
MONEY noun.QUANTITY | |
NORP noun.GROUP | |
ORDINAL noun.QUANTITY | |
ORG noun.GROUP | |
PERCENT noun.QUANTITY | |
PERSON noun.PERSON | |
PRODUCT noun.ARTIFACT | |
QUANTITY noun.QUANTITY | |
TIME noun.TIME | |
WORK_OF_ART noun.COMMUNICATION | |
'''.strip() | |
ONTONOTES_ENTITY_MAP = dict(ln.strip().split() for ln in ONTONOTES_ENTITY_S.split('\n')) | |
MODE = {'noun', 'verb'} # default | |
if set(sys.argv[1].split(','))<={'noun','verb','adj','adv'}: | |
MODE = set(sys.argv[1].split(',')) | |
del sys.argv[1] | |
# load lexicon | |
arsenses = defaultdict(lambda: defaultdict(set)) | |
nMultiwordAWNLexemes = 0 | |
with codecs.open('awnLexicon4SS.txt', 'r', 'utf-8') as lexF: | |
for ln in lexF: | |
info = ln[:-1].split() | |
wU, awnSynset, wnSynset, sst = info[:4] | |
if len(info)<7: # no SST? | |
print('No SST found for lexical entry:',info[-1],file=sys.stderr) | |
continue | |
assert sst.startswith('noun.') or sst.startswith('verb.') or sst.startswith('adj.') or sst.startswith('adv.'),'Invalid SST: {}'.format(sst) | |
if sst.startswith('adj.') or sst.startswith('adv.'): | |
continue | |
if re.match(r'.+_([vnar])\d+AR$', awnSynset).group(1) not in ''.join(m[0] for m in MODE): | |
continue | |
wR = romanize(wU) | |
if wR.count('_')>0: nMultiwordAWNLexemes += 1 | |
arsenses[wR][sst].add(awnSynset) | |
print('Multiword lexemes from AWN:',prop(nMultiwordAWNLexemes, len(arsenses)), file=sys.stderr) | |
if 'noun' in MODE: | |
onEntities = defaultdict(Counter) | |
nMultiwordONLexemes = 0 | |
with codecs.open('ontonotes-4.0-entities.txt', 'r', 'utf-8') as neF: | |
for ln in neF: | |
wU, onNE = ln[:-1].replace(' ','_').split('\t') | |
if re.search(r'\d', wU) is not None: continue # for now, ignore entities with digits | |
wR = romanize(wU) | |
sst = ONTONOTES_ENTITY_MAP[onNE] | |
onEntities[wR][sst] += 1 | |
if '_' in wR: | |
onEntities[wR.split('_')[0]][sst] += 1 # for multiword NEs, also store the SST for first word of the entity in case it occurs with similar names | |
for wR in onEntities: | |
if '_' in wR: nMultiwordONLexemes += 1 | |
for r,(sst,c) in enumerate(onEntities[wR].most_common()): | |
arsenses[wR][sst].add('**ON**_n{}AR'.format(r+1)) | |
print('Multiword lexemes from OntoNotes:',prop(nMultiwordONLexemes, len(onEntities)), file=sys.stderr) | |
suffixes = defaultdict(set) # word mapped to set of full lexemes starting with that word | |
for entry in arsenses: | |
words = entry.split('_') | |
suffixes[words[0]].add(entry) | |
def rank(synsetId): | |
return int(re.match(r'.+_[vnar](\d+)AR$', synsetId).group(1)) | |
def voting(w): | |
# voting procedure for ambiguous entries: | |
# known supersenses are scored based on the number and rank of supporting synsets | |
# normalized by the total number of synsets corresponding to this word | |
result = Counter({sst: (len(syns)/min(rank(syn) for syn in syns)) for sst,syns in arsenses[w].items()}) | |
return normalize(result) | |
# load Wikipedia article tokens | |
ww = [] # vocalized, romanized stem from MADA (wa+ words discarded) or None if a Latin word, as well as newlines separating sentences | |
poses = [] | |
tkns = [] | |
nes = [] | |
if sys.argv[1].endswith('.stem_pos'): # METHOD 1 | |
for ln in fileinput.input(): | |
ln = ln.decode('utf-8')[:-1] | |
if not ln.strip(): | |
ww.append('\n') | |
poses.append('\n') | |
tkns.append('\n') | |
nes.append('\n') | |
else: | |
stem, pos = ln.split('\t') | |
ww.append(None if stem.startswith('@@LAT@@') else stem) | |
poses.append(pos) | |
tkns.append(None) | |
nes.append(None) | |
else: # METHOD 2 | |
skipping = False | |
for ln in fileinput.input(None if len(sys.argv)>1 else glob('4Articles-MADA/*.mada.tok')): | |
# 'ln' contains a romanized sentence, with three versions of each token (the third of which is the vocalized lemma/stem): e.g. Al<ydrwjyn·AlAydrwjyn·<iydruwjiyn | |
if fileinput.isfirstline(): | |
skipping = False | |
postaggedFP = '../'+os.path.basename(fileinput.filename()).replace('Sent.','.').replace('.cleaned.txt.sent.bw.mada.tok','.wd_pos_ne.txt').replace('.txt.bw.mada.tok','.wd_pos_ne.txt') | |
if not os.path.exists(postaggedFP): | |
print('File not found: ', postaggedFP, file=sys.stderr) | |
skipping = True | |
continue | |
postaggedF = codecs.open(postaggedFP, 'r', 'utf-8') | |
elif skipping: | |
continue # corresponding file not found | |
if ln=='\n': continue | |
posLn = next(postaggedF) | |
# 'posLn' contains word, POS, and NE annotation for the sentence | |
posLn = posLn.replace(u'\u00a0','') # remove nonbreaking space sometimes inserted by POS tagger | |
posTkns = iter(posLn.strip().split()) | |
ln = ln.decode('utf-8') | |
for tkn in ln.strip().split(): | |
if tkn.endswith('+'): # e.g., conjunction (separated in MADA/TOKAN output only) | |
continue | |
tkns.append(tkn) | |
posTkn = next(posTkns) | |
w0, pos, ne = posTkn.split('___') | |
ne = '' if ne=='O' else ne.replace('B-','')[0].replace('O','G').replace('I','<') | |
poses.append(pos) | |
nes.append(ne) | |
if tkn.startswith('@@LAT@@'): | |
ww.append(None) | |
continue # ignore Latin characters in input | |
assert tkn.count(u'·')==2,tkn.encode('utf-8') | |
x, y, w = tkn.split(u'·') | |
ww.append(w) # lemma/stem from MADA | |
ww.append('\n') | |
poses.append('\n') | |
tkns.append('\n') | |
nes.append('\n') | |
# process Wikipedia articles | |
nMultiwordInstances = 0 | |
nAmbigTkns = 0 | |
nOOV = 0 | |
nCoveredNE = 0 | |
nCovered = 0 | |
n = 0 | |
waysAmbiguous = Counter() | |
#awnSSTs = [] | |
#sstScores = {} # index of first word matching a lexicon entry -> Counter with scores for each SST | |
#minRankSSTs = {} | |
LEGAL_POSES = set() | |
if 'noun' in MODE: | |
LEGAL_POSES.update({'noun', 'pron', 'abbrev'}) | |
if 'verb' in MODE: | |
LEGAL_POSES.update({'verb'}) | |
def sstLookup(): | |
global n, nCovered, nCoveredNE, nMultiwordInstances, nAmbigTkns, nOOV, waysAmbiguous | |
i = 0 | |
while i<len(ww): # populate awnSSTs with supersense tags, preferring longer lexemes over shorter ones | |
#assert len(awnSSTs)==i,(i,len(awnSSTs)) | |
w = ww[i] | |
#print(w, end=' ') | |
if w=='\n': | |
yield '\n',None,None | |
i += 1 | |
continue | |
if w is None: # latin | |
yield '-',None,None | |
i += 1 | |
continue | |
pos = poses[i] | |
ne = nes[i] | |
relevant = False | |
for relevantPOS in LEGAL_POSES: | |
if relevantPOS in pos: | |
relevant = True | |
break | |
if not relevant: | |
yield '-',None,None | |
i += 1 | |
continue | |
n += 1 | |
if w not in suffixes: | |
nOOV += 1 | |
waysAmbiguous[0] += 1 | |
nCovered += int(ne!='') | |
nCoveredNE += int(ne!='') | |
yield '_',None,None | |
i += 1 | |
continue | |
matched = False | |
for entry in sorted(suffixes[w], key=lambda ent: ent.count('_'), reverse=True): | |
entryWords = entry.split('_') | |
k = len(entryWords) | |
#print(entryWords,ww[i:i+len(entryWords)]) | |
if entryWords[1:]==ww[i+1:i+k]: # match! | |
sstOpts = voting(entry) # score the possible tags | |
# nschneid 2012-04-30: the following line was looking up arsenses[w] and arsenses[w][sst], which I think was a bug | |
minRankSST = min(((sst,rank(syn)) for sst in arsenses[entry] for syn in arsenses[entry][sst]), key=lambda x:x[1]) | |
sstChoice = '_' | |
waysAmbiguous[len(sstOpts)] += 1 | |
if len(sstOpts)>1: | |
nAmbigTkns += 1 | |
(a,ascore),(b,bscore) = sstOpts.most_common(2) | |
if ascore>bscore: # otherwise (a tie), punt | |
sstChoice = a | |
else: | |
sstChoice = sstOpts.most_common(1)[0][0] | |
if k>1: nMultiwordInstances += 1 | |
for x in [sstChoice]+list('<'*(k-1)): | |
yield x,sstOpts,minRankSST | |
if sstChoice!='_': | |
nCovered += len(entryWords) # (though some words might not be nouns or verbs) | |
else: | |
nCovered += sum(1 for ne in nes[i:i+k] if ne!='') | |
nCoveredNE += sum(1 for ne in nes[i:i+k] if ne!='') | |
i += k | |
n += k-1 | |
matched = True | |
break | |
if not matched: # some multiword entries exist such the first word is not a separate entry | |
yield '_',None,None | |
i += 1 | |
iSent = 0 | |
for i,(w,(sst,sstScores,minRankSST),ne) in enumerate(czip(ww,sstLookup(),nes)): | |
if w=='\n': | |
print() | |
if iSent%10000==0: | |
print('sentence',iSent, file=sys.stderr) | |
iSent += 1 | |
else: | |
#print(sst+(ne and ','+ne), end=' ') # heuristically-selected SST and gold NE | |
mentionPosition = 'I' if sst=='<' else ('O' if sstScores is None else 'B') | |
print(json.dumps({k.lower():v for k,v in sstScores.items()})+'\t'+minRankSST[0].lower()+'\t'+mentionPosition if sstScores is not None else sst) # normalized SST scores for the first token of in-vocabulary items; -, _, or < for other tokens | |
# Summary information | |
print('Multiword:', nMultiwordInstances, file=sys.stderr) | |
print('OOV:',prop(nOOV,n), file=sys.stderr) | |
print('Ways ambiguous (fraction of tokens):',normalize(waysAmbiguous,n), file=sys.stderr) | |
print('{}s covered by NE annotations:'.format(MODE),prop(nCoveredNE,n), file=sys.stderr) | |
print('{}s covered by NE annotations and/or AWN/OntoNotes NEs:'.format(MODE),prop(nCovered,n), file=sys.stderr) | |
'''Summary output: | |
(original) | |
10510 | |
OOV: 3207/5368 = 59.74% | |
Ways ambiguous (fraction of tokens): Counter({0: 0.5974292101341282, 1: 0.19411326378539492, 2: 0.0868107302533532, 3: 0.05048435171385991, 4: 0.042846497764530554, 5: 0.010432190760059613, 6: 0.009314456035767511, 7: 0.005216095380029807, 9: 0.0027943368107302535, 8: 0.0005588673621460507}) | |
nouns covered by NE annotations: 1178/5368 = 21.94% | |
nouns covered by NE annotations and/or AWN: 2307/5368 = 42.98% | |
3216 | |
OOV: 1011/1106 = 91.41% | |
Ways ambiguous (fraction of tokens): Counter({0: 0.9141048824593129, 1: 0.045207956600361664, 2: 0.024412296564195298, 3: 0.007233273056057866, 4: 0.0054249547920434, 7: 0.003616636528028933}) | |
verbs covered by NE annotations: 82/1106 = 7.41% | |
verbs covered by NE annotations and/or AWN: 144/1106 = 13.02% | |
(with MW entries) | |
Multiword: 16 | |
OOV: 3062/5370 = 57.02% | |
Ways ambiguous (fraction of tokens): Counter({0: 0.5702048417132216, 1: 0.19646182495344505, 2: 0.08696461824953446, 3: 0.049720670391061456, 4: 0.03910614525139665, 5: 0.010428305400372439, 6: 0.00931098696461825, 7: 0.0052141527001862194, 9: 0.002793296089385475, 8: 0.00018621973929236498}) | |
nouns covered by NE annotations: 1171/5370 = 21.81% | |
nouns covered by NE annotations and/or AWN: 2879/5370 = 53.61% | |
(with OntoNotes entities) | |
Multiword lexemes from AWN: 3681/10510 = 35.02% | |
Multiword lexemes from OntoNotes: 6890/12604 = 54.67% | |
OOV: 2610/5370 = 48.60% | |
Ways ambiguous (fraction of tokens): Counter({0: 0.4860335195530726, 1: 0.2595903165735568, 2: 0.09757914338919925, 3: 0.05176908752327747, 4: 0.03910614525139665, 5: 0.013221601489757914, 6: 0.00931098696461825, 7: 0.0052141527001862194, 9: 0.002793296089385475, 8: 0.00018621973929236498}) | |
nouns covered by NE annotations: 1172/5370 = 21.82% | |
nouns covered by NE annotations and/or AWN/OntoNotes NEs: 2983/5370 = 55.55% | |
''' | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment