Created
May 18, 2015 03:31
-
-
Save interrogator/2b3f37cc14712c5964c5 to your computer and use it in GitHub Desktop.
treebank sfl conversion
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def parse_sfl(n = 3): | |
from bs4 import BeautifulSoup | |
import os | |
from collections import defaultdict | |
# path to xml files | |
xmlpath = 'XML' | |
# list of sfl categories | |
sfl_list = [ | |
['interpersonal', ['subject', 'finite', 'predicator', 'complement', 'adjunct', 'untyped']], | |
['experiential', ['participant', 'process', 'circumstance', 'untyped']], | |
['textual', ['theme', 'rheme', 'untyped']] | |
] | |
# subcategories ... should have made this a dict instead | |
int_roles = sfl_list[0][1] | |
exp_roles = sfl_list[1][1] | |
tex_roles = sfl_list[2][1] | |
def make_roledict(grammar, sfl_list, n = 3): | |
"""take soup and make a list of constituent ids and their sfl functions""" | |
ccs = [cc for cc in grammar.find_all('constituent', recursive = False) if cc['type'] == 'Clause_Complex'] | |
cc_num = 0 | |
roledict = defaultdict(list) | |
# some superfluous looping here | |
for cc in ccs: | |
cc_num += 1 | |
c_num = 0 | |
for clause in [s for s in soup.find_all('constituent') if s['type'] == 'Clause']: | |
c_num += 1 | |
# very inefficient, should go by if first, rather than function ... | |
for metafunction, roles in sfl_list: | |
for func in [f for f in clause.find_all('function') if f['metafunction'] == metafunction]: | |
for role in roles: | |
if func['name'] == role: | |
conrefs = func.find_all('constituentref') | |
for conref in conrefs: | |
roledict[conref['idref']].append(role) | |
# just unique values, as list | |
for key in roledict.keys(): | |
roledict[key] = list(set(roledict[key])) | |
return roledict | |
# file list | |
fs = [os.path.join(xmlpath, f) for f in os.listdir(xmlpath)[:n]] | |
for f in fs: | |
print f | |
soup = BeautifulSoup(open(f).read()) | |
text = soup.find_all('expressionplane')[0].text | |
text = text.lstrip() | |
print text | |
grammar = soup.grammar | |
# make a dict for the sfl roles in each file | |
roledict = make_roledict(grammar, sfl_list, n = n) | |
# get clause complex numbers, loop through them. | |
ccs = [cc for cc in grammar.find_all('constituent', recursive = False) if cc['type'] == 'Clause_Complex'] | |
cc_num = 0 | |
for cc in ccs: | |
cc_num += 1 | |
# for constituents with an sfl role: | |
for constituent in [s for s in soup.find_all('constituent') if s['id'] in roledict.keys()]: | |
# look up all the roles that this constituent has | |
lst_of_roles = roledict[constituent['id']] | |
# account for some functions being word and some being constituent | |
if not constituent['type'] == 'Word': | |
words = [w for w in constituent.find_all('constituent') if w['type'] == 'Word'] | |
else: | |
words = [constituent] | |
# go through each word in each constituent | |
w_num = 0 | |
for index, w in enumerate(words): | |
# make iob tags ... i wonder if this is ok? | |
iobed = [] | |
for r in lst_of_roles: | |
if index == 0: | |
iobed.append('B-%s' % r) | |
else: | |
iobed.append('I-%s' % r) | |
all_roles = ','.join(iobed) | |
w_num += 1 | |
# some stuff not being used here, no big deal | |
string_ref = w.find_all('stringref', limit = 1)[0] | |
st = string_ref['start'] | |
en = string_ref['end'] | |
token = text[int(st):int(en)] | |
word_level = string_ref.parent | |
mid_level = word_level.parent | |
const = pos = mid_level.find_all('constituent')[0] | |
pos = const.find_all('features')[0].find_all('feature')[0]['value'] | |
pos = pos.replace('label.', '') | |
print ','.join([str(cc_num), str(w_num), token, pos, all_roles]) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment