Skip to content

Instantly share code, notes, and snippets.

@interrogator
Created May 18, 2015 03:31
Show Gist options
  • Save interrogator/2b3f37cc14712c5964c5 to your computer and use it in GitHub Desktop.
Save interrogator/2b3f37cc14712c5964c5 to your computer and use it in GitHub Desktop.
treebank sfl conversion
def parse_sfl(n = 3):
from bs4 import BeautifulSoup
import os
from collections import defaultdict
# path to xml files
xmlpath = 'XML'
# list of sfl categories
sfl_list = [
['interpersonal', ['subject', 'finite', 'predicator', 'complement', 'adjunct', 'untyped']],
['experiential', ['participant', 'process', 'circumstance', 'untyped']],
['textual', ['theme', 'rheme', 'untyped']]
]
# subcategories ... should have made this a dict instead
int_roles = sfl_list[0][1]
exp_roles = sfl_list[1][1]
tex_roles = sfl_list[2][1]
def make_roledict(grammar, sfl_list, n = 3):
"""take soup and make a list of constituent ids and their sfl functions"""
ccs = [cc for cc in grammar.find_all('constituent', recursive = False) if cc['type'] == 'Clause_Complex']
cc_num = 0
roledict = defaultdict(list)
# some superfluous looping here
for cc in ccs:
cc_num += 1
c_num = 0
for clause in [s for s in soup.find_all('constituent') if s['type'] == 'Clause']:
c_num += 1
# very inefficient, should go by if first, rather than function ...
for metafunction, roles in sfl_list:
for func in [f for f in clause.find_all('function') if f['metafunction'] == metafunction]:
for role in roles:
if func['name'] == role:
conrefs = func.find_all('constituentref')
for conref in conrefs:
roledict[conref['idref']].append(role)
# just unique values, as list
for key in roledict.keys():
roledict[key] = list(set(roledict[key]))
return roledict
# file list
fs = [os.path.join(xmlpath, f) for f in os.listdir(xmlpath)[:n]]
for f in fs:
print f
soup = BeautifulSoup(open(f).read())
text = soup.find_all('expressionplane')[0].text
text = text.lstrip()
print text
grammar = soup.grammar
# make a dict for the sfl roles in each file
roledict = make_roledict(grammar, sfl_list, n = n)
# get clause complex numbers, loop through them.
ccs = [cc for cc in grammar.find_all('constituent', recursive = False) if cc['type'] == 'Clause_Complex']
cc_num = 0
for cc in ccs:
cc_num += 1
# for constituents with an sfl role:
for constituent in [s for s in soup.find_all('constituent') if s['id'] in roledict.keys()]:
# look up all the roles that this constituent has
lst_of_roles = roledict[constituent['id']]
# account for some functions being word and some being constituent
if not constituent['type'] == 'Word':
words = [w for w in constituent.find_all('constituent') if w['type'] == 'Word']
else:
words = [constituent]
# go through each word in each constituent
w_num = 0
for index, w in enumerate(words):
# make iob tags ... i wonder if this is ok?
iobed = []
for r in lst_of_roles:
if index == 0:
iobed.append('B-%s' % r)
else:
iobed.append('I-%s' % r)
all_roles = ','.join(iobed)
w_num += 1
# some stuff not being used here, no big deal
string_ref = w.find_all('stringref', limit = 1)[0]
st = string_ref['start']
en = string_ref['end']
token = text[int(st):int(en)]
word_level = string_ref.parent
mid_level = word_level.parent
const = pos = mid_level.find_all('constituent')[0]
pos = const.find_all('features')[0].find_all('feature')[0]['value']
pos = pos.replace('label.', '')
print ','.join([str(cc_num), str(w_num), token, pos, all_roles])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment