interrogator · May 18, 2015 03:31
diff --git a/gistfile1.py b/gistfile1.py
 def parse_sfl(n = 3):
    from bs4 import BeautifulSoup
    import os
    from collections import defaultdict
    
    # path to xml files
    xmlpath = 'XML'

    # list of sfl categories
    sfl_list = [
                ['interpersonal', ['subject', 'finite', 'predicator', 'complement', 'adjunct', 'untyped']], 
                ['experiential', ['participant', 'process', 'circumstance', 'untyped']],
                ['textual', ['theme', 'rheme', 'untyped']]
                ]

    # subcategories ... should have made this a dict instead
    int_roles = sfl_list[0][1]
    exp_roles = sfl_list[1][1]
    tex_roles = sfl_list[2][1]


    def make_roledict(grammar, sfl_list, n = 3):
        """take soup and make a list of constituent ids and their sfl functions"""
        ccs = [cc for cc in grammar.find_all('constituent', recursive = False) if cc['type'] == 'Clause_Complex']
        cc_num = 0
        roledict = defaultdict(list)
        # some superfluous looping here
        for cc in ccs:
            cc_num += 1
            c_num = 0
            for clause in [s for s in soup.find_all('constituent') if s['type'] == 'Clause']:
                c_num += 1
                # very inefficient, should go by if first, rather than function ...
                for metafunction, roles in sfl_list:
                    for func in [f for f in clause.find_all('function') if f['metafunction'] == metafunction]:
                        for role in roles:
                            if func['name'] == role:
                                conrefs = func.find_all('constituentref')
                                for conref in conrefs:
                                    roledict[conref['idref']].append(role)

        # just unique values, as list
        for key in roledict.keys():
            roledict[key] = list(set(roledict[key]))
        return roledict

    # file list
    fs = [os.path.join(xmlpath, f) for f in os.listdir(xmlpath)[:n]]
    
    for f in fs:
        print f
        soup = BeautifulSoup(open(f).read())
        text = soup.find_all('expressionplane')[0].text
        text = text.lstrip()
        print text
        grammar = soup.grammar

        # make a dict for the sfl roles in each file
        roledict = make_roledict(grammar, sfl_list, n = n)

        # get clause complex numbers, loop through them.
        ccs = [cc for cc in grammar.find_all('constituent', recursive = False) if cc['type'] == 'Clause_Complex']
        cc_num = 0
        for cc in ccs:
            cc_num += 1

            # for constituents with an sfl role:
            for constituent in [s for s in soup.find_all('constituent') if s['id'] in roledict.keys()]:
                
                # look up all the roles that this constituent has
                lst_of_roles = roledict[constituent['id']]

                # account for some functions being word and some being constituent
                if not constituent['type'] == 'Word':
                    words = [w for w in constituent.find_all('constituent') if w['type'] == 'Word']
                else:
                    words = [constituent]

                # go through each word in each constituent
                w_num = 0
                for index, w in enumerate(words):

                    # make iob tags ... i wonder if this is ok?
                    iobed = []
                    for r in lst_of_roles:
                        if index == 0:
                            iobed.append('B-%s' % r)
                        else:
                            iobed.append('I-%s' % r)

                    all_roles = ','.join(iobed)

                    w_num += 1

                    # some stuff not being used here, no big deal
                    string_ref = w.find_all('stringref', limit = 1)[0]
                    st = string_ref['start']
                    en = string_ref['end']                
                    token = text[int(st):int(en)]
                    word_level = string_ref.parent
                    mid_level = word_level.parent
                    const = pos = mid_level.find_all('constituent')[0]
                    pos = const.find_all('features')[0].find_all('feature')[0]['value']
                    pos = pos.replace('label.', '')
                    print ','.join([str(cc_num), str(w_num), token, pos, all_roles])
	def parse_sfl(n = 3):
	from bs4 import BeautifulSoup
	import os
	from collections import defaultdict

	# path to xml files
	xmlpath = 'XML'

	# list of sfl categories
	sfl_list = [
	['interpersonal', ['subject', 'finite', 'predicator', 'complement', 'adjunct', 'untyped']],
	['experiential', ['participant', 'process', 'circumstance', 'untyped']],
	['textual', ['theme', 'rheme', 'untyped']]
	]

	# subcategories ... should have made this a dict instead
	int_roles = sfl_list[0][1]
	exp_roles = sfl_list[1][1]
	tex_roles = sfl_list[2][1]


	def make_roledict(grammar, sfl_list, n = 3):
	"""take soup and make a list of constituent ids and their sfl functions"""
	ccs = [cc for cc in grammar.find_all('constituent', recursive = False) if cc['type'] == 'Clause_Complex']
	cc_num = 0
	roledict = defaultdict(list)
	# some superfluous looping here
	for cc in ccs:
	cc_num += 1
	c_num = 0
	for clause in [s for s in soup.find_all('constituent') if s['type'] == 'Clause']:
	c_num += 1
	# very inefficient, should go by if first, rather than function ...
	for metafunction, roles in sfl_list:
	for func in [f for f in clause.find_all('function') if f['metafunction'] == metafunction]:
	for role in roles:
	if func['name'] == role:
	conrefs = func.find_all('constituentref')
	for conref in conrefs:
	roledict[conref['idref']].append(role)

	# just unique values, as list
	for key in roledict.keys():
	roledict[key] = list(set(roledict[key]))
	return roledict

	# file list
	fs = [os.path.join(xmlpath, f) for f in os.listdir(xmlpath)[:n]]

	for f in fs:
	print f
	soup = BeautifulSoup(open(f).read())
	text = soup.find_all('expressionplane')[0].text
	text = text.lstrip()
	print text
	grammar = soup.grammar

	# make a dict for the sfl roles in each file
	roledict = make_roledict(grammar, sfl_list, n = n)

	# get clause complex numbers, loop through them.
	ccs = [cc for cc in grammar.find_all('constituent', recursive = False) if cc['type'] == 'Clause_Complex']
	cc_num = 0
	for cc in ccs:
	cc_num += 1

	# for constituents with an sfl role:
	for constituent in [s for s in soup.find_all('constituent') if s['id'] in roledict.keys()]:

	# look up all the roles that this constituent has
	lst_of_roles = roledict[constituent['id']]

	# account for some functions being word and some being constituent
	if not constituent['type'] == 'Word':
	words = [w for w in constituent.find_all('constituent') if w['type'] == 'Word']
	else:
	words = [constituent]

	# go through each word in each constituent
	w_num = 0
	for index, w in enumerate(words):

	# make iob tags ... i wonder if this is ok?
	iobed = []
	for r in lst_of_roles:
	if index == 0:
	iobed.append('B-%s' % r)
	else:
	iobed.append('I-%s' % r)

	all_roles = ','.join(iobed)

	w_num += 1

	# some stuff not being used here, no big deal
	string_ref = w.find_all('stringref', limit = 1)[0]
	st = string_ref['start']
	en = string_ref['end']
	token = text[int(st):int(en)]
	word_level = string_ref.parent
	mid_level = word_level.parent
	const = pos = mid_level.find_all('constituent')[0]
	pos = const.find_all('features')[0].find_all('feature')[0]['value']
	pos = pos.replace('label.', '')
	print ','.join([str(cc_num), str(w_num), token, pos, all_roles])