Skip to content

Instantly share code, notes, and snippets.

@abevieiramota
Created May 18, 2018 17:32
Show Gist options
  • Save abevieiramota/a695c781dd3294d3337ed74df4ee6e5f to your computer and use it in GitHub Desktop.
Save abevieiramota/a695c781dd3294d3337ed74df4ee6e5f to your computer and use it in GitHub Desktop.
import os
import xml.etree.ElementTree as ET
import pandas as pd
tree = ET.parse('../data/webnlg2017/challenge_data_train_dev/train/2triples/2triples_Airport_train_challenge.xml')
root = tree.getroot()
all_elem = list(root.iter('entry'))
# entries
entries = [{
"category": elem.attrib['category'],
"eid": elem.attrib['eid'],
"size": elem.attrib['size']
} for elem in all_elem]
entries_df = pd.DataFrame(entries)
# original tripleset
otriples = [
{'eid': elem.attrib['eid'],
'text': e.text} for e in elem.find('originaltripleset').findall('otriple') for elem in all_elem
]
otriples_df = pd.DataFrame(otriples)
# modified tripleset
mtriples = [
{'eid': elem.attrib['eid'],
'text': e.text} for e in elem.find('modifiedtripleset').findall('mtriple') for elem in all_elem
]
mtriples_df = pd.DataFrame(mtriples)
# lexes
lexes = [
{'eid': elem.attrib['eid'],
'text': e.text,
'comment': e.attrib['comment'],
'lid': e.attrib['lid']} for e in elem.findall('lex') for elem in all_elem
]
lexes_df = pd.DataFrame(lexes)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment