Created
July 25, 2018 08:41
-
-
Save winnydejong/3ff1bbe5c7762708b94dcfa01b34b770 to your computer and use it in GitHub Desktop.
Parsing XMLs with XPath and ElementTree XML API
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| """ | |
| The ElementTree documentation shows how to parse XML using XPath: | |
| https://docs.python.org/3.4/library/xml.etree.elementtree.html#example | |
| """ | |
| import xml.etree.ElementTree as ET | |
| root = ET.fromstring(countrydata) | |
| # Top-level elements | |
| root.findall(".") | |
| # All 'neighbor' grand-children of 'country' children of the top-level elements | |
| root.findall("./country/neighbor") | |
| # Nodes with name='Singapore' that have a 'year' child | |
| root.findall(".//year/..[@name='Singapore']") | |
| # 'year' nodes that are children of nodes with name='Singapore' | |
| root.findall(".//*[@name='Singapore']/year") | |
| # All 'neighbor' nodes that are the second child of their parent | |
| root.findall(".//neighbor[2]") | |
| """ | |
| Knowing this, I wrote a funtion to parse XMLs, | |
| and had the script iterate over XMLs in a directory: | |
| """ | |
| from lxml import etree | |
| import csv | |
| import os | |
| # open csv, create writer, then write headers | |
| f1 = open('1.csv', 'w', encoding='utf8', newline='') | |
| wf1 = csv.writer(f1, delimiter=',') | |
| wf1.writerow(['Status', 'AantalInwoners']) # headers | |
| # function to parse xml | |
| def parse(text): | |
| tree = etree.fromstring(text) | |
| try: # using .strip() 'cause its string | |
| Status = tree.xpath('//Status/text()')[0].strip() | |
| except: | |
| Status = '' | |
| try: # no .strip() since its an integer | |
| AantalInwoners = tree.xpath("//AantalInwoners/text()")[0] | |
| except: | |
| AantalInwoners = '' | |
| # write data to csv | |
| wf1.writerow([Status, AantalInwoners,]) | |
| # loop over files in directory 'xmls' | |
| for name in os.listdir('xmls'): | |
| q = open('xmls/{}'.format(name), 'rb') | |
| text = q.read() | |
| q.close() | |
| try: | |
| parse(text) | |
| except Exception as ex: # exception? print filename | |
| print(name, ':', ex) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment