Created
December 2, 2021 15:47
-
-
Save rickythefox/9fb2966878c852ce91729f960d48e1eb to your computer and use it in GitHub Desktop.
HSA xml to pandas dataframe
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from lxml import etree | |
import pandas as pd | |
def get_hsa_attribute_value(attribute_name, child_element): | |
local_child_tag_name = etree.QName(child_element).localname | |
if local_child_tag_name == 'S': | |
yield attribute_name, child_element.text | |
elif local_child_tag_name == 'Address': | |
yield from [(f'{attribute_name}_{ix}', al.text) for ix, al in enumerate(child_element.iterchildren())] | |
elif local_child_tag_name in ['TimeSpan', 'Coordinate', 'BusinessClassificationType']: | |
yield from [(f'{attribute_name}_{etree.QName(c).localname}', c.text) for c in child_element.iterchildren()] | |
else: | |
yield attribute_name, child_element.text | |
def get_hsa_attribute(el): | |
local_tag_name = etree.QName(el).localname | |
if local_tag_name != 'Attribute': | |
yield local_tag_name, el.text | |
for ac in el.iterchildren(): | |
yield from get_hsa_attribute_value(el.get('name'), ac) | |
def get_hsa_object_dict(hsa_object_element): | |
attrs = {} | |
for c in hsa_object_element.iterchildren(): | |
for k, v in get_hsa_attribute(c): | |
attrs[k] = v | |
return attrs | |
def parse_hsa_file(file_name): | |
with open(file_name, 'r') as f: | |
xml = etree.parse(f) | |
hsa_objects = xml.find('{urn:riv:hsa:HsaInformationList:2}HsaObjects') | |
hsa_dict_list = [get_hsa_object_dict(hsa_object) for hsa_object in hsa_objects.iterchildren()] | |
df = pd.DataFrame(hsa_dict_list).dropna(axis=1, how='all') | |
df = df.reindex(sorted(df.columns), axis=1) | |
print(df.shape) | |
print(df.describe()) | |
print(df.dtypes) | |
print(df.memory_usage(index=True).sum()) | |
print(df.isna().sum().sum()) | |
return df | |
# df.to_excel('publicunits-1004-0900.xlsx') | |
if __name__ == '__main__': | |
parse_hsa_file('publicunits-1004-0900.xml') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment