Skip to content

Instantly share code, notes, and snippets.

@ivlevdenis
Last active July 24, 2017 15:34
Show Gist options
  • Select an option

  • Save ivlevdenis/622537277eba2a28164adc984cb03940 to your computer and use it in GitHub Desktop.

Select an option

Save ivlevdenis/622537277eba2a28164adc984cb03940 to your computer and use it in GitHub Desktop.
Fast convert big xml to cvs.
from lxml import etree
from time import time
st = time()
c = 0
order_pattern = [
'1st_column',
'2nd_column',
'3rd_column',
...
'n_column',
]
el_holder = ['', ] * len(order_pattern)
with open('out.csv', 'w') as f:
for event, element in etree.iterparse('in.xml', tag='T'):
# out_str = ','.join([child.tag for child in element]) + '\n'
# print(out_str)
h = el_holder[:]
for child in element:
try:
if child.text:
if h[order_pattern.index(child.tag)]:
h[order_pattern.index(child.tag)] += '||' + child.text.strip()
else:
h[order_pattern.index(child.tag)] = child.text.strip()
except (ValueError, TypeError):
continue
f.write(','.join(h) + '\n')
element.clear()
while element.getprevious() is not None:
del element.getparent()[0]
c += 1
print('Processed: {0}\tTime: {1}'.format(c, time() - st))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment