Skip to content

Instantly share code, notes, and snippets.

@seandavi
Created December 22, 2017 12:47
Show Gist options
  • Save seandavi/a844099c5cbb5e090edac826d8c65f61 to your computer and use it in GitHub Desktop.
Save seandavi/a844099c5cbb5e090edac826d8c65f61 to your computer and use it in GitHub Desktop.
split xml into smaller xmls based on a split "tag"
#!/usr/bin/env python
import argparse
import lxml.etree
import os, sys
import bz2
parser = argparse.ArgumentParser()
parser.add_argument('tag')
parser.add_argument('n',default=100000)
parser.add_argument('wrapper', default=None)
parser.add_argument('basename', default='file')
opts = parser.parse_args()
#if(opts.fname=='-'):
# f = sys.stdin.encoding('UTF-8')
#else:
# f = open(opts.fname,'rb', encoding="UTF-8")
splitcount = 0
fileindex = 0
d = b""
with sys.stdin.buffer as f:
context = lxml.etree.iterparse(f, events=('end', ))
for event, elem in context:
if elem.tag == opts.tag:
splitcount += 1
d = d + lxml.etree.tostring(elem)
if((splitcount % int(opts.n))==0):
splitcount = 0
filename = format(opts.basename + "-" + str(fileindex) + ".xml.bz2")
fileindex += 1
print(fileindex)
with bz2.BZ2File(filename, 'wb') as f:
f.write(b"<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n")
if(opts.wrapper is not None):
f.write("<{}>".format(opts.wrapper).encode())
f.write(d)
f.write("</{}>".format(opts.wrapper).encode())
else:
f.write(d)
d = b""
elem.clear()
@havardox
Copy link

This code results in every tag having empty contents.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment