Created
December 22, 2017 12:47
-
-
Save seandavi/a844099c5cbb5e090edac826d8c65f61 to your computer and use it in GitHub Desktop.
split xml into smaller xmls based on a split "tag"
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
import argparse | |
import lxml.etree | |
import os, sys | |
import bz2 | |
parser = argparse.ArgumentParser() | |
parser.add_argument('tag') | |
parser.add_argument('n',default=100000) | |
parser.add_argument('wrapper', default=None) | |
parser.add_argument('basename', default='file') | |
opts = parser.parse_args() | |
#if(opts.fname=='-'): | |
# f = sys.stdin.encoding('UTF-8') | |
#else: | |
# f = open(opts.fname,'rb', encoding="UTF-8") | |
splitcount = 0 | |
fileindex = 0 | |
d = b"" | |
with sys.stdin.buffer as f: | |
context = lxml.etree.iterparse(f, events=('end', )) | |
for event, elem in context: | |
if elem.tag == opts.tag: | |
splitcount += 1 | |
d = d + lxml.etree.tostring(elem) | |
if((splitcount % int(opts.n))==0): | |
splitcount = 0 | |
filename = format(opts.basename + "-" + str(fileindex) + ".xml.bz2") | |
fileindex += 1 | |
print(fileindex) | |
with bz2.BZ2File(filename, 'wb') as f: | |
f.write(b"<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n") | |
if(opts.wrapper is not None): | |
f.write("<{}>".format(opts.wrapper).encode()) | |
f.write(d) | |
f.write("</{}>".format(opts.wrapper).encode()) | |
else: | |
f.write(d) | |
d = b"" | |
elem.clear() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
This code results in every tag having empty contents.