Skip to content

Instantly share code, notes, and snippets.

@mdamien
Created August 4, 2015 13:21
Show Gist options
  • Save mdamien/2cc2ad9ecaccf7724eac to your computer and use it in GitHub Desktop.
Save mdamien/2cc2ad9ecaccf7724eac to your computer and use it in GitHub Desktop.
Parse StackOverflow & StackExchange
import xmltodict, json, sys, os
from os.path import join
SPECIFIC_DIR = sys.argv[1].replace('extracted/','') if len(sys.argv) > 1 else None
DIRS = [SPECIFIC_DIR] if SPECIFIC_DIR else os.listdir('extracted/')
for DIR in DIRS:
print(DIR)
DIR2 = 'extracted/'+DIR
if DIR in done:
continue
ALL = {}
with open(join(DIR2,'Posts.xml')) as fd:
ALL = xmltodict.parse(fd.read())
print("STATS", len(ALL['posts']['row']),'posts')
with open(join(DIR2,'Tags.xml')) as fd:
ALL.update(xmltodict.parse(fd.read()))
print("STATS", len(ALL['tags']['row']),'tags')
with open(join(DIR2,'Users.xml')) as fd:
ALL.update(xmltodict.parse(fd.read()))
print("STATS", len(ALL['users']['row']),'users')
with open(join(DIR2,'Comments.xml')) as fd:
ALL.update(xmltodict.parse(fd.read()))
print("STATS", len(ALL['comments']['row']),'comments')
NEW_ALL = {}
for bigkey,els in ALL.items():
new_els = []
for el in els['row']:
new = {}
for key,val in el.items():
new[key.lower().replace('@','')] = val
new_els.append(new)
NEW_ALL[bigkey] = new_els
out = DIR.replace('/','') \
.replace('stackexchange.com','js')
with open(join('out',out),'w') as f:
f.write('var DATA = ')
json.dump(NEW_ALL,f,indent=2)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment