Skip to content

Instantly share code, notes, and snippets.

@estasney
Created July 13, 2018 01:46
Show Gist options
  • Save estasney/f899ce4e0098344496e8796ac0dd5ff5 to your computer and use it in GitHub Desktop.
Save estasney/f899ce4e0098344496e8796ac0dd5ff5 to your computer and use it in GitHub Desktop.
Parsing XML of Stack Overflow Data Dumps
from gensim.utils import smart_open
from collections import defaultdict, OrderedDict
import csv
import xml.etree.ElementTree as ET
headers = ['AcceptedAnswerId', 'AnswerCount', 'ClosedDate', 'CommentCount', 'CommunityOwnedDate', 'CreationDate',
'FavoriteCount', 'Id', 'LastActivityDate', 'LastEditDate', 'LastEditorDisplayName', 'LastEditorUserId',
'OwnerDisplayName', 'OwnerUserId', 'ParentId', 'PostTypeId', 'Score', 'Tags', 'Title', 'ViewCount']
file_path = ""
class SmartXML(object):
def __init__(self, fp, headers):
self.fp = fp
self.headers = headers
def __iter__(self):
for line in smart_open(self.fp):
try:
e = ET.fromstring(line)
ed = dict(e.items())
td = {h: ed.get(h, "") for h in self.headers}
yield td
except:
continue
def __getitem__(self, index):
for i, line in enumerate(smart_open(self.fp)):
if i == index:
try:
e = ET.fromstring(line)
ed = dict(e.items())
td = {h: ed.get(h, "") for h in self.headers}
return td
except:
return None
smart_xml = SmartXML(file_path, headers)
with open(r'posts_data.csv', 'w', newline='') as csv_file:
writer = csv.writer(csv_file)
writer.writerow(headers)
for i, doc in enumerate(smart_xml):
if not doc:
continue
data = list(doc.values())
writer.writerow(data)
if i % 100000 == 0:
print(i)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment