Created
July 13, 2018 01:46
-
-
Save estasney/f899ce4e0098344496e8796ac0dd5ff5 to your computer and use it in GitHub Desktop.
Parsing XML of Stack Overflow Data Dumps
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from gensim.utils import smart_open | |
from collections import defaultdict, OrderedDict | |
import csv | |
import xml.etree.ElementTree as ET | |
headers = ['AcceptedAnswerId', 'AnswerCount', 'ClosedDate', 'CommentCount', 'CommunityOwnedDate', 'CreationDate', | |
'FavoriteCount', 'Id', 'LastActivityDate', 'LastEditDate', 'LastEditorDisplayName', 'LastEditorUserId', | |
'OwnerDisplayName', 'OwnerUserId', 'ParentId', 'PostTypeId', 'Score', 'Tags', 'Title', 'ViewCount'] | |
file_path = "" | |
class SmartXML(object): | |
def __init__(self, fp, headers): | |
self.fp = fp | |
self.headers = headers | |
def __iter__(self): | |
for line in smart_open(self.fp): | |
try: | |
e = ET.fromstring(line) | |
ed = dict(e.items()) | |
td = {h: ed.get(h, "") for h in self.headers} | |
yield td | |
except: | |
continue | |
def __getitem__(self, index): | |
for i, line in enumerate(smart_open(self.fp)): | |
if i == index: | |
try: | |
e = ET.fromstring(line) | |
ed = dict(e.items()) | |
td = {h: ed.get(h, "") for h in self.headers} | |
return td | |
except: | |
return None | |
smart_xml = SmartXML(file_path, headers) | |
with open(r'posts_data.csv', 'w', newline='') as csv_file: | |
writer = csv.writer(csv_file) | |
writer.writerow(headers) | |
for i, doc in enumerate(smart_xml): | |
if not doc: | |
continue | |
data = list(doc.values()) | |
writer.writerow(data) | |
if i % 100000 == 0: | |
print(i) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment