Skip to content

Instantly share code, notes, and snippets.

@tiancheng91
Created August 17, 2017 02:54
Show Gist options
  • Save tiancheng91/c4f45289a49dc225834f08c334cdbd20 to your computer and use it in GitHub Desktop.
Save tiancheng91/c4f45289a49dc225834f08c334cdbd20 to your computer and use it in GitHub Desktop.
stackoverflow dumpfile import into mongo
# coding:utf-8
import time
import datetime
import re
import logging
import json
from xml.etree import ElementTree as etree
from pymongo import MongoClient
db = MongoClient("mongodb://10.138.0.2:27017").stackoverflow
run_tmp = open('run_tmp', 'w')
def parse_line(line):
try:
item = etree.fromstring(line)
except Exception as e:
logging.warning('parse line error : %s' % (e))
return None
return item.attrib
def camel_2_snake_convert(name):
s1 = re.sub('(.)([A-Z][a-z]+)', r'\1_\2', name)
return re.sub('([a-z0-9])([A-Z])', r'\1_\2', s1).lower()
def dict_camel_2_snake_convert(obj):
new = {}
for key in obj.keys():
new[camel_2_snake_convert(key)] = obj[key]
return new
def read_file(file):
for i, line in enumerate(open(file)):
if i % 10000 == 0:
logging.info('running at %s' % (i))
run_tmp.seek(0)
run_tmp.write("%s" % (i))
yield parse_line(line.strip())
def trans_post(obj):
item = dict_camel_2_snake_convert(obj)
item['title_cn'] = ''
item['body_cn'] = ''
if 'tags' in item:
item['tags'] = item['tags'].replace('><', ',').strip('<').strip('>').split(',')
# int type
for key in ['id', 'post_type_id', 'parent_id', 'accept_answer_id', 'score', 'view_count',
'owner_user_id', 'last_editor_user_id', 'answer_count', 'comment_count', 'favorite_count']:
item[key] = int(item[key]) if key in item else 0
item['_id'] = item['id']
del item['id']
return item
if __name__ == '__main__':
tmp = []
dump = open('result.json', 'w')
for line in read_file('Posts.xml'):
if not line:
continue
tmp.append(trans_post(line))
# dump.write("%s\r\n" % (trans_post(line)))
if len(tmp) == 50:
try:
db.posts.insert_many(tmp)
tmp = []
except:
sleep(1)
logging.warning('insert_error, sleep')
db.posts.insert_many(tmp)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment