Skip to content

Instantly share code, notes, and snippets.

@mdamien
Created April 28, 2017 01:31
Show Gist options
  • Save mdamien/976b4ae2c1c108286755de4fd1c45eee to your computer and use it in GitHub Desktop.
Save mdamien/976b4ae2c1c108286755de4fd1c45eee to your computer and use it in GitHub Desktop.
wikipedia to git
from slugify import slugify
import os, shlex, sys
from bs4 import BeautifulSoup
def call(cmd):
code = os.system(cmd)
if code != 0:
raise Exception('"{}"" returned {}'.format(cmd, code))
def call_bash(cmd):
call('bash -c ' + shlex.quote(cmd))
doc = open(sys.argv[1])
soup = BeautifulSoup(doc, 'xml')
DIR = os.sys.argv[2]
os.system('rm -rf ' + DIR)
call('mkdir -p ' + DIR)
call('cd ' + DIR + '; git init')
insertions = []
for child in soup.find_all('page'):
title = child.find('title').text
for revision in child.find_all('revision'):
timestamp = revision.find('timestamp').text
contributor = revision.find('contributor')
username = contributor.find('username')
username_email = ''
if username:
username = username.text
username_email = contributor.find('id').text
else:
username = contributor.find('ip').text
text = revision.find('text').text
insertions.append({
'username': username,
'username_email': username_email,
'title': title,
'text': text,
'timestamp': timestamp,
})
insertions.sort(key=lambda i: i['timestamp'])
for i in insertions:
env = "export GIT_COMMITTER_NAME={};".format(shlex.quote(i['username'])) \
+ "export GIT_COMMITTER_EMAIL={};".format(shlex.quote(i['username_email'])) \
+ "export GIT_COMMITTER_DATE={};".format(shlex.quote(i['timestamp']))
open(os.path.join(DIR, slugify(i['title'])), 'w').write(i['text'])
call_bash('cd ' + DIR + '; git add -A .;' \
+ env
+ 'git commit --date={} --author={} -m {} --allow-empty -q ' \
.format(
shlex.quote(i['timestamp']),
shlex.quote(i['username'] + ' <' + i['username_email'] + '>'),
shlex.quote(i['title'] + ' updated'),
)
)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment