Skip to content

Instantly share code, notes, and snippets.

@rueycheng
Last active February 12, 2023 14:41
Show Gist options
  • Save rueycheng/a3a22c2f9a5c35d899430534601ebf4c to your computer and use it in GitHub Desktop.
Save rueycheng/a3a22c2f9a5c35d899430534601ebf4c to your computer and use it in GitHub Desktop.
extracting tuples from Wikipedia database dumps by directly parsing SQL INSERT statements
"""
This is a tool for extracting tuples directly out of Wikipedia database dumps
by parsing SQL INSERT statements. For now, the output is in (tab-delimited)
CSV format, where each column indicate a field and each row a record. The tool works
with gzip'ed files out of the box.
To use this tool, simply go (take `pagelinks.sql.gz` as an example):
python extract_wp_tuples.py enwiki-20150515-pagelinks.sql.gz
"""
import fileinput
import gzip
import sys
import re
LEAD1 = 'INSERT INTO `'
LEAD2 = '` VALUES '
LEN_LEAD1 = len(LEAD1)
LEN_LEAD2 = len(LEAD2)
def get_tuples(line):
mode = 0
start = None
skip = False
parts = []
for i, c in enumerate(line):
if mode == 0:
if c != '(': raise Exception
mode = 1
elif mode == 1:
if c == "'":
mode = 2
start = i + 1
else:
mode = 4
start = i
elif mode == 2:
if skip:
skip = False
continue
if c == "'":
parts.append(line[start:i])
start = None
mode = 3
elif c == '\\':
skip = True
else:
continue
elif mode == 3:
if c == ',': mode = 1
elif c == ')':
mode = 10
else: raise Exception
elif mode == 4:
if c == ',' or c == ')':
parts.append(line[start:i])
start = None
mode = 1 if c == ',' else 10
elif mode == 10:
yield parts
parts = []
if c == ',': mode = 0
else: raise Exception
assert mode == 10
yield parts
if __name__ == '__main__':
for line in fileinput.input(openhook=fileinput.hook_compressed):
line = line.strip()
if line.startswith(LEAD1) and line.endswith(';'):
pos = line.find(LEAD2, LEN_LEAD1)
if pos > 0:
for t in get_tuples(line[pos + LEN_LEAD2:-1]):
print '\t'.join(t)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment