Last active
February 12, 2023 14:41
-
-
Save rueycheng/a3a22c2f9a5c35d899430534601ebf4c to your computer and use it in GitHub Desktop.
extracting tuples from Wikipedia database dumps by directly parsing SQL INSERT statements
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
This is a tool for extracting tuples directly out of Wikipedia database dumps | |
by parsing SQL INSERT statements. For now, the output is in (tab-delimited) | |
CSV format, where each column indicate a field and each row a record. The tool works | |
with gzip'ed files out of the box. | |
To use this tool, simply go (take `pagelinks.sql.gz` as an example): | |
python extract_wp_tuples.py enwiki-20150515-pagelinks.sql.gz | |
""" | |
import fileinput | |
import gzip | |
import sys | |
import re | |
LEAD1 = 'INSERT INTO `' | |
LEAD2 = '` VALUES ' | |
LEN_LEAD1 = len(LEAD1) | |
LEN_LEAD2 = len(LEAD2) | |
def get_tuples(line): | |
mode = 0 | |
start = None | |
skip = False | |
parts = [] | |
for i, c in enumerate(line): | |
if mode == 0: | |
if c != '(': raise Exception | |
mode = 1 | |
elif mode == 1: | |
if c == "'": | |
mode = 2 | |
start = i + 1 | |
else: | |
mode = 4 | |
start = i | |
elif mode == 2: | |
if skip: | |
skip = False | |
continue | |
if c == "'": | |
parts.append(line[start:i]) | |
start = None | |
mode = 3 | |
elif c == '\\': | |
skip = True | |
else: | |
continue | |
elif mode == 3: | |
if c == ',': mode = 1 | |
elif c == ')': | |
mode = 10 | |
else: raise Exception | |
elif mode == 4: | |
if c == ',' or c == ')': | |
parts.append(line[start:i]) | |
start = None | |
mode = 1 if c == ',' else 10 | |
elif mode == 10: | |
yield parts | |
parts = [] | |
if c == ',': mode = 0 | |
else: raise Exception | |
assert mode == 10 | |
yield parts | |
if __name__ == '__main__': | |
for line in fileinput.input(openhook=fileinput.hook_compressed): | |
line = line.strip() | |
if line.startswith(LEAD1) and line.endswith(';'): | |
pos = line.find(LEAD2, LEN_LEAD1) | |
if pos > 0: | |
for t in get_tuples(line[pos + LEN_LEAD2:-1]): | |
print '\t'.join(t) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment