rueycheng · February 12, 2023 14:41
diff --git a/extract_wikipedia_tuples.py b/extract_wikipedia_tuples.py
 """
 This is a tool for extracting tuples directly out of Wikipedia database dumps
 by parsing SQL INSERT statements.  For now, the output is in (tab-delimited)
 CSV format, where each column indicate a field and each row a record.  The tool works
 with gzip'ed files out of the box.

 To use this tool, simply go (take `pagelinks.sql.gz` as an example):

    python extract_wp_tuples.py enwiki-20150515-pagelinks.sql.gz
 """
 import fileinput
 import gzip
 import sys
 import re

 LEAD1 = 'INSERT INTO `'
 LEAD2 = '` VALUES '

 LEN_LEAD1 = len(LEAD1)
 LEN_LEAD2 = len(LEAD2)

 def get_tuples(line):
    mode = 0
    start = None
    skip = False
    parts = []
    for i, c in enumerate(line):
        if mode == 0:
            if c != '(': raise Exception
            mode = 1
        elif mode == 1:
            if c == "'": 
                mode = 2
                start = i + 1
            else: 
                mode = 4
                start = i
        elif mode == 2:
            if skip:
                skip = False
                continue
            if c == "'":
                parts.append(line[start:i])
                start = None
                mode = 3
            elif c == '\\':
                skip = True
            else:
                continue
        elif mode == 3:
            if c == ',': mode = 1
            elif c == ')': 
                mode = 10
            else: raise Exception
        elif mode == 4:
            if c == ',' or c == ')': 
                parts.append(line[start:i])
                start = None
                mode = 1 if c == ',' else 10
        elif mode == 10:
            yield parts
            parts = []
            if c == ',': mode = 0
            else: raise Exception
    assert mode == 10
    yield parts


 if __name__ == '__main__':
    for line in fileinput.input(openhook=fileinput.hook_compressed):
        line = line.strip()
        if line.startswith(LEAD1) and line.endswith(';'):
            pos = line.find(LEAD2, LEN_LEAD1)
            if pos > 0:
                for t in get_tuples(line[pos + LEN_LEAD2:-1]):
                    print '\t'.join(t)
	"""
	This is a tool for extracting tuples directly out of Wikipedia database dumps
	by parsing SQL INSERT statements. For now, the output is in (tab-delimited)
	CSV format, where each column indicate a field and each row a record. The tool works
	with gzip'ed files out of the box.

	To use this tool, simply go (take `pagelinks.sql.gz` as an example):

	python extract_wp_tuples.py enwiki-20150515-pagelinks.sql.gz
	"""
	import fileinput
	import gzip
	import sys
	import re

	LEAD1 = 'INSERT INTO `'
	LEAD2 = '` VALUES '

	LEN_LEAD1 = len(LEAD1)
	LEN_LEAD2 = len(LEAD2)

	def get_tuples(line):
	mode = 0
	start = None
	skip = False
	parts = []
	for i, c in enumerate(line):
	if mode == 0:
	if c != '(': raise Exception
	mode = 1
	elif mode == 1:
	if c == "'":
	mode = 2
	start = i + 1
	else:
	mode = 4
	start = i
	elif mode == 2:
	if skip:
	skip = False
	continue
	if c == "'":
	parts.append(line[start:i])
	start = None
	mode = 3
	elif c == '\\':
	skip = True
	else:
	continue
	elif mode == 3:
	if c == ',': mode = 1
	elif c == ')':
	mode = 10
	else: raise Exception
	elif mode == 4:
	if c == ',' or c == ')':
	parts.append(line[start:i])
	start = None
	mode = 1 if c == ',' else 10
	elif mode == 10:
	yield parts
	parts = []
	if c == ',': mode = 0
	else: raise Exception
	assert mode == 10
	yield parts


	if __name__ == '__main__':
	for line in fileinput.input(openhook=fileinput.hook_compressed):
	line = line.strip()
	if line.startswith(LEAD1) and line.endswith(';'):
	pos = line.find(LEAD2, LEN_LEAD1)
	if pos > 0:
	for t in get_tuples(line[pos + LEN_LEAD2:-1]):
	print '\t'.join(t)