Skip to content

Instantly share code, notes, and snippets.

@r0yfire
Created October 12, 2015 23:09
Show Gist options
  • Save r0yfire/78b221a32bf4e0ef494a to your computer and use it in GitHub Desktop.
Save r0yfire/78b221a32bf4e0ef494a to your computer and use it in GitHub Desktop.
Python script to convert mysqldump output to JSON file

mysqldump to JSON

Python script to convert mysqldump output to JSON file. Most of the code was borrowed from github.com/jamesmishra/mysqldump-to-csv

You'll want to update the 'parse_row' function to map each item in a row to a dictionary.

Running the script

cat MySQLdump20151012.sql | python mysqldump2json.py > dump.json
#!/usr/bin/env python
import fileinput
import csv
import json
import sys
# This prevents prematurely closed pipes from raising
# an exception in Python
from signal import signal, SIGPIPE, SIG_DFL
signal(SIGPIPE, SIG_DFL)
# The csv file might contain very huge fields,
# therefore increase the field_size_limit
csv.field_size_limit(sys.maxsize)
def is_insert(line):
"""
Returns true if the line begins a SQL insert statement.
"""
return line.startswith('INSERT INTO') or False
def get_values(line):
"""
Returns the portion of an INSERT statement containing values
"""
return line.partition('` VALUES ')[2]
def values_sanity_check(values):
"""
Ensures that values from the INSERT statement meet basic checks.
"""
assert values
assert values[0] == '('
# Assertions have not been raised
return True
def parse_row(row):
return {
'id': row[2],
'created': row[5],
'text': row[8]
}
def parse_values(values, outfile):
"""
Given a file handle and the raw values from a MySQL INSERT
statement, write the equivalent CSV to the file
"""
latest_row = []
reader = csv.reader([values], delimiter=',',
doublequote=False,
escapechar='\\',
quotechar="'",
strict=True
)
transactions = []
for reader_row in reader:
for column in reader_row:
# If our current string is empty...
if len(column) == 0:
latest_row.append('""')
continue
# If our string starts with an open paren
if column[0] == "(":
# Assume that this column does not begin
# a new row.
new_row = False
# If we've been filling out a row
if len(latest_row) > 0:
# Check if the previous entry ended in
# a close paren. If so, the row we've
# been filling out has been COMPLETED
# as:
# 1) the previous entry ended in a )
# 2) the current entry starts with a (
if latest_row[-1][-1] == ")":
# Remove the close paren.
latest_row[-1] = latest_row[-1][:-1]
new_row = True
# If we've found a new row, write it out
# and begin our new one
if new_row:
json_row = parse_row(latest_row)
if json_row.get('text', '') != '':
transactions.append(json_row)
latest_row = []
# If we're beginning a new row, eliminate the
# opening parentheses.
if len(latest_row) == 0:
column = column[1:]
# Add our column to the row we're working on.
latest_row.append(column)
# At the end of an INSERT statement, we'll
# have the semicolon.
# Make sure to remove the semicolon and
# the close paren.
if latest_row[-1][-2:] == ");":
latest_row[-1] = latest_row[-1][:-2]
json_row = parse_row(latest_row)
if json_row.get('text', '') != '':
transactions.append(json_row)
outfile.write(json.dumps(transactions))
def main():
"""
Parse arguments and start the program
"""
# Iterate over all lines in all files
# listed in sys.argv[1:]
# or stdin if no args given.
try:
for line in fileinput.input():
# Look for an INSERT statement and parse it.
if is_insert(line):
values = get_values(line)
if values_sanity_check(values):
parse_values(values, sys.stdout)
except KeyboardInterrupt:
sys.exit(0)
if __name__ == "__main__":
main()
@goors
Copy link

goors commented Jun 2, 2016

You got encoded special characters. for example:

Z\u00fcrich for Zürich. there is nothing that you can do with that.

@Turbine1991
Copy link

Traceback (most recent call last):
  File "mysqldump2json.py", line 133, in <module>
    main()
  File "mysqldump2json.py", line 126, in main
    if values_sanity_check(values):
  File "mysqldump2json.py", line 36, in values_sanity_check
    assert values
AssertionError

It doesn't work. The node version doesn't work because the database is too big.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment