Created
February 1, 2019 14:53
-
-
Save krypted/605e0fb776d8761750c068234811a842 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# | |
# | |
# Exports the contents of a flat sql dump to csv files with the name of the table.csv and a xls representation of the database | |
# Requires Python 3.7 (from a Mac, first - brew install python3) | |
# Prior to running python3 -m pip install pandas | |
# Prior to running python3 -m pip install openpyxl | |
# Usage python3 sqlcsvxlsexport.py <sql dump file> <target directory> | |
# Send accolades for the script to [email protected] or hatemail to <devnull> | |
# | |
# | |
import re | |
import csv | |
import sys | |
import os.path | |
import argparse | |
import pandas as pd | |
PATTERN = '_binary.*?\',\'' | |
# allow large content in the dump | |
csv.field_size_limit(sys.maxsize) | |
def is_insert(line): | |
return 'INSERT INTO' in line or False | |
def get_values(line): | |
return line.partition(' VALUES ')[2] | |
def get_table_name(line): | |
match = re.search('INSERT INTO `([0-9_a-zA-Z]+)`', line) | |
if match: | |
return match.group(1) | |
else: | |
print(line) | |
def get_columns(line): | |
match = re.search('INSERT INTO `.*` \(([^\)]+)\)', line) | |
if match: | |
return list(map(lambda x: x.replace('`', '').strip(), match.group(1).split(','))) | |
def values_sanity_check(values): | |
assert values | |
assert values[0] == '(' | |
# Assertions have not been raised | |
return True | |
def parse_values_2(values): | |
rows = [] | |
latest_row = [] | |
reader = csv.reader([values], delimiter=',', | |
doublequote=False, | |
escapechar='\\', | |
quotechar="'", | |
strict=True | |
) | |
for reader_row in reader: | |
for column in reader_row: | |
if ',' not in column: | |
return [column[1:-1]] | |
if len(column) == 0 or column == 'NULL': | |
latest_row.append(chr(0)) | |
continue | |
if column[0] == "(": | |
new_row = False | |
if len(latest_row) > 0: | |
if latest_row[-1][-1] == ")": | |
latest_row[-1] = latest_row[-1][:-1] | |
new_row = True | |
if new_row: | |
latest_row = ['' if field == '\x00' else field for field in latest_row] | |
rows.append(latest_row) | |
latest_row = [] | |
if len(latest_row) == 0: | |
column = column[1:] | |
latest_row.append(column) | |
if latest_row[-1][-2:] == ");": | |
latest_row[-1] = latest_row[-1][:-2] | |
latest_row = ['' if field == '\x00' else field for field in latest_row] | |
rows.append(latest_row) | |
return rows | |
def parse_values(values): | |
rows = [] | |
latest_row = [] | |
reader = csv.reader([values], delimiter=',', | |
doublequote=False, | |
escapechar='\\', | |
quotechar="'", | |
strict=True | |
) | |
for reader_row in reader: | |
for column in reader_row: | |
if len(column) == 0 or column == 'NULL': | |
latest_row.append(chr(0)) | |
continue | |
if column[0] == "(": | |
new_row = False | |
if len(latest_row) > 0: | |
if latest_row[-1][-1] == ")": | |
latest_row[-1] = latest_row[-1][:-1] | |
new_row = True | |
if new_row: | |
latest_row = ['' if field == '\x00' else field for field in latest_row] | |
rows.append(latest_row) | |
latest_row = [] | |
if len(latest_row) == 0: | |
column = column[1:] | |
latest_row.append(column) | |
if latest_row[-1][-2:] == ");": | |
latest_row[-1] = latest_row[-1][:-2] | |
latest_row = ['' if field == '\x00' else field for field in latest_row] | |
rows.append(latest_row) | |
return rows | |
def main(filepath, output_folder): | |
dataframes = {} | |
with open(filepath, 'rb') as f: | |
#with codecs.open(file_name, "r", encoding='utf-8', errors='ignore') as | |
for line in f.readlines(): | |
try: | |
line = line.decode('utf8') | |
if is_insert(line): | |
table_name = get_table_name(line) | |
columns = get_columns(line) | |
values = get_values(line) | |
if values_sanity_check(values): | |
rows = parse_values(values) | |
if rows and len(rows[0]) != len(columns): | |
print(str(len(rows[0])) + ' - ' + str(len(columns)) + ' : ' + table_name) | |
if table_name not in dataframes.keys(): | |
df_to_add = pd.DataFrame(rows, columns=columns) | |
dataframes[table_name] = df_to_add | |
else: | |
df_to_add = pd.DataFrame(rows, columns=columns) | |
dataframes[table_name] = dataframes[table_name].append(df_to_add, ignore_index=True) | |
if not os.path.isfile(output_folder + table_name + '.csv'): | |
with open(output_folder + table_name + '.csv', 'w') as outcsv: | |
writer = csv.writer(outcsv, quoting=csv.QUOTE_ALL) | |
writer.writerow(columns) | |
for row in rows: | |
writer.writerow(row) | |
else: | |
with open(output_folder + table_name + '.csv', 'a') as outcsv: | |
writer = csv.writer(outcsv, quoting=csv.QUOTE_ALL) | |
for row in rows: | |
writer.writerow(row) | |
except UnicodeDecodeError: | |
line = str(line) | |
table_name = get_table_name(line) | |
columns = get_columns(line) | |
line = line.split('VALUES ')[1].strip() | |
line = line[:-3] | |
values = line[1:-3].split('),(') | |
for value in values: | |
tmp_value = value | |
start_idx = 0 | |
tmp_fields = [] | |
quitted = False | |
binary_end = len(value) | |
while 'binary' in tmp_value: | |
first_binary = re.search(PATTERN, tmp_value[start_idx:]) | |
if not first_binary: | |
tmp_fields.append(tmp_value) | |
quitted = True | |
break | |
binary_start, binary_end = first_binary.span() | |
tmp_fields += parse_values_2('(' + tmp_value[start_idx:binary_start - 1] + ')') | |
tmp_fields.append(first_binary.group()[:-5]) | |
start_idx = binary_end | |
tmp_value = tmp_value[binary_end:] | |
if binary_end != len(value) and not quitted: | |
splitted = tmp_value.split("','") | |
tmp_fields += splitted | |
writer = pd.ExcelWriter(output_folder + 'database.xls', engine='openpyxl') | |
for table in dataframes.keys(): | |
dataframes[table].to_excel(writer, table[:30], index=False) | |
writer.save() | |
if __name__ == '__main__': | |
parser = argparse.ArgumentParser(description='Convert sqldump to csv') | |
parser.add_argument('sql_filepath', action="store", type=str) | |
parser.add_argument('output_dir', action="store", default='.', type=str) | |
args = parser.parse_args() | |
file_path = args.sql_filepath | |
out_dir = args.output_dir if args.output_dir.endswith('/') else args.output_dir + '/' | |
main(file_path, out_dir) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment