Last active
September 30, 2023 23:31
-
-
Save stephanGarland/cc505a9d9c0d044a340fa19b6d778c07 to your computer and use it in GitHub Desktop.
Benchmarking MySQL INSERTs into InnoDB and MyISAM
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import csv | |
import json | |
import os | |
import pymysql | |
import statistics | |
import time | |
def prepare_values(row): | |
new_row = [] | |
for cell in row: | |
if isinstance(cell, list): | |
new_row.append(json.dumps(cell)) | |
else: | |
new_row.append(cell) | |
return new_row | |
def batch_insert(tbl_name, rows, cursor, is_full): | |
query = f"INSERT INTO `{tbl_name}` (user_id,user_email,created_at,tags,shared_with,is_completed,is_overdue,description,title,user_name) VALUES " | |
values_str_list = [] | |
for row in rows: | |
row = prepare_values(row) | |
value_str = "(" + ",".join(["%s" for _ in row]) + ")" | |
values_str_list.append(cursor.mogrify(value_str, row)) | |
query += ",".join(values_str_list) + ";" | |
start_time = time.time() | |
cursor.execute(query) | |
end_time = time.time() | |
if is_full: | |
timings[tbl_name]["full"].append(end_time - start_time) | |
else: | |
timings[tbl_name]["empty"].append(end_time - start_time) | |
def process_csv_file( | |
cursor, csv_file_path, tbl_name, chunk_size, num_rows, is_full: bool | |
): | |
values = [] | |
with open(csv_file_path, "r") as csvfile: | |
csvreader = csv.reader(csvfile, quotechar="'") | |
for i, row in enumerate(csvreader, 1): | |
values.append(row) | |
if i % chunk_size == 0: | |
print( | |
f"inserting batch {i // chunk_size} / {num_rows // chunk_size} into {tbl_name} ({'full' if is_full else 'empty'})" | |
) | |
batch_insert(tbl_name, values, cursor, is_full) | |
values = [] | |
if values: | |
batch_insert(tbl_name, values, cursor, is_full) | |
db_params = { | |
"host": "YOUR_HOST", | |
"password": "YOUR_PASS, | |
"user": "YOUR_USER", | |
"db": "YOUR_DB", | |
"autocommit": True, | |
"ssl": {"fake_flag_to_enable_tls": True}, | |
} | |
csv_file_path1 = "/mnt/ramdisk/xaa.csv" | |
csv_file_path2 = "/mnt/ramdisk/xab.csv" | |
csv_file_size = os.path.getsize(csv_file_path1) | |
table_name1 = "test_innodb" | |
table_name2 = "test_myisam" | |
num_rows = 12_500_000 | |
chunk_size = 10_000 | |
timings = { | |
table_name1: {"empty": [], "full": []}, | |
table_name2: {"empty": [], "full": []}, | |
} | |
connection = pymysql.connect(**db_params) | |
cursor = connection.cursor() | |
for i, tbl in enumerate([table_name1, table_name2]): | |
for j, f in enumerate([csv_file_path1, csv_file_path2]): | |
print(f"\nloading file {j} / 2 into {tbl} in batches of {chunk_size} rows\n") | |
process_csv_file(cursor, f, tbl, chunk_size, num_rows, bool(j)) | |
cursor.close() | |
connection.close() | |
for table, table_timings in timings.items(): | |
for category, category_timings in table_timings.items(): | |
min_time = min(category_timings) | |
max_time = max(category_timings) | |
avg_time = statistics.mean(category_timings) | |
stdev_time = statistics.stdev(category_timings) | |
total_time = sum(category_timings) | |
print(f"\ntimings for {table} ({category}):") | |
print(f"\tMin: {min_time:.3f} seconds") | |
print(f"\tMax: {max_time:.3f} seconds") | |
print(f"\tAvg: {avg_time:.3f} seconds") | |
print(f"\tStd Dev: {stdev_time:.3f} seconds") | |
print( | |
f"\tLoaded {num_rows} rows in {total_time:.2f} seconds at {((csv_file_size / total_time) / 2**20):.2f} MiB/s" | |
) |
Author
stephanGarland
commented
Sep 30, 2023
Data sample that was used in this and the bulk load. The PK (the only key) for the tables is a CHAR(32)
, and is a UUIDv7 as shown here. While not as fast as an AUTO_INCREMENT INT
, it is K-sortable so it won't cause nearly the page split issue as a UUIDv4 would.
'user_id','user_email','created_at','tags','shared_with','is_completed','is_overdue','description','title','user_name'
'018aae66103a7e7aa19e35f1b4330c22','[email protected]','2014-12-26 12:30:05','[]','[]','1','0','refute skittle unshipped thinly stark icing bribe subsystem ecard prudishly','polar','Killie.Kironde'
'018aae66103b752a81967307d5898b67','[email protected]','2023-03-20 16:38:05','[]','[]','1','0','video badass surprise gander bloating jalapeno overpower conjure underpay oboe','starboard','Augusto.Schluter'
'018aae66103c7f4399a5aedd96521b89','[email protected]','2017-08-07 21:17:02','[]','[]','0','0','basin proximity bonfire fantasy camcorder broadness waged willed strike moaner','chevy','Orton.Wong'
'018aae66103d7db488774840e2c2387d','[email protected]','2014-06-26 22:37:13','[]','[]','1','0','shock happier tiara galleria sharply expire coeditor baritone zen dangling','cardinal','Bucky.Shaffert'
'018aae66103e7aa1a5a0b5717a876eb5','[email protected]','1997-07-23 11:14:30','[]','[]','1','0','tinderbox darkening dangle expand rash marine caliber hardness foyer wound','resort','Vasily.Colbye'
'018aae66103f7df78a68aaaaa6668501','[email protected]','2016-10-13 00:52:01','[]','[]','1','0','circus saffron draw supreme audience stipulate acrobat context womankind impure','armless','Maddy.Childs'
'018aae6610407e4dbdba2d01093860b4','[email protected]','2001-07-03 16:29:45','[]','[]','1','0','omen skillful purposely drove pond uptown clumsily hardly womanlike obsession','backspin','Lucine.Kunz'
'018aae6610417848b92c5b8d19efe100','[email protected]','2000-04-14 23:49:16','[]','[]','1','0','passing puzzling until condiment getting spud coconut quiver pronounce casually','flick','Roderick.Linson'
'018aae6610427200b0e03f984f352ed4','[email protected]','2010-09-25 08:27:01','[]','[]','1','0','smartness banknote thursday kindly pesticide selection dude snub shrink drone','geek','Jorrie.Atkinson'
'018aae66104372ad91c0fbcd9aae46b9','[email protected]','2014-12-12 16:44:49','[]','["018aae050842771f9eaa5b9ab581cfe1","018aae4c6c967046a0548438d1b7b9dc","018aadaf5172779f84b5e508557dbd9b"]','0','0','approval squeamish cornstalk spectator perfectly residue crushed resubmit swipe student','capillary','Artur.Sikata'
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment