Skip to content

Instantly share code, notes, and snippets.

@Mytherin
Created December 28, 2020 18:52
Show Gist options
  • Save Mytherin/77d54f2242453d7612f953644c5d18a8 to your computer and use it in GitHub Desktop.
Save Mytherin/77d54f2242453d7612f953644c5d18a8 to your computer and use it in GitHub Desktop.
import duckdb
import sqlite3
import time, random, string
base_tuples = 200*1000
repeat_insert = 8
def run_benchmark_strings(db, dbname, print_tuple_count, count_distinct=True):
db.execute("""CREATE TABLE data(id INTEGER, dt INTEGER, shop TEXT, product TEXT, aa TEXT, bb INTEGER,
customer TEXT);""")
db.execute("begin transaction")
for i in range(base_tuples):
dt = random.randint(1600000000, 1600100000) # random datetime (unix timestamp)
shop = ''.join(random.choices(string.ascii_uppercase, k=1)) # 26 possibilities
product = ''.join(random.choices(string.ascii_uppercase, k=2)) # 676 possibilities
aa = ''.join(random.choices(string.ascii_uppercase, k=2)) # 676 possibilities
customer = ''.join(random.choices(string.ascii_uppercase, k=2)) # 676 possibilities
db.execute("INSERT INTO data(id, dt, shop, product, aa, bb, customer) VALUES (?, ?, ?, ?, ?, ?, ?)",
[i, dt, shop, product, aa, 0, customer])
for i in range(repeat_insert):
db.execute('INSERT INTO data SELECT * FROM data')
db.commit()
if print_tuple_count:
tuple_count = db.execute('SELECT COUNT(*) FROM data').fetchall()[0][0]
print("Total tuples: %d" % (tuple_count,))
start = time.time()
if not count_distinct:
db.execute("SELECT product, COUNT(customer) count FROM data WHERE shop == 'A' GROUP BY product").fetchall()
else:
db.execute("SELECT product, COUNT(DISTINCT customer) count FROM data WHERE shop == 'A' GROUP BY product").fetchall()
print("%s: %.1f ms" % (dbname, (time.time() - start)*1000))
def run_benchmark_integers(db, dbname, print_tuple_count, count_distinct=True):
db.execute("""CREATE TABLE data(id INTEGER, dt INTEGER, shop INTEGER, product INTEGER, aa INTEGER, bb INTEGER,
customer INTEGER);""")
db.execute("begin transaction")
for i in range(base_tuples):
dt = random.randint(1600000000, 1600100000) # random datetime (unix timestamp)
shop = random.randint(0, 26) # 26 possibilities
product = random.randint(0, 676) # 676 possibilities
aa = random.randint(0, 676) # 676 possibilities
customer = random.randint(0, 676) # 676 possibilities
db.execute("INSERT INTO data(id, dt, shop, product, aa, bb, customer) VALUES (?, ?, ?, ?, ?, ?, ?)",
[i, dt, shop, product, aa, 0, customer])
for i in range(repeat_insert):
db.execute('INSERT INTO data SELECT * FROM data')
db.commit()
if print_tuple_count:
tuple_count = db.execute('SELECT COUNT(*) FROM data').fetchall()[0][0]
print("Total tuples: %d" % (tuple_count,))
start = time.time()
if not count_distinct:
db.execute("SELECT product, COUNT(customer) count FROM data WHERE shop == 1 GROUP BY product").fetchall()
else:
db.execute("SELECT product, COUNT(DISTINCT customer) count FROM data WHERE shop == 1 GROUP BY product").fetchall()
print("%s: %.1f ms" % (dbname, (time.time() - start)*1000))
duck_db = duckdb.connect(':memory:')
sqlite_db = sqlite3.connect(':memory:')
print("COUNT(DISTINCT) - VARCHAR")
run_benchmark_strings(duck_db, 'DuckDB', True)
run_benchmark_strings(sqlite_db, 'SQLite', False)
duck_db = duckdb.connect(':memory:')
sqlite_db = sqlite3.connect(':memory:')
print("COUNT(DISTINCT) - INTEGER")
run_benchmark_integers(duck_db, 'DuckDB', True)
run_benchmark_integers(sqlite_db, 'SQLite', False)
print("COUNT - VARCHAR")
duck_db = duckdb.connect(':memory:')
sqlite_db = sqlite3.connect(':memory:')
run_benchmark_strings(duck_db, 'DuckDB', True, False)
run_benchmark_strings(sqlite_db, 'SQLite', False, False)
duck_db = duckdb.connect(':memory:')
sqlite_db = sqlite3.connect(':memory:')
print("COUNT - INTEGER")
run_benchmark_integers(duck_db, 'DuckDB', True, False)
run_benchmark_integers(sqlite_db, 'SQLite', False, False)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment