Skip to content

Instantly share code, notes, and snippets.

@josephernest
Last active January 1, 2021 13:51
Show Gist options
  • Save josephernest/2c02f7627b83a32fd2086fe9dde15215 to your computer and use it in GitHub Desktop.
Save josephernest/2c02f7627b83a32fd2086fe9dde15215 to your computer and use it in GitHub Desktop.
Quick benchmark of DuckDB / Sqlite3 (disclaimer: it doesn't necessarily showcase the power of both solutions, there might be better ways I'm not aware of)
"""
Quick benchmark of DuckDB / Sqlite3 (disclaimer: it doesn't necessarily showcase the power of both solutions, there might be better ways I'm not aware of)
https://gist.github.com/josephernest/2c02f7627b83a32fd2086fe9dde15215.js
https://github.com/cwida/duckdb/issues/1249
1M rows 10M rows 20M rows 50M rows
duckdb 13ms ? 130ms ? 285ms ? ?
sqlite 13ms 26 MB 113ms 260 MB 221ms 527 MB ?
"""
import time, random, string, sqlite3, duckdb, pandas, numpy
class elapsed:
def __enter__(self): self.start = time.time()
def __exit__(self, *args): print("%.1f ms" % ((time.time() - self.start)*1000))
#################### QUERIES
CREATE = """CREATE TABLE data(id INTEGER, dt INTEGER, shop INTEGER, product INTEGER, aa INTEGER, bb INTEGER, customer INTEGER, PRIMARY KEY(shop, product, customer, id))"""
SELECT = "SELECT product, COUNT(DISTINCT customer) count FROM data WHERE shop == 2 GROUP BY product"
N = 10*1000*1000
#################### DUCKDB
db = duckdb.connect(':memory:')
db.execute(CREATE)
db.execute(f"""INSERT INTO data SELECT i id,
round((random()*100000),0)::int + 1600000000 dt,
round((random()*26),0)::int shop,
round((random()*676),0)::int product,
round((random()*676),0)::int aa, 0 bb,
round((random()*676),0)::int customer FROM range(0,{N}) tbl(i);""")
db.commit()
with elapsed():
db.execute(SELECT).fetchnumpy()
# how to measure the DB size of a :memory: DuckDB?
#################### SQLITE
db = sqlite3.connect(':memory:')
db.execute(CREATE + ' WITHOUT ROWID') # https://sqlite.org/withoutrowid.html optimization
for i in range(N):
dt = random.randint(1600000000, 1600100000) # random timestamp
shop = random.randrange(26)
product = random.randrange(676)
aa = random.randrange(676)
bb = 0
customer = random.randrange(676)
db.execute("INSERT INTO data(id, dt, shop, product, aa, bb, customer) VALUES (?, ?, ?, ?, ?, ?, ?)", (i, dt, shop, product, aa, bb, customer))
db.commit()
with elapsed():
for _ in db.execute(SELECT):
pass
print('size (MB):', next(db.execute('PRAGMA page_count;'))[0] * next(db.execute('PRAGMA page_size;'))[0] / 1024**2)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment