Skip to content

Instantly share code, notes, and snippets.

@pdet
Created December 3, 2024 14:48
Show Gist options
  • Save pdet/7459d5b6ad7430dfc86eb5dfe0b643e0 to your computer and use it in GitHub Desktop.
Save pdet/7459d5b6ad7430dfc86eb5dfe0b643e0 to your computer and use it in GitHub Desktop.
import duckdb
import time
def gen_tpch():
con = duckdb.connect()
con.execute("CALL dbgen(sf=20);")
con.execute("COPY lineitem to 'lineitem.csv'")
con.execute("COPY lineitem to 'lineitem.parquet'")
con.execute("COPY lineitem TO 'lineitem_zstd.parquet' (FORMAT 'parquet', CODEC 'zstd', COMPRESSION_LEVEL 1);")
schema = '''
CREATE TABLE lineitem
(
l_orderkey BIGINT not null,
l_partkey BIGINT not null,
l_suppkey BIGINT not null,
l_linenumber BIGINT not null,
l_quantity DOUBLE PRECISION not null,
l_extendedprice DOUBLE PRECISION not null,
l_discount DOUBLE PRECISION not null,
l_tax DOUBLE PRECISION not null,
l_returnflag CHAR(1) not null,
l_linestatus CHAR(1) not null,
l_shipdate DATE not null,
l_commitdate DATE not null,
l_receiptdate DATE not null,
l_shipinstruct CHAR(25) not null,
l_shipmode CHAR(10) not null,
l_comment VARCHAR(44) not null
);
'''
def load_data(file):
for i in range (5):
con = duckdb.connect()
con.execute('SET max_temp_directory_size = \'0GB\'')
con.execute('SET preserve_insertion_order = false;')
con.execute(schema)
start_time = time.time()
con.execute(f"COPY lineitem FROM \'{file}\'")
end_time = time.time()
print(file + " Time: " + str(end_time-start_time))
gen_tpch()
load_data('lineitem.csv')
load_data('lineitem.parquet')
load_data('lineitem_zstd.parquet')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment