Skip to content

Instantly share code, notes, and snippets.

@sebastianknopf
Last active October 13, 2024 19:30
Show Gist options
  • Save sebastianknopf/0edc85873263b7f7faef47e25b32eb79 to your computer and use it in GitHub Desktop.
Save sebastianknopf/0edc85873263b7f7faef47e25b32eb79 to your computer and use it in GitHub Desktop.
python helper script for memory efficient importing huge CSV files into a DuckDB database
import click
import csv
import duckdb
import polars
@click.command
@click.option('--file', '-f', help='Input CSV file')
@click.option('--db', '-d', help='DuckDB filename')
@click.option('--table', '-t', help='Destination table name to import the CSV file')
@click.option('--create-statement', '-c', default=None, help='Create statement filename for destination table')
@click.option('--batch', '-b', default=100000, help='Batch size to import into database')
@click.option('--verbose', '-v', default=False, is_flag=True, help='Print out verbose information messages')
def csv2ddb(file, db, table, create_statement, batch, verbose):
# open connection to destination DB
connection = duckdb.connect(db)
# run create statement if present
if create_statement is not None:
with open(create_statement, 'r') as stmt_file:
stmt = stmt_file.read()
connection.sql(stmt)
# open CSV file
with open(file, 'r') as csv_file:
csv_reader = csv.reader(csv_file)
headers = list()
records = list()
batch_count = 1
for row in csv_reader:
if len(headers) == 0:
headers = row
else:
records.append(row)
# insert if batch size was reached
if len(records) >= batch:
if verbose:
print(f"inserting batch {batch_count} ...")
df = polars.DataFrame(records, schema=headers, orient='row')
connection.sql(f"INSERT INTO {table} SELECT * FROM df")
records = list()
batch_count = batch_count + 1
# final insert for all remaining datasets
if verbose:
print(f"inserting final batch ...")
df = polars.DataFrame(records, schema=headers, orient='row')
connection.sql(f"INSERT INTO {table} SELECT * FROM df")
records = list()
# print out num total datasets in verbose mode
if verbose:
connection.sql(f"SELECT COUNT(*) FROM {table}").show()
# close connection
connection.close()
if __name__ == '__main__':
csv2ddb()
click
duckdb
polars
pyarrow
CREATE TABLE shapes
(
shape_id TEXT NOT NULL,
shape_pt_lat FLOAT NOT NULL,
shape_pt_lon FLOAT NOT NULL,
shape_pt_sequence INTEGER NOT NULL,
shape_dist_traveled FLOAT NOT NULL
)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment