Last active
October 13, 2024 19:30
-
-
Save sebastianknopf/0edc85873263b7f7faef47e25b32eb79 to your computer and use it in GitHub Desktop.
python helper script for memory efficient importing huge CSV files into a DuckDB database
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import click | |
import csv | |
import duckdb | |
import polars | |
@click.command | |
@click.option('--file', '-f', help='Input CSV file') | |
@click.option('--db', '-d', help='DuckDB filename') | |
@click.option('--table', '-t', help='Destination table name to import the CSV file') | |
@click.option('--create-statement', '-c', default=None, help='Create statement filename for destination table') | |
@click.option('--batch', '-b', default=100000, help='Batch size to import into database') | |
@click.option('--verbose', '-v', default=False, is_flag=True, help='Print out verbose information messages') | |
def csv2ddb(file, db, table, create_statement, batch, verbose): | |
# open connection to destination DB | |
connection = duckdb.connect(db) | |
# run create statement if present | |
if create_statement is not None: | |
with open(create_statement, 'r') as stmt_file: | |
stmt = stmt_file.read() | |
connection.sql(stmt) | |
# open CSV file | |
with open(file, 'r') as csv_file: | |
csv_reader = csv.reader(csv_file) | |
headers = list() | |
records = list() | |
batch_count = 1 | |
for row in csv_reader: | |
if len(headers) == 0: | |
headers = row | |
else: | |
records.append(row) | |
# insert if batch size was reached | |
if len(records) >= batch: | |
if verbose: | |
print(f"inserting batch {batch_count} ...") | |
df = polars.DataFrame(records, schema=headers, orient='row') | |
connection.sql(f"INSERT INTO {table} SELECT * FROM df") | |
records = list() | |
batch_count = batch_count + 1 | |
# final insert for all remaining datasets | |
if verbose: | |
print(f"inserting final batch ...") | |
df = polars.DataFrame(records, schema=headers, orient='row') | |
connection.sql(f"INSERT INTO {table} SELECT * FROM df") | |
records = list() | |
# print out num total datasets in verbose mode | |
if verbose: | |
connection.sql(f"SELECT COUNT(*) FROM {table}").show() | |
# close connection | |
connection.close() | |
if __name__ == '__main__': | |
csv2ddb() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
click | |
duckdb | |
polars | |
pyarrow |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
CREATE TABLE shapes | |
( | |
shape_id TEXT NOT NULL, | |
shape_pt_lat FLOAT NOT NULL, | |
shape_pt_lon FLOAT NOT NULL, | |
shape_pt_sequence INTEGER NOT NULL, | |
shape_dist_traveled FLOAT NOT NULL | |
) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment