Skip to content

Instantly share code, notes, and snippets.

@kolibril13
Created November 29, 2024 15:20
Show Gist options
  • Save kolibril13/f4597c16452b4b72965c8d20fe6c0978 to your computer and use it in GitHub Desktop.
Save kolibril13/f4597c16452b4b72965c8d20fe6c0978 to your computer and use it in GitHub Desktop.
pandas vs polars for reading csv files
# /// script
# requires-python = ">=3.12,<3.13"
# dependencies = [
# "pandas",
# "polars",
# ]
# ///
# note: the header of this script was generated by the command
# uv add --script benchmark.py pandas polars
import csv
import random
import time
import pandas as pd
import polars as pl
# Constants
NUM_ROWS = 10_000_000
OUTPUT_FILE = "random_coordinates.csv"
# Step 1: Generate the CSV file
print(f"Generating {NUM_ROWS} random coordinates and writing to {OUTPUT_FILE}...")
start_time = time.time()
with open(OUTPUT_FILE, mode="w", newline="") as file:
writer = csv.writer(file)
writer.writerow(["x", "y"]) # Header row
for _ in range(NUM_ROWS):
x = random.uniform(-1_000, 1_000) # Random float between -1000 and 1000
y = random.uniform(-1_000, 1_000)
writer.writerow([x, y])
end_time = time.time()
csv_generation_time = end_time - start_time
print(f"CSV file {OUTPUT_FILE} created successfully in {csv_generation_time:.2f} seconds.\n")
# Step 2: Benchmark pandas read_csv
print("Benchmarking pandas read_csv...")
start_time = time.time()
data = pd.read_csv(OUTPUT_FILE)
end_time = time.time()
pandas_time = end_time - start_time
print(f"Pandas read_csv completed in {pandas_time:.2f} seconds.\n")
# Step 3: Benchmark Polars read_csv
print("Benchmarking Polars read_csv...")
start_time = time.time()
data_polars = pl.read_csv(OUTPUT_FILE)
end_time = time.time()
polars_time = end_time - start_time
print(f"Polars read_csv completed in {polars_time:.2f} seconds.\n")
# Final summary
print("----- Benchmark Summary -----")
print(f"CSV generation time: {csv_generation_time:.2f} seconds")
print(f"Pandas read_csv time: {pandas_time:.2f} seconds")
print(f"Polars read_csv time: {polars_time:.2f} seconds")
print("------------------------------")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment