Created
November 29, 2024 15:20
-
-
Save kolibril13/f4597c16452b4b72965c8d20fe6c0978 to your computer and use it in GitHub Desktop.
pandas vs polars for reading csv files
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# /// script | |
# requires-python = ">=3.12,<3.13" | |
# dependencies = [ | |
# "pandas", | |
# "polars", | |
# ] | |
# /// | |
# note: the header of this script was generated by the command | |
# uv add --script benchmark.py pandas polars | |
import csv | |
import random | |
import time | |
import pandas as pd | |
import polars as pl | |
# Constants | |
NUM_ROWS = 10_000_000 | |
OUTPUT_FILE = "random_coordinates.csv" | |
# Step 1: Generate the CSV file | |
print(f"Generating {NUM_ROWS} random coordinates and writing to {OUTPUT_FILE}...") | |
start_time = time.time() | |
with open(OUTPUT_FILE, mode="w", newline="") as file: | |
writer = csv.writer(file) | |
writer.writerow(["x", "y"]) # Header row | |
for _ in range(NUM_ROWS): | |
x = random.uniform(-1_000, 1_000) # Random float between -1000 and 1000 | |
y = random.uniform(-1_000, 1_000) | |
writer.writerow([x, y]) | |
end_time = time.time() | |
csv_generation_time = end_time - start_time | |
print(f"CSV file {OUTPUT_FILE} created successfully in {csv_generation_time:.2f} seconds.\n") | |
# Step 2: Benchmark pandas read_csv | |
print("Benchmarking pandas read_csv...") | |
start_time = time.time() | |
data = pd.read_csv(OUTPUT_FILE) | |
end_time = time.time() | |
pandas_time = end_time - start_time | |
print(f"Pandas read_csv completed in {pandas_time:.2f} seconds.\n") | |
# Step 3: Benchmark Polars read_csv | |
print("Benchmarking Polars read_csv...") | |
start_time = time.time() | |
data_polars = pl.read_csv(OUTPUT_FILE) | |
end_time = time.time() | |
polars_time = end_time - start_time | |
print(f"Polars read_csv completed in {polars_time:.2f} seconds.\n") | |
# Final summary | |
print("----- Benchmark Summary -----") | |
print(f"CSV generation time: {csv_generation_time:.2f} seconds") | |
print(f"Pandas read_csv time: {pandas_time:.2f} seconds") | |
print(f"Polars read_csv time: {polars_time:.2f} seconds") | |
print("------------------------------") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment