Skip to content

Instantly share code, notes, and snippets.

@guidocaru
Created January 15, 2025 05:51
Show Gist options
  • Save guidocaru/b553470004dd3e70bdfa1ae6bb02fc6e to your computer and use it in GitHub Desktop.
Save guidocaru/b553470004dd3e70bdfa1ae6bb02fc6e to your computer and use it in GitHub Desktop.
Methods to grow a DataFrame
from methods import list_of_dicts, concat, loc_without_alloc, loc_with_alloc
import time
num_rows = 1000
# List of dicts
start_time = time.perf_counter()
list_of_dicts(num_rows)
end_time = time.perf_counter()
print("- List of dicts: {end_time - start_time:.3f} seconds")
# pd.concat()
start_time = time.perf_counter()
concat(num_rows)
end_time = time.perf_counter()
print("- Concat: {end_time - start_time:.3f} seconds")
# .loc without alloc
start_time = time.perf_counter()
loc_without_alloc(num_rows)
end_time = time.perf_counter()
print("- .loc without alloc: {end_time - start_time:.3f} seconds")
# .loc with alloc
start_time = time.perf_counter()
loc_with_alloc(num_rows)
end_time = time.perf_counter()
print("- .loc with alloc: {end_time - start_time:.3f} seconds")
from methods import list_of_dicts, concat, loc_without_alloc, loc_with_alloc
import perfplot
kernels = [list_of_dicts, concat, loc_without_alloc, loc_with_alloc]
out = perfplot.bench(
setup=lambda n: n,
kernels=kernels,
labels=[k.__name__ for k in kernels],
n_range=[1000, 2000, 4000, 8000, 16000],
xlabel="Number of rows",
title="Methods to grow a DataFrame",
equality_check=None,
)
out.show()
import pandas as pd
def function_that_generates_data(n):
for _ in range(n):
yield 22, 12.3, "abc"
def list_of_dicts(n):
data = []
for row in function_that_generates_data(n):
data.append(row)
return pd.DataFrame(data, columns=["column1", "column2", "column3"])
def concat(n):
df = pd.DataFrame()
for row in function_that_generates_data(n):
df = pd.concat(
[df, pd.DataFrame([row], columns=["column1", "column2", "column3"])],
ignore_index=True,
)
return df
def loc_without_alloc(n):
df = pd.DataFrame(columns=["column1", "column2", "column3"])
for row in function_that_generates_data(n):
df.loc[df.index.max() + 1] = row
return df
def loc_with_alloc(n):
df = pd.DataFrame(columns=["column1", "column2", "column3"], index=range(n))
for i, row in enumerate(function_that_generates_data(n)):
df.loc[i] = row
return df
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment