Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save senhalil/6371ccab298aeae0fdb77ed150f0726d to your computer and use it in GitHub Desktop.
Save senhalil/6371ccab298aeae0fdb77ed150f0726d to your computer and use it in GitHub Desktop.
pydantic vs pandera performance while working with dataframes
import time
import pandas as pd
import pandera as pa
from pandera import Column, DataFrameSchema
from pandera.engines.pandas_engine import PydanticModel
from pydantic import BaseModel, Field, ValidationError
# Generate a synthetic DataFrame
def generate_data(n: int) -> pd.DataFrame:
data = {
"id": list(range(1, n + 1)),
"name": [f"name_{i}" for i in range(1, n + 1)],
"age": [i % 100 for i in range(1, n + 1)],
"salary": [i * 1000.0 for i in range(1, n + 1)],
}
return pd.DataFrame(data)
# Pydantic model for validating a row
class RowModel(BaseModel):
id: int = Field(..., ge=1)
name: str
age: int = Field(..., ge=0, le=100)
salary: float = Field(..., ge=0.0)
# Pandera schema for validating the entire DataFrame
schema = DataFrameSchema(
{
"id": Column(int, checks=pa.Check.ge(1)),
"name": Column(str),
"age": Column(int, checks=[pa.Check.ge(0), pa.Check.le(100)]),
"salary": Column(float, checks=pa.Check.ge(0.0)),
}
)
# https://pandera.readthedocs.io/en/stable/pydantic_integration.html#using-pydantic-models-in-pandera-schemas
class PydanticSchema(pa.DataFrameModel):
"""Pandera schema using the pydantic model."""
class Config:
"""Config with dataframe-level data type."""
dtype = PydanticModel(RowModel)
coerce = True # this is required, otherwise a SchemaInitError is raised
# Validate rows using Pydantic
def validate_with_pydantic(df: pd.DataFrame) -> bool:
valid = True
for row in df.to_dict(orient="records"):
try:
RowModel(**row)
except ValidationError as e:
valid = False
print(f"Pydantic Validation Error: {e}")
return valid
# Validate entire DataFrame using Pandera
def validate_with_pandera(df: pd.DataFrame) -> bool:
try:
schema.validate(df)
return True
except pa.errors.SchemaError as e:
print(f"Pandera Validation Error: {e}")
return False
# Validate entire DataFrame using PanderaPydanticSchema
def validate_with_pandera_pydantic_schema(df: pd.DataFrame) -> bool:
try:
PydanticSchema.validate(df)
return True
except pa.errors.SchemaError as e:
print(f"PanderaPydanticSchema Validation Error: {e}")
return False
# Benchmark function
def benchmark(n: int) -> None:
df = generate_data(n)
# Benchmark Pydantic validation
start_time = time.time()
pydantic_valid = validate_with_pydantic(df)
pydantic_time = time.time() - start_time
# Benchmark Pandera validation
start_time = time.time()
pandera_valid = validate_with_pandera(df)
pandera_time = time.time() - start_time
# Benchmark Pandera Pydantic Schema validation
start_time = time.time()
pandera_pydantic_schema_valid = validate_with_pandera_pydantic_schema(df)
pandera_pydantic_schema_time = time.time() - start_time
# Results
print(f"Pydantic validation passed: {pydantic_valid}, Time taken: {pydantic_time:.6f} seconds")
print(f"Pandera validation passed: {pandera_valid}, Time taken: {pandera_time:.6f} seconds")
print(
f"Pandera Pydantic Schema validation passed: {pandera_pydantic_schema_valid}, Time taken: {pandera_pydantic_schema_time:.6f} seconds"
)
# Run the benchmark with 1M rows
if __name__ == "__main__":
benchmark(1_000_000)
@senhalil
Copy link
Author

senhalil commented Jul 3, 2025

Pydantic validation passed: True, Time taken: 1.489471 seconds
Pandera validation passed: True, Time taken: 0.139608 seconds
Pandera Pydantic Schema validation passed: True, Time taken: 3.208070 seconds

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment