-
-
Save jcrist/d62f450594164d284fbea957fd48b743 to your computer and use it in GitHub Desktop.
| """A quick benchmark comparing the performance of: | |
| - msgspec: https://github.com/jcrist/msgspec | |
| - pydantic V1: https://docs.pydantic.dev/1.10/ | |
| - pydantic V2: https://docs.pydantic.dev/dev-v2/ | |
| The benchmark is modified from the one in the msgspec repo here: | |
| https://github.com/jcrist/msgspec/blob/main/benchmarks/bench_validation.py | |
| I make no claims that it's illustrative of all use cases. I wrote this up | |
| mostly to get an understanding of how msgspec's performance compares with that | |
| of pydantic V2. | |
| """ | |
| from __future__ import annotations | |
| import datetime | |
| import random | |
| import string | |
| import timeit | |
| import uuid | |
| from typing import List, Literal, Union, Annotated | |
| import msgspec | |
| import pydantic | |
| import pydantic.v1 | |
| def make_filesystem_data(capacity): | |
| """Generate a tree structure representing a fake filesystem""" | |
| UTC = datetime.timezone.utc | |
| DATE_2018 = datetime.datetime(2018, 1, 1, tzinfo=UTC) | |
| DATE_2023 = datetime.datetime(2023, 1, 1, tzinfo=UTC) | |
| UUIDS = [str(uuid.uuid4()) for _ in range(30)] | |
| rand = random.Random(42) | |
| def randdt(min, max): | |
| ts = rand.randint(min.timestamp(), max.timestamp()) | |
| return datetime.datetime.fromtimestamp(ts).replace(tzinfo=UTC) | |
| def randstr(min=None, max=None): | |
| if max is not None: | |
| min = rand.randint(min, max) | |
| return "".join(rand.choices(string.ascii_letters, k=min)) | |
| def make_node(is_dir): | |
| nonlocal capacity | |
| name = randstr(4, 30) | |
| created_by = rand.choice(UUIDS) | |
| created_at = randdt(DATE_2018, DATE_2023) | |
| updated_at = randdt(created_at, DATE_2023) | |
| data = { | |
| "type": "directory" if is_dir else "file", | |
| "name": name, | |
| "created_by": created_by, | |
| "created_at": created_at.isoformat(), | |
| "updated_at": updated_at.isoformat(), | |
| } | |
| if is_dir: | |
| n = min(rand.randint(0, 30), capacity) | |
| capacity -= n | |
| data["contents"] = [make_node(rand.random() > 0.9) for _ in range(n)] | |
| else: | |
| data["nbytes"] = rand.randint(0, 1000000) | |
| return data | |
| capacity -= 1 | |
| out = make_node(True) | |
| while capacity: | |
| capacity -= 1 | |
| out["contents"].append(make_node(rand.random() > 0.9)) | |
| return out | |
| def bench(raw_data, dumps, loads, convert): | |
| msg = convert(raw_data) | |
| json_data = dumps(msg) | |
| msg2 = loads(json_data) | |
| assert msg == msg2 | |
| del msg2 | |
| timer = timeit.Timer("func(data)", setup="", globals={"func": dumps, "data": msg}) | |
| n, t = timer.autorange() | |
| dumps_time = t / n | |
| timer = timeit.Timer( | |
| "func(data)", setup="", globals={"func": loads, "data": json_data} | |
| ) | |
| n, t = timer.autorange() | |
| loads_time = t / n | |
| return dumps_time, loads_time | |
| ############################################################################# | |
| # msgspec # | |
| ############################################################################# | |
| class File(msgspec.Struct, tag="file"): | |
| name: Annotated[str, msgspec.Meta(min_length=1)] | |
| created_by: uuid.UUID | |
| created_at: datetime.datetime | |
| updated_at: datetime.datetime | |
| nbytes: Annotated[int, msgspec.Meta(ge=0)] | |
| class Directory(msgspec.Struct, tag="directory"): | |
| name: Annotated[str, msgspec.Meta(min_length=1)] | |
| created_by: uuid.UUID | |
| created_at: datetime.datetime | |
| updated_at: datetime.datetime | |
| contents: List[Union[File, Directory]] | |
| def bench_msgspec(data): | |
| enc = msgspec.json.Encoder() | |
| dec = msgspec.json.Decoder(Directory) | |
| def convert(data): | |
| return msgspec.convert(data, Directory) | |
| return bench(data, enc.encode, dec.decode, convert) | |
| ############################################################################# | |
| # pydantic V2 # | |
| ############################################################################# | |
| class FileModel(pydantic.BaseModel): | |
| type: Literal["file"] = "file" | |
| name: str = pydantic.Field(min_length=1) | |
| created_by: uuid.UUID | |
| created_at: datetime.datetime | |
| updated_at: datetime.datetime | |
| nbytes: pydantic.NonNegativeInt | |
| class DirectoryModel(pydantic.BaseModel): | |
| type: Literal["directory"] = "directory" | |
| name: str = pydantic.Field(min_length=1) | |
| created_by: uuid.UUID | |
| created_at: datetime.datetime | |
| updated_at: datetime.datetime | |
| contents: List[Union[DirectoryModel, FileModel]] | |
| def bench_pydantic_v2(data): | |
| return bench( | |
| data, | |
| lambda p: p.model_dump_json(), | |
| DirectoryModel.model_validate_json, | |
| lambda data: DirectoryModel(**data), | |
| ) | |
| ############################################################################# | |
| # pydantic V1 # | |
| ############################################################################# | |
| class FileModelV1(pydantic.v1.BaseModel): | |
| type: Literal["file"] = "file" | |
| name: str = pydantic.v1.Field(min_length=1) | |
| created_by: uuid.UUID | |
| created_at: datetime.datetime | |
| updated_at: datetime.datetime | |
| nbytes: pydantic.v1.NonNegativeInt | |
| class DirectoryModelV1(pydantic.v1.BaseModel): | |
| type: Literal["directory"] = "directory" | |
| name: str = pydantic.v1.Field(min_length=1) | |
| created_by: uuid.UUID | |
| created_at: datetime.datetime | |
| updated_at: datetime.datetime | |
| contents: List[Union[DirectoryModelV1, FileModelV1]] | |
| def bench_pydantic_v1(data): | |
| return bench( | |
| data, | |
| lambda p: p.json(), | |
| DirectoryModelV1.parse_raw, | |
| lambda data: DirectoryModelV1(**data), | |
| ) | |
| if __name__ == "__main__": | |
| N = 1000 | |
| data = make_filesystem_data(N) | |
| ms_dumps, ms_loads = bench_msgspec(data) | |
| ms_total = ms_dumps + ms_loads | |
| title = f"msgspec {msgspec.__version__}" | |
| print(title) | |
| print("-" * len(title)) | |
| print(f"dumps: {ms_dumps * 1e6:.1f} us") | |
| print(f"loads: {ms_loads * 1e6:.1f} us") | |
| print(f"total: {ms_total * 1e6:.1f} us") | |
| for title, func in [ | |
| (f"pydantic {pydantic.__version__}", bench_pydantic_v2), | |
| (f"pydantic {pydantic.v1.__version__}", bench_pydantic_v1) | |
| ]: | |
| print() | |
| print(title) | |
| print("-" * len(title)) | |
| dumps, loads = func(data) | |
| total = dumps + loads | |
| print(f"dumps: {dumps * 1e6:.1f} us ({dumps / ms_dumps:.1f}x slower)") | |
| print(f"loads: {loads * 1e6:.1f} us ({loads / ms_loads:.1f}x slower)") | |
| print(f"total: {total * 1e6:.1f} us ({total / ms_total:.1f}x slower)") |
Definitely worth either removing pydantic v1, or using the actual package - the pydantic.v1 code is not complied with cython unlike installing pydantic==1.10.9.
Thanks for the feedback Samuel! That's fair, although using the compiled version for V1 seems to have a minimal improvement on this benchmark:
msgspec vs pydantic V1 benchmark (using cython compiled pydantic V1 package)
from __future__ import annotations
import datetime
import random
import string
import timeit
import uuid
from typing import List, Literal, Union, Annotated
import msgspec
import pydantic
def make_filesystem_data(capacity):
"""Generate a tree structure representing a fake filesystem"""
UTC = datetime.timezone.utc
DATE_2018 = datetime.datetime(2018, 1, 1, tzinfo=UTC)
DATE_2023 = datetime.datetime(2023, 1, 1, tzinfo=UTC)
UUIDS = [str(uuid.uuid4()) for _ in range(30)]
rand = random.Random(42)
def randdt(min, max):
ts = rand.randint(min.timestamp(), max.timestamp())
return datetime.datetime.fromtimestamp(ts).replace(tzinfo=UTC)
def randstr(min=None, max=None):
if max is not None:
min = rand.randint(min, max)
return "".join(rand.choices(string.ascii_letters, k=min))
def make_node(is_dir):
nonlocal capacity
name = randstr(4, 30)
created_by = rand.choice(UUIDS)
created_at = randdt(DATE_2018, DATE_2023)
updated_at = randdt(created_at, DATE_2023)
data = {
"type": "directory" if is_dir else "file",
"name": name,
"created_by": created_by,
"created_at": created_at.isoformat(),
"updated_at": updated_at.isoformat(),
}
if is_dir:
n = min(rand.randint(0, 30), capacity)
capacity -= n
data["contents"] = [make_node(rand.random() > 0.9) for _ in range(n)]
else:
data["nbytes"] = rand.randint(0, 1000000)
return data
capacity -= 1
out = make_node(True)
while capacity:
capacity -= 1
out["contents"].append(make_node(rand.random() > 0.9))
return out
def bench(raw_data, dumps, loads, convert):
msg = convert(raw_data)
json_data = dumps(msg)
msg2 = loads(json_data)
assert msg == msg2
del msg2
timer = timeit.Timer("func(data)", setup="", globals={"func": dumps, "data": msg})
n, t = timer.autorange()
dumps_time = t / n
timer = timeit.Timer(
"func(data)", setup="", globals={"func": loads, "data": json_data}
)
n, t = timer.autorange()
loads_time = t / n
return dumps_time, loads_time
#############################################################################
# msgspec #
#############################################################################
class File(msgspec.Struct, tag="file"):
name: Annotated[str, msgspec.Meta(min_length=1)]
created_by: uuid.UUID
created_at: datetime.datetime
updated_at: datetime.datetime
nbytes: Annotated[int, msgspec.Meta(ge=0)]
class Directory(msgspec.Struct, tag="directory"):
name: Annotated[str, msgspec.Meta(min_length=1)]
created_by: uuid.UUID
created_at: datetime.datetime
updated_at: datetime.datetime
contents: List[Union[File, Directory]]
def bench_msgspec(data):
enc = msgspec.json.Encoder()
dec = msgspec.json.Decoder(Directory)
def convert(data):
return msgspec.convert(data, Directory)
return bench(data, enc.encode, dec.decode, convert)
#############################################################################
# pydantic V2 #
#############################################################################
class FileModel(pydantic.BaseModel):
type: Literal["file"] = "file"
name: str = pydantic.Field(min_length=1)
created_by: uuid.UUID
created_at: datetime.datetime
updated_at: datetime.datetime
nbytes: pydantic.NonNegativeInt
class DirectoryModel(pydantic.BaseModel):
type: Literal["directory"] = "directory"
name: str = pydantic.Field(min_length=1)
created_by: uuid.UUID
created_at: datetime.datetime
updated_at: datetime.datetime
contents: List[Union[DirectoryModel, FileModel]]
def bench_pydantic_v1(data):
return bench(
data,
lambda p: p.json(),
DirectoryModel.parse_raw,
lambda data: DirectoryModel(**data),
)
if __name__ == "__main__":
N = 1000
data = make_filesystem_data(N)
ms_dumps, ms_loads = bench_msgspec(data)
ms_total = ms_dumps + ms_loads
title = f"msgspec {msgspec.__version__}"
print(title)
print("-" * len(title))
print(f"dumps: {ms_dumps * 1e6:.1f} us")
print(f"loads: {ms_loads * 1e6:.1f} us")
print(f"total: {ms_total * 1e6:.1f} us")
for title, func in [
(f"pydantic {pydantic.__version__}", bench_pydantic_v1)
]:
print()
print(title)
print("-" * len(title))
dumps, loads = func(data)
total = dumps + loads
print(f"dumps: {dumps * 1e6:.1f} us ({dumps / ms_dumps:.1f}x slower)")
print(f"loads: {loads * 1e6:.1f} us ({loads / ms_loads:.1f}x slower)")
print(f"total: {total * 1e6:.1f} us ({total / ms_total:.1f}x slower)")Output:
msgspec 0.16.0
--------------
dumps: 178.6 us
loads: 497.8 us
total: 676.3 us
pydantic 1.10.9
---------------
dumps: 18206.4 us (102.0x slower)
loads: 55122.1 us (110.7x slower)
total: 73328.5 us (108.4x slower)
Either way, the main point of the benchmark in this gist was to compare pydantic V2 and msgspec, happy to remove v1 if it's a distraction.
Up to you, just making the observation really.
quick update on the numbers as pydantic v2 became stable:
msgspec 0.16.0
--------------
dumps: 179.3 us
loads: 477.0 us
total: 656.3 us
pydantic 2.0.1
--------------
dumps: 4292.0 us (23.9x slower)
loads: 6666.6 us (14.0x slower)
total: 10958.6 us (16.7x slower)
pydantic 1.10.11
----------------
dumps: 24176.3 us (134.8x slower)
loads: 73471.1 us (154.0x slower)
total: 97647.4 us (148.8x slower)
Fix python 3.12
https://gist.github.com/jcrist/d62f450594164d284fbea957fd48b743#file-bench-py-L38
should be
ts = rand.randint(int(min.timestamp()), int(max.timestamp()))BTW @samuelcolvin You said that
Although msgspec and pydantic have different aims and features
What are the different aims if I may ask?
Leaving here my benchmark results
import json
import timeit
from contextlib import contextmanager
from dataclasses import dataclass
from typing import Iterator, TypedDict
import mimesis
import msgspec
import pydantic
from pydantic.type_adapter import TypeAdapter
provider = mimesis.Generic()
def create_user() -> dict:
return {
"id": provider.person.identifier(),
"username": provider.person.username(),
"password": provider.person.password(),
"email": provider.person.email(),
"blog": provider.internet.url(),
"first_name": provider.person.name(),
"last_name": provider.person.last_name(),
"is_active": provider.development.boolean(),
"is_staff": provider.development.boolean(),
"is_superuser": provider.development.boolean(),
"date_joined": provider.person.birthdate(),
"last_login": provider.person.birthdate(),
"friend": create_user() if provider.development.boolean() else None
}
data = [create_user() for _ in range(100000)]
data_raw = msgspec.json.encode(data)
class MsgSpecUser(msgspec.Struct):
id: str
username: str
password: str
email: str
blog: str
first_name: str
last_name: str
is_active: bool
is_staff: bool
is_superuser: bool
date_joined: str
last_login: str
friend: "MsgSpecUser | None"
class PydanticUser(pydantic.BaseModel):
id: str
username: str
password: str
email: str
blog: str
first_name: str
last_name: str
is_active: bool
is_staff: bool
is_superuser: bool
date_joined: str
last_login: str
friend: "PydanticUser | None"
@dataclass
class TimeitResult:
task: str
seconds: float | None = None
@contextmanager
def time_it(task: str) -> Iterator[TimeitResult]:
start = timeit.default_timer()
res = TimeitResult(task=task)
yield res
end = timeit.default_timer()
print(f"{task} took {end - start:1f} seconds")
res.seconds = end - start
def match_precentage(pydantic: float, msgspec: float) -> str:
if pydantic < msgspec:
return f"Pydantic is faster by %{((msgspec - pydantic) / pydantic) * 100:1f}"
return f"MsgSpec is faster by %{((pydantic - msgspec) / msgspec) * 100:1f}"
msgspec_decoder = msgspec.json.Decoder(list[MsgSpecUser])
with time_it("msgspec_decode") as msgspec_res:
msgspec_data = msgspec_decoder.decode(data_raw)
users_ta = TypeAdapter(list[PydanticUser])
with time_it("pydantic_decode") as pydantic_res:
pydantic_data = users_ta.validate_json(data_raw)
print(f"DECODE: {match_precentage(pydantic_res.seconds, msgspec_res.seconds)}")
# ------------ encode ------------
msgspec_encoder = msgspec.json.Encoder()
with time_it("msgspec_encode") as msgspec_res:
msgspec_data_raw = msgspec_encoder.encode(msgspec_data)
with time_it("pydantic_encode") as pydantic_res:
pydantic_data_raw = users_ta.dump_json(pydantic_data)
print(f"ENCODE: {match_precentage(pydantic_res.seconds, msgspec_res.seconds)}")msgspec_decode took 0.162186 seconds
pydantic_decode took 1.120969 seconds
DECODE: MsgSpec is faster by %591.163625
msgspec_encode took 0.044265 seconds
pydantic_encode took 0.223537 seconds
ENCODE: MsgSpec is faster by %404.997775Created another benchmark that uses custom types
https://gist.github.com/nrbnlulu/e983ab23bed5806cff5bb8ba97434d6d
results are quite surprising
msgspec_decode took 0.050580 seconds
pydantic_decode took 0.150948 seconds
DECODE: MsgSpec is faster by %198.433165
msgspec_encode took 0.015060 seconds
pydantic_encode took 0.060530 seconds
ENCODE: MsgSpec is faster by %301.920586Updated results with python 3.12 and latest available versions of pydantic and msgspec:
msgspec 0.18.6
--------------
dumps: 178.8 us
loads: 509.6 us
total: 688.4 us
pydantic 2.9.2
--------------
dumps: 9064.2 us (50.7x slower)
loads: 10563.7 us (20.7x slower)
total: 19627.9 us (28.5x slower)
pydantic 1.10.18
----------------
dumps: 13753.4 us (76.9x slower)
loads: 53922.3 us (105.8x slower)
total: 67675.7 us (98.3x slower)
I tested myself and did not notice a 10x + difference.
from my test, msgspec is about 70% faster than pydantic.
@dataclass
class Item:
product_id: int
name: str
quantity: int
price: float
@dataclass
class Order:
order_id: str
customer_name: str
customer_email: str
items: List[Item]
shipping_address: str
payment_status: str
total_amount: float
discount: Optional[float] = 0.0
# Step 2: Example body data in dictionary format
order_data = {
"order_id": "ORD12345",
"customer_name": "Jane Doe",
"customer_email": "[email protected]",
"items": [
{"product_id": 101, "name": "Laptop", "quantity": 1, "price": 1200.00},
{"product_id": 102, "name": "Mouse", "quantity": 2, "price": 25.50},
{"product_id": 103, "name": "Keyboard", "quantity": 1, "price": 75.75},
],
"shipping_address": "1234 Elm Street, Springfield, IL",
"payment_status": "Paid",
"total_amount": 1400.75,
"discount": 100.0, # optional discount
}
data = json.dumps(order_data).encode()
rounds = 10000
order_adapter = TypeAdapter(Order)
p1 = perf_counter()
for _ in range(rounds):
porder = order_adapter.validate_json(data)
p2 = perf_counter()
r1 = round(p2 - p1, 6)
p1 = perf_counter()
for _ in range(rounds):
morder = decode(data, type=Order)
p2 = perf_counter()
r2 = round(p2 - p1, 6)
print(f"pydantic costs {r1} seconds")
print(f"msgspec costs {r2} seconds")
print(f"pydantic is {round(r1/r2,3)}x slower")pydantic costs 0.023508 seconds
msgspec costs 0.013425 seconds
pydantic is 1.751x slowerPYDANTIC_VERSION = '2.10.6'
MSGSPEC_VERSION = '0.19.0'
========== Update =========
When use msgspec.Struct and pydantic.BaseModel to define model instead of dataclass, msgspec is much more performant than
pydantic.
Interestingly, it is even faster to user a TypeAdapter(list[dataclasses.dataclass]) than TypeAdapter(list[pydantic.BaseModel])
I get an error, maybe
"""A quick benchmark comparing the performance of:
- msgspec: https://github.com/jcrist/msgspec
- pydantic V1: https://docs.pydantic.dev/1.10/
- pydantic V2: https://docs.pydantic.dev/dev-v2/
The benchmark is modified from the one in the msgspec repo here:
https://github.com/jcrist/msgspec/blob/main/benchmarks/bench_validation.py
I make no claims that it's illustrative of all use cases. I wrote this up
mostly to get an understanding of how msgspec's performance compares with that
of pydantic V2.
"""
import datetime
import random
import string
import timeit
import uuid
from typing import List, Literal, Union, Annotated
import msgspec
import pydantic
import pydantic.v1
def make_filesystem_data(capacity):
"""Generate a tree structure representing a fake filesystem"""
UTC = datetime.timezone.utc
DATE_2018 = datetime.datetime(2018, 1, 1, tzinfo=UTC)
DATE_2023 = datetime.datetime(2023, 1, 1, tzinfo=UTC)
UUIDS = [str(uuid.uuid4()) for _ in range(30)]
rand = random.Random(42)
def randdt(min, max):
ts = rand.randint(int(min.timestamp()), int(max.timestamp()))
return datetime.datetime.fromtimestamp(ts).replace(tzinfo=UTC)
def randstr(min=None, max=None):
if max is not None:
min = rand.randint(min, max)
return "".join(rand.choices(string.ascii_letters, k=min))
def make_node(is_dir):
nonlocal capacity
name = randstr(4, 30)
created_by = rand.choice(UUIDS)
created_at = randdt(DATE_2018, DATE_2023)
updated_at = randdt(created_at, DATE_2023)
data = {
"type": "directory" if is_dir else "file",
"name": name,
"created_by": created_by,
"created_at": created_at.isoformat(),
"updated_at": updated_at.isoformat(),
}
if is_dir:
n = min(rand.randint(0, 30), capacity)
capacity -= n
data["contents"] = [make_node(rand.random() > 0.9) for _ in range(n)]
else:
data["nbytes"] = rand.randint(0, 1000000)
return data
capacity -= 1
out = make_node(True)
while capacity:
capacity -= 1
out["contents"].append(make_node(rand.random() > 0.9))
return out
def bench(raw_data, dumps, loads, convert):
msg = convert(raw_data)
json_data = dumps(msg)
msg2 = loads(json_data)
assert msg == msg2
del msg2
timer = timeit.Timer("func(data)", setup="", globals={"func": dumps, "data": msg})
n, t = timer.autorange()
dumps_time = t / n
timer = timeit.Timer(
"func(data)", setup="", globals={"func": loads, "data": json_data}
)
n, t = timer.autorange()
loads_time = t / n
return dumps_time, loads_time
#############################################################################
# msgspec #
#############################################################################
class File(msgspec.Struct, tag="file"):
name: Annotated[str, msgspec.Meta(min_length=1)]
created_by: uuid.UUID
created_at: datetime.datetime
updated_at: datetime.datetime
nbytes: Annotated[int, msgspec.Meta(ge=0)]
class Directory(msgspec.Struct, tag="directory"):
name: Annotated[str, msgspec.Meta(min_length=1)]
created_by: uuid.UUID
created_at: datetime.datetime
updated_at: datetime.datetime
contents: List[Union[File, "Directory"]]
def bench_msgspec(data):
enc = msgspec.json.Encoder()
dec = msgspec.json.Decoder(Directory)
def convert(data):
return msgspec.convert(data, Directory)
return bench(data, enc.encode, dec.decode, convert)
#############################################################################
# pydantic V2 #
#############################################################################
class FileModel(pydantic.BaseModel):
type: Literal["file"] = "file"
name: str = pydantic.Field(min_length=1)
created_by: uuid.UUID
created_at: datetime.datetime
updated_at: datetime.datetime
nbytes: pydantic.NonNegativeInt
class DirectoryModel(pydantic.BaseModel):
type: Literal["directory"] = "directory"
name: str = pydantic.Field(min_length=1)
created_by: uuid.UUID
created_at: datetime.datetime
updated_at: datetime.datetime
contents: List[Union["DirectoryModel", FileModel]]
# Rebuild the model to resolve forward references
DirectoryModel.model_rebuild()
def bench_pydantic_v2(data):
return bench(
data,
lambda p: p.model_dump_json(),
DirectoryModel.model_validate_json,
lambda data: DirectoryModel(**data),
)
#############################################################################
# pydantic V1 #
#############################################################################
class FileModelV1(pydantic.v1.BaseModel):
type: Literal["file"] = "file"
name: str = pydantic.v1.Field(min_length=1)
created_by: uuid.UUID
created_at: datetime.datetime
updated_at: datetime.datetime
nbytes: pydantic.v1.NonNegativeInt
class DirectoryModelV1(pydantic.v1.BaseModel):
type: Literal["directory"] = "directory"
name: str = pydantic.v1.Field(min_length=1)
created_by: uuid.UUID
created_at: datetime.datetime
updated_at: datetime.datetime
contents: List[Union["DirectoryModelV1", FileModelV1]]
# Update forward references for pydantic V1
DirectoryModelV1.update_forward_refs()
def bench_pydantic_v1(data):
return bench(
data,
lambda p: p.json(),
DirectoryModelV1.parse_raw,
lambda data: DirectoryModelV1(**data),
)
if __name__ == "__main__":
N = 1000
data = make_filesystem_data(N)
ms_dumps, ms_loads = bench_msgspec(data)
ms_total = ms_dumps + ms_loads
title = f"msgspec {msgspec.__version__}"
print(title)
print("-" * len(title))
print(f"dumps: {ms_dumps * 1e6:.1f} us")
print(f"loads: {ms_loads * 1e6:.1f} us")
print(f"total: {ms_total * 1e6:.1f} us")
for title, func in [
(f"pydantic {pydantic.__version__}", bench_pydantic_v2),
(f"pydantic {pydantic.v1.__version__}", bench_pydantic_v1)
]:
print()
print(title)
print("-" * len(title))
dumps, loads = func(data)
total = dumps + loads
print(f"dumps: {dumps * 1e6:.1f} us ({dumps / ms_dumps:.1f}x slower)")
print(f"loads: {loads * 1e6:.1f} us ({loads / ms_loads:.1f}x slower)")
print(f"total: {total * 1e6:.1f} us ({total / ms_total:.1f}x slower)")Building on top of the previous comment:
msgspec 0.20.0
--------------
dumps: 190.1 us
loads: 551.2 us
total: 741.3 us
pydantic 2.12.5
---------------
dumps: 2509.5 us (13.2x slower)
loads: 8589.7 us (15.6x slower)
total: 11099.3 us (15.0x slower)
pydantic 1.10.21
----------------
dumps: 14967.6 us (78.7x slower)
loads: 64642.1 us (117.3x slower)
total: 79609.6 us (107.4x slower)
duct tapeed py dataclass
------------------------
dumps: 28349.2 us (149.1x slower)
loads: 3176.0 us (5.8x slower)
total: 31525.3 us (42.5x slower)"""A quick benchmark comparing the performance of:
- msgspec: https://github.com/jcrist/msgspec
- pydantic V1: https://docs.pydantic.dev/1.10/
- pydantic V2: https://docs.pydantic.dev/dev-v2/
- python dataclasses: https://docs.python.org/3.14/library/dataclasses.html
The benchmark is modified from the one in the msgspec repo here:
https://github.com/jcrist/msgspec/blob/main/benchmarks/bench_validation.py
I make no claims that it's illustrative of all use cases. I wrote this up
mostly to get an understanding of how msgspec's performance compares with that
of pydantic V2.
"""
import datetime
import json
import random
import string
import timeit
import uuid
from dataclasses import dataclass, field, asdict
from typing import List, Literal, Union, Annotated
import msgspec
import pydantic
import pydantic.v1
def make_filesystem_data(capacity):
"""Generate a tree structure representing a fake filesystem"""
UTC = datetime.timezone.utc
DATE_2018 = datetime.datetime(2018, 1, 1, tzinfo=UTC)
DATE_2023 = datetime.datetime(2023, 1, 1, tzinfo=UTC)
UUIDS = [str(uuid.uuid4()) for _ in range(30)]
rand = random.Random(42)
def randdt(min, max):
ts = rand.randint(int(min.timestamp()), int(max.timestamp()))
return datetime.datetime.fromtimestamp(ts).replace(tzinfo=UTC)
def randstr(min=None, max=None):
if max is not None:
min = rand.randint(min, max)
return "".join(rand.choices(string.ascii_letters, k=min))
def make_node(is_dir):
nonlocal capacity
name = randstr(4, 30)
created_by = rand.choice(UUIDS)
created_at = randdt(DATE_2018, DATE_2023)
updated_at = randdt(created_at, DATE_2023)
data = {
"type": "directory" if is_dir else "file",
"name": name,
"created_by": created_by,
"created_at": created_at.isoformat(),
"updated_at": updated_at.isoformat(),
}
if is_dir:
n = min(rand.randint(0, 30), capacity)
capacity -= n
data["contents"] = [make_node(rand.random() > 0.9) for _ in range(n)]
else:
data["nbytes"] = rand.randint(0, 1000000)
return data
capacity -= 1
out = make_node(True)
while capacity:
capacity -= 1
out["contents"].append(make_node(rand.random() > 0.9))
return out
def bench(raw_data, dumps, loads, convert):
msg = convert(raw_data)
json_data = dumps(msg)
msg2 = loads(json_data)
assert msg == msg2
del msg2
timer = timeit.Timer("func(data)", setup="", globals={"func": dumps, "data": msg})
n, t = timer.autorange()
dumps_time = t / n
timer = timeit.Timer(
"func(data)", setup="", globals={"func": loads, "data": json_data}
)
n, t = timer.autorange()
loads_time = t / n
return dumps_time, loads_time
#############################################################################
# msgspec #
#############################################################################
class File(msgspec.Struct, tag="file"):
name: Annotated[str, msgspec.Meta(min_length=1)]
created_by: uuid.UUID
created_at: datetime.datetime
updated_at: datetime.datetime
nbytes: Annotated[int, msgspec.Meta(ge=0)]
class Directory(msgspec.Struct, tag="directory"):
name: Annotated[str, msgspec.Meta(min_length=1)]
created_by: uuid.UUID
created_at: datetime.datetime
updated_at: datetime.datetime
contents: List[Union[File, "Directory"]]
def bench_msgspec(data):
enc = msgspec.json.Encoder()
dec = msgspec.json.Decoder(Directory)
def convert(data):
return msgspec.convert(data, Directory)
return bench(data, enc.encode, dec.decode, convert)
#############################################################################
# pydantic V2 #
#############################################################################
class FileModel(pydantic.BaseModel):
type: Literal["file"] = "file"
name: str = pydantic.Field(min_length=1)
created_by: uuid.UUID
created_at: datetime.datetime
updated_at: datetime.datetime
nbytes: pydantic.NonNegativeInt
class DirectoryModel(pydantic.BaseModel):
type: Literal["directory"] = "directory"
name: str = pydantic.Field(min_length=1)
created_by: uuid.UUID
created_at: datetime.datetime
updated_at: datetime.datetime
contents: List[Union["DirectoryModel", FileModel]]
# Rebuild the model to resolve forward references
DirectoryModel.model_rebuild()
def bench_pydantic_v2(data):
return bench(
data,
lambda p: p.model_dump_json(),
DirectoryModel.model_validate_json,
lambda data: DirectoryModel(**data),
)
#############################################################################
# pydantic V1 #
#############################################################################
class FileModelV1(pydantic.v1.BaseModel):
type: Literal["file"] = "file"
name: str = pydantic.v1.Field(min_length=1)
created_by: uuid.UUID
created_at: datetime.datetime
updated_at: datetime.datetime
nbytes: pydantic.v1.NonNegativeInt
class DirectoryModelV1(pydantic.v1.BaseModel):
type: Literal["directory"] = "directory"
name: str = pydantic.v1.Field(min_length=1)
created_by: uuid.UUID
created_at: datetime.datetime
updated_at: datetime.datetime
contents: List[Union["DirectoryModelV1", FileModelV1]]
# Update forward references for pydantic V1
DirectoryModelV1.update_forward_refs()
def bench_pydantic_v1(data):
return bench(
data,
lambda p: p.json(),
DirectoryModelV1.parse_raw,
lambda data: DirectoryModelV1(**data),
)
#############################################################################
# Python dataclass #
#############################################################################
@dataclass
class FileDataclass:
name: str = field()
created_by: uuid.UUID = field()
created_at: datetime.datetime = field()
updated_at: datetime.datetime = field()
nbytes: int = field()
type: str = "file"
def __post_init__(self):
if len(self.name) < 1:
raise ValueError("name must have min_length=1")
if self.nbytes < 0:
raise ValueError("nbytes must be >= 0")
@dataclass
class DirectoryDataclass:
name: str = field()
created_by: uuid.UUID = field()
created_at: datetime.datetime = field()
updated_at: datetime.datetime = field()
contents: List[Union["DirectoryDataclass", FileDataclass]] = field(
default_factory=list
)
type: str = "directory"
def __post_init__(self):
if len(self.name) < 1:
raise ValueError("name must have min_length=1")
def _json_serializer(obj):
"""JSON serializer for objects not serializable by default json code"""
if isinstance(obj, (datetime.datetime, datetime.date)):
return obj.isoformat()
elif isinstance(obj, uuid.UUID):
return str(obj)
raise TypeError(f"Type {type(obj)} not serializable")
def _dict_to_dataclass(data):
"""Convert dict to dataclass instances"""
if data["type"] == "file":
return FileDataclass(
type=data["type"],
name=data["name"],
created_by=uuid.UUID(data["created_by"])
if isinstance(data["created_by"], str)
else data["created_by"],
created_at=datetime.datetime.fromisoformat(data["created_at"])
if isinstance(data["created_at"], str)
else data["created_at"],
updated_at=datetime.datetime.fromisoformat(data["updated_at"])
if isinstance(data["updated_at"], str)
else data["updated_at"],
nbytes=data["nbytes"],
)
else: # directory
return DirectoryDataclass(
type=data["type"],
name=data["name"],
created_by=uuid.UUID(data["created_by"])
if isinstance(data["created_by"], str)
else data["created_by"],
created_at=datetime.datetime.fromisoformat(data["created_at"])
if isinstance(data["created_at"], str)
else data["created_at"],
updated_at=datetime.datetime.fromisoformat(data["updated_at"])
if isinstance(data["updated_at"], str)
else data["updated_at"],
contents=[_dict_to_dataclass(item) for item in data["contents"]],
)
def bench_dataclass(data):
def convert(data):
return _dict_to_dataclass(data)
def dumps(obj):
return json.dumps(asdict(obj), default=_json_serializer)
def loads(json_data):
return _dict_to_dataclass(json.loads(json_data))
return bench(data, dumps, loads, convert)
if __name__ == "__main__":
N = 1000
data = make_filesystem_data(N)
ms_dumps, ms_loads = bench_msgspec(data)
ms_total = ms_dumps + ms_loads
title = f"msgspec {msgspec.__version__}"
print(title)
print("-" * len(title))
print(f"dumps: {ms_dumps * 1e6:.1f} us")
print(f"loads: {ms_loads * 1e6:.1f} us")
print(f"total: {ms_total * 1e6:.1f} us")
for title, func in [
(f"pydantic {pydantic.__version__}", bench_pydantic_v2),
(f"pydantic {pydantic.v1.__version__}", bench_pydantic_v1),
("duct tapeed py dataclass", bench_dataclass),
]:
print()
print(title)
print("-" * len(title))
dumps, loads = func(data)
total = dumps + loads
print(f"dumps: {dumps * 1e6:.1f} us ({dumps / ms_dumps:.1f}x slower)")
print(f"loads: {loads * 1e6:.1f} us ({loads / ms_loads:.1f}x slower)")
print(f"total: {total * 1e6:.1f} us ({total / ms_total:.1f}x slower)")
Since a few people have asked about how msgspec's performance compares to pydantic v2, I've updated the gist above with a benchmark that works with the current pydantic V2 betas. Since pydantic v1 is available as
pydantic.v1this benchmark also compares pydantic v1.Results:
This benchmark is a modified version of the one in the msgspec repo. In general my benchmarks show pydantic v2 is ~15-30x slower than msgspec at JSON encoding, and ~6-15x slower at JSON decoding. Whether that matters for your specific application is workload dependent. Also note that I'm not a pydantic expert, this was mainly for my own understanding of how these libraries compare. As always, I recommend doing your own benchmarks when making technical decisions.