Last active
October 1, 2024 08:54
-
-
Save jcrist/d62f450594164d284fbea957fd48b743 to your computer and use it in GitHub Desktop.
A quick benchmark comparing msgspec (https://github.com/jcrist/msgspec), pydantic v1, and pydantic v2
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
"""A quick benchmark comparing the performance of: | |
- msgspec: https://github.com/jcrist/msgspec | |
- pydantic V1: https://docs.pydantic.dev/1.10/ | |
- pydantic V2: https://docs.pydantic.dev/dev-v2/ | |
The benchmark is modified from the one in the msgspec repo here: | |
https://github.com/jcrist/msgspec/blob/main/benchmarks/bench_validation.py | |
I make no claims that it's illustrative of all use cases. I wrote this up | |
mostly to get an understanding of how msgspec's performance compares with that | |
of pydantic V2. | |
""" | |
from __future__ import annotations | |
import datetime | |
import random | |
import string | |
import timeit | |
import uuid | |
from typing import List, Literal, Union, Annotated | |
import msgspec | |
import pydantic | |
import pydantic.v1 | |
def make_filesystem_data(capacity): | |
"""Generate a tree structure representing a fake filesystem""" | |
UTC = datetime.timezone.utc | |
DATE_2018 = datetime.datetime(2018, 1, 1, tzinfo=UTC) | |
DATE_2023 = datetime.datetime(2023, 1, 1, tzinfo=UTC) | |
UUIDS = [str(uuid.uuid4()) for _ in range(30)] | |
rand = random.Random(42) | |
def randdt(min, max): | |
ts = rand.randint(min.timestamp(), max.timestamp()) | |
return datetime.datetime.fromtimestamp(ts).replace(tzinfo=UTC) | |
def randstr(min=None, max=None): | |
if max is not None: | |
min = rand.randint(min, max) | |
return "".join(rand.choices(string.ascii_letters, k=min)) | |
def make_node(is_dir): | |
nonlocal capacity | |
name = randstr(4, 30) | |
created_by = rand.choice(UUIDS) | |
created_at = randdt(DATE_2018, DATE_2023) | |
updated_at = randdt(created_at, DATE_2023) | |
data = { | |
"type": "directory" if is_dir else "file", | |
"name": name, | |
"created_by": created_by, | |
"created_at": created_at.isoformat(), | |
"updated_at": updated_at.isoformat(), | |
} | |
if is_dir: | |
n = min(rand.randint(0, 30), capacity) | |
capacity -= n | |
data["contents"] = [make_node(rand.random() > 0.9) for _ in range(n)] | |
else: | |
data["nbytes"] = rand.randint(0, 1000000) | |
return data | |
capacity -= 1 | |
out = make_node(True) | |
while capacity: | |
capacity -= 1 | |
out["contents"].append(make_node(rand.random() > 0.9)) | |
return out | |
def bench(raw_data, dumps, loads, convert): | |
msg = convert(raw_data) | |
json_data = dumps(msg) | |
msg2 = loads(json_data) | |
assert msg == msg2 | |
del msg2 | |
timer = timeit.Timer("func(data)", setup="", globals={"func": dumps, "data": msg}) | |
n, t = timer.autorange() | |
dumps_time = t / n | |
timer = timeit.Timer( | |
"func(data)", setup="", globals={"func": loads, "data": json_data} | |
) | |
n, t = timer.autorange() | |
loads_time = t / n | |
return dumps_time, loads_time | |
############################################################################# | |
# msgspec # | |
############################################################################# | |
class File(msgspec.Struct, tag="file"): | |
name: Annotated[str, msgspec.Meta(min_length=1)] | |
created_by: uuid.UUID | |
created_at: datetime.datetime | |
updated_at: datetime.datetime | |
nbytes: Annotated[int, msgspec.Meta(ge=0)] | |
class Directory(msgspec.Struct, tag="directory"): | |
name: Annotated[str, msgspec.Meta(min_length=1)] | |
created_by: uuid.UUID | |
created_at: datetime.datetime | |
updated_at: datetime.datetime | |
contents: List[Union[File, Directory]] | |
def bench_msgspec(data): | |
enc = msgspec.json.Encoder() | |
dec = msgspec.json.Decoder(Directory) | |
def convert(data): | |
return msgspec.convert(data, Directory) | |
return bench(data, enc.encode, dec.decode, convert) | |
############################################################################# | |
# pydantic V2 # | |
############################################################################# | |
class FileModel(pydantic.BaseModel): | |
type: Literal["file"] = "file" | |
name: str = pydantic.Field(min_length=1) | |
created_by: uuid.UUID | |
created_at: datetime.datetime | |
updated_at: datetime.datetime | |
nbytes: pydantic.NonNegativeInt | |
class DirectoryModel(pydantic.BaseModel): | |
type: Literal["directory"] = "directory" | |
name: str = pydantic.Field(min_length=1) | |
created_by: uuid.UUID | |
created_at: datetime.datetime | |
updated_at: datetime.datetime | |
contents: List[Union[DirectoryModel, FileModel]] | |
def bench_pydantic_v2(data): | |
return bench( | |
data, | |
lambda p: p.model_dump_json(), | |
DirectoryModel.model_validate_json, | |
lambda data: DirectoryModel(**data), | |
) | |
############################################################################# | |
# pydantic V1 # | |
############################################################################# | |
class FileModelV1(pydantic.v1.BaseModel): | |
type: Literal["file"] = "file" | |
name: str = pydantic.v1.Field(min_length=1) | |
created_by: uuid.UUID | |
created_at: datetime.datetime | |
updated_at: datetime.datetime | |
nbytes: pydantic.v1.NonNegativeInt | |
class DirectoryModelV1(pydantic.v1.BaseModel): | |
type: Literal["directory"] = "directory" | |
name: str = pydantic.v1.Field(min_length=1) | |
created_by: uuid.UUID | |
created_at: datetime.datetime | |
updated_at: datetime.datetime | |
contents: List[Union[DirectoryModelV1, FileModelV1]] | |
def bench_pydantic_v1(data): | |
return bench( | |
data, | |
lambda p: p.json(), | |
DirectoryModelV1.parse_raw, | |
lambda data: DirectoryModelV1(**data), | |
) | |
if __name__ == "__main__": | |
N = 1000 | |
data = make_filesystem_data(N) | |
ms_dumps, ms_loads = bench_msgspec(data) | |
ms_total = ms_dumps + ms_loads | |
title = f"msgspec {msgspec.__version__}" | |
print(title) | |
print("-" * len(title)) | |
print(f"dumps: {ms_dumps * 1e6:.1f} us") | |
print(f"loads: {ms_loads * 1e6:.1f} us") | |
print(f"total: {ms_total * 1e6:.1f} us") | |
for title, func in [ | |
(f"pydantic {pydantic.__version__}", bench_pydantic_v2), | |
(f"pydantic {pydantic.v1.__version__}", bench_pydantic_v1) | |
]: | |
print() | |
print(title) | |
print("-" * len(title)) | |
dumps, loads = func(data) | |
total = dumps + loads | |
print(f"dumps: {dumps * 1e6:.1f} us ({dumps / ms_dumps:.1f}x slower)") | |
print(f"loads: {loads * 1e6:.1f} us ({loads / ms_loads:.1f}x slower)") | |
print(f"total: {total * 1e6:.1f} us ({total / ms_total:.1f}x slower)") |
Created another benchmark that uses custom types
https://gist.github.com/nrbnlulu/e983ab23bed5806cff5bb8ba97434d6d
results are quite surprising
msgspec_decode took 0.050580 seconds
pydantic_decode took 0.150948 seconds
DECODE: MsgSpec is faster by %198.433165
msgspec_encode took 0.015060 seconds
pydantic_encode took 0.060530 seconds
ENCODE: MsgSpec is faster by %301.920586
Updated results with python 3.12 and latest available versions of pydantic and msgspec:
msgspec 0.18.6
--------------
dumps: 178.8 us
loads: 509.6 us
total: 688.4 us
pydantic 2.9.2
--------------
dumps: 9064.2 us (50.7x slower)
loads: 10563.7 us (20.7x slower)
total: 19627.9 us (28.5x slower)
pydantic 1.10.18
----------------
dumps: 13753.4 us (76.9x slower)
loads: 53922.3 us (105.8x slower)
total: 67675.7 us (98.3x slower)
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Leaving here my benchmark results