Last active
October 24, 2024 12:35
-
-
Save alonsosilvaallende/93017fe8a86868a8d659e83b72a68995 to your computer and use it in GitHub Desktop.
Benchmark Outlines v0.1.1 vs v0.0.46
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# /// script | |
# requires-python = ">=3.12" | |
# dependencies = [ | |
# "outlines-core==0.1.14", | |
# "outlines==0.1.1", | |
# "protobuf==5.28.3", | |
# "pydantic==2.9.2", | |
# "rich==13.9.3", | |
# "sentencepiece==0.2.0", | |
# "setuptools==75.2.0", | |
# "transformers==4.45.2", | |
# "uvtrick==0.3.0", | |
# ] | |
# /// | |
def main() -> None: | |
# Get a json schema from a Pydantic class | |
from enum import Enum | |
from pydantic import BaseModel | |
import json | |
class Name(str, Enum): | |
john = "John" | |
paul = "Paul" | |
class Age(int, Enum): | |
twenty = 20 | |
thirty = 30 | |
class Character(BaseModel): | |
name: Name | |
age: Age | |
json_schema = Character.model_json_schema() | |
# Use a tokenizer, for example `mistralai/Mistral-7B-Instruct-v0.3` | |
from transformers import AutoTokenizer | |
tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.3") | |
# Define bench to compare creation of the index | |
def bench(json_schema, tokenizer): | |
from time import time | |
from importlib import metadata | |
from rich import print | |
from outlines.fsm.json_schema import build_regex_from_schema | |
import interegular | |
from outlines.models.transformers import TransformerTokenizer | |
version = metadata.version("outlines") | |
if version=="0.1.1": | |
from outlines.fsm.json_schema import convert_json_schema_to_str | |
from outlines_core.fsm.regex import make_deterministic_fsm | |
from outlines_core.fsm.regex import create_fsm_index_tokenizer | |
else: | |
from outlines.integrations.utils import convert_json_schema_to_str | |
from outlines.fsm.regex import make_deterministic_fsm | |
from outlines.fsm.regex import create_fsm_index_tokenizer | |
tic = time() | |
schema_str = convert_json_schema_to_str(json_schema=json_schema) | |
regex_str = build_regex_from_schema(schema_str, whitespace_pattern=r"") | |
list_of_strings_fsm = interegular.parse_pattern(regex_str).to_fsm() | |
new_fsm, _ = make_deterministic_fsm(list_of_strings_fsm) | |
new_tokenizer = TransformerTokenizer(tokenizer) | |
# print(new_fsm) | |
index, _ = create_fsm_index_tokenizer(new_fsm, new_tokenizer) | |
toc = time() | |
print(f"[green]outlines=={version}, time: {toc-tic}[/green]") | |
return toc - tic | |
# Use uvtrick to compare the execution time | |
from uvtrick import Env | |
for i in range(3): | |
timed = Env("outlines==0.0.46", python="3.12").run(bench, json_schema, tokenizer) | |
for i in range(3): | |
timed = Env("outlines==0.1.1", python="3.12").run(bench, json_schema, tokenizer) | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment