Skip to content

Instantly share code, notes, and snippets.

@Lyken17
Last active December 3, 2024 17:34
Show Gist options
  • Save Lyken17/bf31b857caa6dcb83299f50b93493bec to your computer and use it in GitHub Desktop.
Save Lyken17/bf31b857caa6dcb83299f50b93493bec to your computer and use it in GitHub Desktop.
LiteLLM Debug
ligengz@:~/workspace/VILA-dev$ python serving/lmsys_test.py --model openai/nvila-8b-dev --api-base http://localhost:8000 --req-per-sec 1
/home/ligengz/anaconda3/envs/hf/lib/python3.10/site-packages/transformers/tokenization_utils_base.py:1601: FutureWarning: `clean_up_tokenization_spaces` was not set. It will be set to `True` by default. This behavior will be depracted in transformers v4.45, and will be then set to `False` by default. For more details check this issue: https://github.com/huggingface/transformers/issues/31884
warnings.warn(
/home/ligengz/anaconda3/envs/hf/lib/python3.10/site-packages/pydantic/main.py:347: UserWarning: Pydantic serializer warnings:
Expected `str` but got `int` - serialized value may not be as expected
return self.__pydantic_serializer__.to_python(
Give Feedback / Get Help: https://github.com/BerriAI/litellm/issues/new
LiteLLM.Info: If you need to debug this error, use `litellm.set_verbose=True'.
Provider List: https://docs.litellm.ai/docs/providers
Traceback (most recent call last):
File "/home/ligengz/anaconda3/envs/hf/lib/python3.10/site-packages/litellm/litellm_core_utils/streaming_handler.py", line 1683, in __next__
response = self.model_response_creator(
File "/home/ligengz/anaconda3/envs/hf/lib/python3.10/site-packages/litellm/litellm_core_utils/streaming_handler.py", line 889, in model_response_creator
model_response = ModelResponse(
File "/home/ligengz/anaconda3/envs/hf/lib/python3.10/site-packages/litellm/types/utils.py", line 907, in __init__
super().__init__(
File "/home/ligengz/anaconda3/envs/hf/lib/python3.10/site-packages/pydantic/main.py", line 176, in __init__
self.__pydantic_validator__.validate_python(data, self_instance=self)
pydantic_core._pydantic_core.ValidationError: 1 validation error for ModelResponse
id
Input should be a valid string [type=string_type, input_value=1, input_type=int]
For further information visit https://errors.pydantic.dev/2.7/v/string_type
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/lustre/fs12/portfolios/nvr/users/ligengz/workspace/VILA-dev/serving/lmsys_test.py", line 135, in <module>
main(args)
File "/lustre/fs12/portfolios/nvr/users/ligengz/workspace/VILA-dev/serving/lmsys_test.py", line 85, in main
future = litellm_completion(args, tokenizer)
File "/lustre/fs12/portfolios/nvr/users/ligengz/workspace/VILA-dev/serving/lmsys_test.py", line 45, in litellm_completion
for chunk in response:
File "/home/ligengz/anaconda3/envs/hf/lib/python3.10/site-packages/litellm/litellm_core_utils/streaming_handler.py", line 1737, in __next__
raise exception_type(
File "/home/ligengz/anaconda3/envs/hf/lib/python3.10/site-packages/litellm/litellm_core_utils/exception_mapping_utils.py", line 2141, in exception_type
raise e # it's already mapped
File "/home/ligengz/anaconda3/envs/hf/lib/python3.10/site-packages/litellm/litellm_core_utils/exception_mapping_utils.py", line 415, in exception_type
raise APIConnectionError(
litellm.exceptions.APIConnectionError: litellm.APIConnectionError: APIConnectionError: OpenAIException - 1 validation error for ModelResponse
id
Input should be a valid string [type=string_type, input_value=1, input_type=int]
For further information visit https://errors.pydantic.dev/2.7/v/string_type
import argparse
import time
import threading
from concurrent.futures import ThreadPoolExecutor
import uuid
import traceback
import numpy as np
from transformers import AutoTokenizer
from litellm import completion
def litellm_completion(args, tokenizer, image_url=None):
if image_url:
messages = [
{
"role": "user",
"content": [
{"type": "image_url", "image_url": {"url": image_url}},
{"type": "text", "text": "Tell me a story about this image."},
],
},
]
else:
messages = [
{"role": "user", "content": "Tell me a story about this image."}
]
start = time.time()
additional_api_kwargs = {}
if args.api_key:
additional_api_kwargs["api_key"] = args.api_key
if args.api_base:
additional_api_kwargs["api_base"] = args.api_base
response = completion(
model=args.model,
messages=messages,
stream=True,
**additional_api_kwargs,
)
ttft = None
itl_list = []
content = ""
for chunk in response:
if chunk.choices[0].delta.content:
end_time = time.time()
if ttft is None:
ttft = end_time - start
content += chunk.choices[0].delta.content
num_tokens = len(tokenizer.encode(content))
itl_list.append((end_time - start) / num_tokens)
start = end_time
print(content); input()
return content, ttft, itl_list
def main(args):
n = args.num_total_responses
batch_size = args.req_per_sec # Requests per second
start = time.time()
all_results = []
tokenizer = AutoTokenizer.from_pretrained("gpt2")
# with ThreadPoolExecutor(max_workers=batch_size) as executor:
for i in range(0, n, batch_size):
batch_futures = []
batch = range(i, min(i + batch_size, n))
for _ in batch:
if args.include_image:
if args.randomize_image_dimensions:
y_dimension = np.random.randint(100, 1025)
else:
y_dimension = 512
image_url = f"https://placehold.co/1024x{y_dimension}/png"
# future = executor.submit(
# litellm_completion, args, tokenizer, image_url
# )
future = litellm_completion(args, tokenizer, image_url)
else:
# future = executor.submit(litellm_completion, args, tokenizer)
future = litellm_completion(args, tokenizer)
batch_futures.append(future)
exit(0)
# Wait for batch to complete
for future in batch_futures:
all_results.append(future.result())
if i + batch_size < n:
time.sleep(1) # Wait 1 second before next batch
successful_completions = [
c for c in all_results if isinstance(c, tuple) and len(c) == 3
]
ttft_list = np.array([float(c[1]) for c in successful_completions])
itl_list_flattened = np.array(
[
float(item)
for sublist in [c[2] for c in successful_completions]
for item in sublist
]
)
# Write errors to error_log.txt
with open("load_test_errors.log", "a") as error_log:
for completion in all_results:
if isinstance(completion, str):
error_log.write(completion + "\n")
print(f"Completed requests: {len(successful_completions)}")
print(f"P99 TTFT: {np.percentile(ttft_list, 99)}")
print(f"Mean TTFT: {np.mean(ttft_list)}")
print(f"P99 ITL: {np.percentile(itl_list_flattened, 99)}")
print(f"Mean ITL: {np.mean(itl_list_flattened)}")
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--model", type=str, default="azure-gpt-3.5")
parser.add_argument("--api-base", type=str, default=None)
parser.add_argument("--api-key", type=str, default=None)
parser.add_argument("--num-total-responses", type=int, default=50)
parser.add_argument("--req-per-sec", type=int, default=5)
parser.add_argument("--include-image", action="store_true")
parser.add_argument("--randomize-image-dimensions", action="store_true")
args = parser.parse_args()
# Blank out contents of error_log.txt
open("load_test_errors.log", "w").close()
main(args)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment