Last active
December 3, 2024 17:34
-
-
Save Lyken17/bf31b857caa6dcb83299f50b93493bec to your computer and use it in GitHub Desktop.
LiteLLM Debug
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
ligengz@:~/workspace/VILA-dev$ python serving/lmsys_test.py --model openai/nvila-8b-dev --api-base http://localhost:8000 --req-per-sec 1 | |
/home/ligengz/anaconda3/envs/hf/lib/python3.10/site-packages/transformers/tokenization_utils_base.py:1601: FutureWarning: `clean_up_tokenization_spaces` was not set. It will be set to `True` by default. This behavior will be depracted in transformers v4.45, and will be then set to `False` by default. For more details check this issue: https://github.com/huggingface/transformers/issues/31884 | |
warnings.warn( | |
/home/ligengz/anaconda3/envs/hf/lib/python3.10/site-packages/pydantic/main.py:347: UserWarning: Pydantic serializer warnings: | |
Expected `str` but got `int` - serialized value may not be as expected | |
return self.__pydantic_serializer__.to_python( | |
Give Feedback / Get Help: https://github.com/BerriAI/litellm/issues/new | |
LiteLLM.Info: If you need to debug this error, use `litellm.set_verbose=True'. | |
Provider List: https://docs.litellm.ai/docs/providers | |
Traceback (most recent call last): | |
File "/home/ligengz/anaconda3/envs/hf/lib/python3.10/site-packages/litellm/litellm_core_utils/streaming_handler.py", line 1683, in __next__ | |
response = self.model_response_creator( | |
File "/home/ligengz/anaconda3/envs/hf/lib/python3.10/site-packages/litellm/litellm_core_utils/streaming_handler.py", line 889, in model_response_creator | |
model_response = ModelResponse( | |
File "/home/ligengz/anaconda3/envs/hf/lib/python3.10/site-packages/litellm/types/utils.py", line 907, in __init__ | |
super().__init__( | |
File "/home/ligengz/anaconda3/envs/hf/lib/python3.10/site-packages/pydantic/main.py", line 176, in __init__ | |
self.__pydantic_validator__.validate_python(data, self_instance=self) | |
pydantic_core._pydantic_core.ValidationError: 1 validation error for ModelResponse | |
id | |
Input should be a valid string [type=string_type, input_value=1, input_type=int] | |
For further information visit https://errors.pydantic.dev/2.7/v/string_type | |
During handling of the above exception, another exception occurred: | |
Traceback (most recent call last): | |
File "/lustre/fs12/portfolios/nvr/users/ligengz/workspace/VILA-dev/serving/lmsys_test.py", line 135, in <module> | |
main(args) | |
File "/lustre/fs12/portfolios/nvr/users/ligengz/workspace/VILA-dev/serving/lmsys_test.py", line 85, in main | |
future = litellm_completion(args, tokenizer) | |
File "/lustre/fs12/portfolios/nvr/users/ligengz/workspace/VILA-dev/serving/lmsys_test.py", line 45, in litellm_completion | |
for chunk in response: | |
File "/home/ligengz/anaconda3/envs/hf/lib/python3.10/site-packages/litellm/litellm_core_utils/streaming_handler.py", line 1737, in __next__ | |
raise exception_type( | |
File "/home/ligengz/anaconda3/envs/hf/lib/python3.10/site-packages/litellm/litellm_core_utils/exception_mapping_utils.py", line 2141, in exception_type | |
raise e # it's already mapped | |
File "/home/ligengz/anaconda3/envs/hf/lib/python3.10/site-packages/litellm/litellm_core_utils/exception_mapping_utils.py", line 415, in exception_type | |
raise APIConnectionError( | |
litellm.exceptions.APIConnectionError: litellm.APIConnectionError: APIConnectionError: OpenAIException - 1 validation error for ModelResponse | |
id | |
Input should be a valid string [type=string_type, input_value=1, input_type=int] | |
For further information visit https://errors.pydantic.dev/2.7/v/string_type |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import argparse | |
import time | |
import threading | |
from concurrent.futures import ThreadPoolExecutor | |
import uuid | |
import traceback | |
import numpy as np | |
from transformers import AutoTokenizer | |
from litellm import completion | |
def litellm_completion(args, tokenizer, image_url=None): | |
if image_url: | |
messages = [ | |
{ | |
"role": "user", | |
"content": [ | |
{"type": "image_url", "image_url": {"url": image_url}}, | |
{"type": "text", "text": "Tell me a story about this image."}, | |
], | |
}, | |
] | |
else: | |
messages = [ | |
{"role": "user", "content": "Tell me a story about this image."} | |
] | |
start = time.time() | |
additional_api_kwargs = {} | |
if args.api_key: | |
additional_api_kwargs["api_key"] = args.api_key | |
if args.api_base: | |
additional_api_kwargs["api_base"] = args.api_base | |
response = completion( | |
model=args.model, | |
messages=messages, | |
stream=True, | |
**additional_api_kwargs, | |
) | |
ttft = None | |
itl_list = [] | |
content = "" | |
for chunk in response: | |
if chunk.choices[0].delta.content: | |
end_time = time.time() | |
if ttft is None: | |
ttft = end_time - start | |
content += chunk.choices[0].delta.content | |
num_tokens = len(tokenizer.encode(content)) | |
itl_list.append((end_time - start) / num_tokens) | |
start = end_time | |
print(content); input() | |
return content, ttft, itl_list | |
def main(args): | |
n = args.num_total_responses | |
batch_size = args.req_per_sec # Requests per second | |
start = time.time() | |
all_results = [] | |
tokenizer = AutoTokenizer.from_pretrained("gpt2") | |
# with ThreadPoolExecutor(max_workers=batch_size) as executor: | |
for i in range(0, n, batch_size): | |
batch_futures = [] | |
batch = range(i, min(i + batch_size, n)) | |
for _ in batch: | |
if args.include_image: | |
if args.randomize_image_dimensions: | |
y_dimension = np.random.randint(100, 1025) | |
else: | |
y_dimension = 512 | |
image_url = f"https://placehold.co/1024x{y_dimension}/png" | |
# future = executor.submit( | |
# litellm_completion, args, tokenizer, image_url | |
# ) | |
future = litellm_completion(args, tokenizer, image_url) | |
else: | |
# future = executor.submit(litellm_completion, args, tokenizer) | |
future = litellm_completion(args, tokenizer) | |
batch_futures.append(future) | |
exit(0) | |
# Wait for batch to complete | |
for future in batch_futures: | |
all_results.append(future.result()) | |
if i + batch_size < n: | |
time.sleep(1) # Wait 1 second before next batch | |
successful_completions = [ | |
c for c in all_results if isinstance(c, tuple) and len(c) == 3 | |
] | |
ttft_list = np.array([float(c[1]) for c in successful_completions]) | |
itl_list_flattened = np.array( | |
[ | |
float(item) | |
for sublist in [c[2] for c in successful_completions] | |
for item in sublist | |
] | |
) | |
# Write errors to error_log.txt | |
with open("load_test_errors.log", "a") as error_log: | |
for completion in all_results: | |
if isinstance(completion, str): | |
error_log.write(completion + "\n") | |
print(f"Completed requests: {len(successful_completions)}") | |
print(f"P99 TTFT: {np.percentile(ttft_list, 99)}") | |
print(f"Mean TTFT: {np.mean(ttft_list)}") | |
print(f"P99 ITL: {np.percentile(itl_list_flattened, 99)}") | |
print(f"Mean ITL: {np.mean(itl_list_flattened)}") | |
if __name__ == "__main__": | |
parser = argparse.ArgumentParser() | |
parser.add_argument("--model", type=str, default="azure-gpt-3.5") | |
parser.add_argument("--api-base", type=str, default=None) | |
parser.add_argument("--api-key", type=str, default=None) | |
parser.add_argument("--num-total-responses", type=int, default=50) | |
parser.add_argument("--req-per-sec", type=int, default=5) | |
parser.add_argument("--include-image", action="store_true") | |
parser.add_argument("--randomize-image-dimensions", action="store_true") | |
args = parser.parse_args() | |
# Blank out contents of error_log.txt | |
open("load_test_errors.log", "w").close() | |
main(args) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment