Lyken17 · December 3, 2024 17:34
diff --git a/debug.txt b/debug.txt
 ligengz@:~/workspace/VILA-dev$ python serving/lmsys_test.py     --model openai/nvila-8b-dev     --api-base http://localhost:8000     --req-per-sec 1
 /home/ligengz/anaconda3/envs/hf/lib/python3.10/site-packages/transformers/tokenization_utils_base.py:1601: FutureWarning: `clean_up_tokenization_spaces` was not set. It will be set to `True` by default. This behavior will be depracted in transformers v4.45, and will be then set to `False` by default. For more details check this issue: https://github.com/huggingface/transformers/issues/31884
  warnings.warn(
 /home/ligengz/anaconda3/envs/hf/lib/python3.10/site-packages/pydantic/main.py:347: UserWarning: Pydantic serializer warnings:
  Expected `str` but got `int` - serialized value may not be as expected
  return self.__pydantic_serializer__.to_python(

 Give Feedback / Get Help: https://github.com/BerriAI/litellm/issues/new
 LiteLLM.Info: If you need to debug this error, use `litellm.set_verbose=True'.


 Provider List: https://docs.litellm.ai/docs/providers

 Traceback (most recent call last):
  File "/home/ligengz/anaconda3/envs/hf/lib/python3.10/site-packages/litellm/litellm_core_utils/streaming_handler.py", line 1683, in __next__
    response = self.model_response_creator(
  File "/home/ligengz/anaconda3/envs/hf/lib/python3.10/site-packages/litellm/litellm_core_utils/streaming_handler.py", line 889, in model_response_creator
    model_response = ModelResponse(
  File "/home/ligengz/anaconda3/envs/hf/lib/python3.10/site-packages/litellm/types/utils.py", line 907, in __init__
    super().__init__(
  File "/home/ligengz/anaconda3/envs/hf/lib/python3.10/site-packages/pydantic/main.py", line 176, in __init__
    self.__pydantic_validator__.validate_python(data, self_instance=self)
 pydantic_core._pydantic_core.ValidationError: 1 validation error for ModelResponse
 id
  Input should be a valid string [type=string_type, input_value=1, input_type=int]
    For further information visit https://errors.pydantic.dev/2.7/v/string_type

 During handling of the above exception, another exception occurred:

 Traceback (most recent call last):
  File "/lustre/fs12/portfolios/nvr/users/ligengz/workspace/VILA-dev/serving/lmsys_test.py", line 135, in <module>
    main(args)
  File "/lustre/fs12/portfolios/nvr/users/ligengz/workspace/VILA-dev/serving/lmsys_test.py", line 85, in main
    future = litellm_completion(args, tokenizer)
  File "/lustre/fs12/portfolios/nvr/users/ligengz/workspace/VILA-dev/serving/lmsys_test.py", line 45, in litellm_completion
    for chunk in response:
  File "/home/ligengz/anaconda3/envs/hf/lib/python3.10/site-packages/litellm/litellm_core_utils/streaming_handler.py", line 1737, in __next__
    raise exception_type(
  File "/home/ligengz/anaconda3/envs/hf/lib/python3.10/site-packages/litellm/litellm_core_utils/exception_mapping_utils.py", line 2141, in exception_type
    raise e  # it's already mapped
  File "/home/ligengz/anaconda3/envs/hf/lib/python3.10/site-packages/litellm/litellm_core_utils/exception_mapping_utils.py", line 415, in exception_type
    raise APIConnectionError(
 litellm.exceptions.APIConnectionError: litellm.APIConnectionError: APIConnectionError: OpenAIException - 1 validation error for ModelResponse
 id
  Input should be a valid string [type=string_type, input_value=1, input_type=int]
    For further information visit https://errors.pydantic.dev/2.7/v/string_type
diff --git a/lmsys_test.py b/lmsys_test.py
 import argparse
 import time
 import threading
 from concurrent.futures import ThreadPoolExecutor
 import uuid
 import traceback
 import numpy as np
 from transformers import AutoTokenizer
 from litellm import completion


 def litellm_completion(args, tokenizer, image_url=None):
    if image_url:
        messages = [
            {
                "role": "user",
                "content": [
                    {"type": "image_url", "image_url": {"url": image_url}},
                    {"type": "text", "text": "Tell me a story about this image."},
                ],
            },
        ]
    else:
        messages = [
            {"role": "user", "content": "Tell me a story about this image."}
        ]

    start = time.time()

    additional_api_kwargs = {}
    if args.api_key:
        additional_api_kwargs["api_key"] = args.api_key
    if args.api_base:
        additional_api_kwargs["api_base"] = args.api_base

    response = completion(
        model=args.model,
        messages=messages,
        stream=True,
        **additional_api_kwargs,
    )
    ttft = None
    itl_list = []
    content = ""
    for chunk in response:
        if chunk.choices[0].delta.content:
            end_time = time.time()
            if ttft is None:
                ttft = end_time - start
            content += chunk.choices[0].delta.content
            num_tokens = len(tokenizer.encode(content))
            itl_list.append((end_time - start) / num_tokens)
            start = end_time
    print(content); input()
    return content, ttft, itl_list

    

 def main(args):
    n = args.num_total_responses
    batch_size = args.req_per_sec  # Requests per second
    start = time.time()

    all_results = []
    tokenizer = AutoTokenizer.from_pretrained("gpt2")

    # with ThreadPoolExecutor(max_workers=batch_size) as executor:
    for i in range(0, n, batch_size):
        batch_futures = []
        batch = range(i, min(i + batch_size, n))

        for _ in batch:
            if args.include_image:
                if args.randomize_image_dimensions:
                    y_dimension = np.random.randint(100, 1025)
                else:
                    y_dimension = 512
                image_url = f"https://placehold.co/1024x{y_dimension}/png"
                # future = executor.submit(
                #     litellm_completion, args, tokenizer, image_url
                # )
                future = litellm_completion(args, tokenizer, image_url)
            else:
                # future = executor.submit(litellm_completion, args, tokenizer)
                future = litellm_completion(args, tokenizer)
            batch_futures.append(future)
        
        exit(0)
        # Wait for batch to complete
        for future in batch_futures:
            all_results.append(future.result())

        if i + batch_size < n:
            time.sleep(1)  # Wait 1 second before next batch

    successful_completions = [
        c for c in all_results if isinstance(c, tuple) and len(c) == 3
    ]
    ttft_list = np.array([float(c[1]) for c in successful_completions])
    itl_list_flattened = np.array(
        [
            float(item)
            for sublist in [c[2] for c in successful_completions]
            for item in sublist
        ]
    )

    # Write errors to error_log.txt
    with open("load_test_errors.log", "a") as error_log:
        for completion in all_results:
            if isinstance(completion, str):
                error_log.write(completion + "\n")

    print(f"Completed requests: {len(successful_completions)}")
    print(f"P99 TTFT: {np.percentile(ttft_list, 99)}")
    print(f"Mean TTFT: {np.mean(ttft_list)}")
    print(f"P99 ITL: {np.percentile(itl_list_flattened, 99)}")
    print(f"Mean ITL: {np.mean(itl_list_flattened)}")


 if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--model", type=str, default="azure-gpt-3.5")
    parser.add_argument("--api-base", type=str, default=None)
    parser.add_argument("--api-key", type=str, default=None)
    parser.add_argument("--num-total-responses", type=int, default=50)
    parser.add_argument("--req-per-sec", type=int, default=5)
    parser.add_argument("--include-image", action="store_true")
    parser.add_argument("--randomize-image-dimensions", action="store_true")
    args = parser.parse_args()

    # Blank out contents of error_log.txt
    open("load_test_errors.log", "w").close()

    main(args)
	ligengz@:~/workspace/VILA-dev$ python serving/lmsys_test.py --model openai/nvila-8b-dev --api-base http://localhost:8000 --req-per-sec 1
	/home/ligengz/anaconda3/envs/hf/lib/python3.10/site-packages/transformers/tokenization_utils_base.py:1601: FutureWarning: `clean_up_tokenization_spaces` was not set. It will be set to `True` by default. This behavior will be depracted in transformers v4.45, and will be then set to `False` by default. For more details check this issue: https://github.com/huggingface/transformers/issues/31884
	warnings.warn(
	/home/ligengz/anaconda3/envs/hf/lib/python3.10/site-packages/pydantic/main.py:347: UserWarning: Pydantic serializer warnings:
	Expected `str` but got `int` - serialized value may not be as expected
	return self.__pydantic_serializer__.to_python(

	Give Feedback / Get Help: https://github.com/BerriAI/litellm/issues/new
	LiteLLM.Info: If you need to debug this error, use `litellm.set_verbose=True'.


	Provider List: https://docs.litellm.ai/docs/providers

	Traceback (most recent call last):
	File "/home/ligengz/anaconda3/envs/hf/lib/python3.10/site-packages/litellm/litellm_core_utils/streaming_handler.py", line 1683, in __next__
	response = self.model_response_creator(
	File "/home/ligengz/anaconda3/envs/hf/lib/python3.10/site-packages/litellm/litellm_core_utils/streaming_handler.py", line 889, in model_response_creator
	model_response = ModelResponse(
	File "/home/ligengz/anaconda3/envs/hf/lib/python3.10/site-packages/litellm/types/utils.py", line 907, in __init__
	super().__init__(
	File "/home/ligengz/anaconda3/envs/hf/lib/python3.10/site-packages/pydantic/main.py", line 176, in __init__
	self.__pydantic_validator__.validate_python(data, self_instance=self)
	pydantic_core._pydantic_core.ValidationError: 1 validation error for ModelResponse
	id
	Input should be a valid string [type=string_type, input_value=1, input_type=int]
	For further information visit https://errors.pydantic.dev/2.7/v/string_type

	During handling of the above exception, another exception occurred:

	Traceback (most recent call last):
	File "/lustre/fs12/portfolios/nvr/users/ligengz/workspace/VILA-dev/serving/lmsys_test.py", line 135, in <module>
	main(args)
	File "/lustre/fs12/portfolios/nvr/users/ligengz/workspace/VILA-dev/serving/lmsys_test.py", line 85, in main
	future = litellm_completion(args, tokenizer)
	File "/lustre/fs12/portfolios/nvr/users/ligengz/workspace/VILA-dev/serving/lmsys_test.py", line 45, in litellm_completion
	for chunk in response:
	File "/home/ligengz/anaconda3/envs/hf/lib/python3.10/site-packages/litellm/litellm_core_utils/streaming_handler.py", line 1737, in __next__
	raise exception_type(
	File "/home/ligengz/anaconda3/envs/hf/lib/python3.10/site-packages/litellm/litellm_core_utils/exception_mapping_utils.py", line 2141, in exception_type
	raise e # it's already mapped
	File "/home/ligengz/anaconda3/envs/hf/lib/python3.10/site-packages/litellm/litellm_core_utils/exception_mapping_utils.py", line 415, in exception_type
	raise APIConnectionError(
	litellm.exceptions.APIConnectionError: litellm.APIConnectionError: APIConnectionError: OpenAIException - 1 validation error for ModelResponse
	id
	Input should be a valid string [type=string_type, input_value=1, input_type=int]
	For further information visit https://errors.pydantic.dev/2.7/v/string_type
	import argparse
	import time
	import threading
	from concurrent.futures import ThreadPoolExecutor
	import uuid
	import traceback
	import numpy as np
	from transformers import AutoTokenizer
	from litellm import completion


	def litellm_completion(args, tokenizer, image_url=None):
	if image_url:
	messages = [
	{
	"role": "user",
	"content": [
	{"type": "image_url", "image_url": {"url": image_url}},
	{"type": "text", "text": "Tell me a story about this image."},
	],
	},
	]
	else:
	messages = [
	{"role": "user", "content": "Tell me a story about this image."}
	]

	start = time.time()

	additional_api_kwargs = {}
	if args.api_key:
	additional_api_kwargs["api_key"] = args.api_key
	if args.api_base:
	additional_api_kwargs["api_base"] = args.api_base

	response = completion(
	model=args.model,
	messages=messages,
	stream=True,
	**additional_api_kwargs,
	)
	ttft = None
	itl_list = []
	content = ""
	for chunk in response:
	if chunk.choices[0].delta.content:
	end_time = time.time()
	if ttft is None:
	ttft = end_time - start
	content += chunk.choices[0].delta.content
	num_tokens = len(tokenizer.encode(content))
	itl_list.append((end_time - start) / num_tokens)
	start = end_time
	print(content); input()
	return content, ttft, itl_list



	def main(args):
	n = args.num_total_responses
	batch_size = args.req_per_sec # Requests per second
	start = time.time()

	all_results = []
	tokenizer = AutoTokenizer.from_pretrained("gpt2")

	# with ThreadPoolExecutor(max_workers=batch_size) as executor:
	for i in range(0, n, batch_size):
	batch_futures = []
	batch = range(i, min(i + batch_size, n))

	for _ in batch:
	if args.include_image:
	if args.randomize_image_dimensions:
	y_dimension = np.random.randint(100, 1025)
	else:
	y_dimension = 512
	image_url = f"https://placehold.co/1024x{y_dimension}/png"
	# future = executor.submit(
	# litellm_completion, args, tokenizer, image_url
	# )
	future = litellm_completion(args, tokenizer, image_url)
	else:
	# future = executor.submit(litellm_completion, args, tokenizer)
	future = litellm_completion(args, tokenizer)
	batch_futures.append(future)

	exit(0)
	# Wait for batch to complete
	for future in batch_futures:
	all_results.append(future.result())

	if i + batch_size < n:
	time.sleep(1) # Wait 1 second before next batch

	successful_completions = [
	c for c in all_results if isinstance(c, tuple) and len(c) == 3
	]
	ttft_list = np.array([float(c[1]) for c in successful_completions])
	itl_list_flattened = np.array(
	[
	float(item)
	for sublist in [c[2] for c in successful_completions]
	for item in sublist
	]
	)

	# Write errors to error_log.txt
	with open("load_test_errors.log", "a") as error_log:
	for completion in all_results:
	if isinstance(completion, str):
	error_log.write(completion + "\n")

	print(f"Completed requests: {len(successful_completions)}")
	print(f"P99 TTFT: {np.percentile(ttft_list, 99)}")
	print(f"Mean TTFT: {np.mean(ttft_list)}")
	print(f"P99 ITL: {np.percentile(itl_list_flattened, 99)}")
	print(f"Mean ITL: {np.mean(itl_list_flattened)}")


	if __name__ == "__main__":
	parser = argparse.ArgumentParser()
	parser.add_argument("--model", type=str, default="azure-gpt-3.5")
	parser.add_argument("--api-base", type=str, default=None)
	parser.add_argument("--api-key", type=str, default=None)
	parser.add_argument("--num-total-responses", type=int, default=50)
	parser.add_argument("--req-per-sec", type=int, default=5)
	parser.add_argument("--include-image", action="store_true")
	parser.add_argument("--randomize-image-dimensions", action="store_true")
	args = parser.parse_args()

	# Blank out contents of error_log.txt
	open("load_test_errors.log", "w").close()

	main(args)