grahama1970 · December 20, 2024 14:44
diff --git a/full output.txt b/full output.txt
 2024-12-20 09:39:32.640 | INFO     | __main__:run_curl_version:10 - 
 === Running curl version ===
 2024-12-20 09:39:32.641 | INFO     | __main__:run_curl_version:43 - Initial request time: 0.00 seconds
 2024-12-20 09:39:32.641 | INFO     | __main__:run_curl_version:47 - Response tokens:
 To determine the number of rugby players on a touch rugby team, we can refer to the relevant section of the document. 

 1. **Understanding Team Composition**: The document states that a team consists of a maximum of 14 players. However, this number includes reserves, meaning that only six (6) players are allowed on the field at any given time during a match.

 2. **Player Limitation**: Additionally, teams are encouraged to include mixed genders (four males and four females), indicating that2024-12-20 09:39:43.862 | INFO     | __main__:run_curl_version:68 - 
 Tokens generated: 100
 2024-12-20 09:39:43.863 | INFO     | __main__:run_curl_version:69 - Token processing time: 11.22 seconds
 2024-12-20 09:39:43.863 | INFO     | __main__:run_curl_version:70 - Total execution time: 11.22 seconds
 2024-12-20 09:39:43.864 | INFO     | __main__:<module>:162 - 
 ==================================================
 2024-12-20 09:39:43.865 | INFO     | __main__:run_requests_version:74 - 
 === Running requests version ===
 2024-12-20 09:39:43.873 | INFO     | __main__:run_requests_version:96 - Initial request time: 0.01 seconds
 2024-12-20 09:39:43.873 | INFO     | __main__:run_requests_version:100 - Response tokens:
 To determine how many rugby players are on a touch rugby team, we can refer to the rules provided in the document. 

 1. A team consists of a maximum of 14 players. 
 2. Each team has a maximum of 6 players on the field at any time. 
 3. If fewer than 4 players are on the field due to sin bins, dismissals, or injuries, the team will be declared to have a player less. 
 4. No more than 32024-12-20 09:40:03.338 | INFO     | __main__:run_requests_version:118 - 
 Tokens generated: 100
 2024-12-20 09:40:03.339 | INFO     | __main__:run_requests_version:119 - Token processing time: 19.47 seconds
 2024-12-20 09:40:03.339 | INFO     | __main__:run_requests_version:120 - Total execution time: 19.47 seconds
 2024-12-20 09:40:03.339 | INFO     | __main__:<module>:164 - 
 ==================================================
 2024-12-20 09:40:03.339 | INFO     | __main__:run_openai_version:124 - 
 === Running OpenAI client version ===
 2024-12-20 09:40:03.417 | INFO     | __main__:run_openai_version:134 - Response tokens:
 To determine how many rugby players are on a touch rugby team, we can refer to the relevant section of the document. 

 1. The document states that a team consists of a maximum of 14 players. 
 2. It also mentions that teams are formed by mixing community and competition players, suggesting that the number of players per team is consistent. 
 3. Therefore, the total number of players on a touch rugby team is 14.

 So, the final answer is: 142024-12-20 09:40:31.319 | INFO     | __main__:run_openai_version:152 - 
 Tokens generated: 98
 2024-12-20 09:40:31.320 | INFO     | __main__:run_openai_version:153 - Total execution time: 27.98 seconds
 2024-12-20 09:40:31.320 | INFO     | __main__:<module>:176 - 
 === Performance Comparison ===

 +----------+------------+--------------------+
 | Method   |   Time (s) |   Diff vs Curl (s) |
 +==========+============+====================+
 | Curl     |      11.22 |               0    |
 +----------+------------+--------------------+
 | Requests |      19.47 |               8.25 |
 +----------+------------+--------------------+
 | OpenAI   |      27.98 |              16.76 |
 +----------+------------+--------------------+
diff --git a/lorax_requests_debug.py b/lorax_requests_debug.py
 import json
 import time
 import requests
 import subprocess
 from openai import OpenAI
 from loguru import logger


 def run_curl_version():
    logger.info("\n=== Running curl version ===")
    start_time = time.time()
    
    curl_command = """
    curl http://127.0.0.1:30002/v1/chat/completions \
        -X POST \
        -H "Content-Type: application/json" \
        -d '{
            "model": "Trelis/Meta-Llama-3.1-8B-Instruct-touch-rugby-2-adapters",
            "messages": [
                {
                    "role": "system",
                    "content": "You are a friendly chatbot"
                },
                {
                    "role": "user",
                    "content": "How many rugby players are on a touch rugby team?"
                }
            ],
            "max_tokens": 100,
            "stream": true
        }'
    """
    
    request_start = time.time()
    process = subprocess.Popen(
        curl_command,
        shell=True,
        stdout=subprocess.PIPE,
        stderr=subprocess.PIPE,
        text=True
    )
    request_time = time.time() - request_start
    logger.info(f"Initial request time: {request_time:.2f} seconds")

    tokens_start = time.time()
    token_count = 0
    logger.info("Response tokens:")
    
    while True:
        line = process.stdout.readline()
        if not line:
            break
        if line.startswith('data: '):
            line = line[6:]
            if line.strip() == '[DONE]':
                continue
            try:
                chunk = json.loads(line)
                if chunk['choices'][0]['delta'].get('content'):
                    token_count += 1
                    print(chunk['choices'][0]['delta']['content'], end='', flush=True)
            except json.JSONDecodeError:
                continue

    process.wait()
    tokens_time = time.time() - tokens_start
    total_time = time.time() - start_time
    logger.info(f"\nTokens generated: {token_count}")
    logger.info(f"Token processing time: {tokens_time:.2f} seconds")
    logger.info(f"Total execution time: {total_time:.2f} seconds")
    return total_time

 def run_requests_version():
    logger.info("\n=== Running requests version ===")
    start_time = time.time()
    
    request_start = time.time()
    response = requests.post(
        "http://127.0.0.1:30002/v1/chat/completions",
        headers={"Content-Type": "application/json"},
        json={
            "model": "Trelis/Meta-Llama-3.1-8B-Instruct-touch-rugby-2-adapters",
            "messages": [
                {
                    "role": "system",
                    "content": "You are a friendly chatbot",
                },
                {"role": "user", "content": "How many rugby players are on a touch rugby team?"},
            ],
            "max_tokens": 100,
            "stream": True
        },
        stream=True
    )
    request_time = time.time() - request_start
    logger.info(f"Initial request time: {request_time:.2f} seconds")

    tokens_start = time.time()
    token_count = 0
    logger.info("Response tokens:")
    for line in response.iter_lines():
        if line:
            line = line.decode('utf-8')
            if line.startswith('data: '):
                line = line[6:]
                if line.strip() == '[DONE]':
                    continue
                try:
                    chunk = json.loads(line)
                    if chunk['choices'][0]['delta'].get('content'):
                        token_count += 1
                        print(chunk['choices'][0]['delta']['content'], end='', flush=True)
                except json.JSONDecodeError:
                    continue

    tokens_time = time.time() - tokens_start
    total_time = time.time() - start_time
    logger.info(f"\nTokens generated: {token_count}")
    logger.info(f"Token processing time: {tokens_time:.2f} seconds")
    logger.info(f"Total execution time: {total_time:.2f} seconds")
    return total_time

 def run_openai_version():
    logger.info("\n=== Running OpenAI client version ===")
    start_time = time.time()
    
    client = OpenAI(
        api_key="EMPTY",
        base_url="http://127.0.0.1:30002/v1",
    )

    client_start = time.time()
    token_count = 0
    logger.info("Response tokens:")
    for chunk in client.chat.completions.create(
        model="Trelis/Meta-Llama-3.1-8B-Instruct-touch-rugby-2-adapters",
        messages=[
            {
                "role": "system",
                "content": "You are a friendly chatbot",
            },
            {"role": "user", "content": "How many rugby players are on a touch rugby team?"},
        ],
        max_tokens=100,
        stream=True
    ):
        if chunk.choices[0].delta.content is not None:
            token_count += 1
            print(chunk.choices[0].delta.content, end="", flush=True)

    total_time = time.time() - start_time
    logger.info(f"\nTokens generated: {token_count}")
    logger.info(f"Total execution time: {total_time:.2f} seconds")
    return total_time


 if __name__ == "__main__":
    from tabulate import tabulate

    # Run all three versions
    curl_time = run_curl_version()
    logger.info("\n" + "="*50)
    requests_time = run_requests_version()
    logger.info("\n" + "="*50)
    openai_time = run_openai_version()

    # Prepare table data
    table_data = [
        ["Method", "Time (s)", "Diff vs Curl (s)"],
        ["Curl", f"{curl_time:.2f}", "0.00"],
        ["Requests", f"{requests_time:.2f}", f"{requests_time - curl_time:.2f}"],
        ["OpenAI", f"{openai_time:.2f}", f"{openai_time - curl_time:.2f}"]
    ]

    # Print comparison table
    logger.info("\n=== Performance Comparison ===")
    print("\n" + tabulate(table_data, headers="firstrow", tablefmt="grid"))
diff --git a/output.txt b/output.txt
 === Performance Comparison ===

 +----------+------------+--------------------+
 | Method   |   Time (s) |   Diff vs Curl (s) |
 +==========+============+====================+
 | Curl     |      11.22 |               0    |
 +----------+------------+--------------------+
 | Requests |      19.47 |               8.25 |
 +----------+------------+--------------------+
 | OpenAI   |      27.98 |              16.76 |
 +----------+------------+--------------------+
	2024-12-20 09:39:32.640 \| INFO \| __main__:run_curl_version:10 -
	=== Running curl version ===
	2024-12-20 09:39:32.641 \| INFO \| __main__:run_curl_version:43 - Initial request time: 0.00 seconds
	2024-12-20 09:39:32.641 \| INFO \| __main__:run_curl_version:47 - Response tokens:
	To determine the number of rugby players on a touch rugby team, we can refer to the relevant section of the document.

	1. Understanding Team Composition: The document states that a team consists of a maximum of 14 players. However, this number includes reserves, meaning that only six (6) players are allowed on the field at any given time during a match.

	2. Player Limitation: Additionally, teams are encouraged to include mixed genders (four males and four females), indicating that2024-12-20 09:39:43.862 \| INFO \| __main__:run_curl_version:68 -
	Tokens generated: 100
	2024-12-20 09:39:43.863 \| INFO \| __main__:run_curl_version:69 - Token processing time: 11.22 seconds
	2024-12-20 09:39:43.863 \| INFO \| __main__:run_curl_version:70 - Total execution time: 11.22 seconds
	2024-12-20 09:39:43.864 \| INFO \| __main__:<module>:162 -
	==================================================
	2024-12-20 09:39:43.865 \| INFO \| __main__:run_requests_version:74 -
	=== Running requests version ===
	2024-12-20 09:39:43.873 \| INFO \| __main__:run_requests_version:96 - Initial request time: 0.01 seconds
	2024-12-20 09:39:43.873 \| INFO \| __main__:run_requests_version:100 - Response tokens:
	To determine how many rugby players are on a touch rugby team, we can refer to the rules provided in the document.

	1. A team consists of a maximum of 14 players.
	2. Each team has a maximum of 6 players on the field at any time.
	3. If fewer than 4 players are on the field due to sin bins, dismissals, or injuries, the team will be declared to have a player less.
	4. No more than 32024-12-20 09:40:03.338 \| INFO \| __main__:run_requests_version:118 -
	Tokens generated: 100
	2024-12-20 09:40:03.339 \| INFO \| __main__:run_requests_version:119 - Token processing time: 19.47 seconds
	2024-12-20 09:40:03.339 \| INFO \| __main__:run_requests_version:120 - Total execution time: 19.47 seconds
	2024-12-20 09:40:03.339 \| INFO \| __main__:<module>:164 -
	==================================================
	2024-12-20 09:40:03.339 \| INFO \| __main__:run_openai_version:124 -
	=== Running OpenAI client version ===
	2024-12-20 09:40:03.417 \| INFO \| __main__:run_openai_version:134 - Response tokens:
	To determine how many rugby players are on a touch rugby team, we can refer to the relevant section of the document.

	1. The document states that a team consists of a maximum of 14 players.
	2. It also mentions that teams are formed by mixing community and competition players, suggesting that the number of players per team is consistent.
	3. Therefore, the total number of players on a touch rugby team is 14.

	So, the final answer is: 142024-12-20 09:40:31.319 \| INFO \| __main__:run_openai_version:152 -
	Tokens generated: 98
	2024-12-20 09:40:31.320 \| INFO \| __main__:run_openai_version:153 - Total execution time: 27.98 seconds
	2024-12-20 09:40:31.320 \| INFO \| __main__:<module>:176 -
	=== Performance Comparison ===

	+----------+------------+--------------------+
	\| Method \| Time (s) \| Diff vs Curl (s) \|
	+==========+============+====================+
	\| Curl \| 11.22 \| 0 \|
	+----------+------------+--------------------+
	\| Requests \| 19.47 \| 8.25 \|
	+----------+------------+--------------------+
	\| OpenAI \| 27.98 \| 16.76 \|
	+----------+------------+--------------------+
	import json
	import time
	import requests
	import subprocess
	from openai import OpenAI
	from loguru import logger


	def run_curl_version():
	logger.info("\n=== Running curl version ===")
	start_time = time.time()

	curl_command = """
	curl http://127.0.0.1:30002/v1/chat/completions \
	-X POST \
	-H "Content-Type: application/json" \
	-d '{
	"model": "Trelis/Meta-Llama-3.1-8B-Instruct-touch-rugby-2-adapters",
	"messages": [
	{
	"role": "system",
	"content": "You are a friendly chatbot"
	},
	{
	"role": "user",
	"content": "How many rugby players are on a touch rugby team?"
	}
	],
	"max_tokens": 100,
	"stream": true
	}'
	"""

	request_start = time.time()
	process = subprocess.Popen(
	curl_command,
	shell=True,
	stdout=subprocess.PIPE,
	stderr=subprocess.PIPE,
	text=True
	)
	request_time = time.time() - request_start
	logger.info(f"Initial request time: {request_time:.2f} seconds")

	tokens_start = time.time()
	token_count = 0
	logger.info("Response tokens:")

	while True:
	line = process.stdout.readline()
	if not line:
	break
	if line.startswith('data: '):
	line = line[6:]
	if line.strip() == '[DONE]':
	continue
	try:
	chunk = json.loads(line)
	if chunk['choices'][0]['delta'].get('content'):
	token_count += 1
	print(chunk['choices'][0]['delta']['content'], end='', flush=True)
	except json.JSONDecodeError:
	continue

	process.wait()
	tokens_time = time.time() - tokens_start
	total_time = time.time() - start_time
	logger.info(f"\nTokens generated: {token_count}")
	logger.info(f"Token processing time: {tokens_time:.2f} seconds")
	logger.info(f"Total execution time: {total_time:.2f} seconds")
	return total_time

	def run_requests_version():
	logger.info("\n=== Running requests version ===")
	start_time = time.time()

	request_start = time.time()
	response = requests.post(
	"http://127.0.0.1:30002/v1/chat/completions",
	headers={"Content-Type": "application/json"},
	json={
	"model": "Trelis/Meta-Llama-3.1-8B-Instruct-touch-rugby-2-adapters",
	"messages": [
	{
	"role": "system",
	"content": "You are a friendly chatbot",
	},
	{"role": "user", "content": "How many rugby players are on a touch rugby team?"},
	],
	"max_tokens": 100,
	"stream": True
	},
	stream=True
	)
	request_time = time.time() - request_start
	logger.info(f"Initial request time: {request_time:.2f} seconds")

	tokens_start = time.time()
	token_count = 0
	logger.info("Response tokens:")
	for line in response.iter_lines():
	if line:
	line = line.decode('utf-8')
	if line.startswith('data: '):
	line = line[6:]
	if line.strip() == '[DONE]':
	continue
	try:
	chunk = json.loads(line)
	if chunk['choices'][0]['delta'].get('content'):
	token_count += 1
	print(chunk['choices'][0]['delta']['content'], end='', flush=True)
	except json.JSONDecodeError:
	continue

	tokens_time = time.time() - tokens_start
	total_time = time.time() - start_time
	logger.info(f"\nTokens generated: {token_count}")
	logger.info(f"Token processing time: {tokens_time:.2f} seconds")
	logger.info(f"Total execution time: {total_time:.2f} seconds")
	return total_time

	def run_openai_version():
	logger.info("\n=== Running OpenAI client version ===")
	start_time = time.time()

	client = OpenAI(
	api_key="EMPTY",
	base_url="http://127.0.0.1:30002/v1",
	)

	client_start = time.time()
	token_count = 0
	logger.info("Response tokens:")
	for chunk in client.chat.completions.create(
	model="Trelis/Meta-Llama-3.1-8B-Instruct-touch-rugby-2-adapters",
	messages=[
	{
	"role": "system",
	"content": "You are a friendly chatbot",
	},
	{"role": "user", "content": "How many rugby players are on a touch rugby team?"},
	],
	max_tokens=100,
	stream=True
	):
	if chunk.choices[0].delta.content is not None:
	token_count += 1
	print(chunk.choices[0].delta.content, end="", flush=True)

	total_time = time.time() - start_time
	logger.info(f"\nTokens generated: {token_count}")
	logger.info(f"Total execution time: {total_time:.2f} seconds")
	return total_time


	if __name__ == "__main__":
	from tabulate import tabulate

	# Run all three versions
	curl_time = run_curl_version()
	logger.info("\n" + "="*50)
	requests_time = run_requests_version()
	logger.info("\n" + "="*50)
	openai_time = run_openai_version()

	# Prepare table data
	table_data = [
	["Method", "Time (s)", "Diff vs Curl (s)"],
	["Curl", f"{curl_time:.2f}", "0.00"],
	["Requests", f"{requests_time:.2f}", f"{requests_time - curl_time:.2f}"],
	["OpenAI", f"{openai_time:.2f}", f"{openai_time - curl_time:.2f}"]
	]

	# Print comparison table
	logger.info("\n=== Performance Comparison ===")
	print("\n" + tabulate(table_data, headers="firstrow", tablefmt="grid"))