rom1504 · January 26, 2025 20:49
diff --git a/README.md b/README.md
diff --git a/process.py b/process.py
 import os
 import json
 import time
 import openai
 import numpy as np
 from sklearn.metrics.pairwise import cosine_similarity

 # Pull your API key from an environment variable (replace with your var name if different).
 openai.api_key = os.getenv("OPENAI_API_KEY")

 def split_content_into_packets(content):
    """
    Split content into distinct packets by identifying boundaries using OpenAI, 
    handling cases where packets span multiple chunks.

    Args:
        content (str): The file content to split.

    Returns:
        list: List of individual packets as strings.
    """
    chunk_size = 8000  # Define a safe chunk size for OpenAI input
    chunks = [content[i:i + chunk_size] for i in range(0, len(content), chunk_size)]
    packets = []
    buffer = ""  # Buffer to hold incomplete packets

    print(f"Number of API calls to be made: {len(chunks)}")

    for idx, chunk in enumerate(chunks):
        # Include the buffer from the previous chunk to handle incomplete packets
        chunk_with_buffer = buffer + chunk

        # The prompt requests JSON output with "packets" and "buffer"
        prompt = f"""
        You are analyzing a file containing protocol or code-like definitions. 
        Your task is to split the input into distinct packets or sections.
        Each packet begins with a distinctive identifier or comment (e.g., '# MC:') 
        and ends when the next identifier or section starts.
        Return your result in valid JSON with two keys: "packets" and "buffer".

        - "packets" should be a list of the complete packets you identified in this chunk.
        - "buffer" should be a string with any incomplete packet that needs to carry over.

        Input:
        {chunk_with_buffer}
        """

        try:
            response = openai.chat.completions.create(
                model="gpt-4o-mini",  # or your custom fine-tuned model
                messages=[
                    {
                        "role": "system", 
                        "content": "You are an expert in parsing structured files."
                    },
                    {
                        "role": "user", 
                        "content": prompt
                    },
                ],
                temperature=0,
            )

            response_text = response['choices'][0]['message']['content'].strip()

            # Safely parse JSON instead of using eval
            try:
                response_content = json.loads(response_text)
            except json.JSONDecodeError:
                print(f"Could not decode JSON for chunk {idx + 1}. Got:\n{response_text}")
                continue

            # Extract the packets and buffer, if present
            if "packets" in response_content and isinstance(response_content["packets"], list):
                packets.extend(response_content["packets"])
            else:
                print(f"Warning: No 'packets' list found in chunk {idx + 1}")

            if "buffer" in response_content and isinstance(response_content["buffer"], str):
                buffer = response_content["buffer"]
            else:
                buffer = ""

        except Exception as e:
            print(f"Error processing chunk {idx + 1}/{len(chunks)}: {e}")
            continue

        time.sleep(1)  # Add a delay between API calls to avoid rate limits

    # Add any remaining content in the buffer as the last packet
    if buffer.strip():
        packets.append(buffer.strip())

    return packets

 def generate_embeddings(texts):
    """
    Generate embeddings for a list of texts using OpenAI.

    Args:
        texts (list): List of strings to generate embeddings for.

    Returns:
        list: List of embeddings as numpy arrays.
    """
    response = openai.Embedding.create(
        model="text-embedding-ada-002",
        input=texts
    )
    return [np.array(item["embedding"]) for item in response["data"]]

 def map_packets(proto_packets, protocol_docs):
    """
    Map proto packets to protocol documentation sections using embeddings.

    Args:
        proto_packets (list): List of packet definitions from proto.yml.
        protocol_docs (list): List of documentation sections.

    Returns:
        dict: Mapping of proto packets to their most similar documentation sections.
    """
    proto_embeddings = generate_embeddings(proto_packets)
    doc_embeddings = generate_embeddings(protocol_docs)

    similarity_matrix = cosine_similarity(proto_embeddings, doc_embeddings)

    packet_mapping = {}
    for i, proto_packet in enumerate(proto_packets):
        most_similar_idx = np.argmax(similarity_matrix[i])
        packet_mapping[proto_packet] = protocol_docs[most_similar_idx]

    return packet_mapping

 def add_comments(proto_packets, packet_mapping):
    """
    Add comments to proto packets based on the mapped documentation.

    Args:
        proto_packets (list): List of packet definitions from proto.yml.
        packet_mapping (dict): Mapping of proto packets to documentation.

    Returns:
        list: List of commented packet definitions.
    """
    commented_packets = []
    for proto_packet in proto_packets:
        documentation = packet_mapping[proto_packet]

        prompt = f"""
        You are provided with a Minecraft packet definition and its corresponding documentation.
        Your task is to add detailed and precise comments to the packet definition based on the documentation.

        Documentation:
        {documentation}

        Packet Definition:
        {proto_packet}

        Return the packet definition with added comments (in-line or above the relevant fields).
        """

        response = openai.ChatCompletion.create(
            model="gpt-4",  # or "gpt-4o-mini" if you have a more powerful custom model
            messages=[
                {
                    "role": "system", 
                    "content": "You are a helpful assistant for enhancing code with comments based on documentation."
                },
                {
                    "role": "user", 
                    "content": prompt
                },
            ],
            temperature=0,
        )

        commented_packet = response['choices'][0]['message']['content'].strip()
        commented_packets.append(commented_packet)

    return commented_packets

 if __name__ == "__main__":
    proto_path = "proto.yml"
    protocol_doc_path = "wikivg.txt"

    # Read proto.yml content
    with open(proto_path, "r", encoding="utf-8") as f:
        proto_content = f.read()

    # Split proto.yml into packets
    proto_packets = split_content_into_packets(proto_content)
    print("Proto packets split complete. Number of packets:", len(proto_packets))
    exit()
    # Read wikivg.txt content
    with open(protocol_doc_path, "r", encoding="utf-8") as f:
        protocol_docs_content = f.read()

    # Split wikivg.txt into sections
    protocol_docs = split_content_into_packets(protocol_docs_content)
    print("Protocol docs split complete. Number of sections:", len(protocol_docs))

    # Map proto packets to documentation
    packet_mapping = map_packets(proto_packets, protocol_docs)

    # Generate commented packets
    commented_packets = add_comments(proto_packets, packet_mapping)

    # Write output
    output_path = "proto_commented.yml"
    with open(output_path, "w", encoding="utf-8") as f:
        f.write("\n\n".join(commented_packets))

    print(f"Commented proto.yml saved to {output_path}")
	import os
	import json
	import time
	import openai
	import numpy as np
	from sklearn.metrics.pairwise import cosine_similarity

	# Pull your API key from an environment variable (replace with your var name if different).
	openai.api_key = os.getenv("OPENAI_API_KEY")

	def split_content_into_packets(content):
	"""
	Split content into distinct packets by identifying boundaries using OpenAI,
	handling cases where packets span multiple chunks.

	Args:
	content (str): The file content to split.

	Returns:
	list: List of individual packets as strings.
	"""
	chunk_size = 8000 # Define a safe chunk size for OpenAI input
	chunks = [content[i:i + chunk_size] for i in range(0, len(content), chunk_size)]
	packets = []
	buffer = "" # Buffer to hold incomplete packets

	print(f"Number of API calls to be made: {len(chunks)}")

	for idx, chunk in enumerate(chunks):
	# Include the buffer from the previous chunk to handle incomplete packets
	chunk_with_buffer = buffer + chunk

	# The prompt requests JSON output with "packets" and "buffer"
	prompt = f"""
	You are analyzing a file containing protocol or code-like definitions.
	Your task is to split the input into distinct packets or sections.
	Each packet begins with a distinctive identifier or comment (e.g., '# MC:')
	and ends when the next identifier or section starts.
	Return your result in valid JSON with two keys: "packets" and "buffer".

	- "packets" should be a list of the complete packets you identified in this chunk.
	- "buffer" should be a string with any incomplete packet that needs to carry over.

	Input:
	{chunk_with_buffer}
	"""

	try:
	response = openai.chat.completions.create(
	model="gpt-4o-mini", # or your custom fine-tuned model
	messages=[
	{
	"role": "system",
	"content": "You are an expert in parsing structured files."
	},
	{
	"role": "user",
	"content": prompt
	},
	],
	temperature=0,
	)

	response_text = response['choices'][0]['message']['content'].strip()

	# Safely parse JSON instead of using eval
	try:
	response_content = json.loads(response_text)
	except json.JSONDecodeError:
	print(f"Could not decode JSON for chunk {idx + 1}. Got:\n{response_text}")
	continue

	# Extract the packets and buffer, if present
	if "packets" in response_content and isinstance(response_content["packets"], list):
	packets.extend(response_content["packets"])
	else:
	print(f"Warning: No 'packets' list found in chunk {idx + 1}")

	if "buffer" in response_content and isinstance(response_content["buffer"], str):
	buffer = response_content["buffer"]
	else:
	buffer = ""

	except Exception as e:
	print(f"Error processing chunk {idx + 1}/{len(chunks)}: {e}")
	continue

	time.sleep(1) # Add a delay between API calls to avoid rate limits

	# Add any remaining content in the buffer as the last packet
	if buffer.strip():
	packets.append(buffer.strip())

	return packets

	def generate_embeddings(texts):
	"""
	Generate embeddings for a list of texts using OpenAI.

	Args:
	texts (list): List of strings to generate embeddings for.

	Returns:
	list: List of embeddings as numpy arrays.
	"""
	response = openai.Embedding.create(
	model="text-embedding-ada-002",
	input=texts
	)
	return [np.array(item["embedding"]) for item in response["data"]]

	def map_packets(proto_packets, protocol_docs):
	"""
	Map proto packets to protocol documentation sections using embeddings.

	Args:
	proto_packets (list): List of packet definitions from proto.yml.
	protocol_docs (list): List of documentation sections.

	Returns:
	dict: Mapping of proto packets to their most similar documentation sections.
	"""
	proto_embeddings = generate_embeddings(proto_packets)
	doc_embeddings = generate_embeddings(protocol_docs)

	similarity_matrix = cosine_similarity(proto_embeddings, doc_embeddings)

	packet_mapping = {}
	for i, proto_packet in enumerate(proto_packets):
	most_similar_idx = np.argmax(similarity_matrix[i])
	packet_mapping[proto_packet] = protocol_docs[most_similar_idx]

	return packet_mapping

	def add_comments(proto_packets, packet_mapping):
	"""
	Add comments to proto packets based on the mapped documentation.

	Args:
	proto_packets (list): List of packet definitions from proto.yml.
	packet_mapping (dict): Mapping of proto packets to documentation.

	Returns:
	list: List of commented packet definitions.
	"""
	commented_packets = []
	for proto_packet in proto_packets:
	documentation = packet_mapping[proto_packet]

	prompt = f"""
	You are provided with a Minecraft packet definition and its corresponding documentation.
	Your task is to add detailed and precise comments to the packet definition based on the documentation.

	Documentation:
	{documentation}

	Packet Definition:
	{proto_packet}

	Return the packet definition with added comments (in-line or above the relevant fields).
	"""

	response = openai.ChatCompletion.create(
	model="gpt-4", # or "gpt-4o-mini" if you have a more powerful custom model
	messages=[
	{
	"role": "system",
	"content": "You are a helpful assistant for enhancing code with comments based on documentation."
	},
	{
	"role": "user",
	"content": prompt
	},
	],
	temperature=0,
	)

	commented_packet = response['choices'][0]['message']['content'].strip()
	commented_packets.append(commented_packet)

	return commented_packets

	if __name__ == "__main__":
	proto_path = "proto.yml"
	protocol_doc_path = "wikivg.txt"

	# Read proto.yml content
	with open(proto_path, "r", encoding="utf-8") as f:
	proto_content = f.read()

	# Split proto.yml into packets
	proto_packets = split_content_into_packets(proto_content)
	print("Proto packets split complete. Number of packets:", len(proto_packets))
	exit()
	# Read wikivg.txt content
	with open(protocol_doc_path, "r", encoding="utf-8") as f:
	protocol_docs_content = f.read()

	# Split wikivg.txt into sections
	protocol_docs = split_content_into_packets(protocol_docs_content)
	print("Protocol docs split complete. Number of sections:", len(protocol_docs))

	# Map proto packets to documentation
	packet_mapping = map_packets(proto_packets, protocol_docs)

	# Generate commented packets
	commented_packets = add_comments(proto_packets, packet_mapping)

	# Write output
	output_path = "proto_commented.yml"
	with open(output_path, "w", encoding="utf-8") as f:
	f.write("\n\n".join(commented_packets))

	print(f"Commented proto.yml saved to {output_path}")