generated by talking with chatgpt
I instantly hit rate limits so not sure it really works, but something like that should work
import os | |
import json | |
import time | |
import openai | |
import numpy as np | |
from sklearn.metrics.pairwise import cosine_similarity | |
# Pull your API key from an environment variable (replace with your var name if different). | |
openai.api_key = os.getenv("OPENAI_API_KEY") | |
def split_content_into_packets(content): | |
""" | |
Split content into distinct packets by identifying boundaries using OpenAI, | |
handling cases where packets span multiple chunks. | |
Args: | |
content (str): The file content to split. | |
Returns: | |
list: List of individual packets as strings. | |
""" | |
chunk_size = 8000 # Define a safe chunk size for OpenAI input | |
chunks = [content[i:i + chunk_size] for i in range(0, len(content), chunk_size)] | |
packets = [] | |
buffer = "" # Buffer to hold incomplete packets | |
print(f"Number of API calls to be made: {len(chunks)}") | |
for idx, chunk in enumerate(chunks): | |
# Include the buffer from the previous chunk to handle incomplete packets | |
chunk_with_buffer = buffer + chunk | |
# The prompt requests JSON output with "packets" and "buffer" | |
prompt = f""" | |
You are analyzing a file containing protocol or code-like definitions. | |
Your task is to split the input into distinct packets or sections. | |
Each packet begins with a distinctive identifier or comment (e.g., '# MC:') | |
and ends when the next identifier or section starts. | |
Return your result in valid JSON with two keys: "packets" and "buffer". | |
- "packets" should be a list of the complete packets you identified in this chunk. | |
- "buffer" should be a string with any incomplete packet that needs to carry over. | |
Input: | |
{chunk_with_buffer} | |
""" | |
try: | |
response = openai.chat.completions.create( | |
model="gpt-4o-mini", # or your custom fine-tuned model | |
messages=[ | |
{ | |
"role": "system", | |
"content": "You are an expert in parsing structured files." | |
}, | |
{ | |
"role": "user", | |
"content": prompt | |
}, | |
], | |
temperature=0, | |
) | |
response_text = response['choices'][0]['message']['content'].strip() | |
# Safely parse JSON instead of using eval | |
try: | |
response_content = json.loads(response_text) | |
except json.JSONDecodeError: | |
print(f"Could not decode JSON for chunk {idx + 1}. Got:\n{response_text}") | |
continue | |
# Extract the packets and buffer, if present | |
if "packets" in response_content and isinstance(response_content["packets"], list): | |
packets.extend(response_content["packets"]) | |
else: | |
print(f"Warning: No 'packets' list found in chunk {idx + 1}") | |
if "buffer" in response_content and isinstance(response_content["buffer"], str): | |
buffer = response_content["buffer"] | |
else: | |
buffer = "" | |
except Exception as e: | |
print(f"Error processing chunk {idx + 1}/{len(chunks)}: {e}") | |
continue | |
time.sleep(1) # Add a delay between API calls to avoid rate limits | |
# Add any remaining content in the buffer as the last packet | |
if buffer.strip(): | |
packets.append(buffer.strip()) | |
return packets | |
def generate_embeddings(texts): | |
""" | |
Generate embeddings for a list of texts using OpenAI. | |
Args: | |
texts (list): List of strings to generate embeddings for. | |
Returns: | |
list: List of embeddings as numpy arrays. | |
""" | |
response = openai.Embedding.create( | |
model="text-embedding-ada-002", | |
input=texts | |
) | |
return [np.array(item["embedding"]) for item in response["data"]] | |
def map_packets(proto_packets, protocol_docs): | |
""" | |
Map proto packets to protocol documentation sections using embeddings. | |
Args: | |
proto_packets (list): List of packet definitions from proto.yml. | |
protocol_docs (list): List of documentation sections. | |
Returns: | |
dict: Mapping of proto packets to their most similar documentation sections. | |
""" | |
proto_embeddings = generate_embeddings(proto_packets) | |
doc_embeddings = generate_embeddings(protocol_docs) | |
similarity_matrix = cosine_similarity(proto_embeddings, doc_embeddings) | |
packet_mapping = {} | |
for i, proto_packet in enumerate(proto_packets): | |
most_similar_idx = np.argmax(similarity_matrix[i]) | |
packet_mapping[proto_packet] = protocol_docs[most_similar_idx] | |
return packet_mapping | |
def add_comments(proto_packets, packet_mapping): | |
""" | |
Add comments to proto packets based on the mapped documentation. | |
Args: | |
proto_packets (list): List of packet definitions from proto.yml. | |
packet_mapping (dict): Mapping of proto packets to documentation. | |
Returns: | |
list: List of commented packet definitions. | |
""" | |
commented_packets = [] | |
for proto_packet in proto_packets: | |
documentation = packet_mapping[proto_packet] | |
prompt = f""" | |
You are provided with a Minecraft packet definition and its corresponding documentation. | |
Your task is to add detailed and precise comments to the packet definition based on the documentation. | |
Documentation: | |
{documentation} | |
Packet Definition: | |
{proto_packet} | |
Return the packet definition with added comments (in-line or above the relevant fields). | |
""" | |
response = openai.ChatCompletion.create( | |
model="gpt-4", # or "gpt-4o-mini" if you have a more powerful custom model | |
messages=[ | |
{ | |
"role": "system", | |
"content": "You are a helpful assistant for enhancing code with comments based on documentation." | |
}, | |
{ | |
"role": "user", | |
"content": prompt | |
}, | |
], | |
temperature=0, | |
) | |
commented_packet = response['choices'][0]['message']['content'].strip() | |
commented_packets.append(commented_packet) | |
return commented_packets | |
if __name__ == "__main__": | |
proto_path = "proto.yml" | |
protocol_doc_path = "wikivg.txt" | |
# Read proto.yml content | |
with open(proto_path, "r", encoding="utf-8") as f: | |
proto_content = f.read() | |
# Split proto.yml into packets | |
proto_packets = split_content_into_packets(proto_content) | |
print("Proto packets split complete. Number of packets:", len(proto_packets)) | |
exit() | |
# Read wikivg.txt content | |
with open(protocol_doc_path, "r", encoding="utf-8") as f: | |
protocol_docs_content = f.read() | |
# Split wikivg.txt into sections | |
protocol_docs = split_content_into_packets(protocol_docs_content) | |
print("Protocol docs split complete. Number of sections:", len(protocol_docs)) | |
# Map proto packets to documentation | |
packet_mapping = map_packets(proto_packets, protocol_docs) | |
# Generate commented packets | |
commented_packets = add_comments(proto_packets, packet_mapping) | |
# Write output | |
output_path = "proto_commented.yml" | |
with open(output_path, "w", encoding="utf-8") as f: | |
f.write("\n\n".join(commented_packets)) | |
print(f"Commented proto.yml saved to {output_path}") |