generated by talking with chatgpt
I instantly hit rate limits so not sure it really works, but something like that should work
| import os | |
| import json | |
| import time | |
| import openai | |
| import numpy as np | |
| from sklearn.metrics.pairwise import cosine_similarity | |
| # Pull your API key from an environment variable (replace with your var name if different). | |
| openai.api_key = os.getenv("OPENAI_API_KEY") | |
| def split_content_into_packets(content): | |
| """ | |
| Split content into distinct packets by identifying boundaries using OpenAI, | |
| handling cases where packets span multiple chunks. | |
| Args: | |
| content (str): The file content to split. | |
| Returns: | |
| list: List of individual packets as strings. | |
| """ | |
| chunk_size = 8000 # Define a safe chunk size for OpenAI input | |
| chunks = [content[i:i + chunk_size] for i in range(0, len(content), chunk_size)] | |
| packets = [] | |
| buffer = "" # Buffer to hold incomplete packets | |
| print(f"Number of API calls to be made: {len(chunks)}") | |
| for idx, chunk in enumerate(chunks): | |
| # Include the buffer from the previous chunk to handle incomplete packets | |
| chunk_with_buffer = buffer + chunk | |
| # The prompt requests JSON output with "packets" and "buffer" | |
| prompt = f""" | |
| You are analyzing a file containing protocol or code-like definitions. | |
| Your task is to split the input into distinct packets or sections. | |
| Each packet begins with a distinctive identifier or comment (e.g., '# MC:') | |
| and ends when the next identifier or section starts. | |
| Return your result in valid JSON with two keys: "packets" and "buffer". | |
| - "packets" should be a list of the complete packets you identified in this chunk. | |
| - "buffer" should be a string with any incomplete packet that needs to carry over. | |
| Input: | |
| {chunk_with_buffer} | |
| """ | |
| try: | |
| response = openai.chat.completions.create( | |
| model="gpt-4o-mini", # or your custom fine-tuned model | |
| messages=[ | |
| { | |
| "role": "system", | |
| "content": "You are an expert in parsing structured files." | |
| }, | |
| { | |
| "role": "user", | |
| "content": prompt | |
| }, | |
| ], | |
| temperature=0, | |
| ) | |
| response_text = response['choices'][0]['message']['content'].strip() | |
| # Safely parse JSON instead of using eval | |
| try: | |
| response_content = json.loads(response_text) | |
| except json.JSONDecodeError: | |
| print(f"Could not decode JSON for chunk {idx + 1}. Got:\n{response_text}") | |
| continue | |
| # Extract the packets and buffer, if present | |
| if "packets" in response_content and isinstance(response_content["packets"], list): | |
| packets.extend(response_content["packets"]) | |
| else: | |
| print(f"Warning: No 'packets' list found in chunk {idx + 1}") | |
| if "buffer" in response_content and isinstance(response_content["buffer"], str): | |
| buffer = response_content["buffer"] | |
| else: | |
| buffer = "" | |
| except Exception as e: | |
| print(f"Error processing chunk {idx + 1}/{len(chunks)}: {e}") | |
| continue | |
| time.sleep(1) # Add a delay between API calls to avoid rate limits | |
| # Add any remaining content in the buffer as the last packet | |
| if buffer.strip(): | |
| packets.append(buffer.strip()) | |
| return packets | |
| def generate_embeddings(texts): | |
| """ | |
| Generate embeddings for a list of texts using OpenAI. | |
| Args: | |
| texts (list): List of strings to generate embeddings for. | |
| Returns: | |
| list: List of embeddings as numpy arrays. | |
| """ | |
| response = openai.Embedding.create( | |
| model="text-embedding-ada-002", | |
| input=texts | |
| ) | |
| return [np.array(item["embedding"]) for item in response["data"]] | |
| def map_packets(proto_packets, protocol_docs): | |
| """ | |
| Map proto packets to protocol documentation sections using embeddings. | |
| Args: | |
| proto_packets (list): List of packet definitions from proto.yml. | |
| protocol_docs (list): List of documentation sections. | |
| Returns: | |
| dict: Mapping of proto packets to their most similar documentation sections. | |
| """ | |
| proto_embeddings = generate_embeddings(proto_packets) | |
| doc_embeddings = generate_embeddings(protocol_docs) | |
| similarity_matrix = cosine_similarity(proto_embeddings, doc_embeddings) | |
| packet_mapping = {} | |
| for i, proto_packet in enumerate(proto_packets): | |
| most_similar_idx = np.argmax(similarity_matrix[i]) | |
| packet_mapping[proto_packet] = protocol_docs[most_similar_idx] | |
| return packet_mapping | |
| def add_comments(proto_packets, packet_mapping): | |
| """ | |
| Add comments to proto packets based on the mapped documentation. | |
| Args: | |
| proto_packets (list): List of packet definitions from proto.yml. | |
| packet_mapping (dict): Mapping of proto packets to documentation. | |
| Returns: | |
| list: List of commented packet definitions. | |
| """ | |
| commented_packets = [] | |
| for proto_packet in proto_packets: | |
| documentation = packet_mapping[proto_packet] | |
| prompt = f""" | |
| You are provided with a Minecraft packet definition and its corresponding documentation. | |
| Your task is to add detailed and precise comments to the packet definition based on the documentation. | |
| Documentation: | |
| {documentation} | |
| Packet Definition: | |
| {proto_packet} | |
| Return the packet definition with added comments (in-line or above the relevant fields). | |
| """ | |
| response = openai.ChatCompletion.create( | |
| model="gpt-4", # or "gpt-4o-mini" if you have a more powerful custom model | |
| messages=[ | |
| { | |
| "role": "system", | |
| "content": "You are a helpful assistant for enhancing code with comments based on documentation." | |
| }, | |
| { | |
| "role": "user", | |
| "content": prompt | |
| }, | |
| ], | |
| temperature=0, | |
| ) | |
| commented_packet = response['choices'][0]['message']['content'].strip() | |
| commented_packets.append(commented_packet) | |
| return commented_packets | |
| if __name__ == "__main__": | |
| proto_path = "proto.yml" | |
| protocol_doc_path = "wikivg.txt" | |
| # Read proto.yml content | |
| with open(proto_path, "r", encoding="utf-8") as f: | |
| proto_content = f.read() | |
| # Split proto.yml into packets | |
| proto_packets = split_content_into_packets(proto_content) | |
| print("Proto packets split complete. Number of packets:", len(proto_packets)) | |
| exit() | |
| # Read wikivg.txt content | |
| with open(protocol_doc_path, "r", encoding="utf-8") as f: | |
| protocol_docs_content = f.read() | |
| # Split wikivg.txt into sections | |
| protocol_docs = split_content_into_packets(protocol_docs_content) | |
| print("Protocol docs split complete. Number of sections:", len(protocol_docs)) | |
| # Map proto packets to documentation | |
| packet_mapping = map_packets(proto_packets, protocol_docs) | |
| # Generate commented packets | |
| commented_packets = add_comments(proto_packets, packet_mapping) | |
| # Write output | |
| output_path = "proto_commented.yml" | |
| with open(output_path, "w", encoding="utf-8") as f: | |
| f.write("\n\n".join(commented_packets)) | |
| print(f"Commented proto.yml saved to {output_path}") |