Skip to content

Instantly share code, notes, and snippets.

@rom1504
Created January 26, 2025 20:49
Show Gist options
  • Save rom1504/73561eefe3d69d27833af14d63c93431 to your computer and use it in GitHub Desktop.
Save rom1504/73561eefe3d69d27833af14d63c93431 to your computer and use it in GitHub Desktop.
join wikivg and mcdata proto and add descriptions

generated by talking with chatgpt

I instantly hit rate limits so not sure it really works, but something like that should work

import os
import json
import time
import openai
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
# Pull your API key from an environment variable (replace with your var name if different).
openai.api_key = os.getenv("OPENAI_API_KEY")
def split_content_into_packets(content):
"""
Split content into distinct packets by identifying boundaries using OpenAI,
handling cases where packets span multiple chunks.
Args:
content (str): The file content to split.
Returns:
list: List of individual packets as strings.
"""
chunk_size = 8000 # Define a safe chunk size for OpenAI input
chunks = [content[i:i + chunk_size] for i in range(0, len(content), chunk_size)]
packets = []
buffer = "" # Buffer to hold incomplete packets
print(f"Number of API calls to be made: {len(chunks)}")
for idx, chunk in enumerate(chunks):
# Include the buffer from the previous chunk to handle incomplete packets
chunk_with_buffer = buffer + chunk
# The prompt requests JSON output with "packets" and "buffer"
prompt = f"""
You are analyzing a file containing protocol or code-like definitions.
Your task is to split the input into distinct packets or sections.
Each packet begins with a distinctive identifier or comment (e.g., '# MC:')
and ends when the next identifier or section starts.
Return your result in valid JSON with two keys: "packets" and "buffer".
- "packets" should be a list of the complete packets you identified in this chunk.
- "buffer" should be a string with any incomplete packet that needs to carry over.
Input:
{chunk_with_buffer}
"""
try:
response = openai.chat.completions.create(
model="gpt-4o-mini", # or your custom fine-tuned model
messages=[
{
"role": "system",
"content": "You are an expert in parsing structured files."
},
{
"role": "user",
"content": prompt
},
],
temperature=0,
)
response_text = response['choices'][0]['message']['content'].strip()
# Safely parse JSON instead of using eval
try:
response_content = json.loads(response_text)
except json.JSONDecodeError:
print(f"Could not decode JSON for chunk {idx + 1}. Got:\n{response_text}")
continue
# Extract the packets and buffer, if present
if "packets" in response_content and isinstance(response_content["packets"], list):
packets.extend(response_content["packets"])
else:
print(f"Warning: No 'packets' list found in chunk {idx + 1}")
if "buffer" in response_content and isinstance(response_content["buffer"], str):
buffer = response_content["buffer"]
else:
buffer = ""
except Exception as e:
print(f"Error processing chunk {idx + 1}/{len(chunks)}: {e}")
continue
time.sleep(1) # Add a delay between API calls to avoid rate limits
# Add any remaining content in the buffer as the last packet
if buffer.strip():
packets.append(buffer.strip())
return packets
def generate_embeddings(texts):
"""
Generate embeddings for a list of texts using OpenAI.
Args:
texts (list): List of strings to generate embeddings for.
Returns:
list: List of embeddings as numpy arrays.
"""
response = openai.Embedding.create(
model="text-embedding-ada-002",
input=texts
)
return [np.array(item["embedding"]) for item in response["data"]]
def map_packets(proto_packets, protocol_docs):
"""
Map proto packets to protocol documentation sections using embeddings.
Args:
proto_packets (list): List of packet definitions from proto.yml.
protocol_docs (list): List of documentation sections.
Returns:
dict: Mapping of proto packets to their most similar documentation sections.
"""
proto_embeddings = generate_embeddings(proto_packets)
doc_embeddings = generate_embeddings(protocol_docs)
similarity_matrix = cosine_similarity(proto_embeddings, doc_embeddings)
packet_mapping = {}
for i, proto_packet in enumerate(proto_packets):
most_similar_idx = np.argmax(similarity_matrix[i])
packet_mapping[proto_packet] = protocol_docs[most_similar_idx]
return packet_mapping
def add_comments(proto_packets, packet_mapping):
"""
Add comments to proto packets based on the mapped documentation.
Args:
proto_packets (list): List of packet definitions from proto.yml.
packet_mapping (dict): Mapping of proto packets to documentation.
Returns:
list: List of commented packet definitions.
"""
commented_packets = []
for proto_packet in proto_packets:
documentation = packet_mapping[proto_packet]
prompt = f"""
You are provided with a Minecraft packet definition and its corresponding documentation.
Your task is to add detailed and precise comments to the packet definition based on the documentation.
Documentation:
{documentation}
Packet Definition:
{proto_packet}
Return the packet definition with added comments (in-line or above the relevant fields).
"""
response = openai.ChatCompletion.create(
model="gpt-4", # or "gpt-4o-mini" if you have a more powerful custom model
messages=[
{
"role": "system",
"content": "You are a helpful assistant for enhancing code with comments based on documentation."
},
{
"role": "user",
"content": prompt
},
],
temperature=0,
)
commented_packet = response['choices'][0]['message']['content'].strip()
commented_packets.append(commented_packet)
return commented_packets
if __name__ == "__main__":
proto_path = "proto.yml"
protocol_doc_path = "wikivg.txt"
# Read proto.yml content
with open(proto_path, "r", encoding="utf-8") as f:
proto_content = f.read()
# Split proto.yml into packets
proto_packets = split_content_into_packets(proto_content)
print("Proto packets split complete. Number of packets:", len(proto_packets))
exit()
# Read wikivg.txt content
with open(protocol_doc_path, "r", encoding="utf-8") as f:
protocol_docs_content = f.read()
# Split wikivg.txt into sections
protocol_docs = split_content_into_packets(protocol_docs_content)
print("Protocol docs split complete. Number of sections:", len(protocol_docs))
# Map proto packets to documentation
packet_mapping = map_packets(proto_packets, protocol_docs)
# Generate commented packets
commented_packets = add_comments(proto_packets, packet_mapping)
# Write output
output_path = "proto_commented.yml"
with open(output_path, "w", encoding="utf-8") as f:
f.write("\n\n".join(commented_packets))
print(f"Commented proto.yml saved to {output_path}")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment