Skip to content

Instantly share code, notes, and snippets.

@Luiz-Monad
Last active December 28, 2024 15:45
Show Gist options
  • Save Luiz-Monad/3c520fce913ec2f3452d8b074d8d55fa to your computer and use it in GitHub Desktop.
Save Luiz-Monad/3c520fce913ec2f3452d8b074d8d55fa to your computer and use it in GitHub Desktop.
translate LLM poc (api)
aiohttp==3.8.5
aiosignal==1.3.1
async-timeout==4.0.2
attrs==23.1.0
certifi==2023.7.22
charset-normalizer==3.2.0
colorama==0.4.6
filelock==3.12.2
frozenlist==1.4.0
fsspec==2023.6.0
huggingface-hub==0.16.4
idna==3.4
multidict==6.0.4
numpy==1.25.1
openai==0.27.8
packaging==23.1
PyYAML==6.0.1
regex==2023.6.3
requests==2.31.0
safetensors==0.3.1
tokenizers==0.13.3
tqdm==4.65.0
transformers==4.31.0
typing_extensions==4.7.1
urllib3==2.0.4
yarl==1.9.2
You are a translator that translates C code to C#, don't give hints, just generate the code, don't embed the code in markdown or format the code.
Refactor functions to CamelCase.
Pointer to primitive types should be converted to SpanPtr<>.
Pointers to non-primitive types are references.
Don't use the method Slice, SpanPtr has overloads for "+", "-", "++" and "--".
Keep memcopy and memmove functions.
Don't use unsafe.
import os
import openai
from transformers import GPT2Tokenizer
# OpenAI GPT-2 tokenizer is the same as GPT-3 tokenizer
# we use it to count the number of tokens in the text
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
openai.api_key = os.getenv("OPENAI_API_KEY")
esc = "\x1b"
red = f"{esc}[31m"
green = f"{esc}[92m"
blue = f"{esc}[94m"
white = f"{esc}[37m"
yellow = f"{esc}[33m"
zero_cursor = f"{esc}[2;0H"
save_cursor = f"{esc}[s"
restore_cursor = f"{esc}[u"
with open("system.txt", "r") as system_file:
system = "\n".join(system_file.readlines())
def prompter(argument):
global last_prompt
cur_prompt = {"role": "user", "content": argument}
response = openai.ChatCompletion.create(
model="gpt-3.5-turbo",
# model="gpt-4",
messages=[
{"role": "system", "content": system},
cur_prompt,
]
)
# retrieve the model's message from the response
return response['choices'][0]['message']['content']
# https://github.com/openai/openai-cookbook/blob/main/examples/book_translation/translate_latex_book.ipynb
def group_chunks(chunks, ntokens, max_len=1000, hard_max_len=3000):
"""
Group very short chunks, to form approximately page long chunks.
"""
batches = []
cur_batch = ""
cur_tokens = 0
# iterate over chunks, and group the short ones together
for chunk, ntoken in zip(chunks, ntokens):
# discard chunks that exceed hard max length
if ntoken > hard_max_len:
print(f"Warning: Chunk discarded for being too long ({ntoken} tokens > {hard_max_len} token limit). Preview: '{chunk[:50]}...'")
continue
# if room in current batch, add new chunk
if cur_tokens + 1 + ntoken <= max_len:
cur_batch += chunk
cur_tokens += 1 + ntoken # adds 1 token for the newline
# otherwise, record the batch and start a new one
else:
batches.append(cur_batch)
cur_batch = chunk
cur_tokens = ntoken
if cur_batch: # add the last batch if it's not empty
batches.append(cur_batch)
return batches
def main():
with open("source.c", "r") as source_file:
chunks = source_file.readlines()
with open("target.cs", "w") as target_file:
target_file.write("/* automatic code translation from C to C# */\n")
ntokens = []
for chunk in chunks:
ntokens.append(len(tokenizer.encode(chunk)))
print(f"{white}longest number of tokens in a single line: {max(ntokens)}")
chunks = group_chunks(chunks, ntokens)
def status():
print(f"{save_cursor}{zero_cursor}{yellow}chunk {i} / {len(chunks)}{restore_cursor}")
for i in range(0, len(chunks)):
code_slice = chunks[i]
print(f"{green}{code_slice}\n")
status()
translation = prompter(code_slice)
print(f"{blue}{translation}\n")
status()
if translation:
with open("target.cs", "a") as target_file:
target_file.write(f"{translation}\n")
else:
print(f"{red}Failed to translate the code slice starting at line: ", i)
break
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment