Last active
December 28, 2024 15:45
-
-
Save Luiz-Monad/3c520fce913ec2f3452d8b074d8d55fa to your computer and use it in GitHub Desktop.
translate LLM poc (api)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
aiohttp==3.8.5 | |
aiosignal==1.3.1 | |
async-timeout==4.0.2 | |
attrs==23.1.0 | |
certifi==2023.7.22 | |
charset-normalizer==3.2.0 | |
colorama==0.4.6 | |
filelock==3.12.2 | |
frozenlist==1.4.0 | |
fsspec==2023.6.0 | |
huggingface-hub==0.16.4 | |
idna==3.4 | |
multidict==6.0.4 | |
numpy==1.25.1 | |
openai==0.27.8 | |
packaging==23.1 | |
PyYAML==6.0.1 | |
regex==2023.6.3 | |
requests==2.31.0 | |
safetensors==0.3.1 | |
tokenizers==0.13.3 | |
tqdm==4.65.0 | |
transformers==4.31.0 | |
typing_extensions==4.7.1 | |
urllib3==2.0.4 | |
yarl==1.9.2 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
You are a translator that translates C code to C#, don't give hints, just generate the code, don't embed the code in markdown or format the code. | |
Refactor functions to CamelCase. | |
Pointer to primitive types should be converted to SpanPtr<>. | |
Pointers to non-primitive types are references. | |
Don't use the method Slice, SpanPtr has overloads for "+", "-", "++" and "--". | |
Keep memcopy and memmove functions. | |
Don't use unsafe. | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import openai | |
from transformers import GPT2Tokenizer | |
# OpenAI GPT-2 tokenizer is the same as GPT-3 tokenizer | |
# we use it to count the number of tokens in the text | |
tokenizer = GPT2Tokenizer.from_pretrained("gpt2") | |
openai.api_key = os.getenv("OPENAI_API_KEY") | |
esc = "\x1b" | |
red = f"{esc}[31m" | |
green = f"{esc}[92m" | |
blue = f"{esc}[94m" | |
white = f"{esc}[37m" | |
yellow = f"{esc}[33m" | |
zero_cursor = f"{esc}[2;0H" | |
save_cursor = f"{esc}[s" | |
restore_cursor = f"{esc}[u" | |
with open("system.txt", "r") as system_file: | |
system = "\n".join(system_file.readlines()) | |
def prompter(argument): | |
global last_prompt | |
cur_prompt = {"role": "user", "content": argument} | |
response = openai.ChatCompletion.create( | |
model="gpt-3.5-turbo", | |
# model="gpt-4", | |
messages=[ | |
{"role": "system", "content": system}, | |
cur_prompt, | |
] | |
) | |
# retrieve the model's message from the response | |
return response['choices'][0]['message']['content'] | |
# https://github.com/openai/openai-cookbook/blob/main/examples/book_translation/translate_latex_book.ipynb | |
def group_chunks(chunks, ntokens, max_len=1000, hard_max_len=3000): | |
""" | |
Group very short chunks, to form approximately page long chunks. | |
""" | |
batches = [] | |
cur_batch = "" | |
cur_tokens = 0 | |
# iterate over chunks, and group the short ones together | |
for chunk, ntoken in zip(chunks, ntokens): | |
# discard chunks that exceed hard max length | |
if ntoken > hard_max_len: | |
print(f"Warning: Chunk discarded for being too long ({ntoken} tokens > {hard_max_len} token limit). Preview: '{chunk[:50]}...'") | |
continue | |
# if room in current batch, add new chunk | |
if cur_tokens + 1 + ntoken <= max_len: | |
cur_batch += chunk | |
cur_tokens += 1 + ntoken # adds 1 token for the newline | |
# otherwise, record the batch and start a new one | |
else: | |
batches.append(cur_batch) | |
cur_batch = chunk | |
cur_tokens = ntoken | |
if cur_batch: # add the last batch if it's not empty | |
batches.append(cur_batch) | |
return batches | |
def main(): | |
with open("source.c", "r") as source_file: | |
chunks = source_file.readlines() | |
with open("target.cs", "w") as target_file: | |
target_file.write("/* automatic code translation from C to C# */\n") | |
ntokens = [] | |
for chunk in chunks: | |
ntokens.append(len(tokenizer.encode(chunk))) | |
print(f"{white}longest number of tokens in a single line: {max(ntokens)}") | |
chunks = group_chunks(chunks, ntokens) | |
def status(): | |
print(f"{save_cursor}{zero_cursor}{yellow}chunk {i} / {len(chunks)}{restore_cursor}") | |
for i in range(0, len(chunks)): | |
code_slice = chunks[i] | |
print(f"{green}{code_slice}\n") | |
status() | |
translation = prompter(code_slice) | |
print(f"{blue}{translation}\n") | |
status() | |
if translation: | |
with open("target.cs", "a") as target_file: | |
target_file.write(f"{translation}\n") | |
else: | |
print(f"{red}Failed to translate the code slice starting at line: ", i) | |
break | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment