Last active
August 21, 2024 08:28
-
-
Save fflorent/96c057da668b2d0e1c4b924f882e9448 to your computer and use it in GitHub Desktop.
translation of grist-help into french using GPT-4o
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# !IMPORTANT | |
# Adapt the model gpt-4o to gpt-4o-mini when the translated file is larger than 4096 tokens | |
import os | |
import openai | |
from pathlib import Path | |
import json | |
import argparse | |
import datetime | |
from openai.types.batch import Batch | |
client = openai.Client( | |
api_key=os.environ.get("OPENAI_API_KEY"), | |
) | |
def create_batch_jsonl_item_for_markdown_to_translate(markdown_file: Path) -> str: | |
""" Crée un simple élément jsonl consommable par l'API batch d'OpenAI pour chaque fichier markdown dans le répertoire source. """ | |
system_prompt = """ | |
You will be provided with markdown content written in English, and your task is to translate it into French. You must take into account the following glossary in TSV format: | |
``` | |
user attributes propriété d'utilisateur | |
lookup column cible | |
Lookup table table d'appairage | |
special rules Règles avancées | |
access rules permissions avancées | |
seed rules règles par défaut | |
personal site espace personnel | |
team site espace d'équipe | |
range intervalle | |
raw data tables données sources | |
record card vue fiche | |
widget vue | |
currency devise | |
sandbox sandbox | |
workspace dossier | |
table table | |
trigger formula formule d'initialisation | |
``` | |
""" | |
markdown_content = markdown_file.read_text(encoding='utf-8') | |
messages = [{ | |
"role": "system", | |
"content": system_prompt | |
}, { | |
"role": "user", | |
"content": markdown_content, | |
}] | |
return json.dumps({ | |
"custom_id": str(markdown_file).replace('en', 'fr', 1), | |
"method": "POST", | |
"url": "/v1/chat/completions", | |
"body": { | |
"model": "gpt-4o", | |
"temperature": 0.3, | |
"top_p": 1, | |
"messages": messages, | |
} | |
}) | |
def create_translation_batch(markdown_files: list[str]): | |
"""Envoyer un fichier jsonl de traduction à l'API batch d'OpenAI.""" | |
jsonl_items = [ | |
create_batch_jsonl_item_for_markdown_to_translate(Path(markdown_file)) for markdown_file in markdown_files | |
] | |
jsonl_content = "\n".join(jsonl_items) | |
batch_input_file = client.files.create( | |
file=bytes(jsonl_content, encoding="utf-8"), | |
purpose="batch", | |
) | |
batch_input_file_id = batch_input_file.id | |
print('Batch input file ID:', batch_input_file_id) | |
input_file_path=f'batch_input-{batch_input_file_id}.jsonl' | |
Path(input_file_path).write_text(jsonl_content, encoding='utf-8') | |
print(f'Saved jsonl content to {input_file_path}') | |
client.batches.create( | |
input_file_id=batch_input_file_id, | |
endpoint="/v1/chat/completions", | |
completion_window="24h", | |
) | |
def print_batch_info(batch: Batch): | |
print('-----------------') | |
print("Batch ID:", batch.id) | |
print("Created At:", datetime.datetime.fromtimestamp(batch.created_at).isoformat().replace('T', ' ')) | |
print("Status:", batch.status) | |
print('Output file ID:', batch.output_file_id) | |
print('Errors:', batch.errors) | |
print('-----------------') | |
return client.batches.retrieve(batch.id) | |
if __name__ == "__main__": | |
""" | |
Usage: | |
python batch.py upload md_file1 md_file2 md_file3... | |
python batch.py check batch_id|all | |
python batch.py download file_id --write | |
""" | |
parser = argparse.ArgumentParser(description='Process some integers.') | |
parser.add_argument('command', type=str, help='upload|check|download') | |
parser.add_argument('args', nargs='+', help='List of markdown files or batch id') | |
parser.add_argument('--write', action='store_true', help='Write the downloaded content to a file', default=False) | |
parser.add_argument('--raw', action='store_true', help='Show raw batch info', default=False) | |
args = parser.parse_args() | |
if args.command == 'upload': | |
create_translation_batch(args.args) | |
elif args.command == 'check': | |
if args.args[0] == 'all': | |
batches=client.batches.list().data | |
batches.sort(key=lambda x: x.created_at, reverse=False) | |
for batch in batches: | |
print_batch_info(batch) | |
else: | |
if args.raw: | |
print(client.batches.retrieve(args.args[0])) | |
else: | |
print_batch_info(client.batches.retrieve(args.args[0])) | |
elif args.command == 'download': | |
file_response = client.files.content(args.args[0]) | |
responses=[json.loads(line) for line in file_response.text.split('\n') if line] | |
print('-----------------') | |
for response in responses: | |
if response['response']['status_code'] == 200: | |
print(f'Treating {response["custom_id"]}') | |
if args.write: | |
with open(response['custom_id'], 'w', encoding='utf-8') as f: | |
f.write(response['response']['body']['choices'][0]['message']['content']) | |
print(f'Saved to {response["custom_id"]}') | |
else: | |
print(response['response']['body']['choices'][0]['message']['content']) | |
else: | |
print('Error:', response['response']['error']) | |
print('-----------------') | |
else: | |
print('Invalid command') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment