Skip to content

Instantly share code, notes, and snippets.

@fflorent
Last active August 21, 2024 08:28
Show Gist options
  • Save fflorent/96c057da668b2d0e1c4b924f882e9448 to your computer and use it in GitHub Desktop.
Save fflorent/96c057da668b2d0e1c4b924f882e9448 to your computer and use it in GitHub Desktop.
translation of grist-help into french using GPT-4o
# !IMPORTANT
# Adapt the model gpt-4o to gpt-4o-mini when the translated file is larger than 4096 tokens
import os
import openai
from pathlib import Path
import json
import argparse
import datetime
from openai.types.batch import Batch
client = openai.Client(
api_key=os.environ.get("OPENAI_API_KEY"),
)
def create_batch_jsonl_item_for_markdown_to_translate(markdown_file: Path) -> str:
""" Crée un simple élément jsonl consommable par l'API batch d'OpenAI pour chaque fichier markdown dans le répertoire source. """
system_prompt = """
You will be provided with markdown content written in English, and your task is to translate it into French. You must take into account the following glossary in TSV format:
```
user attributes propriété d'utilisateur
lookup column cible
Lookup table table d'appairage
special rules Règles avancées
access rules permissions avancées
seed rules règles par défaut
personal site espace personnel
team site espace d'équipe
range intervalle
raw data tables données sources
record card vue fiche
widget vue
currency devise
sandbox sandbox
workspace dossier
table table
trigger formula formule d'initialisation
```
"""
markdown_content = markdown_file.read_text(encoding='utf-8')
messages = [{
"role": "system",
"content": system_prompt
}, {
"role": "user",
"content": markdown_content,
}]
return json.dumps({
"custom_id": str(markdown_file).replace('en', 'fr', 1),
"method": "POST",
"url": "/v1/chat/completions",
"body": {
"model": "gpt-4o",
"temperature": 0.3,
"top_p": 1,
"messages": messages,
}
})
def create_translation_batch(markdown_files: list[str]):
"""Envoyer un fichier jsonl de traduction à l'API batch d'OpenAI."""
jsonl_items = [
create_batch_jsonl_item_for_markdown_to_translate(Path(markdown_file)) for markdown_file in markdown_files
]
jsonl_content = "\n".join(jsonl_items)
batch_input_file = client.files.create(
file=bytes(jsonl_content, encoding="utf-8"),
purpose="batch",
)
batch_input_file_id = batch_input_file.id
print('Batch input file ID:', batch_input_file_id)
input_file_path=f'batch_input-{batch_input_file_id}.jsonl'
Path(input_file_path).write_text(jsonl_content, encoding='utf-8')
print(f'Saved jsonl content to {input_file_path}')
client.batches.create(
input_file_id=batch_input_file_id,
endpoint="/v1/chat/completions",
completion_window="24h",
)
def print_batch_info(batch: Batch):
print('-----------------')
print("Batch ID:", batch.id)
print("Created At:", datetime.datetime.fromtimestamp(batch.created_at).isoformat().replace('T', ' '))
print("Status:", batch.status)
print('Output file ID:', batch.output_file_id)
print('Errors:', batch.errors)
print('-----------------')
return client.batches.retrieve(batch.id)
if __name__ == "__main__":
"""
Usage:
python batch.py upload md_file1 md_file2 md_file3...
python batch.py check batch_id|all
python batch.py download file_id --write
"""
parser = argparse.ArgumentParser(description='Process some integers.')
parser.add_argument('command', type=str, help='upload|check|download')
parser.add_argument('args', nargs='+', help='List of markdown files or batch id')
parser.add_argument('--write', action='store_true', help='Write the downloaded content to a file', default=False)
parser.add_argument('--raw', action='store_true', help='Show raw batch info', default=False)
args = parser.parse_args()
if args.command == 'upload':
create_translation_batch(args.args)
elif args.command == 'check':
if args.args[0] == 'all':
batches=client.batches.list().data
batches.sort(key=lambda x: x.created_at, reverse=False)
for batch in batches:
print_batch_info(batch)
else:
if args.raw:
print(client.batches.retrieve(args.args[0]))
else:
print_batch_info(client.batches.retrieve(args.args[0]))
elif args.command == 'download':
file_response = client.files.content(args.args[0])
responses=[json.loads(line) for line in file_response.text.split('\n') if line]
print('-----------------')
for response in responses:
if response['response']['status_code'] == 200:
print(f'Treating {response["custom_id"]}')
if args.write:
with open(response['custom_id'], 'w', encoding='utf-8') as f:
f.write(response['response']['body']['choices'][0]['message']['content'])
print(f'Saved to {response["custom_id"]}')
else:
print(response['response']['body']['choices'][0]['message']['content'])
else:
print('Error:', response['response']['error'])
print('-----------------')
else:
print('Invalid command')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment