Skip to content

Instantly share code, notes, and snippets.

@kioku
Created August 26, 2024 21:36
Show Gist options
  • Save kioku/72cec85738e08080d2e3b5093e2045e7 to your computer and use it in GitHub Desktop.
Save kioku/72cec85738e08080d2e3b5093e2045e7 to your computer and use it in GitHub Desktop.
A script to translate markdown files from one language to another using OpenAI's GPT model.
#!/usr/bin/env node
/**
* @file translate-docs.js
* @description A script to translate markdown files from one language to another using OpenAI's GPT model.
* @usage node translate-docs.js --directoryPath=<path> --sourceLanguage=<language> --targetLanguage=<language>
* @license MIT
*/
const fs = require('fs').promises;
const path = require('path');
const { createOpenAI } = require('@ai-sdk/openai');
const { RateLimiter } = require('limiter');
const { generateText } = require('ai');
// Function to parse named command line arguments
const parseArgs = () => {
const args = process.argv.slice(2);
const argsMap = {};
args.forEach((arg) => {
const [key, value] = arg.split('=');
if (key.startsWith('--')) {
argsMap[key.substring(2)] = value;
}
});
return {
...argsMap,
sourceLanguage: argsMap.sourceLanguage || 'zh',
targetLanguage: argsMap.targetLanguage || 'en',
};
};
// Parse named command line arguments
const { directoryPath, sourceLanguage, targetLanguage } = parseArgs();
// Validate command line arguments
if (!directoryPath) {
console.error('Usage: node script.js --directoryPath=<path> --sourceLanguage=<language> --targetLanguage=<language>');
process.exit(1);
}
// Create a rate limiter: 5 requests per minute
const limiter = new RateLimiter({ tokensPerInterval: 5, interval: 'minute' });
const openai = createOpenAI();
const model = openai('gpt-4o-mini');
const systemPrompt = `
You are a TRANSLATOR. You are provided with text in ${sourceLanguage}.
Your TASK is to translate the text into ${targetLanguage}.
Consider that the text is formatted in markdown.
Respond only with the translated text.
If there's a need to add a comment, please use the following format:
<!--
TNOTE: <comment>
-->
`;
const prompt = `Translate: <md_file_content>`;
/**
* Processes all markdown files in the specified directory.
* @param {string} directoryPath - The path to the directory containing markdown files.
* @param {string} sourceLanguage - The source language code.
* @param {string} targetLanguage - The target language code.
*/
async function processFiles(directoryPath, sourceLanguage, targetLanguage) {
try {
const files = await fs.readdir(directoryPath);
const markdownFiles = files.filter((file) => file.endsWith('.md'));
const totalCount = markdownFiles.length;
for (let index = 0; index < totalCount; index++) {
const filename = markdownFiles[index];
const currCount = index + 1;
const language = filename.split('.').length > 2 ? filename.split('.')[1] : sourceLanguage;
console.log(`${currCount} / ${totalCount} Parsing ${filename} in ${language}...`);
await translateFileLimited(path.join(directoryPath, filename), sourceLanguage, targetLanguage);
}
} catch (err) {
console.error('Error processing the directory', err);
}
}
/**
* Translates a file in chunks, respecting the rate limit.
* @param {string} filePath - The path to the file to be translated.
* @param {string} sourceLanguage - The source language code.
* @param {string} targetLanguage - The target language code.
*/
async function translateFileLimited(filePath, sourceLanguage, targetLanguage) {
try {
const chunks = await splitFileIntoChunks(filePath);
const translatedChunks = [];
let count = 0;
for (const chunk of chunks) {
if (sourceLanguage === 'zh' && !containsChinese(chunk)) {
translatedChunks.push(chunk);
console.log(`Skipping chunk ${count++} of ${chunks.length}`);
console.log(chunk);
continue;
}
// Wait for a token from the rate limiter
await limiter.removeTokens(1);
const { text } = await generateText({
model,
systemPrompt,
prompt: prompt.replace('<md_file_content>', chunk),
});
translatedChunks.push(text);
await appendToFile(filePath, text, sourceLanguage, targetLanguage);
console.log(`Translated chunk ${count++} of ${chunks.length}`);
}
console.log('Done.');
} catch (err) {
console.error('Error translating the file', err);
}
}
/**
* Appends translated content to a new file.
* @param {string} filePath - The path to the original file.
* @param {string} content - The translated content.
* @param {string} sourceLanguage - The source language code.
* @param {string} targetLanguage - The target language code.
*/
async function appendToFile(filePath, content, sourceLanguage, targetLanguage) {
try {
await fs.writeFile(
filePath.replace(`.${sourceLanguage}.md`, `.${targetLanguage}.md`),
content.concat('\n'),
{ flag: 'a' }
);
} catch (err) {
console.error('Error appending to the file', err);
}
}
/**
* Splits the file content into chunks of a specified size, preferring to split at markdown headings.
* @param {string} filePath - The path to the file to be split.
* @param {number} chunkSize - The maximum number of lines per chunk.
* @returns {Promise<string[]>} - An array of file content chunks.
*/
async function splitFileIntoChunks(filePath, chunkSize = 750) {
const fileContent = await fs.readFile(filePath, 'utf8');
const lines = fileContent.split(/\r\n|\r|\n/);
let chunks = [];
let currentChunk = [];
for (let i = 0; i < lines.length; i++) {
currentChunk.push(lines[i]);
if (currentChunk.length >= chunkSize || i === lines.length - 1) {
let splitIndex = currentChunk.findIndex((line) => line.startsWith('#'));
if (splitIndex > 0 && splitIndex < currentChunk.length) {
chunks.push(currentChunk.slice(0, splitIndex).join('\n'));
currentChunk = currentChunk.slice(splitIndex);
} else {
chunks.push(currentChunk.join('\n'));
currentChunk = [];
}
}
}
if (currentChunk.length > 0) {
chunks.push(currentChunk.join('\n'));
}
return chunks;
}
/**
* Checks if a string contains Chinese characters.
* @param {string} text - The text to check.
* @returns {boolean} - True if the text contains Chinese characters, false otherwise.
*/
function containsChinese(text) {
return /[\u3400-\u4DBF\u4E00-\u9FFF\uF900-\uFAFF]/.test(text);
}
// Start processing files
processFiles(directoryPath, sourceLanguage, targetLanguage);
@kioku
Copy link
Author

kioku commented Aug 26, 2024

Common issues:

  • If {targetLanguage} is in a source file, then it might be translated to {sourceLanguage}.
  • Sometimes adds Here's your translation of the provided text: at the top of the file unnecessarily.
  • Missing newline characters above some markdown headings.
  • Does not translate /{sourceLanguage}/... links to /{targetLanguage}/....
  • Does not translate link parts to {targetLanguage}.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment