Created
August 26, 2024 21:36
-
-
Save kioku/72cec85738e08080d2e3b5093e2045e7 to your computer and use it in GitHub Desktop.
A script to translate markdown files from one language to another using OpenAI's GPT model.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env node | |
/** | |
* @file translate-docs.js | |
* @description A script to translate markdown files from one language to another using OpenAI's GPT model. | |
* @usage node translate-docs.js --directoryPath=<path> --sourceLanguage=<language> --targetLanguage=<language> | |
* @license MIT | |
*/ | |
const fs = require('fs').promises; | |
const path = require('path'); | |
const { createOpenAI } = require('@ai-sdk/openai'); | |
const { RateLimiter } = require('limiter'); | |
const { generateText } = require('ai'); | |
// Function to parse named command line arguments | |
const parseArgs = () => { | |
const args = process.argv.slice(2); | |
const argsMap = {}; | |
args.forEach((arg) => { | |
const [key, value] = arg.split('='); | |
if (key.startsWith('--')) { | |
argsMap[key.substring(2)] = value; | |
} | |
}); | |
return { | |
...argsMap, | |
sourceLanguage: argsMap.sourceLanguage || 'zh', | |
targetLanguage: argsMap.targetLanguage || 'en', | |
}; | |
}; | |
// Parse named command line arguments | |
const { directoryPath, sourceLanguage, targetLanguage } = parseArgs(); | |
// Validate command line arguments | |
if (!directoryPath) { | |
console.error('Usage: node script.js --directoryPath=<path> --sourceLanguage=<language> --targetLanguage=<language>'); | |
process.exit(1); | |
} | |
// Create a rate limiter: 5 requests per minute | |
const limiter = new RateLimiter({ tokensPerInterval: 5, interval: 'minute' }); | |
const openai = createOpenAI(); | |
const model = openai('gpt-4o-mini'); | |
const systemPrompt = ` | |
You are a TRANSLATOR. You are provided with text in ${sourceLanguage}. | |
Your TASK is to translate the text into ${targetLanguage}. | |
Consider that the text is formatted in markdown. | |
Respond only with the translated text. | |
If there's a need to add a comment, please use the following format: | |
<!-- | |
TNOTE: <comment> | |
--> | |
`; | |
const prompt = `Translate: <md_file_content>`; | |
/** | |
* Processes all markdown files in the specified directory. | |
* @param {string} directoryPath - The path to the directory containing markdown files. | |
* @param {string} sourceLanguage - The source language code. | |
* @param {string} targetLanguage - The target language code. | |
*/ | |
async function processFiles(directoryPath, sourceLanguage, targetLanguage) { | |
try { | |
const files = await fs.readdir(directoryPath); | |
const markdownFiles = files.filter((file) => file.endsWith('.md')); | |
const totalCount = markdownFiles.length; | |
for (let index = 0; index < totalCount; index++) { | |
const filename = markdownFiles[index]; | |
const currCount = index + 1; | |
const language = filename.split('.').length > 2 ? filename.split('.')[1] : sourceLanguage; | |
console.log(`${currCount} / ${totalCount} Parsing ${filename} in ${language}...`); | |
await translateFileLimited(path.join(directoryPath, filename), sourceLanguage, targetLanguage); | |
} | |
} catch (err) { | |
console.error('Error processing the directory', err); | |
} | |
} | |
/** | |
* Translates a file in chunks, respecting the rate limit. | |
* @param {string} filePath - The path to the file to be translated. | |
* @param {string} sourceLanguage - The source language code. | |
* @param {string} targetLanguage - The target language code. | |
*/ | |
async function translateFileLimited(filePath, sourceLanguage, targetLanguage) { | |
try { | |
const chunks = await splitFileIntoChunks(filePath); | |
const translatedChunks = []; | |
let count = 0; | |
for (const chunk of chunks) { | |
if (sourceLanguage === 'zh' && !containsChinese(chunk)) { | |
translatedChunks.push(chunk); | |
console.log(`Skipping chunk ${count++} of ${chunks.length}`); | |
console.log(chunk); | |
continue; | |
} | |
// Wait for a token from the rate limiter | |
await limiter.removeTokens(1); | |
const { text } = await generateText({ | |
model, | |
systemPrompt, | |
prompt: prompt.replace('<md_file_content>', chunk), | |
}); | |
translatedChunks.push(text); | |
await appendToFile(filePath, text, sourceLanguage, targetLanguage); | |
console.log(`Translated chunk ${count++} of ${chunks.length}`); | |
} | |
console.log('Done.'); | |
} catch (err) { | |
console.error('Error translating the file', err); | |
} | |
} | |
/** | |
* Appends translated content to a new file. | |
* @param {string} filePath - The path to the original file. | |
* @param {string} content - The translated content. | |
* @param {string} sourceLanguage - The source language code. | |
* @param {string} targetLanguage - The target language code. | |
*/ | |
async function appendToFile(filePath, content, sourceLanguage, targetLanguage) { | |
try { | |
await fs.writeFile( | |
filePath.replace(`.${sourceLanguage}.md`, `.${targetLanguage}.md`), | |
content.concat('\n'), | |
{ flag: 'a' } | |
); | |
} catch (err) { | |
console.error('Error appending to the file', err); | |
} | |
} | |
/** | |
* Splits the file content into chunks of a specified size, preferring to split at markdown headings. | |
* @param {string} filePath - The path to the file to be split. | |
* @param {number} chunkSize - The maximum number of lines per chunk. | |
* @returns {Promise<string[]>} - An array of file content chunks. | |
*/ | |
async function splitFileIntoChunks(filePath, chunkSize = 750) { | |
const fileContent = await fs.readFile(filePath, 'utf8'); | |
const lines = fileContent.split(/\r\n|\r|\n/); | |
let chunks = []; | |
let currentChunk = []; | |
for (let i = 0; i < lines.length; i++) { | |
currentChunk.push(lines[i]); | |
if (currentChunk.length >= chunkSize || i === lines.length - 1) { | |
let splitIndex = currentChunk.findIndex((line) => line.startsWith('#')); | |
if (splitIndex > 0 && splitIndex < currentChunk.length) { | |
chunks.push(currentChunk.slice(0, splitIndex).join('\n')); | |
currentChunk = currentChunk.slice(splitIndex); | |
} else { | |
chunks.push(currentChunk.join('\n')); | |
currentChunk = []; | |
} | |
} | |
} | |
if (currentChunk.length > 0) { | |
chunks.push(currentChunk.join('\n')); | |
} | |
return chunks; | |
} | |
/** | |
* Checks if a string contains Chinese characters. | |
* @param {string} text - The text to check. | |
* @returns {boolean} - True if the text contains Chinese characters, false otherwise. | |
*/ | |
function containsChinese(text) { | |
return /[\u3400-\u4DBF\u4E00-\u9FFF\uF900-\uFAFF]/.test(text); | |
} | |
// Start processing files | |
processFiles(directoryPath, sourceLanguage, targetLanguage); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Common issues:
{targetLanguage}
is in a source file, then it might be translated to{sourceLanguage}
.Here's your translation of the provided text:
at the top of the file unnecessarily./{sourceLanguage}/...
links to/{targetLanguage}/...
.{targetLanguage}
.