kioku · August 26, 2024 21:36 · kioku · Aug 26, 2024
diff --git a/translate-docs.js b/translate-docs.js
 #!/usr/bin/env node

 /**
 * @file translate-docs.js
 * @description A script to translate markdown files from one language to another using OpenAI's GPT model.
 * @usage node translate-docs.js --directoryPath=<path> --sourceLanguage=<language> --targetLanguage=<language>
 * @license MIT
 */

 const fs = require('fs').promises;
 const path = require('path');
 const { createOpenAI } = require('@ai-sdk/openai');
 const { RateLimiter } = require('limiter');
 const { generateText } = require('ai');

 // Function to parse named command line arguments
 const parseArgs = () => {
  const args = process.argv.slice(2);
  const argsMap = {};
  args.forEach((arg) => {
    const [key, value] = arg.split('=');
    if (key.startsWith('--')) {
      argsMap[key.substring(2)] = value;
    }
  });
  return {
    ...argsMap,
    sourceLanguage: argsMap.sourceLanguage || 'zh',
    targetLanguage: argsMap.targetLanguage || 'en',
  };
 };

 // Parse named command line arguments
 const { directoryPath, sourceLanguage, targetLanguage } = parseArgs();

 // Validate command line arguments
 if (!directoryPath) {
  console.error('Usage: node script.js --directoryPath=<path> --sourceLanguage=<language> --targetLanguage=<language>');
  process.exit(1);
 }

 // Create a rate limiter: 5 requests per minute
 const limiter = new RateLimiter({ tokensPerInterval: 5, interval: 'minute' });

 const openai = createOpenAI();

 const model = openai('gpt-4o-mini');

 const systemPrompt = `
 You are a TRANSLATOR. You are provided with text in ${sourceLanguage}.
 Your TASK is to translate the text into ${targetLanguage}.
 Consider that the text is formatted in markdown.
 Respond only with the translated text.
 If there's a need to add a comment, please use the following format:
 <!--
 TNOTE: <comment>
 -->
 `;

 const prompt = `Translate: <md_file_content>`;

 /**
 * Processes all markdown files in the specified directory.
 * @param {string} directoryPath - The path to the directory containing markdown files.
 * @param {string} sourceLanguage - The source language code.
 * @param {string} targetLanguage - The target language code.
 */
 async function processFiles(directoryPath, sourceLanguage, targetLanguage) {
  try {
    const files = await fs.readdir(directoryPath);
    const markdownFiles = files.filter((file) => file.endsWith('.md'));
    const totalCount = markdownFiles.length;

    for (let index = 0; index < totalCount; index++) {
      const filename = markdownFiles[index];
      const currCount = index + 1;
      const language = filename.split('.').length > 2 ? filename.split('.')[1] : sourceLanguage;

      console.log(`${currCount} / ${totalCount} Parsing ${filename} in ${language}...`);
      await translateFileLimited(path.join(directoryPath, filename), sourceLanguage, targetLanguage);
    }
  } catch (err) {
    console.error('Error processing the directory', err);
  }
 }

 /**
 * Translates a file in chunks, respecting the rate limit.
 * @param {string} filePath - The path to the file to be translated.
 * @param {string} sourceLanguage - The source language code.
 * @param {string} targetLanguage - The target language code.
 */
 async function translateFileLimited(filePath, sourceLanguage, targetLanguage) {
  try {
    const chunks = await splitFileIntoChunks(filePath);
    const translatedChunks = [];

    let count = 0;
    for (const chunk of chunks) {
      if (sourceLanguage === 'zh' && !containsChinese(chunk)) {
        translatedChunks.push(chunk);
        console.log(`Skipping chunk ${count++} of ${chunks.length}`);
        console.log(chunk);
        continue;
      }

      // Wait for a token from the rate limiter
      await limiter.removeTokens(1);

      const { text } = await generateText({
        model,
        systemPrompt,
        prompt: prompt.replace('<md_file_content>', chunk),
      });

      translatedChunks.push(text);
      await appendToFile(filePath, text, sourceLanguage, targetLanguage);
      console.log(`Translated chunk ${count++} of ${chunks.length}`);
    }

    console.log('Done.');
  } catch (err) {
    console.error('Error translating the file', err);
  }
 }

 /**
 * Appends translated content to a new file.
 * @param {string} filePath - The path to the original file.
 * @param {string} content - The translated content.
 * @param {string} sourceLanguage - The source language code.
 * @param {string} targetLanguage - The target language code.
 */
 async function appendToFile(filePath, content, sourceLanguage, targetLanguage) {
  try {
    await fs.writeFile(
      filePath.replace(`.${sourceLanguage}.md`, `.${targetLanguage}.md`),
      content.concat('\n'),
      { flag: 'a' }
    );
  } catch (err) {
    console.error('Error appending to the file', err);
  }
 }

 /**
 * Splits the file content into chunks of a specified size, preferring to split at markdown headings.
 * @param {string} filePath - The path to the file to be split.
 * @param {number} chunkSize - The maximum number of lines per chunk.
 * @returns {Promise<string[]>} - An array of file content chunks.
 */
 async function splitFileIntoChunks(filePath, chunkSize = 750) {
  const fileContent = await fs.readFile(filePath, 'utf8');
  const lines = fileContent.split(/\r\n|\r|\n/);
  let chunks = [];
  let currentChunk = [];

  for (let i = 0; i < lines.length; i++) {
    currentChunk.push(lines[i]);

    if (currentChunk.length >= chunkSize || i === lines.length - 1) {
      let splitIndex = currentChunk.findIndex((line) => line.startsWith('#'));

      if (splitIndex > 0 && splitIndex < currentChunk.length) {
        chunks.push(currentChunk.slice(0, splitIndex).join('\n'));
        currentChunk = currentChunk.slice(splitIndex);
      } else {
        chunks.push(currentChunk.join('\n'));
        currentChunk = [];
      }
    }
  }

  if (currentChunk.length > 0) {
    chunks.push(currentChunk.join('\n'));
  }

  return chunks;
 }

 /**
 * Checks if a string contains Chinese characters.
 * @param {string} text - The text to check.
 * @returns {boolean} - True if the text contains Chinese characters, false otherwise.
 */
 function containsChinese(text) {
  return /[\u3400-\u4DBF\u4E00-\u9FFF\uF900-\uFAFF]/.test(text);
 }

 // Start processing files
 processFiles(directoryPath, sourceLanguage, targetLanguage);
	#!/usr/bin/env node

	/**
	* @file translate-docs.js
	* @description A script to translate markdown files from one language to another using OpenAI's GPT model.
	* @usage node translate-docs.js --directoryPath=<path> --sourceLanguage=<language> --targetLanguage=<language>
	* @license MIT
	*/

	const fs = require('fs').promises;
	const path = require('path');
	const { createOpenAI } = require('@ai-sdk/openai');
	const { RateLimiter } = require('limiter');
	const { generateText } = require('ai');

	// Function to parse named command line arguments
	const parseArgs = () => {
	const args = process.argv.slice(2);
	const argsMap = {};
	args.forEach((arg) => {
	const [key, value] = arg.split('=');
	if (key.startsWith('--')) {
	argsMap[key.substring(2)] = value;
	}
	});
	return {
	...argsMap,
	sourceLanguage: argsMap.sourceLanguage \|\| 'zh',
	targetLanguage: argsMap.targetLanguage \|\| 'en',
	};
	};

	// Parse named command line arguments
	const { directoryPath, sourceLanguage, targetLanguage } = parseArgs();

	// Validate command line arguments
	if (!directoryPath) {
	console.error('Usage: node script.js --directoryPath=<path> --sourceLanguage=<language> --targetLanguage=<language>');
	process.exit(1);
	}

	// Create a rate limiter: 5 requests per minute
	const limiter = new RateLimiter({ tokensPerInterval: 5, interval: 'minute' });

	const openai = createOpenAI();

	const model = openai('gpt-4o-mini');

	const systemPrompt = `
	You are a TRANSLATOR. You are provided with text in ${sourceLanguage}.
	Your TASK is to translate the text into ${targetLanguage}.
	Consider that the text is formatted in markdown.
	Respond only with the translated text.
	If there's a need to add a comment, please use the following format:
	<!--
	TNOTE: <comment>
	-->
	`;

	const prompt = `Translate: <md_file_content>`;

	/**
	* Processes all markdown files in the specified directory.
	* @param {string} directoryPath - The path to the directory containing markdown files.
	* @param {string} sourceLanguage - The source language code.
	* @param {string} targetLanguage - The target language code.
	*/
	async function processFiles(directoryPath, sourceLanguage, targetLanguage) {
	try {
	const files = await fs.readdir(directoryPath);
	const markdownFiles = files.filter((file) => file.endsWith('.md'));
	const totalCount = markdownFiles.length;

	for (let index = 0; index < totalCount; index++) {
	const filename = markdownFiles[index];
	const currCount = index + 1;
	const language = filename.split('.').length > 2 ? filename.split('.')[1] : sourceLanguage;

	console.log(`${currCount} / ${totalCount} Parsing ${filename} in ${language}...`);
	await translateFileLimited(path.join(directoryPath, filename), sourceLanguage, targetLanguage);
	}
	} catch (err) {
	console.error('Error processing the directory', err);
	}
	}

	/**
	* Translates a file in chunks, respecting the rate limit.
	* @param {string} filePath - The path to the file to be translated.
	* @param {string} sourceLanguage - The source language code.
	* @param {string} targetLanguage - The target language code.
	*/
	async function translateFileLimited(filePath, sourceLanguage, targetLanguage) {
	try {
	const chunks = await splitFileIntoChunks(filePath);
	const translatedChunks = [];

	let count = 0;
	for (const chunk of chunks) {
	if (sourceLanguage === 'zh' && !containsChinese(chunk)) {
	translatedChunks.push(chunk);
	console.log(`Skipping chunk ${count++} of ${chunks.length}`);
	console.log(chunk);
	continue;
	}

	// Wait for a token from the rate limiter
	await limiter.removeTokens(1);

	const { text } = await generateText({
	model,
	systemPrompt,
	prompt: prompt.replace('<md_file_content>', chunk),
	});

	translatedChunks.push(text);
	await appendToFile(filePath, text, sourceLanguage, targetLanguage);
	console.log(`Translated chunk ${count++} of ${chunks.length}`);
	}

	console.log('Done.');
	} catch (err) {
	console.error('Error translating the file', err);
	}
	}

	/**
	* Appends translated content to a new file.
	* @param {string} filePath - The path to the original file.
	* @param {string} content - The translated content.
	* @param {string} sourceLanguage - The source language code.
	* @param {string} targetLanguage - The target language code.
	*/
	async function appendToFile(filePath, content, sourceLanguage, targetLanguage) {
	try {
	await fs.writeFile(
	filePath.replace(`.${sourceLanguage}.md`, `.${targetLanguage}.md`),
	content.concat('\n'),
	{ flag: 'a' }
	);
	} catch (err) {
	console.error('Error appending to the file', err);
	}
	}

	/**
	* Splits the file content into chunks of a specified size, preferring to split at markdown headings.
	* @param {string} filePath - The path to the file to be split.
	* @param {number} chunkSize - The maximum number of lines per chunk.
	* @returns {Promise<string[]>} - An array of file content chunks.
	*/
	async function splitFileIntoChunks(filePath, chunkSize = 750) {
	const fileContent = await fs.readFile(filePath, 'utf8');
	const lines = fileContent.split(/\r\n\|\r\|\n/);
	let chunks = [];
	let currentChunk = [];

	for (let i = 0; i < lines.length; i++) {
	currentChunk.push(lines[i]);

	if (currentChunk.length >= chunkSize \|\| i === lines.length - 1) {
	let splitIndex = currentChunk.findIndex((line) => line.startsWith('#'));

	if (splitIndex > 0 && splitIndex < currentChunk.length) {
	chunks.push(currentChunk.slice(0, splitIndex).join('\n'));
	currentChunk = currentChunk.slice(splitIndex);
	} else {
	chunks.push(currentChunk.join('\n'));
	currentChunk = [];
	}
	}
	}

	if (currentChunk.length > 0) {
	chunks.push(currentChunk.join('\n'));
	}

	return chunks;
	}

	/**
	* Checks if a string contains Chinese characters.
	* @param {string} text - The text to check.
	* @returns {boolean} - True if the text contains Chinese characters, false otherwise.
	*/
	function containsChinese(text) {
	return /[\u3400-\u4DBF\u4E00-\u9FFF\uF900-\uFAFF]/.test(text);
	}

	// Start processing files
	processFiles(directoryPath, sourceLanguage, targetLanguage);