Last active
July 7, 2023 11:01
-
-
Save badbye/7202980eb29b5feaad76ecda98e9317b to your computer and use it in GitHub Desktop.
translate Rmd files
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(httr2) | |
library(knitr) | |
AZURE_API_URL = "https://xxx.openai.azure.com/openai/deployments/davinci-003/completions?api-version=2022-12-01" | |
AZURE_API_KEY = "xxx" | |
PROMPT = "你是一个优秀的文学家和工程师。你将逐行翻译 Human 给定的 markdown 格式的文本,返回流畅易懂的中文。如果给定的内容中某一行以`<--`开头,以`-->`结尾,则忽略该行不翻译。\n行内的文本遇到以下规则时,... 所表示的内容也不翻译:\n1. `r ...`\n2. \\ref(...)\n\nHuman: China is the greatest country in the world. ^[You have to admit it.]\n<-- The US may disagree -->\nsee table \\ref(tb), its population is `r 10**8`.\nAI: 中国是世界上最伟大的国家。 ^[你必须承认这个事实。]\n<-- The US may disagree -->\n见表 \\ref(tb), 它的人口数量是 `r 10**8`。\n\nHuman: %s\nAI: " | |
commented = function(line) { | |
if (is_commented(line)) line else paste0('<!-- ', line, ' -->') | |
} | |
is_commented = function(line) { | |
line = trimws(line) | |
startsWith(line, '<!--') && endsWith(line, '-->') | |
} | |
translate = function(content) { | |
lines = strsplit(content, '\n')[[1]] | |
uncommented_lines = lines[!Vectorize(is_commented)(lines)] | |
davinci_003_translate(paste(uncommented_lines, collapse = '\n')) | |
} | |
davinci_003_translate = function(text) { | |
req = request(AZURE_API_URL) %>% | |
req_headers("Content-Type" = "application/json") %>% | |
req_headers("api-key" = AZURE_API_KEY) | |
req_body = list(prompt =sprintf(PROMPT, text), | |
max_tokens = 2000, | |
temperature = 0.9, | |
frequency_penalty = 0, | |
presence_penalty = 0, | |
top_p = 1, | |
best_of = 1, | |
stop = c("Human:","AI:")) | |
resp = req_perform(req %>% req_body_json(req_body)) | |
json_resp = resp_body_json(resp) | |
cat(sprintf('; total_tokens usage: %s; prompt_tokens: %s; completion_tokens: %s', | |
json_resp$usage$total_tokens, | |
json_resp$usage$prompt_tokens, | |
json_resp$usage$completion_tokens)) | |
resp_body_json(resp)$choices[[1]]$text | |
} | |
process_chunk = function(x) { | |
UseMethod('process_chunk', x) | |
} | |
process_chunk.block = function(x) { | |
params = opts_chunk$merge(x$params) | |
code = knit_code$get(params$label) | |
engine = tolower(code_param$engine) | |
if (engine == 'block2') { | |
translated_code = translate(paste(code, collapse = '\n')) | |
# keep the original code and comment it | |
code = c(sapply(code, commented), '\n', translated_code) | |
} | |
sprintf("```{%s %s}\n%s\n```", | |
engine, | |
x$params.src, | |
paste(code, collapse = '\n') | |
) | |
} | |
process_chunk.inline = function(x) { | |
content = x$input | |
lines = strsplit(content, '\n')[[1]] | |
sprintf('%s\n%s', | |
paste(sapply(lines, commented), collapse = '\n'), | |
translate(content)) | |
} | |
translate_rmd = function(file_path) { | |
translated_file_path = gsub('.Rmd', '-trans.Rmd', file_path) | |
lines = xfun::read_utf8(file_path) | |
opts_chunk$restore() | |
knit_code$restore() | |
opts_knit$restore() | |
chunks = knitr:::split_file(lines, patterns = all_patterns$md) | |
translated_chunks = character(length = length(chunks)) | |
t0 = Sys.time() | |
for (i in 1:length(chunks)) { | |
chunk = chunks[[i]] | |
if (class(chunk) == 'inline' && trimws(chunk$input) == "") { | |
translated_chunks[i] = '\n' | |
} else { | |
translated_chunks[i] = process_chunk(chunk) | |
} | |
t = Sys.time() | |
cat(sprintf('\nprogress: %s/%s; time cost: %s secs', i, length(chunks), | |
round(as.numeric(t - t0, units = "secs"), 2))) | |
} | |
translated_chunks = sapply(chunks, process_chunk) | |
writeLines(translated_chunks, translated_file_path) | |
cat(sprintf("\nwrite to file: %s", translated_file_path)) | |
} | |
translate_rmd('xxx.Rmd') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment