Skip to content

Instantly share code, notes, and snippets.

@badbye
Last active July 7, 2023 11:01
Show Gist options
  • Save badbye/7202980eb29b5feaad76ecda98e9317b to your computer and use it in GitHub Desktop.
Save badbye/7202980eb29b5feaad76ecda98e9317b to your computer and use it in GitHub Desktop.
translate Rmd files
library(httr2)
library(knitr)
AZURE_API_URL = "https://xxx.openai.azure.com/openai/deployments/davinci-003/completions?api-version=2022-12-01"
AZURE_API_KEY = "xxx"
PROMPT = "你是一个优秀的文学家和工程师。你将逐行翻译 Human 给定的 markdown 格式的文本,返回流畅易懂的中文。如果给定的内容中某一行以`<--`开头,以`-->`结尾,则忽略该行不翻译。\n行内的文本遇到以下规则时,... 所表示的内容也不翻译:\n1. `r ...`\n2. \\ref(...)\n\nHuman: China is the greatest country in the world. ^[You have to admit it.]\n<-- The US may disagree -->\nsee table \\ref(tb), its population is `r 10**8`.\nAI: 中国是世界上最伟大的国家。 ^[你必须承认这个事实。]\n<-- The US may disagree -->\n见表 \\ref(tb), 它的人口数量是 `r 10**8`。\n\nHuman: %s\nAI: "
commented = function(line) {
if (is_commented(line)) line else paste0('<!-- ', line, ' -->')
}
is_commented = function(line) {
line = trimws(line)
startsWith(line, '<!--') && endsWith(line, '-->')
}
translate = function(content) {
lines = strsplit(content, '\n')[[1]]
uncommented_lines = lines[!Vectorize(is_commented)(lines)]
davinci_003_translate(paste(uncommented_lines, collapse = '\n'))
}
davinci_003_translate = function(text) {
req = request(AZURE_API_URL) %>%
req_headers("Content-Type" = "application/json") %>%
req_headers("api-key" = AZURE_API_KEY)
req_body = list(prompt =sprintf(PROMPT, text),
max_tokens = 2000,
temperature = 0.9,
frequency_penalty = 0,
presence_penalty = 0,
top_p = 1,
best_of = 1,
stop = c("Human:","AI:"))
resp = req_perform(req %>% req_body_json(req_body))
json_resp = resp_body_json(resp)
cat(sprintf('; total_tokens usage: %s; prompt_tokens: %s; completion_tokens: %s',
json_resp$usage$total_tokens,
json_resp$usage$prompt_tokens,
json_resp$usage$completion_tokens))
resp_body_json(resp)$choices[[1]]$text
}
process_chunk = function(x) {
UseMethod('process_chunk', x)
}
process_chunk.block = function(x) {
params = opts_chunk$merge(x$params)
code = knit_code$get(params$label)
engine = tolower(code_param$engine)
if (engine == 'block2') {
translated_code = translate(paste(code, collapse = '\n'))
# keep the original code and comment it
code = c(sapply(code, commented), '\n', translated_code)
}
sprintf("```{%s %s}\n%s\n```",
engine,
x$params.src,
paste(code, collapse = '\n')
)
}
process_chunk.inline = function(x) {
content = x$input
lines = strsplit(content, '\n')[[1]]
sprintf('%s\n%s',
paste(sapply(lines, commented), collapse = '\n'),
translate(content))
}
translate_rmd = function(file_path) {
translated_file_path = gsub('.Rmd', '-trans.Rmd', file_path)
lines = xfun::read_utf8(file_path)
opts_chunk$restore()
knit_code$restore()
opts_knit$restore()
chunks = knitr:::split_file(lines, patterns = all_patterns$md)
translated_chunks = character(length = length(chunks))
t0 = Sys.time()
for (i in 1:length(chunks)) {
chunk = chunks[[i]]
if (class(chunk) == 'inline' && trimws(chunk$input) == "") {
translated_chunks[i] = '\n'
} else {
translated_chunks[i] = process_chunk(chunk)
}
t = Sys.time()
cat(sprintf('\nprogress: %s/%s; time cost: %s secs', i, length(chunks),
round(as.numeric(t - t0, units = "secs"), 2)))
}
translated_chunks = sapply(chunks, process_chunk)
writeLines(translated_chunks, translated_file_path)
cat(sprintf("\nwrite to file: %s", translated_file_path))
}
translate_rmd('xxx.Rmd')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment