Skip to content

Instantly share code, notes, and snippets.

@farach
Created July 8, 2024 22:04
Show Gist options
  • Save farach/3b83bd2f044c6b7506ea0fb034cf91fb to your computer and use it in GitHub Desktop.
Save farach/3b83bd2f044c6b7506ea0fb034cf91fb to your computer and use it in GitHub Desktop.
# This R script generates synthetic data representing job functions in various languages, translates them to English using a local language model, and detects the original language. The output includes the original job function, the translated job function, and the detected language, and is saved to a CSV file for further use.
# Load necessary libraries
library(httr)
library(jsonlite)
library(textcat)
library(tidyverse)
library(glue)
library(here)
# Function to generate synthetic data
generate_synthetic_data <- function(n) {
job_functions <- c("Opetus", "Logistyk", "Educación", "Formation", "Образование", "教育", "교육", "教育")
data.frame(
record_id = as.character(1:n),
job_function = sample(job_functions, n, replace = TRUE),
stringsAsFactors = FALSE
)
}
# Generate synthetic data
set.seed(123)
synthetic_data <- generate_synthetic_data(10)
# Define the local inference server URL
local_server_url <- "http://localhost:1234/v1/chat/completions"
# Function to translate job functions using the local model
translate_job_function <- function(job_function) {
# Few-shot CoT prompt with examples
system_message <- glue(
"You are an expert in language translation. Your task is to translate job functions or departments in the Education industry from various languages to English. ",
"Provide only the direct translation without any additional text. Here are some examples:\n",
"Input: 'Opetus' -> Output: 'Education'\n",
"Input: 'Logistyk' -> Output: 'Logistics'\n",
"Input: 'Educación' -> Output: 'Education'\n",
"Input: 'Formation' -> Output: 'Education'\n",
"Translate the following: '{job_function}'"
)
# Define the data payload for the local server
data <- list(
model = "microsoft/Phi-3-mini-4k-instruct-gqf", # Update the model name if necessary
messages = list(
list(role = "system", content = system_message),
list(role = "user", content = job_function)
),
temperature = 0.7,
max_tokens = 50,
top_p = 0.9,
frequency_penalty = 0.0,
presence_penalty = 0.0
)
json_body <- toJSON(data, auto_unbox = TRUE)
response <- POST(local_server_url, add_headers(
"Content-Type" = "application/json"
), body = json_body, encode = "json")
if (status_code(response) == 200) {
# Parse response and return the content in JSON format
response_content <- content(response, as = "parsed", type = "application/json")
return(response_content)
} else {
stop(glue("Error in API request: {status_code(response)}, {content(response, 'text')}"))
}
}
# Update the function call in the processing loop
translations <- vector("list", length(synthetic_data$job_function))
detected_languages <- vector("list", length(synthetic_data$job_function))
for (i in seq_along(synthetic_data$job_function)) {
job_function_to_translate <- synthetic_data$job_function[i]
tryCatch(
{
result <- translate_job_function(job_function_to_translate)
print(result) # Add this line to print the result for debugging
translation <- result$choices[[1]]$message$content
detected_language <- textcat::textcat(job_function_to_translate)
translations[[i]] <- translation
detected_languages[[i]] <- detected_language
},
error = function(e) {
message(glue("Error in row {i}: {e$message}"))
translations[[i]] <- NA_character_
detected_languages[[i]] <- NA_character_
}
)
}
# Combine the results into the processed data frame
synthetic_data <- synthetic_data %>%
mutate(
translated_job_function = unlist(translations),
detected_language = unlist(detected_languages)
)
# Print the results to verify
print(synthetic_data)
# Write the results to a CSV file
write_csv(synthetic_data, "synthetic_data_translations.csv")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment