Created
July 8, 2024 22:04
-
-
Save farach/3b83bd2f044c6b7506ea0fb034cf91fb to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# This R script generates synthetic data representing job functions in various languages, translates them to English using a local language model, and detects the original language. The output includes the original job function, the translated job function, and the detected language, and is saved to a CSV file for further use. | |
# Load necessary libraries | |
library(httr) | |
library(jsonlite) | |
library(textcat) | |
library(tidyverse) | |
library(glue) | |
library(here) | |
# Function to generate synthetic data | |
generate_synthetic_data <- function(n) { | |
job_functions <- c("Opetus", "Logistyk", "Educación", "Formation", "Образование", "教育", "교육", "教育") | |
data.frame( | |
record_id = as.character(1:n), | |
job_function = sample(job_functions, n, replace = TRUE), | |
stringsAsFactors = FALSE | |
) | |
} | |
# Generate synthetic data | |
set.seed(123) | |
synthetic_data <- generate_synthetic_data(10) | |
# Define the local inference server URL | |
local_server_url <- "http://localhost:1234/v1/chat/completions" | |
# Function to translate job functions using the local model | |
translate_job_function <- function(job_function) { | |
# Few-shot CoT prompt with examples | |
system_message <- glue( | |
"You are an expert in language translation. Your task is to translate job functions or departments in the Education industry from various languages to English. ", | |
"Provide only the direct translation without any additional text. Here are some examples:\n", | |
"Input: 'Opetus' -> Output: 'Education'\n", | |
"Input: 'Logistyk' -> Output: 'Logistics'\n", | |
"Input: 'Educación' -> Output: 'Education'\n", | |
"Input: 'Formation' -> Output: 'Education'\n", | |
"Translate the following: '{job_function}'" | |
) | |
# Define the data payload for the local server | |
data <- list( | |
model = "microsoft/Phi-3-mini-4k-instruct-gqf", # Update the model name if necessary | |
messages = list( | |
list(role = "system", content = system_message), | |
list(role = "user", content = job_function) | |
), | |
temperature = 0.7, | |
max_tokens = 50, | |
top_p = 0.9, | |
frequency_penalty = 0.0, | |
presence_penalty = 0.0 | |
) | |
json_body <- toJSON(data, auto_unbox = TRUE) | |
response <- POST(local_server_url, add_headers( | |
"Content-Type" = "application/json" | |
), body = json_body, encode = "json") | |
if (status_code(response) == 200) { | |
# Parse response and return the content in JSON format | |
response_content <- content(response, as = "parsed", type = "application/json") | |
return(response_content) | |
} else { | |
stop(glue("Error in API request: {status_code(response)}, {content(response, 'text')}")) | |
} | |
} | |
# Update the function call in the processing loop | |
translations <- vector("list", length(synthetic_data$job_function)) | |
detected_languages <- vector("list", length(synthetic_data$job_function)) | |
for (i in seq_along(synthetic_data$job_function)) { | |
job_function_to_translate <- synthetic_data$job_function[i] | |
tryCatch( | |
{ | |
result <- translate_job_function(job_function_to_translate) | |
print(result) # Add this line to print the result for debugging | |
translation <- result$choices[[1]]$message$content | |
detected_language <- textcat::textcat(job_function_to_translate) | |
translations[[i]] <- translation | |
detected_languages[[i]] <- detected_language | |
}, | |
error = function(e) { | |
message(glue("Error in row {i}: {e$message}")) | |
translations[[i]] <- NA_character_ | |
detected_languages[[i]] <- NA_character_ | |
} | |
) | |
} | |
# Combine the results into the processed data frame | |
synthetic_data <- synthetic_data %>% | |
mutate( | |
translated_job_function = unlist(translations), | |
detected_language = unlist(detected_languages) | |
) | |
# Print the results to verify | |
print(synthetic_data) | |
# Write the results to a CSV file | |
write_csv(synthetic_data, "synthetic_data_translations.csv") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment