Created
November 9, 2023 20:57
-
-
Save allaway/383602ded5c4e1e7f7c91c9658470572 to your computer and use it in GitHub Desktop.
Update NF Research Tools Central (temporary solution)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(synapser) | |
library(googledrive) | |
library(googlesheets4) | |
library(tidyverse) | |
library(jsonlite) | |
gs4_auth() | |
synLogin() | |
goog_dir <- 'https://drive.google.com/drive/u/1/folders/1yblPMk-kgMj5KJi7P0AFOobZ9vkl6Nkk' | |
##save a version of each table with update date | |
# db_tables <- c("syn26486850", | |
# "syn26486843", | |
# "syn26486841", | |
# "syn26486840", | |
# "syn26486839", | |
# "syn26486836", | |
# "syn26486835", | |
# "syn26486834", | |
# "syn26486833", | |
# "syn26486832", | |
# "syn26486830", | |
# "syn26486829", | |
# "syn26486823", | |
# "syn26486821", | |
# "syn26486811", | |
# "syn26486808", | |
# "syn26486807" | |
# ) | |
# | |
# sapply(db_tables, synCreateSnapshotVersion, | |
# comment = "Snapshot prior to database update", | |
# label = as.character(Sys.Date())) | |
##function to do the heavy lifting | |
## input is a synapse id (one of db_tables above) | |
## second input is a google file id or url, or a vector of multiple google file ids | |
## function removes all of the rows in the synapse table (without deleting the table outright) | |
## if there is only one file, download it and save it as dataframe foo | |
## if there is more than one file id input, download all and concatenate into one file using the bind_rows function, and save it as dataframe foo | |
## then check if dataframe foo column names match column names in synapse table | |
## if they don't match, error | |
## if they do match, upload the rows in table foo to the synapse table | |
# Function to update Synapse tables with data from Google Sheets | |
update_synapse_table_from_google <- function(component, | |
synapse_id, | |
google_directory, | |
stringlist_cols = NULL, | |
naming_pattern = ".manifest") { | |
bar <- googledrive::drive_ls( | |
google_directory, | |
type = 'spreadsheet', | |
recursive = T) | |
google_file_ids <- dplyr::filter(bar, grepl(pattern = paste0(component, naming_pattern), bar$name)) %>% | |
purrr::pluck("id") | |
# Check if the input is a single file ID or a vector of multiple IDs | |
multiple_files <- length(google_file_ids) > 1 | |
# Get the Synapse table schema | |
schema <- synGet(synapse_id) | |
# Get the column names from the Synapse table schema | |
synapse_cols <- synGetTableColumns(schema)$asList() | |
synapse_col_names <- lapply(synapse_cols, purrr::pluck, 'name') %>% unlist() | |
# Initialize an empty dataframe to store the combined data | |
foo <- data.frame() | |
# Download« and combine Google Sheets data | |
if (multiple_files) { | |
# Loop through each file ID, download the data, and combine | |
for (file_id in google_file_ids) { | |
file_data <- read_sheet(file_id, col_types = 'c') | |
foo <- bind_rows(foo, file_data) | |
} | |
} else { | |
# Download data from the single Google Sheet | |
foo <- read_sheet(google_file_ids, col_types = 'c') | |
} | |
# Check if column names in dataframe foo match the Synapse table schema column names | |
foo_cols <- colnames(foo) | |
# Assuming foo_cols and synapse_col_names are vectors of column names | |
missing_in_foo <- setdiff(synapse_col_names, foo_cols) | |
missing_in_synapse <- setdiff(foo_cols, synapse_col_names) | |
# Remove "Component" from the missing_in_synapse list if it exists | |
missing_in_synapse <- setdiff(missing_in_synapse, "Component") | |
if (length(missing_in_foo) > 0 || length(missing_in_synapse) > 0) { | |
error_message <- "Column names in the dataframe do not match the Synapse table schema.\n" | |
if (length(missing_in_foo) > 0) { | |
error_message <- paste0(error_message, "Missing in dataframe: ", toString(missing_in_foo), ".\n") | |
} | |
if (length(missing_in_synapse) > 0) { | |
error_message <- paste0(error_message, "Missing in Synapse schema (except 'Component'): ", toString(missing_in_synapse), ".") | |
} | |
stop(error_message) | |
} | |
# Double check that component is correct | |
if (!all(unique(foo$Component) %in% schema$properties$name)) { | |
stop("Mismatched Component detected. Check the name of the table on synapse and the values in the Component column.") | |
} | |
# Reorder columns in foo to match the Synapse table schema | |
foo <- foo[, synapse_col_names, drop = FALSE] | |
# Delete the rows from the Synapse table | |
synDelete(synTableQuery(paste0("SELECT * FROM ", synapse_id))) | |
# convert columns to stringlist | |
if(!is.null(stringlist_cols)){ | |
foo <- convert_columns_to_json(foo, stringlist_cols) | |
} | |
# Upload the rows in dataframe foo to the Synapse table | |
synStore(Table(synapse_id, foo)) | |
} | |
# Helper function to convert specified columns to JSON arrays | |
convert_columns_to_json <- function(dataframe, columns) { | |
# Check if specified columns exist in the dataframe | |
if (!all(columns %in% names(dataframe))) { | |
stop("One or more specified columns do not exist in the dataframe.") | |
} | |
# Convert each specified column to a JSON array | |
for (column in columns) { | |
dataframe[[column]] <- sapply(dataframe[[column]], function(x) { | |
if (is.na(x)) { | |
# Keep NA as is | |
NA | |
} else { | |
# Split the string by comma, trim whitespace, and convert to JSON array | |
toJSON(trimws(unlist(strsplit(x, ",", fixed = TRUE))), auto_unbox = FALSE) | |
} | |
}) | |
} | |
return(dataframe) | |
} | |
update_synapse_table_from_google( | |
component = "AnimalModelDetails", | |
synapse_id = "syn26486808", | |
google_directory = "https://drive.google.com/drive/u/1/folders/1yblPMk-kgMj5KJi7P0AFOobZ9vkl6Nkk", | |
stringlist_cols = c("animalModelOfManifestation", "animalModelGeneticDisorder") | |
) | |
update_synapse_table_from_google( | |
component = "AntibodyDetails", | |
synapse_id = "syn26486811", | |
google_directory = "https://drive.google.com/drive/u/1/folders/1yblPMk-kgMj5KJi7P0AFOobZ9vkl6Nkk", | |
stringlist_cols = "reactiveSpecies" | |
) | |
update_synapse_table_from_google( | |
component = "BiobankDetails", | |
synapse_id = "syn26486821", | |
google_directory = "https://drive.google.com/drive/u/1/folders/1yblPMk-kgMj5KJi7P0AFOobZ9vkl6Nkk", | |
stringlist_cols = c("specimenPreparationMethod","specimenType","tumorType", | |
"specimenFormat","specimenTissueType", "diseaseType") | |
) | |
update_synapse_table_from_google( | |
component = "CellLineDetails", | |
synapse_id = "syn26486823", | |
google_directory = "https://drive.google.com/drive/u/1/folders/1yblPMk-kgMj5KJi7P0AFOobZ9vkl6Nkk", | |
stringlist_cols = c("cellLineManifestation","cellLineGeneticDisorder") | |
) | |
update_synapse_table_from_google( | |
component = "Development", | |
synapse_id = "syn26486807", | |
google_directory = "https://drive.google.com/drive/u/1/folders/1yblPMk-kgMj5KJi7P0AFOobZ9vkl6Nkk", | |
) | |
update_synapse_table_from_google( | |
component = "Donor", | |
synapse_id = "syn26486829", | |
google_directory = "https://drive.google.com/drive/u/1/folders/1yblPMk-kgMj5KJi7P0AFOobZ9vkl6Nkk", | |
stringlist_cols = 'species' | |
) | |
update_synapse_table_from_google( | |
component = "Funder", | |
synapse_id = "syn26486830", | |
google_directory = "https://drive.google.com/drive/u/1/folders/1yblPMk-kgMj5KJi7P0AFOobZ9vkl6Nkk", | |
) | |
update_synapse_table_from_google( | |
component = "GeneticReagentDetails", | |
synapse_id = "syn26486832", | |
google_directory = "https://drive.google.com/drive/u/1/folders/1yblPMk-kgMj5KJi7P0AFOobZ9vkl6Nkk", | |
stringlist_cols = c("insertSpecies","vectorType") | |
) | |
update_synapse_table_from_google( | |
component = "Investigator", | |
synapse_id = "syn26486833", | |
google_directory = "https://drive.google.com/drive/u/1/folders/1yblPMk-kgMj5KJi7P0AFOobZ9vkl6Nkk", | |
) | |
update_synapse_table_from_google( | |
component = "Mutation", | |
synapse_id = "syn26486834", | |
google_directory = "https://drive.google.com/drive/u/1/folders/1yblPMk-kgMj5KJi7P0AFOobZ9vkl6Nkk", | |
) | |
update_synapse_table_from_google( | |
component = "MutationDetails", | |
synapse_id = "syn26486835", | |
google_directory = "https://drive.google.com/drive/u/1/folders/1yblPMk-kgMj5KJi7P0AFOobZ9vkl6Nkk", | |
stringlist_cols = c("alleleType","mutationMethod", "mutationType") | |
) | |
update_synapse_table_from_google( | |
component = "Observation", | |
synapse_id = "syn26486836", | |
google_directory = "https://drive.google.com/drive/u/1/folders/1yblPMk-kgMj5KJi7P0AFOobZ9vkl6Nkk", | |
stringlist_cols = c("observationType") | |
) | |
update_synapse_table_from_google( | |
component = "Publication", | |
synapse_id = "syn26486839", | |
google_directory = "https://drive.google.com/drive/u/1/folders/1yblPMk-kgMj5KJi7P0AFOobZ9vkl6Nkk", | |
stringlist_cols = c("authors") | |
) | |
update_synapse_table_from_google( | |
component = "Resource", | |
synapse_id = "syn26450069", | |
google_directory = "https://drive.google.com/drive/u/1/folders/1yblPMk-kgMj5KJi7P0AFOobZ9vkl6Nkk", | |
stringlist_cols = c("usageRequirements","synonyms") | |
) | |
update_synapse_table_from_google( | |
component = "ResourceApplication", | |
synapse_id = "syn26486840", | |
google_directory = "https://drive.google.com/drive/u/1/folders/1yblPMk-kgMj5KJi7P0AFOobZ9vkl6Nkk", | |
stringlist_cols = c("applications") | |
) | |
update_synapse_table_from_google( | |
component = "Usage", | |
synapse_id = "syn26486841", | |
google_directory = "https://drive.google.com/drive/u/1/folders/1yblPMk-kgMj5KJi7P0AFOobZ9vkl6Nkk", | |
) | |
update_synapse_table_from_google( | |
component = "Vendor", | |
synapse_id = "syn26486850", | |
google_directory = "https://drive.google.com/drive/u/1/folders/1yblPMk-kgMj5KJi7P0AFOobZ9vkl6Nkk", | |
) | |
update_synapse_table_from_google( | |
component = "VendorItem", | |
synapse_id = "syn26486843", | |
google_directory = "https://drive.google.com/drive/u/1/folders/1yblPMk-kgMj5KJi7P0AFOobZ9vkl6Nkk", | |
) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
This is a temporary solution until some blockers with schematic-db are resolved.