Skip to content

Instantly share code, notes, and snippets.

@allaway
Created November 9, 2023 20:57
Show Gist options
  • Save allaway/383602ded5c4e1e7f7c91c9658470572 to your computer and use it in GitHub Desktop.
Save allaway/383602ded5c4e1e7f7c91c9658470572 to your computer and use it in GitHub Desktop.
Update NF Research Tools Central (temporary solution)
library(synapser)
library(googledrive)
library(googlesheets4)
library(tidyverse)
library(jsonlite)
gs4_auth()
synLogin()
goog_dir <- 'https://drive.google.com/drive/u/1/folders/1yblPMk-kgMj5KJi7P0AFOobZ9vkl6Nkk'
##save a version of each table with update date
# db_tables <- c("syn26486850",
# "syn26486843",
# "syn26486841",
# "syn26486840",
# "syn26486839",
# "syn26486836",
# "syn26486835",
# "syn26486834",
# "syn26486833",
# "syn26486832",
# "syn26486830",
# "syn26486829",
# "syn26486823",
# "syn26486821",
# "syn26486811",
# "syn26486808",
# "syn26486807"
# )
#
# sapply(db_tables, synCreateSnapshotVersion,
# comment = "Snapshot prior to database update",
# label = as.character(Sys.Date()))
##function to do the heavy lifting
## input is a synapse id (one of db_tables above)
## second input is a google file id or url, or a vector of multiple google file ids
## function removes all of the rows in the synapse table (without deleting the table outright)
## if there is only one file, download it and save it as dataframe foo
## if there is more than one file id input, download all and concatenate into one file using the bind_rows function, and save it as dataframe foo
## then check if dataframe foo column names match column names in synapse table
## if they don't match, error
## if they do match, upload the rows in table foo to the synapse table
# Function to update Synapse tables with data from Google Sheets
update_synapse_table_from_google <- function(component,
synapse_id,
google_directory,
stringlist_cols = NULL,
naming_pattern = ".manifest") {
bar <- googledrive::drive_ls(
google_directory,
type = 'spreadsheet',
recursive = T)
google_file_ids <- dplyr::filter(bar, grepl(pattern = paste0(component, naming_pattern), bar$name)) %>%
purrr::pluck("id")
# Check if the input is a single file ID or a vector of multiple IDs
multiple_files <- length(google_file_ids) > 1
# Get the Synapse table schema
schema <- synGet(synapse_id)
# Get the column names from the Synapse table schema
synapse_cols <- synGetTableColumns(schema)$asList()
synapse_col_names <- lapply(synapse_cols, purrr::pluck, 'name') %>% unlist()
# Initialize an empty dataframe to store the combined data
foo <- data.frame()
# Download« and combine Google Sheets data
if (multiple_files) {
# Loop through each file ID, download the data, and combine
for (file_id in google_file_ids) {
file_data <- read_sheet(file_id, col_types = 'c')
foo <- bind_rows(foo, file_data)
}
} else {
# Download data from the single Google Sheet
foo <- read_sheet(google_file_ids, col_types = 'c')
}
# Check if column names in dataframe foo match the Synapse table schema column names
foo_cols <- colnames(foo)
# Assuming foo_cols and synapse_col_names are vectors of column names
missing_in_foo <- setdiff(synapse_col_names, foo_cols)
missing_in_synapse <- setdiff(foo_cols, synapse_col_names)
# Remove "Component" from the missing_in_synapse list if it exists
missing_in_synapse <- setdiff(missing_in_synapse, "Component")
if (length(missing_in_foo) > 0 || length(missing_in_synapse) > 0) {
error_message <- "Column names in the dataframe do not match the Synapse table schema.\n"
if (length(missing_in_foo) > 0) {
error_message <- paste0(error_message, "Missing in dataframe: ", toString(missing_in_foo), ".\n")
}
if (length(missing_in_synapse) > 0) {
error_message <- paste0(error_message, "Missing in Synapse schema (except 'Component'): ", toString(missing_in_synapse), ".")
}
stop(error_message)
}
# Double check that component is correct
if (!all(unique(foo$Component) %in% schema$properties$name)) {
stop("Mismatched Component detected. Check the name of the table on synapse and the values in the Component column.")
}
# Reorder columns in foo to match the Synapse table schema
foo <- foo[, synapse_col_names, drop = FALSE]
# Delete the rows from the Synapse table
synDelete(synTableQuery(paste0("SELECT * FROM ", synapse_id)))
# convert columns to stringlist
if(!is.null(stringlist_cols)){
foo <- convert_columns_to_json(foo, stringlist_cols)
}
# Upload the rows in dataframe foo to the Synapse table
synStore(Table(synapse_id, foo))
}
# Helper function to convert specified columns to JSON arrays
convert_columns_to_json <- function(dataframe, columns) {
# Check if specified columns exist in the dataframe
if (!all(columns %in% names(dataframe))) {
stop("One or more specified columns do not exist in the dataframe.")
}
# Convert each specified column to a JSON array
for (column in columns) {
dataframe[[column]] <- sapply(dataframe[[column]], function(x) {
if (is.na(x)) {
# Keep NA as is
NA
} else {
# Split the string by comma, trim whitespace, and convert to JSON array
toJSON(trimws(unlist(strsplit(x, ",", fixed = TRUE))), auto_unbox = FALSE)
}
})
}
return(dataframe)
}
update_synapse_table_from_google(
component = "AnimalModelDetails",
synapse_id = "syn26486808",
google_directory = "https://drive.google.com/drive/u/1/folders/1yblPMk-kgMj5KJi7P0AFOobZ9vkl6Nkk",
stringlist_cols = c("animalModelOfManifestation", "animalModelGeneticDisorder")
)
update_synapse_table_from_google(
component = "AntibodyDetails",
synapse_id = "syn26486811",
google_directory = "https://drive.google.com/drive/u/1/folders/1yblPMk-kgMj5KJi7P0AFOobZ9vkl6Nkk",
stringlist_cols = "reactiveSpecies"
)
update_synapse_table_from_google(
component = "BiobankDetails",
synapse_id = "syn26486821",
google_directory = "https://drive.google.com/drive/u/1/folders/1yblPMk-kgMj5KJi7P0AFOobZ9vkl6Nkk",
stringlist_cols = c("specimenPreparationMethod","specimenType","tumorType",
"specimenFormat","specimenTissueType", "diseaseType")
)
update_synapse_table_from_google(
component = "CellLineDetails",
synapse_id = "syn26486823",
google_directory = "https://drive.google.com/drive/u/1/folders/1yblPMk-kgMj5KJi7P0AFOobZ9vkl6Nkk",
stringlist_cols = c("cellLineManifestation","cellLineGeneticDisorder")
)
update_synapse_table_from_google(
component = "Development",
synapse_id = "syn26486807",
google_directory = "https://drive.google.com/drive/u/1/folders/1yblPMk-kgMj5KJi7P0AFOobZ9vkl6Nkk",
)
update_synapse_table_from_google(
component = "Donor",
synapse_id = "syn26486829",
google_directory = "https://drive.google.com/drive/u/1/folders/1yblPMk-kgMj5KJi7P0AFOobZ9vkl6Nkk",
stringlist_cols = 'species'
)
update_synapse_table_from_google(
component = "Funder",
synapse_id = "syn26486830",
google_directory = "https://drive.google.com/drive/u/1/folders/1yblPMk-kgMj5KJi7P0AFOobZ9vkl6Nkk",
)
update_synapse_table_from_google(
component = "GeneticReagentDetails",
synapse_id = "syn26486832",
google_directory = "https://drive.google.com/drive/u/1/folders/1yblPMk-kgMj5KJi7P0AFOobZ9vkl6Nkk",
stringlist_cols = c("insertSpecies","vectorType")
)
update_synapse_table_from_google(
component = "Investigator",
synapse_id = "syn26486833",
google_directory = "https://drive.google.com/drive/u/1/folders/1yblPMk-kgMj5KJi7P0AFOobZ9vkl6Nkk",
)
update_synapse_table_from_google(
component = "Mutation",
synapse_id = "syn26486834",
google_directory = "https://drive.google.com/drive/u/1/folders/1yblPMk-kgMj5KJi7P0AFOobZ9vkl6Nkk",
)
update_synapse_table_from_google(
component = "MutationDetails",
synapse_id = "syn26486835",
google_directory = "https://drive.google.com/drive/u/1/folders/1yblPMk-kgMj5KJi7P0AFOobZ9vkl6Nkk",
stringlist_cols = c("alleleType","mutationMethod", "mutationType")
)
update_synapse_table_from_google(
component = "Observation",
synapse_id = "syn26486836",
google_directory = "https://drive.google.com/drive/u/1/folders/1yblPMk-kgMj5KJi7P0AFOobZ9vkl6Nkk",
stringlist_cols = c("observationType")
)
update_synapse_table_from_google(
component = "Publication",
synapse_id = "syn26486839",
google_directory = "https://drive.google.com/drive/u/1/folders/1yblPMk-kgMj5KJi7P0AFOobZ9vkl6Nkk",
stringlist_cols = c("authors")
)
update_synapse_table_from_google(
component = "Resource",
synapse_id = "syn26450069",
google_directory = "https://drive.google.com/drive/u/1/folders/1yblPMk-kgMj5KJi7P0AFOobZ9vkl6Nkk",
stringlist_cols = c("usageRequirements","synonyms")
)
update_synapse_table_from_google(
component = "ResourceApplication",
synapse_id = "syn26486840",
google_directory = "https://drive.google.com/drive/u/1/folders/1yblPMk-kgMj5KJi7P0AFOobZ9vkl6Nkk",
stringlist_cols = c("applications")
)
update_synapse_table_from_google(
component = "Usage",
synapse_id = "syn26486841",
google_directory = "https://drive.google.com/drive/u/1/folders/1yblPMk-kgMj5KJi7P0AFOobZ9vkl6Nkk",
)
update_synapse_table_from_google(
component = "Vendor",
synapse_id = "syn26486850",
google_directory = "https://drive.google.com/drive/u/1/folders/1yblPMk-kgMj5KJi7P0AFOobZ9vkl6Nkk",
)
update_synapse_table_from_google(
component = "VendorItem",
synapse_id = "syn26486843",
google_directory = "https://drive.google.com/drive/u/1/folders/1yblPMk-kgMj5KJi7P0AFOobZ9vkl6Nkk",
)
@allaway
Copy link
Author

allaway commented Nov 9, 2023

This is a temporary solution until some blockers with schematic-db are resolved.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment