Created
November 11, 2022 00:19
-
-
Save allaway/f1b25ef40596859281d09db250b9f05e to your computer and use it in GitHub Desktop.
Index S3 files on Synapse when md5s are also deposited as text files in the bucket (using R)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Sys.setenv( | |
| AWS_ACCESS_KEY_ID="abcd", | |
| AWS_SECRET_ACCESS_KEY="1a2b3c", | |
| AWS_SESSION_TOKEN="rlylongstring" | |
| ) | |
| library(paws) | |
| library(dplyr) | |
| library(nfportalutils) | |
| library(reticulate) | |
| synapse <- import('synapseclient') | |
| syn_login() | |
| s3 <- paws::s3() | |
| bucket <- 'nf-syn23664726-s3-bucket-n9uakf7bowwd' | |
| storageLocation <- httr::GET('https://repo-prod.prod.sagebase.org/file/v1/entity/syn35221462/uploadDestination')$content %>% | |
| rawToChar() %>% | |
| jsonlite::fromJSON() %>% | |
| purrr::pluck('storageLocationId') | |
| foo <- s3$list_objects_v2(bucket, Delimiter = "/") | |
| folders <- s3$list_objects_v2(bucket, Prefix = "WholeGenomeSequencing/", Delimiter = "/") | |
| pfx <- folders$CommonPrefixes %>% unlist | |
| get_md5 <- function(bucket, prefix, pat = '*md5.txt'){ | |
| folders <- s3$list_objects_v2(bucket, Prefix = prefix)$Contents | |
| key <- unlist(folders)[grep(x = unlist(folders), pattern = pat)] | |
| bar <- s3$get_object(Bucket = bucket,Key = key) | |
| #this next part should probably be _outside_ the function to make it easier to re-use in slightly difft scenarios | |
| bar$Body %>% | |
| rawToChar() %>% | |
| readr::read_delim(col_names = c("md5", "null", "filename")) %>% | |
| dplyr::mutate(key = glue::glue("{prefix}{filename}")) %>% | |
| dplyr::mutate(prefix = prefix) | |
| } | |
| get_size <- function(key){ | |
| message(key) | |
| size <- s3$list_objects_v2(Bucket=bucket, Prefix = key)$Contents[[1]]$Size | |
| } | |
| output <- lapply(pfx, function(x){ | |
| foo <- get_md5(bucket = bucket, prefix = x) | |
| }) %>% bind_rows() %>% | |
| mutate(parent_folder = stringr::str_remove("WholeGenomeSequencing/", string = prefix) %>% stringr::str_remove(pattern = "/")) | |
| ## some files that are mentioned in the md5 reference files appear not to exist. let's check for this: | |
| find_missing_files <- function(bucket, key){ | |
| ret <- s3$list_objects_v2(Bucket=bucket, Prefix = key) | |
| if(ret$KeyCount==0){ | |
| c(missing = T)} | |
| else if(ret$KeyCount==1){ | |
| c(missing = F) | |
| } | |
| } | |
| missing <- sapply(output$key, find_missing_files, bucket = bucket) | |
| missing_actually <- names(missing[sapply(missing,isTRUE)]) | |
| ##there's one missing file, let's remove that and proceed. We can index that one file later. | |
| output <- output[output$key!=missing_actually,] | |
| #create folders | |
| # sapply(unique(output$parent_folder), function(x){ | |
| # .syn$store(synapse$Folder(name = x, parentId = "syn35221462")) | |
| # }) | |
| ##whoops, get parent ids, | |
| parentids <- .syn$getChildren("syn35221462") %>% | |
| reticulate::iterate() %>% | |
| bind_rows %>% | |
| select(name, id) %>% | |
| rename(parent_folder = name) | |
| output_2 <- mutate(output, size = sapply(key,get_size)) | |
| output_3 <- left_join(output_2, parentids) | |
| make_and_store_file_handle <- function(bucket, key, filename, size, md5, storage_location, parent_id){ | |
| fileHandle <- | |
| list( | |
| 'concreteType' = 'org.sagebionetworks.repo.model.file.S3FileHandle', | |
| 'fileName' = filename, | |
| 'contentSize' = size, | |
| 'contentType' = 'NOT SET', | |
| 'contentMd5' = md5, | |
| 'bucketName' = bucket, | |
| 'key' = key, | |
| 'storageLocationId'= storage_location | |
| ) | |
| fileHandle <- .syn$restPOST('/externalFileHandle/s3', jsonlite::toJSON(fileHandle, auto_unbox = T), endpoint=.syn$fileHandleEndpoint) | |
| f <- synapse$File(name = filename, parentId=parent_id, dataFileHandleId = fileHandle[['id']]) | |
| .syn$store(f) | |
| } | |
| apply(output_3, 1, function(x){ | |
| print(x[["key"]]) | |
| make_and_store_file_handle(bucket = bucket, | |
| key = x[["key"]], | |
| filename = x[["filename"]], | |
| size = as.numeric(x[["size"]]) , | |
| md5 = x[["md5"]], | |
| storage_location = as.numeric(storageLocation), | |
| parent_id = x[["id"]]) | |
| }) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment