Skip to content

Instantly share code, notes, and snippets.

@allaway
Created November 11, 2022 00:19
Show Gist options
  • Select an option

  • Save allaway/f1b25ef40596859281d09db250b9f05e to your computer and use it in GitHub Desktop.

Select an option

Save allaway/f1b25ef40596859281d09db250b9f05e to your computer and use it in GitHub Desktop.
Index S3 files on Synapse when md5s are also deposited as text files in the bucket (using R)
Sys.setenv(
AWS_ACCESS_KEY_ID="abcd",
AWS_SECRET_ACCESS_KEY="1a2b3c",
AWS_SESSION_TOKEN="rlylongstring"
)
library(paws)
library(dplyr)
library(nfportalutils)
library(reticulate)
synapse <- import('synapseclient')
syn_login()
s3 <- paws::s3()
bucket <- 'nf-syn23664726-s3-bucket-n9uakf7bowwd'
storageLocation <- httr::GET('https://repo-prod.prod.sagebase.org/file/v1/entity/syn35221462/uploadDestination')$content %>%
rawToChar() %>%
jsonlite::fromJSON() %>%
purrr::pluck('storageLocationId')
foo <- s3$list_objects_v2(bucket, Delimiter = "/")
folders <- s3$list_objects_v2(bucket, Prefix = "WholeGenomeSequencing/", Delimiter = "/")
pfx <- folders$CommonPrefixes %>% unlist
get_md5 <- function(bucket, prefix, pat = '*md5.txt'){
folders <- s3$list_objects_v2(bucket, Prefix = prefix)$Contents
key <- unlist(folders)[grep(x = unlist(folders), pattern = pat)]
bar <- s3$get_object(Bucket = bucket,Key = key)
#this next part should probably be _outside_ the function to make it easier to re-use in slightly difft scenarios
bar$Body %>%
rawToChar() %>%
readr::read_delim(col_names = c("md5", "null", "filename")) %>%
dplyr::mutate(key = glue::glue("{prefix}{filename}")) %>%
dplyr::mutate(prefix = prefix)
}
get_size <- function(key){
message(key)
size <- s3$list_objects_v2(Bucket=bucket, Prefix = key)$Contents[[1]]$Size
}
output <- lapply(pfx, function(x){
foo <- get_md5(bucket = bucket, prefix = x)
}) %>% bind_rows() %>%
mutate(parent_folder = stringr::str_remove("WholeGenomeSequencing/", string = prefix) %>% stringr::str_remove(pattern = "/"))
## some files that are mentioned in the md5 reference files appear not to exist. let's check for this:
find_missing_files <- function(bucket, key){
ret <- s3$list_objects_v2(Bucket=bucket, Prefix = key)
if(ret$KeyCount==0){
c(missing = T)}
else if(ret$KeyCount==1){
c(missing = F)
}
}
missing <- sapply(output$key, find_missing_files, bucket = bucket)
missing_actually <- names(missing[sapply(missing,isTRUE)])
##there's one missing file, let's remove that and proceed. We can index that one file later.
output <- output[output$key!=missing_actually,]
#create folders
# sapply(unique(output$parent_folder), function(x){
# .syn$store(synapse$Folder(name = x, parentId = "syn35221462"))
# })
##whoops, get parent ids,
parentids <- .syn$getChildren("syn35221462") %>%
reticulate::iterate() %>%
bind_rows %>%
select(name, id) %>%
rename(parent_folder = name)
output_2 <- mutate(output, size = sapply(key,get_size))
output_3 <- left_join(output_2, parentids)
make_and_store_file_handle <- function(bucket, key, filename, size, md5, storage_location, parent_id){
fileHandle <-
list(
'concreteType' = 'org.sagebionetworks.repo.model.file.S3FileHandle',
'fileName' = filename,
'contentSize' = size,
'contentType' = 'NOT SET',
'contentMd5' = md5,
'bucketName' = bucket,
'key' = key,
'storageLocationId'= storage_location
)
fileHandle <- .syn$restPOST('/externalFileHandle/s3', jsonlite::toJSON(fileHandle, auto_unbox = T), endpoint=.syn$fileHandleEndpoint)
f <- synapse$File(name = filename, parentId=parent_id, dataFileHandleId = fileHandle[['id']])
.syn$store(f)
}
apply(output_3, 1, function(x){
print(x[["key"]])
make_and_store_file_handle(bucket = bucket,
key = x[["key"]],
filename = x[["filename"]],
size = as.numeric(x[["size"]]) ,
md5 = x[["md5"]],
storage_location = as.numeric(storageLocation),
parent_id = x[["id"]])
})
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment