Created
March 15, 2024 13:26
-
-
Save vjcitn/e30bca744e28d9dbc4388ed57a7f6d04 to your computer and use it in GitHub Desktop.
defines a function probe_lake() to produce shiny app to explore BiocBuildDB data lake
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# setup | |
library(aws.s3) | |
library(DBI) | |
library(dplyr) | |
library(duckdb) | |
library(shiny) | |
# get bucket content metadata into a data.frame "bb" | |
bb = get_bucket_df("s3://bioc-builddb-mirror/buildResults") | |
type = sapply(strsplit(bb$Key, "-"), "[", 2) | |
#table(type) | |
bb$type = type | |
bb$repdate = as.Date(bb$LastModified) | |
# for testing | |
devinf = "buildResults/f9785dba87426695825cc6524dcb82c6-info.csv.gz" | |
if (!exists("con")) con <- dbConnect(duckdb::duckdb(), read_only=TRUE) | |
dbExecute(con, "install 'httpfs'") | |
dbExecute(con, "load 'httpfs'") | |
available_types = c("build_summary.csv.gz", | |
"info.csv.gz", "propagation_status.csv.gz", | |
"report.tgz") | |
# probe into a file for information | |
#' given a bucket file id (key), with duckdb connection con, get | |
#' a list of relevant information including 6 records from the | |
#' associated table in 'head' | |
#' @param key character(1) filename in bucket bioc-builddb-mirror | |
#' @param con duckdb db connection | |
#' @return a list with a provisional S3 class corresponding | |
#' to the file type, with elements fields, nrec, head, key, | |
#' type, guess, branch | |
#' @export | |
probe_file = function(key, con) { | |
pa = sprintf('s3://bioc-builddb-mirror/%s', key) | |
sqlstring = sprintf("FROM read_csv('%s')", pa) | |
# acquire the db and learn fields and number of records | |
tmp = con |> | |
dplyr::tbl(dplyr::sql(sqlstring)) | |
fields = colnames(tmp) | |
nrec = tmp |> dplyr::count() |> as.data.frame() |> | |
unlist() |> as.numeric() | |
# get first 6 records in data.frame for 'probing' | |
thead=(tmp |> head() |> as.data.frame()) | |
# obtain the file type based on substring of key | |
type = unlist(Map(grepl, available_types, key)) | |
type = names(type)[which(type)] | |
# guess the content type among experiment, software, annotation, book | |
guess = "" | |
branch = "" | |
# specific guess code for info.csv | |
if (type == "info.csv.gz") { | |
guesses = c("affydata", "a4Reporting", "AHCytoBands", "csawBook") | |
names(guesses) = c("experiment", "software", "annotation", "book") | |
chks = sapply(guesses, function(x) dplyr::filter(dplyr::select(tmp, Package), Package==x) |> count() | |
|> as.data.frame() |> unlist()) | |
guess = names(guesses[which(chks>0)]) | |
branch = thead$git_branch[1] | |
} | |
# produce list and add class for some generics like print() and details() | |
ans = list(fields=fields, nrec = nrec, head=thead, key=key, | |
type=type, guess=guess, branch = branch) | |
class(ans) = c(type, class(ans)) | |
ans | |
} | |
print.info.csv.gz = function(x, ...) { | |
cat(sprintf("%s info for %s branch with %d records\n", x$guess, x$branch, x$nrec)) | |
} | |
print.build_summary.csv.gz = function(x, ...) { | |
cat(sprintf("build_summary with %d records\n", x$nrec)) | |
} | |
print.propagation_status.csv.gz = function(x, ...) { | |
cat(sprintf("propagation_status with %d records\n", x$nrec)) | |
} | |
details = function(x) UseMethod("details") | |
details.info.csv.gz = function(x) { | |
cat(sprintf("info.csv for %s branch, %d records.\n", | |
x$head$git_branch[1], x$nrec)) | |
} | |
ui = fluidPage( | |
sidebarLayout( | |
sidebarPanel( | |
helpText("probe BiocBuildDB bucket contents"), | |
dateInput("date", "date", min="2024-02-29", max="2024-03-15", value="2024-03-07"), | |
radioButtons("mode", "mode", choices=c("info", | |
"propagation", "build_summary"), selected="info"), | |
uiOutput("boxes") | |
), | |
mainPanel( | |
tabsetPanel( | |
tabPanel("main", | |
verbatimTextOutput("thedate"), | |
DT::dataTableOutput("pick")) | |
) | |
) | |
) | |
) | |
server = function(input, output) { | |
# | |
# assuming bb is available as the bucket data.frame, confine | |
# attention to records with a specific date and mode (info, propagation, build_summary) | |
# | |
gettab = reactive({ | |
validate(need(!is.null(input$date), "pick a date")) | |
tmp = bb[ intersect(which(bb$repdate == as.Date(input$date)), | |
grep(input$mode, bb$type)), ] | |
tks = make.names(paste(tmp$type, tmp$repdate, sep=":"), unique=TRUE) | |
rownames(tmp) = tks | |
tmp | |
}) | |
# | |
# use output of gettab to find a selected file (named in 'Key' field) | |
# and probe it | |
# | |
do_probe = reactive({ | |
tmp = gettab() | |
validate(need(nchar(input$tabs)>0, "waiting for tab")) | |
kk = tmp[input$tabs, "Key"] | |
validate(need(nchar(kk)>0,"getting content")) | |
probe_file(kk, con) | |
}) | |
# | |
# provide 6 records from selected table | |
# | |
output$pick = DT::renderDataTable({ | |
ans = do_probe() | |
ans$head | |
}) | |
output$thedate = renderPrint( cat(print(do_probe())) ) | |
# | |
# generate buttons for all available files of given date and mode | |
# | |
output$boxes = renderUI({ | |
tmp = gettab() | |
rn = rownames(tmp) | |
radioButtons("tabs", "tabs", choices=rn, selected=rn[1]) | |
}) | |
} | |
# | |
# run the app | |
# | |
probe_lake = function() { | |
runApp(list(ui=ui, server=server)) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment