vjcitn · March 15, 2024 13:26
diff --git a/probe_lake.R b/probe_lake.R
 # setup

 library(aws.s3)
 library(DBI)
 library(dplyr)
 library(duckdb)
 library(shiny)

 # get bucket content metadata into a data.frame "bb"

 bb = get_bucket_df("s3://bioc-builddb-mirror/buildResults")
 type = sapply(strsplit(bb$Key, "-"), "[", 2)
 #table(type)

 bb$type = type
 bb$repdate = as.Date(bb$LastModified)

 # for testing
 devinf = "buildResults/f9785dba87426695825cc6524dcb82c6-info.csv.gz"

 if (!exists("con")) con <- dbConnect(duckdb::duckdb(), read_only=TRUE)
 dbExecute(con, "install 'httpfs'")
 dbExecute(con, "load 'httpfs'")

 available_types = c("build_summary.csv.gz", 
   "info.csv.gz", "propagation_status.csv.gz", 
   "report.tgz")

 # probe into a file for information

 #' given a bucket file id (key), with duckdb connection con, get
 #' a list of relevant information including 6 records from the
 #' associated table in 'head'
 #' @param key character(1) filename in bucket bioc-builddb-mirror
 #' @param con duckdb db connection
 #' @return a list with a provisional S3 class corresponding
 #' to the file type, with elements fields, nrec, head, key,
 #' type, guess, branch
 #' @export
 probe_file = function(key, con) {
  pa = sprintf('s3://bioc-builddb-mirror/%s', key)
  sqlstring = sprintf("FROM read_csv('%s')", pa)
 # acquire the db and learn fields and number of records
  tmp = con |>
    dplyr::tbl(dplyr::sql(sqlstring)) 
  fields = colnames(tmp)
  nrec = tmp |> dplyr::count() |> as.data.frame() |> 
      unlist() |> as.numeric()
 # get first 6 records in data.frame for 'probing'
  thead=(tmp |> head() |> as.data.frame())
 # obtain the file type based on substring of key
  type = unlist(Map(grepl, available_types, key))
  type = names(type)[which(type)]
 # guess the content type among experiment, software, annotation, book
  guess = ""
  branch = ""
 # specific guess code for info.csv
  if (type == "info.csv.gz") {
   guesses = c("affydata", "a4Reporting", "AHCytoBands", "csawBook")
   names(guesses) = c("experiment", "software", "annotation", "book")
   chks = sapply(guesses, function(x) dplyr::filter(dplyr::select(tmp, Package), Package==x) |> count()
     |> as.data.frame() |> unlist())
   guess = names(guesses[which(chks>0)])
   branch = thead$git_branch[1]
  }
 # produce list and add class for some generics like print() and details()
  ans = list(fields=fields, nrec = nrec, head=thead, key=key,
    type=type, guess=guess, branch = branch)
  class(ans) = c(type, class(ans))
  ans
 }

 print.info.csv.gz = function(x, ...) {
 cat(sprintf("%s info for %s branch with %d records\n", x$guess, x$branch, x$nrec))
 }

 print.build_summary.csv.gz = function(x, ...) {
 cat(sprintf("build_summary with %d records\n", x$nrec))
 }

 print.propagation_status.csv.gz = function(x, ...) {
 cat(sprintf("propagation_status with %d records\n", x$nrec))
 }

 details = function(x) UseMethod("details")

 details.info.csv.gz = function(x) {
  cat(sprintf("info.csv for %s branch, %d records.\n",
     x$head$git_branch[1], x$nrec))
 }


 ui = fluidPage(
 sidebarLayout(
  sidebarPanel(
   helpText("probe BiocBuildDB bucket contents"),
   dateInput("date", "date", min="2024-02-29", max="2024-03-15", value="2024-03-07"),
   radioButtons("mode", "mode", choices=c("info",
         "propagation", "build_summary"), selected="info"),
   uiOutput("boxes")
   ),
  mainPanel(
   tabsetPanel(
    tabPanel("main",
     verbatimTextOutput("thedate"),
     DT::dataTableOutput("pick"))
   )
  )
 )
 )

 server = function(input, output) {
 #
 # assuming bb is available as the bucket data.frame, confine
 # attention to records with a specific date and mode (info, propagation, build_summary)
 #
 gettab = reactive({
    validate(need(!is.null(input$date), "pick a date"))
    tmp = bb[ intersect(which(bb$repdate == as.Date(input$date)),
            grep(input$mode, bb$type)), ]
    tks = make.names(paste(tmp$type, tmp$repdate, sep=":"), unique=TRUE)
    rownames(tmp) = tks
    tmp
    })
 #
 # use output of gettab to find a selected file (named in 'Key' field)
 # and probe it
 #
 do_probe = reactive({
    tmp = gettab()
    validate(need(nchar(input$tabs)>0, "waiting for tab"))
    kk = tmp[input$tabs, "Key"]
    validate(need(nchar(kk)>0,"getting content"))
    probe_file(kk, con)
    })
   
 #
 # provide 6 records from selected table
 #
 output$pick = DT::renderDataTable({
    ans = do_probe()
    ans$head
    })
  
 output$thedate = renderPrint( cat(print(do_probe())) )
 #
 # generate buttons for all available files of given date and mode
 #
 output$boxes = renderUI({
    tmp = gettab()
    rn = rownames(tmp)
    radioButtons("tabs", "tabs", choices=rn, selected=rn[1])
    })
 }

 #
 # run the app
 #
 probe_lake = function() {
 runApp(list(ui=ui, server=server))
 }
	# setup

	library(aws.s3)
	library(DBI)
	library(dplyr)
	library(duckdb)
	library(shiny)

	# get bucket content metadata into a data.frame "bb"

	bb = get_bucket_df("s3://bioc-builddb-mirror/buildResults")
	type = sapply(strsplit(bb$Key, "-"), "[", 2)
	#table(type)

	bb$type = type
	bb$repdate = as.Date(bb$LastModified)

	# for testing
	devinf = "buildResults/f9785dba87426695825cc6524dcb82c6-info.csv.gz"

	if (!exists("con")) con <- dbConnect(duckdb::duckdb(), read_only=TRUE)
	dbExecute(con, "install 'httpfs'")
	dbExecute(con, "load 'httpfs'")

	available_types = c("build_summary.csv.gz",
	"info.csv.gz", "propagation_status.csv.gz",
	"report.tgz")

	# probe into a file for information

	#' given a bucket file id (key), with duckdb connection con, get
	#' a list of relevant information including 6 records from the
	#' associated table in 'head'
	#' @param key character(1) filename in bucket bioc-builddb-mirror
	#' @param con duckdb db connection
	#' @return a list with a provisional S3 class corresponding
	#' to the file type, with elements fields, nrec, head, key,
	#' type, guess, branch
	#' @export
	probe_file = function(key, con) {
	pa = sprintf('s3://bioc-builddb-mirror/%s', key)
	sqlstring = sprintf("FROM read_csv('%s')", pa)
	# acquire the db and learn fields and number of records
	tmp = con \|>
	dplyr::tbl(dplyr::sql(sqlstring))
	fields = colnames(tmp)
	nrec = tmp \|> dplyr::count() \|> as.data.frame() \|>
	unlist() \|> as.numeric()
	# get first 6 records in data.frame for 'probing'
	thead=(tmp \|> head() \|> as.data.frame())
	# obtain the file type based on substring of key
	type = unlist(Map(grepl, available_types, key))
	type = names(type)[which(type)]
	# guess the content type among experiment, software, annotation, book
	guess = ""
	branch = ""
	# specific guess code for info.csv
	if (type == "info.csv.gz") {
	guesses = c("affydata", "a4Reporting", "AHCytoBands", "csawBook")
	names(guesses) = c("experiment", "software", "annotation", "book")
	chks = sapply(guesses, function(x) dplyr::filter(dplyr::select(tmp, Package), Package==x) \|> count()
	\|> as.data.frame() \|> unlist())
	guess = names(guesses[which(chks>0)])
	branch = thead$git_branch[1]
	}
	# produce list and add class for some generics like print() and details()
	ans = list(fields=fields, nrec = nrec, head=thead, key=key,
	type=type, guess=guess, branch = branch)
	class(ans) = c(type, class(ans))
	ans
	}

	print.info.csv.gz = function(x, ...) {
	cat(sprintf("%s info for %s branch with %d records\n", x$guess, x$branch, x$nrec))
	}

	print.build_summary.csv.gz = function(x, ...) {
	cat(sprintf("build_summary with %d records\n", x$nrec))
	}

	print.propagation_status.csv.gz = function(x, ...) {
	cat(sprintf("propagation_status with %d records\n", x$nrec))
	}

	details = function(x) UseMethod("details")

	details.info.csv.gz = function(x) {
	cat(sprintf("info.csv for %s branch, %d records.\n",
	x$head$git_branch[1], x$nrec))
	}


	ui = fluidPage(
	sidebarLayout(
	sidebarPanel(
	helpText("probe BiocBuildDB bucket contents"),
	dateInput("date", "date", min="2024-02-29", max="2024-03-15", value="2024-03-07"),
	radioButtons("mode", "mode", choices=c("info",
	"propagation", "build_summary"), selected="info"),
	uiOutput("boxes")
	),
	mainPanel(
	tabsetPanel(
	tabPanel("main",
	verbatimTextOutput("thedate"),
	DT::dataTableOutput("pick"))
	)
	)
	)
	)

	server = function(input, output) {
	#
	# assuming bb is available as the bucket data.frame, confine
	# attention to records with a specific date and mode (info, propagation, build_summary)
	#
	gettab = reactive({
	validate(need(!is.null(input$date), "pick a date"))
	tmp = bb[ intersect(which(bb$repdate == as.Date(input$date)),
	grep(input$mode, bb$type)), ]
	tks = make.names(paste(tmp$type, tmp$repdate, sep=":"), unique=TRUE)
	rownames(tmp) = tks
	tmp
	})
	#
	# use output of gettab to find a selected file (named in 'Key' field)
	# and probe it
	#
	do_probe = reactive({
	tmp = gettab()
	validate(need(nchar(input$tabs)>0, "waiting for tab"))
	kk = tmp[input$tabs, "Key"]
	validate(need(nchar(kk)>0,"getting content"))
	probe_file(kk, con)
	})

	#
	# provide 6 records from selected table
	#
	output$pick = DT::renderDataTable({
	ans = do_probe()
	ans$head
	})

	output$thedate = renderPrint( cat(print(do_probe())) )
	#
	# generate buttons for all available files of given date and mode
	#
	output$boxes = renderUI({
	tmp = gettab()
	rn = rownames(tmp)
	radioButtons("tabs", "tabs", choices=rn, selected=rn[1])
	})
	}

	#
	# run the app
	#
	probe_lake = function() {
	runApp(list(ui=ui, server=server))
	}