Skip to content

Instantly share code, notes, and snippets.

@allaway
Created November 9, 2023 21:00
Show Gist options
  • Save allaway/f1082f40adc17ba3a41a3ac0247fdecf to your computer and use it in GitHub Desktop.
Save allaway/f1082f40adc17ba3a41a3ac0247fdecf to your computer and use it in GitHub Desktop.
Plot expected data for NF Data Portal using IncomingData table
library(ggplot2)
library(synapser)
library(lubridate)
library(tidyverse)
##datetime in unix time, integer
after_date <- 1704067200L
synLogin()
incoming_data <- synTableQuery('SELECT * FROM syn51471723')$filepath %>%
read_csv() %>%
mutate(date_uploadestimate = lubridate::parse_date_time(date_uploadestimate, orders = 'mdy')) %>%
filter(date_uploadestimate > as_datetime(after_date))
summarized_data <- incoming_data %>%
group_by(assay, fundingAgency) %>%
summarize(estimated_samples = sum(estimatedMinNumSamples), estimated_files = sum(estimatedNumFiles)) %>%
mutate(estimated_samples = case_when(is.na(estimated_samples) ~ 0,
!is.na(estimated_samples) ~ estimated_samples)) %>%
mutate(estimated_files = case_when(is.na(estimated_files) ~ 0,
!is.na(estimated_files) ~ estimated_files)) %>%
ungroup() %>%
group_by(assay) %>%
mutate(estimated_samples_all = sum(estimated_samples), estimated_files_all = sum(estimated_files)) %>%
ungroup() %>%
mutate(assay_reordered_files = forcats::fct_reorder(assay, desc(estimated_files_all), .na_rm = T)) %>%
mutate(assay_reordered_samples = forcats::fct_reorder(assay, desc(estimated_samples_all), .na_rm = T)) %>%
slice_max(estimated_files_all, n =20)
max_date <- max(incoming_data$date_uploadestimate)
a <- ggplot(summarized_data) +
geom_bar(aes(x = assay_reordered_files, y = estimated_files, fill = fundingAgency), stat = 'identity') +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(x = "Assay", y = "Estimated number of Files",
title = paste0("Current expected data deposits between ",
lubridate::round_date(as_datetime(after_date), unit = 'day'),
' and ',
lubridate::round_date(as_datetime(max_date), unit = 'day')
))
b <- ggplot(summarized_data) +
geom_bar(aes(x = assay_reordered_files, y = estimated_samples, fill = fundingAgency), stat = 'identity') +
theme(axis.text.x = element_text(angle = 45, hjust = 1))+
labs(x = "Assay", y = "Estimated number of Samples\n(e.g patients, mice, biospecimens, etc.)",
title = paste0("Current expected data deposits between ",
lubridate::round_date(as_datetime(after_date), unit = 'day'),
' and ',
lubridate::round_date(as_datetime(max_date), unit = 'day')
))
cowplot::plot_grid(a, b, align="hv", axis = "l", ncol = 1)
@allaway
Copy link
Author

allaway commented Dec 1, 2023

Here's a related version of this that we used to produce some plots for the 2023 NTAP report with LLM comments ;)

# Loading required libraries
library(ggplot2)        # For data visualization
library(synapser)       # For Synapse API access
library(lubridate)      # For handling date and time data
library(tidyverse)      # Collection of data manipulation packages

# Setting a cutoff date for filtering data (in Unix time format)
after_date <- 1704067200L

# Logging into Synapse
synLogin()

# Querying a Synapse table and processing the data
incoming_data <- synTableQuery('SELECT * FROM syn51471723')$filepath %>% 
  read_csv() %>% 
  mutate(date_uploadestimate = lubridate::parse_date_time(date_uploadestimate, orders = 'mdy')) %>% 
  filter(date_uploadestimate > as_datetime(after_date)) %>% 
  filter(fundingAgency == "NTAP")

# Summarizing the data by assay and funding agency         
summarized_data <- incoming_data %>% 
  group_by(assay, fundingAgency) %>% 
  summarize(estimated_samples = sum(estimatedMinNumSamples), estimated_files = sum(estimatedNumFiles)) %>% 
  mutate(estimated_samples = case_when(is.na(estimated_samples) ~ 0,
                                       !is.na(estimated_samples) ~ estimated_samples)) %>%
  mutate(estimated_files = case_when(is.na(estimated_files) ~ 0,
                                     !is.na(estimated_files) ~ estimated_files)) %>%
  ungroup() %>% 
  group_by(assay) %>% 
  mutate(estimated_samples_all = sum(estimated_samples), estimated_files_all = sum(estimated_files)) %>% 
  ungroup() %>% 
  mutate(assay_reordered_files = forcats::fct_reorder(assay, desc(estimated_files_all), .na_rm = T)) %>% 
  mutate(assay_reordered_samples = forcats::fct_reorder(assay, desc(estimated_samples_all), .na_rm = T)) %>% 
  slice_max(estimated_files_all, n =10)

# Finding the maximum date in the incoming data
max_date <- max(incoming_data$date_uploadestimate)

# Plotting the summarized data
ggplot(summarized_data) +
  geom_bar(aes(x = assay_reordered_files, y = estimated_files, fill = assay), stat = 'identity') +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  labs(x = "Assay", y = "Estimated number of Files",
       title = paste0("Top 10 expected NTAP data deposits between ", 
                      lubridate::round_date(as_datetime(after_date), unit = 'day'),
                      ' and ',
                      lubridate::round_date(as_datetime(max_date), unit = 'day')
       )) +
  sagethemes::scale_fill_sage_d(level = '400')

# Querying another Synapse table for released data
released_data <- synTableQuery("SELECT id, assay FROM syn16858331 WHERE studyId IN ('syn11374349', 'syn11374345', 'syn11374354', 'syn8016599', 'syn26957643')
                               and resourceType = 'experimentalData'")$filepath %>% 
  read_csv()

# Summarizing the released data
summarized_data <- released_data %>% 
  count(assay) %>% 
  mutate(assay_reordered_files = forcats::fct_reorder(assay, desc(n), .na_rm = T)) 

# Plotting the summarized released data
ggplot(summarized_data) +
  geom_bar(aes(x = assay_reordered_files, y = n, fill = assay), stat = 'identity') +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1),
        plot.margin = margin(l = 1, unit = 'cm')) +
  labs(x = "Assay", y = "Number of Files",
       title = paste0("NTAP data released in 2023"
       )) 

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment