Skip to content

Instantly share code, notes, and snippets.

@seabbs
Last active July 24, 2019 08:57
Show Gist options
  • Save seabbs/ba9c33286221606c1a00062487a28d80 to your computer and use it in GitHub Desktop.
Save seabbs/ba9c33286221606c1a00062487a28d80 to your computer and use it in GitHub Desktop.
Read in, munge and save a set of csvs + profiling information
# Get required packages - managed using pacman ---------------------------
# This installs packages if they are missing and otherwise loads them.
if (!require(pacman)) install.packages("pacman"); library(pacman)
p_load("tidyverse")
p_load("fs")
p_load("data.table")
p_load("lubridate")
p_load("purrr")
p_load("furrr")
# Get filenames -----------------------------------------------------------
get_filenames <- function(directory = NULL, sub_path = NULL) {
folders <- fs::dir_ls(path = directory) %>%
unname
#fill vector with names of files
# Moved from a for loop to a vectorised approach (shorter + faster)
filenames <- purrr::map_chr(folders, ~ fs::path(., sub_path))
return(filenames)
}
## Modify filenames for your data.
filenames <- get_filenames("raw-data", sub_path = file.path("SheepScab", "Sheep", "I_0forsam.csv"))
# Read in (single) -----------------------------------------------------------------
## This function summarises a single file
## Currently data munging is implemented in dplyr + base + data.table
## This means that code clarity has been sacrificed for performance.
summarise_file <- function(file) {
## Read in the data using data.table
df <- data.table::fread(file)
## Turn into a tibble
df <- tibble::as_tibble(df)
## Time variables
df$time <- lubridate::as_date(df$time)
df$month <- lubridate::month(df$time)
df$year <- lubridate::year(df$time)
df$month_year <- lubridate::floor_date(df$time, "month")
df <- dplyr::select(df, iteration, time, month, year, month_year, dplyr::everything())
## Summarise across farms
## Number infected
df_mat <- as.matrix(df[, -(1:5)])
df_mat <- sign(df_mat)
df$no_inf_farms <- rowSums(df_mat)
## Pull variables of interest
df_sum <- dplyr::select(df, iteration, time, month, year, month_year, no_inf_farms)
## Summarise within farms - highly compressed for speed
## Convert to data.table
df_long <- data.table::as.data.table(df)
## Add a row with 0 infected cases at the end of the simulation.
## Go long from wide
df_long <- data.table::melt.data.table(df_long, id.vars = c("iteration", "time",
"month", "year", "month_year"),
variable.name = "farm", value.name = "infected")
##Find all farms and add new ending iteration with zero cases
all_farms <- df_long[iteration == max(iteration)][,`:=`(iteration = max(iteration) + 1,
infected = 0, type = "maximum")]
## Pull out new infections
df_time_inf <- df_long[infected > 0 & data.table::shift(infected, 1, type = "lag") == 0][, type := "infected"]
## Pull out new recoveries
df_time_rec <- df_long[infected == 0 & data.table::shift(infected, 1, type = "lag") > 0][,type := "recovered"]
## Join data frames together
df_long <- rbind(df_time_inf, df_time_rec, all_farms)
## Find time between infection and recoveries
df_long <- df_long[order(iteration)][,
time_infected := data.table::shift(iteration, 1, type = "lead") -
iteration,
by = farm][type == "infected"]
## Make farms human readable
df_long <- df_long[, farm := data.table::last(data.table::tstrsplit(farm, "/", fixed=TRUE))]
df_long <- tibble::as_tibble(df_long)
out <- list(df_sum, df_long)
return(out)
}
## Forces JIT compilation (should occur by default)
summarise_file <- compiler::cmpfun(summarise_file)
## My profiling tests with a 1e6 row dataframe show that (in %):
## Overall - 1200ms
## See img/profile.png for details
# Summarise all files -----------------------------------------------------
## This uses parallisation via the future package.
## If your datasets are large you may run out of RAM and this will crash
## If this happens comment out plan(multiprocess) and setDTthreads and uncomment plan(sequential) + setDTthreads
## For small jobs it may be faster to run sequentially as setting up parallisation adds some overhead.
## As it stands it looks like using a sequential plan and allowing data.table to be parallel is fastest.
##future::plan(future::multiprocess(workers = future::availableCores() / 2))
##data.table::setDTthreads(threads = 1)
plan(sequential)
data.table::setDTthreads(threads = future::availableCores() / 2)
## Run summary function
sim_sum <- future_map(filenames, summarise_file, .progress = TRUE)
## Flip list order
sim_sum <- purrr::transpose(sim_sum)
##Pull out and bind no in farm df
disease_sum <- dplyr::bind_rows(sim_sum[[1]], .id = "id")
## Pull and bind when farms inf df
farm_sum <- dplyr::bind_rows(sim_sum[[2]], .id = "id")
# Save results ------------------------------------------------------------
data.table::fwrite(disease_sum, "data/disease_sum.csv")
data.table::fwrite(farm_sum, "data/farm_sum.csv")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment