Last active
July 24, 2019 08:57
-
-
Save seabbs/ba9c33286221606c1a00062487a28d80 to your computer and use it in GitHub Desktop.
Read in, munge and save a set of csvs + profiling information
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Get required packages - managed using pacman --------------------------- | |
# This installs packages if they are missing and otherwise loads them. | |
if (!require(pacman)) install.packages("pacman"); library(pacman) | |
p_load("tidyverse") | |
p_load("fs") | |
p_load("data.table") | |
p_load("lubridate") | |
p_load("purrr") | |
p_load("furrr") | |
# Get filenames ----------------------------------------------------------- | |
get_filenames <- function(directory = NULL, sub_path = NULL) { | |
folders <- fs::dir_ls(path = directory) %>% | |
unname | |
#fill vector with names of files | |
# Moved from a for loop to a vectorised approach (shorter + faster) | |
filenames <- purrr::map_chr(folders, ~ fs::path(., sub_path)) | |
return(filenames) | |
} | |
## Modify filenames for your data. | |
filenames <- get_filenames("raw-data", sub_path = file.path("SheepScab", "Sheep", "I_0forsam.csv")) | |
# Read in (single) ----------------------------------------------------------------- | |
## This function summarises a single file | |
## Currently data munging is implemented in dplyr + base + data.table | |
## This means that code clarity has been sacrificed for performance. | |
summarise_file <- function(file) { | |
## Read in the data using data.table | |
df <- data.table::fread(file) | |
## Turn into a tibble | |
df <- tibble::as_tibble(df) | |
## Time variables | |
df$time <- lubridate::as_date(df$time) | |
df$month <- lubridate::month(df$time) | |
df$year <- lubridate::year(df$time) | |
df$month_year <- lubridate::floor_date(df$time, "month") | |
df <- dplyr::select(df, iteration, time, month, year, month_year, dplyr::everything()) | |
## Summarise across farms | |
## Number infected | |
df_mat <- as.matrix(df[, -(1:5)]) | |
df_mat <- sign(df_mat) | |
df$no_inf_farms <- rowSums(df_mat) | |
## Pull variables of interest | |
df_sum <- dplyr::select(df, iteration, time, month, year, month_year, no_inf_farms) | |
## Summarise within farms - highly compressed for speed | |
## Convert to data.table | |
df_long <- data.table::as.data.table(df) | |
## Add a row with 0 infected cases at the end of the simulation. | |
## Go long from wide | |
df_long <- data.table::melt.data.table(df_long, id.vars = c("iteration", "time", | |
"month", "year", "month_year"), | |
variable.name = "farm", value.name = "infected") | |
##Find all farms and add new ending iteration with zero cases | |
all_farms <- df_long[iteration == max(iteration)][,`:=`(iteration = max(iteration) + 1, | |
infected = 0, type = "maximum")] | |
## Pull out new infections | |
df_time_inf <- df_long[infected > 0 & data.table::shift(infected, 1, type = "lag") == 0][, type := "infected"] | |
## Pull out new recoveries | |
df_time_rec <- df_long[infected == 0 & data.table::shift(infected, 1, type = "lag") > 0][,type := "recovered"] | |
## Join data frames together | |
df_long <- rbind(df_time_inf, df_time_rec, all_farms) | |
## Find time between infection and recoveries | |
df_long <- df_long[order(iteration)][, | |
time_infected := data.table::shift(iteration, 1, type = "lead") - | |
iteration, | |
by = farm][type == "infected"] | |
## Make farms human readable | |
df_long <- df_long[, farm := data.table::last(data.table::tstrsplit(farm, "/", fixed=TRUE))] | |
df_long <- tibble::as_tibble(df_long) | |
out <- list(df_sum, df_long) | |
return(out) | |
} | |
## Forces JIT compilation (should occur by default) | |
summarise_file <- compiler::cmpfun(summarise_file) | |
## My profiling tests with a 1e6 row dataframe show that (in %): | |
## Overall - 1200ms | |
## See img/profile.png for details | |
# Summarise all files ----------------------------------------------------- | |
## This uses parallisation via the future package. | |
## If your datasets are large you may run out of RAM and this will crash | |
## If this happens comment out plan(multiprocess) and setDTthreads and uncomment plan(sequential) + setDTthreads | |
## For small jobs it may be faster to run sequentially as setting up parallisation adds some overhead. | |
## As it stands it looks like using a sequential plan and allowing data.table to be parallel is fastest. | |
##future::plan(future::multiprocess(workers = future::availableCores() / 2)) | |
##data.table::setDTthreads(threads = 1) | |
plan(sequential) | |
data.table::setDTthreads(threads = future::availableCores() / 2) | |
## Run summary function | |
sim_sum <- future_map(filenames, summarise_file, .progress = TRUE) | |
## Flip list order | |
sim_sum <- purrr::transpose(sim_sum) | |
##Pull out and bind no in farm df | |
disease_sum <- dplyr::bind_rows(sim_sum[[1]], .id = "id") | |
## Pull and bind when farms inf df | |
farm_sum <- dplyr::bind_rows(sim_sum[[2]], .id = "id") | |
# Save results ------------------------------------------------------------ | |
data.table::fwrite(disease_sum, "data/disease_sum.csv") | |
data.table::fwrite(farm_sum, "data/farm_sum.csv") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment