Skip to content

Instantly share code, notes, and snippets.

@padpadpadpad
Last active October 16, 2020 07:49
Show Gist options
  • Save padpadpadpad/a22b86a658e63873ee45e8314f104c7b to your computer and use it in GitHub Desktop.
Save padpadpadpad/a22b86a658e63873ee45e8314f104c7b to your computer and use it in GitHub Desktop.
Example of parallelising R code using furrr and multidplyr
# load packages
library(multidplyr)
library(dplyr)
library(tidyr)
library(purrr)
library(furrr)
library(tibble)
library(tictoc)
# I want to know which gene a bunch of SNPs is in. This will show us how to give different inputs to a function and also parallelising. Hopefully.
# make a random dataframe of SNP positions, for our purposes all genes are back to back
genome <- tibble(gene = paste('gene', 1:100, sep = '_'),
start = seq(1, by = 10, length.out = 100),
end = seq(10, by = 10, length.out = 100))
# make a function to assign gene_name by position
get_gene_name <- function(snp_pos, ref_genome){
temp <- dplyr::filter(ref_genome, start <= snp_pos & end >= snp_pos) %>%
dplyr::pull(., gene)
return(temp)
}
# test on a single case
get_gene_name(10, genome)
get_gene_name(12, genome)
# make up loads of different input values (here are SNP positions)
n_iter = 100000
# as n_iter gets bigger, the benefit of the parallel approaches increases
all_snps <- tibble(snps = round(runif(n_iter, min = 1, max = 1000)),
iter = 1:n_iter)
head(all_snps)
# now the fun happens, we nest the data using iter and then run the function on each iteration of out variables
gene_names <- all_snps %>%
nest(-iter)
# we iterate our function over our list column using purrr::map()
# add gene_name
tic()
gene_names$gene_name = purrr::map(gene_names$data, ~get_gene_name(.x$snps, ref_genome))
toc()
# now to parallelise using furrr
plan(multiprocess)
tic()
gene_names$gene_name2 = furrr::future_map(gene_names$data, ~get_gene_name(.x$snps, ref_genome))
toc()
# unnest output
gene_names <- unnest(gene_names, data, gene_name, gene_name2)
head(gene_names)
# now to multiprocess using multidplyr
# great tutorial here http://www.business-science.io/code-tools/2016/12/18/multidplyr.html
tic()
# check available cores
cl <- availableCores()
# divide data into groups to send off to each core
group <- rep(1:cl, length.out = nrow(all_snps))
gene_names <- bind_cols(tibble(group), all_snps) %>%
nest(., -c(iter, group))
# create cluster
cluster <- create_cluster(cores = cl)
# partition by group
by_group <- gene_names %>%
partition(group, cluster = cluster)
# assign values to the cluster
by_group %>%
# assign libraries
cluster_library('purrr') %>%
cluster_library('dplyr') %>%
cluster_library('tidyr') %>%
# assign values
cluster_assign_value('get_gene_name', get_gene_name) %>%
cluster_assign_value('genome', genome)
# check assignments have occurred
cluster_eval(by_group, search())[[1]]
# run multidplyr code
d_mdplyr <- by_group %>% # Use by_group party_df
mutate(., gene_info = purrr::map(data, ~get_gene_name(.x$snps, genome))) %>%
collect() %>% # Special collect() function to recombine partitions
tibble::as.tibble() %>%
unnest(data, gene_info) %>%
arrange(snps)
toc()
# these approaches can be applied to multiple starting parameters as long as they are listed within a nested dataframe. i find multidplyr the fastest for the uses I have at the moment.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment