Last active
October 16, 2020 07:49
-
-
Save padpadpadpad/a22b86a658e63873ee45e8314f104c7b to your computer and use it in GitHub Desktop.
Example of parallelising R code using furrr and multidplyr
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# load packages | |
library(multidplyr) | |
library(dplyr) | |
library(tidyr) | |
library(purrr) | |
library(furrr) | |
library(tibble) | |
library(tictoc) | |
# I want to know which gene a bunch of SNPs is in. This will show us how to give different inputs to a function and also parallelising. Hopefully. | |
# make a random dataframe of SNP positions, for our purposes all genes are back to back | |
genome <- tibble(gene = paste('gene', 1:100, sep = '_'), | |
start = seq(1, by = 10, length.out = 100), | |
end = seq(10, by = 10, length.out = 100)) | |
# make a function to assign gene_name by position | |
get_gene_name <- function(snp_pos, ref_genome){ | |
temp <- dplyr::filter(ref_genome, start <= snp_pos & end >= snp_pos) %>% | |
dplyr::pull(., gene) | |
return(temp) | |
} | |
# test on a single case | |
get_gene_name(10, genome) | |
get_gene_name(12, genome) | |
# make up loads of different input values (here are SNP positions) | |
n_iter = 100000 | |
# as n_iter gets bigger, the benefit of the parallel approaches increases | |
all_snps <- tibble(snps = round(runif(n_iter, min = 1, max = 1000)), | |
iter = 1:n_iter) | |
head(all_snps) | |
# now the fun happens, we nest the data using iter and then run the function on each iteration of out variables | |
gene_names <- all_snps %>% | |
nest(-iter) | |
# we iterate our function over our list column using purrr::map() | |
# add gene_name | |
tic() | |
gene_names$gene_name = purrr::map(gene_names$data, ~get_gene_name(.x$snps, ref_genome)) | |
toc() | |
# now to parallelise using furrr | |
plan(multiprocess) | |
tic() | |
gene_names$gene_name2 = furrr::future_map(gene_names$data, ~get_gene_name(.x$snps, ref_genome)) | |
toc() | |
# unnest output | |
gene_names <- unnest(gene_names, data, gene_name, gene_name2) | |
head(gene_names) | |
# now to multiprocess using multidplyr | |
# great tutorial here http://www.business-science.io/code-tools/2016/12/18/multidplyr.html | |
tic() | |
# check available cores | |
cl <- availableCores() | |
# divide data into groups to send off to each core | |
group <- rep(1:cl, length.out = nrow(all_snps)) | |
gene_names <- bind_cols(tibble(group), all_snps) %>% | |
nest(., -c(iter, group)) | |
# create cluster | |
cluster <- create_cluster(cores = cl) | |
# partition by group | |
by_group <- gene_names %>% | |
partition(group, cluster = cluster) | |
# assign values to the cluster | |
by_group %>% | |
# assign libraries | |
cluster_library('purrr') %>% | |
cluster_library('dplyr') %>% | |
cluster_library('tidyr') %>% | |
# assign values | |
cluster_assign_value('get_gene_name', get_gene_name) %>% | |
cluster_assign_value('genome', genome) | |
# check assignments have occurred | |
cluster_eval(by_group, search())[[1]] | |
# run multidplyr code | |
d_mdplyr <- by_group %>% # Use by_group party_df | |
mutate(., gene_info = purrr::map(data, ~get_gene_name(.x$snps, genome))) %>% | |
collect() %>% # Special collect() function to recombine partitions | |
tibble::as.tibble() %>% | |
unnest(data, gene_info) %>% | |
arrange(snps) | |
toc() | |
# these approaches can be applied to multiple starting parameters as long as they are listed within a nested dataframe. i find multidplyr the fastest for the uses I have at the moment. |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment