padpadpadpad · October 16, 2020 07:49
diff --git a/TeamParallel.R b/TeamParallel.R
 # load packages
 library(multidplyr)
 library(dplyr)
 library(tidyr)
 library(purrr)
 library(furrr)
 library(tibble)
 library(tictoc)

 # I want to know which gene a bunch of SNPs is in. This will show us how to give different inputs to a function and also parallelising. Hopefully.

 # make a random dataframe of SNP positions, for our purposes all genes are back to back
 genome <- tibble(gene = paste('gene', 1:100, sep = '_'),
 start = seq(1, by = 10, length.out = 100),
 end = seq(10, by = 10, length.out = 100))


 # make a function to assign gene_name by position
 get_gene_name <- function(snp_pos, ref_genome){
 temp <- dplyr::filter(ref_genome, start <= snp_pos & end >= snp_pos) %>%
 dplyr::pull(., gene)
 return(temp)
 }

 # test on a single case
 get_gene_name(10, genome)
 get_gene_name(12, genome)

 # make up loads of different input values (here are SNP positions)
 n_iter = 100000
 # as n_iter gets bigger, the benefit of the parallel approaches increases

 all_snps <- tibble(snps = round(runif(n_iter, min = 1, max = 1000)),
 iter = 1:n_iter)
 head(all_snps)

 # now the fun happens, we nest the data using iter and then run the function on each iteration of out variables
 gene_names <- all_snps %>%
 nest(-iter)

 # we iterate our function over our list column using purrr::map()
 # add gene_name
 tic()
 gene_names$gene_name = purrr::map(gene_names$data, ~get_gene_name(.x$snps, ref_genome))
 toc()

 # now to parallelise using furrr
 plan(multiprocess)
 tic()
 gene_names$gene_name2 = furrr::future_map(gene_names$data, ~get_gene_name(.x$snps, ref_genome))
 toc()

 # unnest output
 gene_names <- unnest(gene_names, data, gene_name, gene_name2)
 head(gene_names)

 # now to multiprocess using multidplyr
 # great tutorial here http://www.business-science.io/code-tools/2016/12/18/multidplyr.html

 tic()

 # check available cores
 cl <- availableCores()

 # divide data into groups to send off to each core
 group <- rep(1:cl, length.out = nrow(all_snps))
 gene_names <- bind_cols(tibble(group), all_snps) %>%
 nest(., -c(iter, group))

 # create cluster
 cluster <- create_cluster(cores = cl)

 # partition by group
 by_group <- gene_names %>%
 partition(group, cluster = cluster)

 # assign values to the cluster
 by_group %>%
 # assign libraries
 cluster_library('purrr') %>%
 cluster_library('dplyr') %>%
 cluster_library('tidyr') %>%
 # assign values
 cluster_assign_value('get_gene_name', get_gene_name) %>%
 cluster_assign_value('genome', genome)

 # check assignments have occurred
 cluster_eval(by_group, search())[[1]]

 # run multidplyr code
 d_mdplyr <- by_group %>% # Use by_group party_df
 mutate(., gene_info = purrr::map(data, ~get_gene_name(.x$snps, genome))) %>%
 collect() %>% # Special collect() function to recombine partitions
 tibble::as.tibble() %>%
 unnest(data, gene_info) %>%
 arrange(snps)

 toc()

 # these approaches can be applied to multiple starting parameters as long as they are listed within a nested dataframe. i find multidplyr the fastest for the uses I have at the moment.
	# load packages
	library(multidplyr)
	library(dplyr)
	library(tidyr)
	library(purrr)
	library(furrr)
	library(tibble)
	library(tictoc)

	# I want to know which gene a bunch of SNPs is in. This will show us how to give different inputs to a function and also parallelising. Hopefully.

	# make a random dataframe of SNP positions, for our purposes all genes are back to back
	genome <- tibble(gene = paste('gene', 1:100, sep = '_'),
	start = seq(1, by = 10, length.out = 100),
	end = seq(10, by = 10, length.out = 100))


	# make a function to assign gene_name by position
	get_gene_name <- function(snp_pos, ref_genome){
	temp <- dplyr::filter(ref_genome, start <= snp_pos & end >= snp_pos) %>%
	dplyr::pull(., gene)
	return(temp)
	}

	# test on a single case
	get_gene_name(10, genome)
	get_gene_name(12, genome)

	# make up loads of different input values (here are SNP positions)
	n_iter = 100000
	# as n_iter gets bigger, the benefit of the parallel approaches increases

	all_snps <- tibble(snps = round(runif(n_iter, min = 1, max = 1000)),
	iter = 1:n_iter)
	head(all_snps)

	# now the fun happens, we nest the data using iter and then run the function on each iteration of out variables
	gene_names <- all_snps %>%
	nest(-iter)

	# we iterate our function over our list column using purrr::map()
	# add gene_name
	tic()
	gene_names$gene_name = purrr::map(gene_names$data, ~get_gene_name(.x$snps, ref_genome))
	toc()

	# now to parallelise using furrr
	plan(multiprocess)
	tic()
	gene_names$gene_name2 = furrr::future_map(gene_names$data, ~get_gene_name(.x$snps, ref_genome))
	toc()

	# unnest output
	gene_names <- unnest(gene_names, data, gene_name, gene_name2)
	head(gene_names)

	# now to multiprocess using multidplyr
	# great tutorial here http://www.business-science.io/code-tools/2016/12/18/multidplyr.html

	tic()

	# check available cores
	cl <- availableCores()

	# divide data into groups to send off to each core
	group <- rep(1:cl, length.out = nrow(all_snps))
	gene_names <- bind_cols(tibble(group), all_snps) %>%
	nest(., -c(iter, group))

	# create cluster
	cluster <- create_cluster(cores = cl)

	# partition by group
	by_group <- gene_names %>%
	partition(group, cluster = cluster)

	# assign values to the cluster
	by_group %>%
	# assign libraries
	cluster_library('purrr') %>%
	cluster_library('dplyr') %>%
	cluster_library('tidyr') %>%
	# assign values
	cluster_assign_value('get_gene_name', get_gene_name) %>%
	cluster_assign_value('genome', genome)

	# check assignments have occurred
	cluster_eval(by_group, search())[[1]]

	# run multidplyr code
	d_mdplyr <- by_group %>% # Use by_group party_df
	mutate(., gene_info = purrr::map(data, ~get_gene_name(.x$snps, genome))) %>%
	collect() %>% # Special collect() function to recombine partitions
	tibble::as.tibble() %>%
	unnest(data, gene_info) %>%
	arrange(snps)

	toc()

	# these approaches can be applied to multiple starting parameters as long as they are listed within a nested dataframe. i find multidplyr the fastest for the uses I have at the moment.