Skip to content

Instantly share code, notes, and snippets.

@DavisVaughan
Last active October 9, 2022 06:41
Show Gist options
  • Save DavisVaughan/865d95cf0101c24df27b37f4047dd2e5 to your computer and use it in GitHub Desktop.
Save DavisVaughan/865d95cf0101c24df27b37f4047dd2e5 to your computer and use it in GitHub Desktop.
# This example demonstrates running furrr code distributed on 2 AWS instances ("nodes").
# The instances have already been created.
library(future)
library(furrr)
# Two t2.micro AWS instances
# Created from http://www.louisaslett.com/RStudio_AMI/
public_ip <- c("34.205.155.182", "34.201.26.217")
# This is where my pem file lives (password to connect essentially).
ssh_private_key_file <- "~/Desktop/programming/AWS/key-pair/dvaughan.pem"
# Connect!
cl <- makeClusterPSOCK(
## Public IP number of EC2 instance
public_ip,
## User name (always 'ubuntu')
user = "ubuntu",
## Use private SSH key registered with AWS
rshopts = c(
"-o", "StrictHostKeyChecking=no",
"-o", "IdentitiesOnly=yes",
"-i", ssh_private_key_file
),
## Set up .libPaths() for the 'ubuntu' user and
## install future/purrr/furrr packages
rscript_args = c(
"-e", shQuote("local({p <- Sys.getenv('R_LIBS_USER'); dir.create(p, recursive = TRUE, showWarnings = FALSE); .libPaths(p)})"),
"-e", shQuote("install.packages(c('future', 'purrr', 'furrr'))")
),
dryrun = FALSE
)
# Set the plan to use the cluster workers!
plan(cluster, workers = cl)
# Run some code distributed evenly on the two workers!
x <- 1
future_map(1:5, ~{.x + x})
#> [[1]]
#> [1] 2
#>
#> [[2]]
#> [1] 3
#>
#> [[3]]
#> [1] 4
#>
#> [[4]]
#> [1] 5
#>
#> [[5]]
#> [1] 6
# Are we reaallllly running in parallel?
library(tictoc)
tic()
future_map(1:2, ~{ Sys.sleep(10) })
#> [[1]]
#> NULL
#>
#> [[2]]
#> NULL
toc()
#> 13.158 sec elapsed
# Shut down
parallel::stopCluster(cl)
@DavisVaughan
Copy link
Author

An updated version that also uses reticulate to let you start and stop the instances.

# This example demonstrates running furrr code distributed on 2 AWS instances ("nodes").
# The instances have already been created.

# Two t2.micro AWS instances
# Created from http://www.louisaslett.com/RStudio_AMI/

library(reticulate)
library(future)
library(furrr)

# Import boto3 from python
boto <- import("boto3")

# Import EC2 client
ec2 <- boto$client('ec2')

# These are the instance IDs and Public IPs
instance_ids <- list("i-074e1c3e9bbccb7e5", "i-0a931747f6d57e794")
instance_ips <- c("54.172.113.213", "52.90.110.26")

# Start them up (after this, right now you have to manually wait on them and go check that they are ready)
ec2$start_instances(InstanceIds = instance_ids)

public_ip <- instance_ips

# This is where my pem file lives (password to connect essentially).
ssh_private_key_file <- "~/Desktop/programming/AWS/key-pair/dvaughan.pem"

# Connect!
cl <- makeClusterPSOCK(
  
  ## Public IP number of EC2 instance
  public_ip,
  
  ## User name (always 'ubuntu')
  user = "ubuntu",
  
  ## Use private SSH key registered with AWS
  rshopts = c(
    "-o", "StrictHostKeyChecking=no",
    "-o", "IdentitiesOnly=yes",
    "-i", ssh_private_key_file
  ),
  
  ## Set up .libPaths() for the 'ubuntu' user and
  ## install future/purrr/furrr packages
  rscript_args = c(
    "-e", shQuote("local({p <- Sys.getenv('R_LIBS_USER'); dir.create(p, recursive = TRUE, showWarnings = FALSE); .libPaths(p)})"),
    "-e", shQuote("install.packages(c('future', 'purrr', 'furrr'))")
  ),
  
  dryrun = FALSE
)

# Set the plan to use the cluster workers!
plan(cluster, workers = cl)

# Run some code distributed evenly on the two workers!
x <- 1
future_map(1:5, ~{.x + x})

# Are we reaallllly running in parallel?
library(tictoc)
tic()
future_walk(1:2, ~{ Sys.sleep(10) })
toc()


# Shut down
parallel::stopCluster(cl)

# Stop the instances
ec2$stop_instances(InstanceIds = instance_ids)

@DavisVaughan
Copy link
Author

Another version that does multi-level distributing. First, we distribute over 2 t2.xlarge AWS instances (2x speedup). Then we distribute over the 4 vCPU's that each instance has (resulting in ~8x speedup for this super simple example).

Using this, we go from 80 seconds -> 13 seconds

library(furrr)
#> Loading required package: future
library(tictoc)

instance_ips <- c("54.164.159.51", "52.23.226.59")

public_ip <- instance_ips

# This is where my pem file lives (password to connect essentially).
ssh_private_key_file <- "~/Desktop/programming/AWS/key-pair/dvaughan.pem"

# Connect!
cl <- makeClusterPSOCK(

  ## Public IP number of EC2 instance
  public_ip,

  ## User name (always 'ubuntu')
  user = "ubuntu",

  ## Use private SSH key registered with AWS
  rshopts = c(
    "-o", "StrictHostKeyChecking=no",
    "-o", "IdentitiesOnly=yes",
    "-i", ssh_private_key_file
  ),

  ## Set up .libPaths() for the 'ubuntu' user and
  ## install future/purrr/furrr packages
  rscript_args = c(
    "-e", shQuote("local({p <- Sys.getenv('R_LIBS_USER'); dir.create(p, recursive = TRUE, showWarnings = FALSE); .libPaths(p)})"),
    "-e", shQuote("install.packages(c('future', 'purrr', 'furrr'))")
  ),

  dryrun = FALSE
)

# First let's try it sequentially
plan(sequential)

tic()
future_map(1:2, ~{
  future_map(1:4, ~Sys.sleep(10))
})
toc()
#> 80.851 sec elapsed

# Multi-distributed plan!
plan(list(tweak(cluster, workers = cl), tweak(multiprocess, workers = 4)))

tic()

# First we distribute over our 2 EC2 instances
future_map(1:2, ~{
  # Then we distribute over the 4 vCPUs that each instance has
  future_map(1:4, ~Sys.sleep(10))
})

toc()
#> 13.001 sec elapsed

Created on 2018-05-31 by the reprex package (v0.2.0).

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment