c-w · January 4, 2019 20:53 · c-w · Oct 19, 2022 · gbrueckl · Oct 20, 2022
diff --git a/databricksParallelNotebookDriver.scala b/databricksParallelNotebookDriver.scala
 // define the name of the Azure Databricks notebook to run
 val notebookToRun = ???

 // define some way to generate a sequence of workloads to run
 val jobArguments = ???

 // define the number of workers per job
 val workersPerJob = ???

 import java.util.concurrent.Executors
 import scala.concurrent.{Await, ExecutionContext, Future}
 import scala.concurrent.duration.Duration

 // look up the number of workers in the cluster
 val workersAvailable = sc.getExecutorMemoryStatus.size

 // determine number of jobs we can run each with the desired worker count
 val totalJobs = workersAvailable / workersPerJob

 // look up required context for parallel run calls
 val context = dbutils.notebook.getContext()

 // create threadpool for parallel runs
 implicit val executionContext = ExecutionContext.fromExecutorService(
  Executors.newFixedThreadPool(totalJobs))

 try {
  val futures = jobArguments.zipWithIndex.map { case (args, i) =>
    Future({
      // ensure thread knows about databricks context
      dbutils.notebook.setContext(context)

      // define up to maxJobs separate scheduler pools
      sc.setLocalProperty("spark.scheduler.pool", s"pool${i % totalJobs}")

      // start the job in the scheduler pool
      dbutils.notebook.run(notebookToRun, timeoutSeconds = 0, args)
    })}

  // wait for all the jobs to finish processing
  Await.result(Future.sequence(futures), atMost = Duration.Inf)
 } finally {
  // ensure to clean up the threadpool
  executionContext.shutdownNow()
 }
	// define the name of the Azure Databricks notebook to run
	val notebookToRun = ???

	// define some way to generate a sequence of workloads to run
	val jobArguments = ???

	// define the number of workers per job
	val workersPerJob = ???

	import java.util.concurrent.Executors
	import scala.concurrent.{Await, ExecutionContext, Future}
	import scala.concurrent.duration.Duration

	// look up the number of workers in the cluster
	val workersAvailable = sc.getExecutorMemoryStatus.size

	// determine number of jobs we can run each with the desired worker count
	val totalJobs = workersAvailable / workersPerJob

	// look up required context for parallel run calls
	val context = dbutils.notebook.getContext()

	// create threadpool for parallel runs
	implicit val executionContext = ExecutionContext.fromExecutorService(
	Executors.newFixedThreadPool(totalJobs))

	try {
	val futures = jobArguments.zipWithIndex.map { case (args, i) =>
	Future({
	// ensure thread knows about databricks context
	dbutils.notebook.setContext(context)

	// define up to maxJobs separate scheduler pools
	sc.setLocalProperty("spark.scheduler.pool", s"pool${i % totalJobs}")

	// start the job in the scheduler pool
	dbutils.notebook.run(notebookToRun, timeoutSeconds = 0, args)
	})}

	// wait for all the jobs to finish processing
	Await.result(Future.sequence(futures), atMost = Duration.Inf)
	} finally {
	// ensure to clean up the threadpool
	executionContext.shutdownNow()
	}