martinsotir · March 14, 2018 18:48
diff --git a/Dockerfile b/Dockerfile
 # Instructions (requires docker):
 # docker build -t patchwork .
 # docker run -it --rm patchwork

 FROM openjdk:8

 RUN apt-get update
 RUN apt-get install apt-transport-https

 # Install sbt
 RUN echo "deb https://dl.bintray.com/sbt/debian /" | tee -a /etc/apt/sources.list.d/sbt.list
 RUN apt-key adv --keyserver hkp://keyserver.ubuntu.com:80 --recv 2EE0EA64E40A89B84B2DF73499E82A75642AC823
 RUN apt-get update
 RUN apt-get install sbt

 # build patchwork and set sbt version to 0.13.16
 RUN git clone https://github.com/crim-ca/patchwork
 WORKDIR patchwork
 RUN sbt compile

 # Add test class
 COPY test1.scala ./src/main/scala/example/test1.scala
 RUN sbt compile

 # Run test file locally:
 CMD sbt "runMain Test1"
diff --git a/test1.scala b/test1.scala
 import ca.crim.spark.mllib.clustering._
 import org.apache.spark.rdd.RDD
 import org.apache.spark.{ SparkContext, SparkConf }

 object Test1 extends App {

  val sc = new SparkContext(new SparkConf().setAppName("Test1").setMaster("local[2]"))

  val dataRDD: RDD[Array[Double]] = sc.parallelize(List(
    Array(10,10,1000,1,1,1,1,1,1,1000),
    Array(20,20,1,1,1,1,1,1,1,1),
    Array(20,30,1,1,1,1,1,1,1,1),
    Array(30,330,1,1,1,1,1,1,1,1),
    Array(330,30,444,1,1,1,1,1,1,1)))

  // PatchWork parameters
  val epsilon = Array(10.1, 10.1, 1, 1, 1, 1, 1, 1, 1, 1)
  val minPts = 1
  val minCellInCluster = 1
  val ratio = 0.0

  // Training a model with the data
  val (patchworkModel, execTime) = Utils.time(
    new PatchWork(epsilon, minPts, ratio, minCellInCluster).run(dataRDD)
  )

  // Display the cluster for each data point
  dataRDD.collect().map(x =>
    x.mkString("\t") + "\t" + patchworkModel.predict(x).getID
  ).foreach(println)

  // Display some stats about the clusters
  var cs = ""
  for (i <- Range(0, patchworkModel.clusters.size)) {
    cs = cs + "   cluster " + patchworkModel.clusters(i).getID + " has " + patchworkModel.clusters(i).cellsList.size + " cells \n"
  }
  
  println("\n----------------------------------------- \n" +
    "number of points : " + dataRDD.count() + "\n" +
    "number of clusters : " + patchworkModel.clusters.size + "\n" +
    "----------------------------------------- \n" +
    cs +
    "----------------------------------------- \n" +
    "size of epsilon : [" + epsilon.mkString(",") + "] \n" +
    "min pts in each cell : " + minPts + "\n" +
    "time of training : " + execTime + " ms" + "\n----------------------------------------- \n")

  sc.stop
 }
	# Instructions (requires docker):
	# docker build -t patchwork .
	# docker run -it --rm patchwork

	FROM openjdk:8

	RUN apt-get update
	RUN apt-get install apt-transport-https

	# Install sbt
	RUN echo "deb https://dl.bintray.com/sbt/debian /" \| tee -a /etc/apt/sources.list.d/sbt.list
	RUN apt-key adv --keyserver hkp://keyserver.ubuntu.com:80 --recv 2EE0EA64E40A89B84B2DF73499E82A75642AC823
	RUN apt-get update
	RUN apt-get install sbt

	# build patchwork and set sbt version to 0.13.16
	RUN git clone https://github.com/crim-ca/patchwork
	WORKDIR patchwork
	RUN sbt compile

	# Add test class
	COPY test1.scala ./src/main/scala/example/test1.scala
	RUN sbt compile

	# Run test file locally:
	CMD sbt "runMain Test1"
	import ca.crim.spark.mllib.clustering._
	import org.apache.spark.rdd.RDD
	import org.apache.spark.{ SparkContext, SparkConf }

	object Test1 extends App {

	val sc = new SparkContext(new SparkConf().setAppName("Test1").setMaster("local[2]"))

	val dataRDD: RDD[Array[Double]] = sc.parallelize(List(
	Array(10,10,1000,1,1,1,1,1,1,1000),
	Array(20,20,1,1,1,1,1,1,1,1),
	Array(20,30,1,1,1,1,1,1,1,1),
	Array(30,330,1,1,1,1,1,1,1,1),
	Array(330,30,444,1,1,1,1,1,1,1)))

	// PatchWork parameters
	val epsilon = Array(10.1, 10.1, 1, 1, 1, 1, 1, 1, 1, 1)
	val minPts = 1
	val minCellInCluster = 1
	val ratio = 0.0

	// Training a model with the data
	val (patchworkModel, execTime) = Utils.time(
	new PatchWork(epsilon, minPts, ratio, minCellInCluster).run(dataRDD)
	)

	// Display the cluster for each data point
	dataRDD.collect().map(x =>
	x.mkString("\t") + "\t" + patchworkModel.predict(x).getID
	).foreach(println)

	// Display some stats about the clusters
	var cs = ""
	for (i <- Range(0, patchworkModel.clusters.size)) {
	cs = cs + " cluster " + patchworkModel.clusters(i).getID + " has " + patchworkModel.clusters(i).cellsList.size + " cells \n"
	}

	println("\n----------------------------------------- \n" +
	"number of points : " + dataRDD.count() + "\n" +
	"number of clusters : " + patchworkModel.clusters.size + "\n" +
	"----------------------------------------- \n" +
	cs +
	"----------------------------------------- \n" +
	"size of epsilon : [" + epsilon.mkString(",") + "] \n" +
	"min pts in each cell : " + minPts + "\n" +
	"time of training : " + execTime + " ms" + "\n----------------------------------------- \n")

	sc.stop
	}