Skip to content

Instantly share code, notes, and snippets.

@martinsotir
Last active March 14, 2018 18:48
Show Gist options
  • Save martinsotir/d16a26b975d90bfd10716fd9a0e0e769 to your computer and use it in GitHub Desktop.
Save martinsotir/d16a26b975d90bfd10716fd9a0e0e769 to your computer and use it in GitHub Desktop.
patchwork_test_for_valera_1
# Instructions (requires docker):
# docker build -t patchwork .
# docker run -it --rm patchwork
FROM openjdk:8
RUN apt-get update
RUN apt-get install apt-transport-https
# Install sbt
RUN echo "deb https://dl.bintray.com/sbt/debian /" | tee -a /etc/apt/sources.list.d/sbt.list
RUN apt-key adv --keyserver hkp://keyserver.ubuntu.com:80 --recv 2EE0EA64E40A89B84B2DF73499E82A75642AC823
RUN apt-get update
RUN apt-get install sbt
# build patchwork and set sbt version to 0.13.16
RUN git clone https://github.com/crim-ca/patchwork
WORKDIR patchwork
RUN sbt compile
# Add test class
COPY test1.scala ./src/main/scala/example/test1.scala
RUN sbt compile
# Run test file locally:
CMD sbt "runMain Test1"
import ca.crim.spark.mllib.clustering._
import org.apache.spark.rdd.RDD
import org.apache.spark.{ SparkContext, SparkConf }
object Test1 extends App {
val sc = new SparkContext(new SparkConf().setAppName("Test1").setMaster("local[2]"))
val dataRDD: RDD[Array[Double]] = sc.parallelize(List(
Array(10,10,1000,1,1,1,1,1,1,1000),
Array(20,20,1,1,1,1,1,1,1,1),
Array(20,30,1,1,1,1,1,1,1,1),
Array(30,330,1,1,1,1,1,1,1,1),
Array(330,30,444,1,1,1,1,1,1,1)))
// PatchWork parameters
val epsilon = Array(10.1, 10.1, 1, 1, 1, 1, 1, 1, 1, 1)
val minPts = 1
val minCellInCluster = 1
val ratio = 0.0
// Training a model with the data
val (patchworkModel, execTime) = Utils.time(
new PatchWork(epsilon, minPts, ratio, minCellInCluster).run(dataRDD)
)
// Display the cluster for each data point
dataRDD.collect().map(x =>
x.mkString("\t") + "\t" + patchworkModel.predict(x).getID
).foreach(println)
// Display some stats about the clusters
var cs = ""
for (i <- Range(0, patchworkModel.clusters.size)) {
cs = cs + " cluster " + patchworkModel.clusters(i).getID + " has " + patchworkModel.clusters(i).cellsList.size + " cells \n"
}
println("\n----------------------------------------- \n" +
"number of points : " + dataRDD.count() + "\n" +
"number of clusters : " + patchworkModel.clusters.size + "\n" +
"----------------------------------------- \n" +
cs +
"----------------------------------------- \n" +
"size of epsilon : [" + epsilon.mkString(",") + "] \n" +
"min pts in each cell : " + minPts + "\n" +
"time of training : " + execTime + " ms" + "\n----------------------------------------- \n")
sc.stop
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment