Skip to content

Instantly share code, notes, and snippets.

@libratiger
Last active August 29, 2015 14:02
Show Gist options
  • Save libratiger/6bf00e60885215e3bfd5 to your computer and use it in GitHub Desktop.
Save libratiger/6bf00e60885215e3bfd5 to your computer and use it in GitHub Desktop.
package mllib
import scala.util.Random
import org.jblas.DoubleMatrix
import org.apache.spark.SparkContext
import org.apache.spark.rdd._
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext._
import org.apache.spark.Logging
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.util.MLUtils
import org.apache.spark.mllib.clustering.KMeans
object MyKmeans {
def main(args: Array[String]) {
val sparkMaster = "spark://192.168.35.10:7077"
val input_path = "hdfs://192.168.35.10:54310/kmeans/data2"
val conf = new SparkConf()
conf.setMaster(sparkMaster)
conf.setAppName("Kmeans")
conf.setJars(Seq("target/scala-2.10/tinyfish_2.10-1.0.jar"))
conf.set("spark.worker.timeout", "300")
conf.set("spark.akka.frameSize", "100")
conf.set("spark.executor.memory", "70g")
conf.set("spark.default.parallelism", "3000")
conf.set("spark.storage.memoryFraction", "0.4")
conf.set("spark.akka.threads", "22")
conf.set("spark.shuffle.consolidateFiles", "true")
val sc = new SparkContext(conf)
val data = MLUtils.loadLabeledData(sc, input_path)
val parsedData = data.map(_.features).cache()
val numIterations = 3
val numClusters = 30
val clusters = KMeans.train(parsedData, numClusters, numIterations)
println("finished")
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment