yaravind · July 3, 2020 23:16
diff --git a/KMeansSparkMLToMLLib.scala b/KMeansSparkMLToMLLib.scala
 import org.apache.spark.mllib.clustering.BisectingKMeans
 import org.apache.spark.mllib.linalg.Vectors
 import org.apache.spark.mllib.linalg.Vector

 //std_features col is of type vector
 scaledFeatures.select($"std_features").printSchema()

 val tempFeatureRdd = scaledFeatures.select($"std_features").rdd

 import scala.reflect.runtime.universe._
 def getType[T: TypeTag](value: T) = typeOf[T]
 println("-------BEFORE")
 println("Type of RDD: "+getType(tempFeatureRdd))
 println("Type of column: "+getType(tempFeatureRdd.first()))

 /**
 create a new df of type RDD[org.apache.spark.mllib.linalg.Vector] by mapping
 RDD[org.apache.spark.sql.Row] to RDD[org.apache.spark.mllib.linalg.Vector] 
 as BisectingKMeans works only with Vector type
 **/
 val input = scaledFeatures
                .select($"std_features")
                .rdd
                .map(v => Vectors.fromML(v.getAs[org.apache.spark.ml.linalg.Vector](0)))
                .cache() //important for ML algos to run faster
 println("-------AFTER")
 println("Type of RDD: "+getType(input))
 println("Type of column: "+getType(input.first()))

 println("Total rows: "+input.count())

 // Clustering the data into 9 clusters by BisectingKMeans.
 val bkm = new BisectingKMeans().setK(9)
 val model = bkm.run(input)

 println(s"Compute Cost: ${model.computeCost(input)}")
 model.clusterCenters.zipWithIndex.foreach { case (center, idx) =>
  println(s"Cluster Center ${idx}: ${center}")
 }
	import org.apache.spark.mllib.clustering.BisectingKMeans
	import org.apache.spark.mllib.linalg.Vectors
	import org.apache.spark.mllib.linalg.Vector

	//std_features col is of type vector
	scaledFeatures.select($"std_features").printSchema()

	val tempFeatureRdd = scaledFeatures.select($"std_features").rdd

	import scala.reflect.runtime.universe._
	def getType[T: TypeTag](value: T) = typeOf[T]
	println("-------BEFORE")
	println("Type of RDD: "+getType(tempFeatureRdd))
	println("Type of column: "+getType(tempFeatureRdd.first()))

	/**
	create a new df of type RDD[org.apache.spark.mllib.linalg.Vector] by mapping
	RDD[org.apache.spark.sql.Row] to RDD[org.apache.spark.mllib.linalg.Vector]
	as BisectingKMeans works only with Vector type
	**/
	val input = scaledFeatures
	.select($"std_features")
	.rdd
	.map(v => Vectors.fromML(v.getAs[org.apache.spark.ml.linalg.Vector](0)))
	.cache() //important for ML algos to run faster
	println("-------AFTER")
	println("Type of RDD: "+getType(input))
	println("Type of column: "+getType(input.first()))

	println("Total rows: "+input.count())

	// Clustering the data into 9 clusters by BisectingKMeans.
	val bkm = new BisectingKMeans().setK(9)
	val model = bkm.run(input)

	println(s"Compute Cost: ${model.computeCost(input)}")
	model.clusterCenters.zipWithIndex.foreach { case (center, idx) =>
	println(s"Cluster Center ${idx}: ${center}")
	}
No results found