One-hot encoder for use with Spark DataFrames.
import scala.collection.JavaConverters._
import org.apache.spark.sql.types.{StructType,StructField,StringType}
import org.apache.spark.sql.Row
def identityMatrix(n:Int):Array[Array[String]]=Array.tabulate(n,n)((x,y) => if(x==y) "1" else "0")
def encodeStringOneHot(table:org.apache.spark.sql.DataFrame,column:String) = {
//Accepts the dataframe and the target column name. Returns a new dataframe in which the target column has been replaced with a one-hot/dummy encoding.
val categories_table = sqlContext.sql(s"SELECT DISTINCT $column FROM temp")
val n_categories:Int = categories_table.count().toInt
val categories = categories_table.collectAsList()
val matrix ={
case (c,r) => c+:r
val matrix_rdd = sc.makeRDD(matrix)
val schema = StructType(StructField(column,StringType,true) +:> StructField(c,StringType,true)))
val row_rdd:RDD[Row] = => Row.fromSeq(r))
val table_oh = sqlContext.createDataFrame(row_rdd,schema)
var joined = table.join(table_oh,List(column),"left_outer").drop(column){
joined = joined.withColumnRenamed(c,column+"_"+c)
