marcovivero · July 7, 2015 23:46
diff --git a/gistfile1.scala b/gistfile1.scala
 def splitDataTest (
    sqlContext : SQLContext,
    data : DataFrame,
    rowCol : String,
    colCol : String,
    tokenizer : String => Array[String],
    idf : Boolean = true,
    numFolds : Int
  ) : Seq[(AssociatedData, RDD[TestObservation])] = {

    val modUDF : Int => (UserDefinedFunction, UserDefinedFunction) = {
      k => (
        functions.udf((s : String) => s.## % numFolds != k),
        functions.udf((s : String) => s.## % numFolds == k)
      )
    }

    // Train/ Test
    val validSeq : Seq[(AssociatedData, RDD[TestObservation])]= (1 until (numFolds + 1)).map(k => {
        val f = modUDF(k)
        (
          new AssociatedData(sqlContext, data.filter(f._1(data(colCol))), rowCol, colCol, tokenizer, idf),
          prepareTest(data.filter(f._2(data(colCol))), rowCol, colCol)
        )
      })

    validSeq
  }
	def splitDataTest (
	sqlContext : SQLContext,
	data : DataFrame,
	rowCol : String,
	colCol : String,
	tokenizer : String => Array[String],
	idf : Boolean = true,
	numFolds : Int
	) : Seq[(AssociatedData, RDD[TestObservation])] = {

	val modUDF : Int => (UserDefinedFunction, UserDefinedFunction) = {
	k => (
	functions.udf((s : String) => s.## % numFolds != k),
	functions.udf((s : String) => s.## % numFolds == k)
	)
	}

	// Train/ Test
	val validSeq : Seq[(AssociatedData, RDD[TestObservation])]= (1 until (numFolds + 1)).map(k => {
	val f = modUDF(k)
	(
	new AssociatedData(sqlContext, data.filter(f._1(data(colCol))), rowCol, colCol, tokenizer, idf),
	prepareTest(data.filter(f._2(data(colCol))), rowCol, colCol)
	)
	})

	validSeq
	}