Skip to content

Instantly share code, notes, and snippets.

@realfirst
Last active August 23, 2017 02:27
Show Gist options
  • Save realfirst/1bcb68ea7ee21eea41245318a3a2e8e1 to your computer and use it in GitHub Desktop.
Save realfirst/1bcb68ea7ee21eea41245318a3a2e8e1 to your computer and use it in GitHub Desktop.
// ch5
val rawData = sc.textFile("file:///search/dje/spark-ml/train_noheader.tsv")
val records = rawData.map(line => line.split("\t"))
records.first()
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.linalg.Vectors
val data = records.map { r =>
val trimmed = r.map(_.replaceAll("\"",""))
val label = trimmed(r.size - 1).toInt
val features = trimmed.slice(4, r.size - 1).map(d => if (d == "?") 0.0 else d.toDouble)
LabeledPoint(label, Vectors.dense(features))
}
data.cache
val numData = data.count
val nbData = records.map { r =>
val trimmed = r.map(_.replaceAll("\"",""))
val label = trimmed(r.size - 1).toInt
val features = trimmed.slice(4, r.size - 1).map(d => if (d == "?") 0.0 else d.toDouble).map(d => if (d < 0) 0.0 else d )
LabeledPoint(label, Vectors.dense(features))
}
import org.apache.spark.mllib.classification.LogisticRegressionWithSGD
import org.apache.spark.mllib.classification.SVMWithSGD
import org.apache.spark.mllib.classification.NaiveBayes
import org.apache.spark.mllib.tree.DecisionTree
import org.apache.spark.mllib.tree.configuration.Algo
import org.apache.spark.mllib.tree.impurity.Entropy
val numIterations = 10
val maxTreeDepth = 5
val lrModel = LogisticRegressionWithSGD.train(data, numIterations)
val svmModel = SVMWithSGD.train(data, numIterations)
val nbMobel = NaiveBayes.train(nbdata)
val dtModel = DecisionTree.train(data, Algo.Classification, Entropy, maxTreeDepth)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment