http://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary.html#news20.binary
head -1000 news20.binary | sed 's/+1/1/g' | sed 's/-1/0/g' > news20.binary.1000
sort -R news20.binary > news20.random
head -1000 news20.random | sed 's/+1/1/g' | sed 's/-1/0/g' > news20.random.1000
import org.apache.spark.mllib.util.MLUtils
import org.apache.spark.mllib.classification.LogisticRegressionWithSGD
//val training = MLUtils.loadLibSVMFile(sc, "hdfs://dm01:8020/dataset/news20-binary/raw/news20.binary.1000", multiclass=false)
val training = MLUtils.loadLibSVMFile(sc, "hdfs://dm01:8020/dataset/news20-binary/raw/news20.random.1000", multiclass=false)
// val training = MLUtils.loadLibSVMFile(sc, "hdfs://dm01:8020/dataset/news20-binary/raw/news20.random.1000", multiclass=false, numFeatures = 1354731 , minPartitions = 32)
val numFeatures = training .take(1)(0).features.size
//numFeatures: Int = 178560 for news20.binary.1000
//numFeatures: Int = 1354731 for news20.random.1000
val model = LogisticRegressionWithSGD.train(training, numIterations=1)
aggregate at GradientDescent.scala:178 never finishes.
https://github.com/apache/spark/blob/master/mllib/src/main/scala/org/apache/spark/mllib/optimization/GradientDescent.scala#L178
We confirmed that GC is not happening during the aggregate.