eliasah · November 30, 2016 15:50 · eliasah · Oct 25, 2016
diff --git a/0.question_svm_with_sgd.md b/0.question_svm_with_sgd.md
diff --git a/1.svm_with_sgd.py b/1.svm_with_sgd.py
 # code tested with pyspark 
 # pyspark --packages com.databricks:spark-csv_2.10:1.5.0

 from pyspark.ml.feature import VectorAssembler
 from pyspark.mllib.classification import LabeledPoint
 from pyspark.mllib.classification import SVMWithSGD

 # read data
 df = sqlContext.read.format('com.databricks.spark.csv').options(header='true', inferschema='true').load('data.csv')

 # prepare data
 assembler = VectorAssembler(inputCols=["X", "Y"], outputCol="features")
 data = assembler.transform(df)

 # create RDD[LabeledPoint]
 rdd = data.map(lambda row: LabeledPoint(row.label, row.features))

 # Train SVM With SGD model
 model = SVMWithSGD.train(rdd, iterations=1000,regParam=1.0,intercept=True,step=0.1)

 # Create unlabled data
 unlabeled_data = data.map(lambda x : x.features)

 # Make prediction on unlabeled data and collect
 model.predict(unlabeled_data).collect()
 # [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0]
diff --git a/2.data.csv b/2.data.csv
	# code tested with pyspark
	# pyspark --packages com.databricks:spark-csv_2.10:1.5.0

	from pyspark.ml.feature import VectorAssembler
	from pyspark.mllib.classification import LabeledPoint
	from pyspark.mllib.classification import SVMWithSGD

	# read data
	df = sqlContext.read.format('com.databricks.spark.csv').options(header='true', inferschema='true').load('data.csv')

	# prepare data
	assembler = VectorAssembler(inputCols=["X", "Y"], outputCol="features")
	data = assembler.transform(df)

	# create RDD[LabeledPoint]
	rdd = data.map(lambda row: LabeledPoint(row.label, row.features))

	# Train SVM With SGD model
	model = SVMWithSGD.train(rdd, iterations=1000,regParam=1.0,intercept=True,step=0.1)

	# Create unlabled data
	unlabeled_data = data.map(lambda x : x.features)

	# Make prediction on unlabeled data and collect
	model.predict(unlabeled_data).collect()
	# [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0]