dvgodoy · March 9, 2019 12:30
diff --git a/handy_training.py b/handy_training.py
 from pyspark.ml.feature import VectorAssembler
 from pyspark.ml.classification import RandomForestClassifier
 from pyspark.ml.pipeline import Pipeline

 # Let's generate a transformer to make both imputations we specified earlier
 imputer = hdf_fenced.transformers.imputer()
 # And a transformer to fence outliers as well
 fencer = hdf_fenced.transformers.fencer()
 # We choose only 3 numeric features (so we don't need to encode categorical features)
 assem = VectorAssembler(inputCols=['Fare', 'Age', 'SibSp', 'Parch'], outputCol='features')
 # Then we build a simple RF classifier with 20 trees
 rf = RandomForestClassifier(featuresCol='features', labelCol='Survived', numTrees=20)
 # And put the two steps into a nice pipeline
 pipeline = Pipeline(stages=[imputer, fencer, assem, rf])

 # Now we fit the model and use transform to get the predictions
 # Thanks to handyspark, now stratified imputations and fencing outliers are
 # also part of the pipeline! :-)
 model = pipeline.fit(sdf)
 predictions = model.transform(sdf)
	from pyspark.ml.feature import VectorAssembler
	from pyspark.ml.classification import RandomForestClassifier
	from pyspark.ml.pipeline import Pipeline

	# Let's generate a transformer to make both imputations we specified earlier
	imputer = hdf_fenced.transformers.imputer()
	# And a transformer to fence outliers as well
	fencer = hdf_fenced.transformers.fencer()
	# We choose only 3 numeric features (so we don't need to encode categorical features)
	assem = VectorAssembler(inputCols=['Fare', 'Age', 'SibSp', 'Parch'], outputCol='features')
	# Then we build a simple RF classifier with 20 trees
	rf = RandomForestClassifier(featuresCol='features', labelCol='Survived', numTrees=20)
	# And put the two steps into a nice pipeline
	pipeline = Pipeline(stages=[imputer, fencer, assem, rf])

	# Now we fit the model and use transform to get the predictions
	# Thanks to handyspark, now stratified imputations and fencing outliers are
	# also part of the pipeline! :-)
	model = pipeline.fit(sdf)
	predictions = model.transform(sdf)