Daniel Voigt Godoy dvgodoy

JUPYTER

	import numpy as np
	from keras import backend as K
	from keras.initializers import VarianceScaling

	fan_in = fan_out = 100
	stddev = np.sqrt(1. / fan_in)

	normal_values = K.eval(K.random_normal(shape=(fan_in, fan_out), stddev=stddev)).ravel()
	truncated_values = K.eval(K.truncated_normal(shape=(fan_in, fan_out), stddev=stddev)).ravel()
	var_scaling_values = K.eval(VarianceScaling(mode='fan_in')(shape=(fan_in, fan_out))).ravel()

	from sklearn.linear_model import LogisticRegression
	from sklearn.metrics import log_loss
	import numpy as np

	x = np.array([-2.2, -1.4, -.8, .2, .4, .8, 1.2, 2.2, 2.9, 4.6])
	y = np.array([0.0, 0.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0])

	logr = LogisticRegression(solver='lbfgs')
	logr.fit(x.reshape(-1, 1), y)

	### CLEANING UP DATA

	### IMPUTING MISSING VALUES
	# Fills missing values for Age, stratifying it by Pclass and Sex
	hdf_filled = hdf.stratify(['Pclass', 'Sex']).fill(continuous=['Age'], strategy=['mean'])

	### HANDLING OUTLIERS
	# Fences outlier values for Fare
	hdf_fenced = hdf_filled.fence(['Fare'], k=3)

	from pyspark.ml.feature import VectorAssembler
	from pyspark.ml.classification import RandomForestClassifier
	from pyspark.ml.pipeline import Pipeline

	# Let's generate a transformer to make both imputations we specified earlier
	imputer = hdf_fenced.transformers.imputer()
	# And a transformer to fence outliers as well
	fencer = hdf_fenced.transformers.fencer()
	# We choose only 3 numeric features (so we don't need to encode categorical features)
	assem = VectorAssembler(inputCols=['Fare', 'Age', 'SibSp', 'Parch'], outputCol='features')

	# Creates instance of extended version of BinaryClassificationMetrics
	# using a DataFrame and its probability and label columns, as the output
	# from the classifier
	bcm = BinaryClassificationMetrics(predictions, scoreCol='probability', labelCol='Survived')

	# We still can get the same metrics as the evaluator...
	print("Area under ROC Curve: {:.4f}".format(bcm.areaUnderROC))
	print("Area under PR Curve: {:.4f}".format(bcm.areaUnderPR))

	# But now we can PLOT both ROC and PR curves!

	from deepreplay.datasets.ball import load_data

	X, y = load_data(n_dims=10)

	import findspark
	from pyspark.sql import SparkSession
	from handyspark import *
	from matplotlib import pyplot as plt
	%matplotlib inline

	findspark.init()

	spark = SparkSession.builder.getOrCreate()

	from pyspark.ml.evaluation import BinaryClassificationEvaluator

	# Let's use the run-of-the-mill evaluator
	evaluator = BinaryClassificationEvaluator(labelCol='Survived')

	# We have only two choices: area under ROC and PR curves :-(
	auroc = evaluator.evaluate(predictions, {evaluator.metricName: "areaUnderROC"})
	auprc = evaluator.evaluate(predictions, {evaluator.metricName: "areaUnderPR"})
	print("Area under ROC Curve: {:.4f}".format(auroc))
	print("Area under PR Curve: {:.4f}".format(auprc))