Skip to content

Instantly share code, notes, and snippets.

View dvgodoy's full-sized avatar

Daniel Voigt Godoy dvgodoy

View GitHub Profile
from deepreplay.datasets.ball import load_data
X, y = load_data(n_dims=10)
import numpy as np
from keras import backend as K
from keras.initializers import VarianceScaling
fan_in = fan_out = 100
stddev = np.sqrt(1. / fan_in)
normal_values = K.eval(K.random_normal(shape=(fan_in, fan_out), stddev=stddev)).ravel()
truncated_values = K.eval(K.truncated_normal(shape=(fan_in, fan_out), stddev=stddev)).ravel()
var_scaling_values = K.eval(VarianceScaling(mode='fan_in')(shape=(fan_in, fan_out))).ravel()
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss
import numpy as np
x = np.array([-2.2, -1.4, -.8, .2, .4, .8, 1.2, 2.2, 2.9, 4.6])
y = np.array([0.0, 0.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0])
logr = LogisticRegression(solver='lbfgs')
logr.fit(x.reshape(-1, 1), y)
### CLEANING UP DATA
### IMPUTING MISSING VALUES
# Fills missing values for Age, stratifying it by Pclass and Sex
hdf_filled = hdf.stratify(['Pclass', 'Sex']).fill(continuous=['Age'], strategy=['mean'])
### HANDLING OUTLIERS
# Fences outlier values for Fare
hdf_fenced = hdf_filled.fence(['Fare'], k=3)
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.pipeline import Pipeline
# Let's generate a transformer to make both imputations we specified earlier
imputer = hdf_fenced.transformers.imputer()
# And a transformer to fence outliers as well
fencer = hdf_fenced.transformers.fencer()
# We choose only 3 numeric features (so we don't need to encode categorical features)
assem = VectorAssembler(inputCols=['Fare', 'Age', 'SibSp', 'Parch'], outputCol='features')
# Creates instance of extended version of BinaryClassificationMetrics
# using a DataFrame and its probability and label columns, as the output
# from the classifier
bcm = BinaryClassificationMetrics(predictions, scoreCol='probability', labelCol='Survived')
# We still can get the same metrics as the evaluator...
print("Area under ROC Curve: {:.4f}".format(bcm.areaUnderROC))
print("Area under PR Curve: {:.4f}".format(bcm.areaUnderPR))
# But now we can PLOT both ROC and PR curves!
import findspark
from pyspark.sql import SparkSession
from handyspark import *
from matplotlib import pyplot as plt
%matplotlib inline
findspark.init()
spark = SparkSession.builder.getOrCreate()
from pyspark.ml.evaluation import BinaryClassificationEvaluator
# Let's use the run-of-the-mill evaluator
evaluator = BinaryClassificationEvaluator(labelCol='Survived')
# We have only two choices: area under ROC and PR curves :-(
auroc = evaluator.evaluate(predictions, {evaluator.metricName: "areaUnderROC"})
auprc = evaluator.evaluate(predictions, {evaluator.metricName: "areaUnderPR"})
print("Area under ROC Curve: {:.4f}".format(auroc))
print("Area under PR Curve: {:.4f}".format(auprc))