https://www.dataquest.io/blog/jupyter-notebook-tips-tricks-shortcuts/
https://ndres.me/post/best-jupyter-notebook-extensions/
https://towardsdatascience.com/machine-learning-from-scratch-part-1-76603dececa6
from deepreplay.datasets.ball import load_data | |
X, y = load_data(n_dims=10) |
import numpy as np | |
from keras import backend as K | |
from keras.initializers import VarianceScaling | |
fan_in = fan_out = 100 | |
stddev = np.sqrt(1. / fan_in) | |
normal_values = K.eval(K.random_normal(shape=(fan_in, fan_out), stddev=stddev)).ravel() | |
truncated_values = K.eval(K.truncated_normal(shape=(fan_in, fan_out), stddev=stddev)).ravel() | |
var_scaling_values = K.eval(VarianceScaling(mode='fan_in')(shape=(fan_in, fan_out))).ravel() |
from sklearn.linear_model import LogisticRegression | |
from sklearn.metrics import log_loss | |
import numpy as np | |
x = np.array([-2.2, -1.4, -.8, .2, .4, .8, 1.2, 2.2, 2.9, 4.6]) | |
y = np.array([0.0, 0.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]) | |
logr = LogisticRegression(solver='lbfgs') | |
logr.fit(x.reshape(-1, 1), y) |
### CLEANING UP DATA | |
### IMPUTING MISSING VALUES | |
# Fills missing values for Age, stratifying it by Pclass and Sex | |
hdf_filled = hdf.stratify(['Pclass', 'Sex']).fill(continuous=['Age'], strategy=['mean']) | |
### HANDLING OUTLIERS | |
# Fences outlier values for Fare | |
hdf_fenced = hdf_filled.fence(['Fare'], k=3) |
from pyspark.ml.feature import VectorAssembler | |
from pyspark.ml.classification import RandomForestClassifier | |
from pyspark.ml.pipeline import Pipeline | |
# Let's generate a transformer to make both imputations we specified earlier | |
imputer = hdf_fenced.transformers.imputer() | |
# And a transformer to fence outliers as well | |
fencer = hdf_fenced.transformers.fencer() | |
# We choose only 3 numeric features (so we don't need to encode categorical features) | |
assem = VectorAssembler(inputCols=['Fare', 'Age', 'SibSp', 'Parch'], outputCol='features') |
# Creates instance of extended version of BinaryClassificationMetrics | |
# using a DataFrame and its probability and label columns, as the output | |
# from the classifier | |
bcm = BinaryClassificationMetrics(predictions, scoreCol='probability', labelCol='Survived') | |
# We still can get the same metrics as the evaluator... | |
print("Area under ROC Curve: {:.4f}".format(bcm.areaUnderROC)) | |
print("Area under PR Curve: {:.4f}".format(bcm.areaUnderPR)) | |
# But now we can PLOT both ROC and PR curves! |
import findspark | |
from pyspark.sql import SparkSession | |
from handyspark import * | |
from matplotlib import pyplot as plt | |
%matplotlib inline | |
findspark.init() | |
spark = SparkSession.builder.getOrCreate() |
from pyspark.ml.evaluation import BinaryClassificationEvaluator | |
# Let's use the run-of-the-mill evaluator | |
evaluator = BinaryClassificationEvaluator(labelCol='Survived') | |
# We have only two choices: area under ROC and PR curves :-( | |
auroc = evaluator.evaluate(predictions, {evaluator.metricName: "areaUnderROC"}) | |
auprc = evaluator.evaluate(predictions, {evaluator.metricName: "areaUnderPR"}) | |
print("Area under ROC Curve: {:.4f}".format(auroc)) | |
print("Area under PR Curve: {:.4f}".format(auprc)) |