https://www.dataquest.io/blog/jupyter-notebook-tips-tricks-shortcuts/
https://ndres.me/post/best-jupyter-notebook-extensions/
https://towardsdatascience.com/machine-learning-from-scratch-part-1-76603dececa6
| from deepreplay.datasets.ball import load_data | |
| X, y = load_data(n_dims=10) |
| import numpy as np | |
| from keras import backend as K | |
| from keras.initializers import VarianceScaling | |
| fan_in = fan_out = 100 | |
| stddev = np.sqrt(1. / fan_in) | |
| normal_values = K.eval(K.random_normal(shape=(fan_in, fan_out), stddev=stddev)).ravel() | |
| truncated_values = K.eval(K.truncated_normal(shape=(fan_in, fan_out), stddev=stddev)).ravel() | |
| var_scaling_values = K.eval(VarianceScaling(mode='fan_in')(shape=(fan_in, fan_out))).ravel() |
| from sklearn.linear_model import LogisticRegression | |
| from sklearn.metrics import log_loss | |
| import numpy as np | |
| x = np.array([-2.2, -1.4, -.8, .2, .4, .8, 1.2, 2.2, 2.9, 4.6]) | |
| y = np.array([0.0, 0.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]) | |
| logr = LogisticRegression(solver='lbfgs') | |
| logr.fit(x.reshape(-1, 1), y) |
| ### CLEANING UP DATA | |
| ### IMPUTING MISSING VALUES | |
| # Fills missing values for Age, stratifying it by Pclass and Sex | |
| hdf_filled = hdf.stratify(['Pclass', 'Sex']).fill(continuous=['Age'], strategy=['mean']) | |
| ### HANDLING OUTLIERS | |
| # Fences outlier values for Fare | |
| hdf_fenced = hdf_filled.fence(['Fare'], k=3) |
| from pyspark.ml.feature import VectorAssembler | |
| from pyspark.ml.classification import RandomForestClassifier | |
| from pyspark.ml.pipeline import Pipeline | |
| # Let's generate a transformer to make both imputations we specified earlier | |
| imputer = hdf_fenced.transformers.imputer() | |
| # And a transformer to fence outliers as well | |
| fencer = hdf_fenced.transformers.fencer() | |
| # We choose only 3 numeric features (so we don't need to encode categorical features) | |
| assem = VectorAssembler(inputCols=['Fare', 'Age', 'SibSp', 'Parch'], outputCol='features') |
| # Creates instance of extended version of BinaryClassificationMetrics | |
| # using a DataFrame and its probability and label columns, as the output | |
| # from the classifier | |
| bcm = BinaryClassificationMetrics(predictions, scoreCol='probability', labelCol='Survived') | |
| # We still can get the same metrics as the evaluator... | |
| print("Area under ROC Curve: {:.4f}".format(bcm.areaUnderROC)) | |
| print("Area under PR Curve: {:.4f}".format(bcm.areaUnderPR)) | |
| # But now we can PLOT both ROC and PR curves! |
| import findspark | |
| from pyspark.sql import SparkSession | |
| from handyspark import * | |
| from matplotlib import pyplot as plt | |
| %matplotlib inline | |
| findspark.init() | |
| spark = SparkSession.builder.getOrCreate() |
| from pyspark.ml.evaluation import BinaryClassificationEvaluator | |
| # Let's use the run-of-the-mill evaluator | |
| evaluator = BinaryClassificationEvaluator(labelCol='Survived') | |
| # We have only two choices: area under ROC and PR curves :-( | |
| auroc = evaluator.evaluate(predictions, {evaluator.metricName: "areaUnderROC"}) | |
| auprc = evaluator.evaluate(predictions, {evaluator.metricName: "areaUnderPR"}) | |
| print("Area under ROC Curve: {:.4f}".format(auroc)) | |
| print("Area under PR Curve: {:.4f}".format(auprc)) |