Last active
July 22, 2018 02:57
-
-
Save accessnash/c329252828d9cc7eb5bffdb5fc2ce28d to your computer and use it in GitHub Desktop.
PIMA Indians dataset obtained from the UCI Machine Learning Repository - to predict whether or not a given female patient will contract diabetes based on features such as BMI, age, and number of pregnancies (Supervised Learning with scikit learn at Datacamp)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # Import necessary modules | |
| from sklearn.model_selection import train_test_split | |
| from sklearn.metrics import classification_report | |
| from sklearn.metrics import confusion_matrix | |
| from sklearn.linear_model import LogisticRegression | |
| # Create training and test set | |
| X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4, random_state=42) | |
| # Instantiate a k-NN classifier: knn | |
| knn = KNeighborsClassifier(n_neighbors=6) | |
| # Fit the classifier to the training data | |
| knn.fit(X_train, y_train) | |
| # Predict the labels of the test data: y_pred | |
| y_pred = knn.predict(X_test) | |
| # Generate the confusion matrix and classification report | |
| print(confusion_matrix(y_test, y_pred)) | |
| print(classification_report(y_test, y_pred)) | |
| # Logistic regression for binary classification | |
| # Create training and test sets | |
| X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4, random_state=42) | |
| # Create the classifier: logreg | |
| logreg = LogisticRegression() | |
| # Fit the classifier to the training data | |
| logreg.fit(X_train, y_train) | |
| # Predict the labels of the test set: y_pred | |
| y_pred = logreg.predict(X_test) | |
| # Compute and print the confusion matrix and classification report | |
| print(confusion_matrix(y_test, y_pred)) | |
| print(classification_report(y_test, y_pred)) | |
| #Plotting an ROC curve | |
| from sklearn.metrics import roc_curve | |
| # Compute predicted probabilities: y_pred_prob | |
| y_pred_prob = logreg.predict_proba(X_test)[:,1] | |
| # Generate ROC curve values: fpr, tpr, thresholds | |
| fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob) | |
| # Plot ROC curve | |
| plt.plot([0, 1], [0, 1], 'k--') | |
| plt.plot(fpr, tpr) | |
| plt.xlabel('False Positive Rate') | |
| plt.ylabel('True Positive Rate') | |
| plt.title('ROC Curve') | |
| plt.show() | |
| # Area Under Curve computation | |
| # Import necessary modules | |
| from sklearn.metrics import roc_auc_score | |
| from sklearn.model_selection import cross_val_score | |
| # Compute predicted probabilities: y_pred_prob | |
| y_pred_prob = logreg.predict_proba(X_test)[:,1] | |
| # Compute and print AUC score | |
| print("AUC: {}".format(roc_auc_score(y_test, y_pred_prob))) | |
| # Compute cross-validated AUC scores: cv_auc | |
| cv_auc = cross_val_score(logreg, X, y, cv=5,scoring='roc_auc') | |
| # Print list of AUC scores | |
| print("AUC scores computed using 5-fold cross-validation: {}".format(cv_auc)) | |
| #use GridSearchCV and logistic regression to find the optimal C in the hyperparameter space | |
| from sklearn.model_selection import GridSearchCV | |
| # Setup the hyperparameter grid | |
| c_space = np.logspace(-5, 8, 15) | |
| param_grid = {'C': c_space} | |
| # Instantiate a logistic regression classifier: logreg | |
| logreg = LogisticRegression() | |
| # Instantiate the GridSearchCV object: logreg_cv | |
| logreg_cv = GridSearchCV(logreg, param_grid, cv=5) | |
| # Fit it to the data | |
| logreg_cv.fit(X,y) | |
| # Print the tuned parameters and score | |
| print("Tuned Logistic Regression Parameters: {}".format(logreg_cv.best_params_)) | |
| print("Best score is {}".format(logreg_cv.best_score_)) | |
| # Hyperparameter tuning with RandomizedSearchCV | |
| # Import necessary modules | |
| from scipy.stats import randint | |
| from sklearn.tree import DecisionTreeClassifier | |
| from sklearn.model_selection import RandomizedSearchCV | |
| # Setup the parameters and distributions to sample from: param_dist | |
| param_dist = {"max_depth": [3, None], | |
| "max_features": randint(1, 9), | |
| "min_samples_leaf": randint(1, 9), | |
| "criterion": ["gini", "entropy"]} | |
| # Instantiate a Decision Tree classifier: tree | |
| tree = DecisionTreeClassifier() | |
| # Instantiate the RandomizedSearchCV object: tree_cv | |
| tree_cv = RandomizedSearchCV(tree, param_dist, cv=5) | |
| # Fit it to the data | |
| tree_cv.fit(X, y) | |
| # Print the tuned parameters and score | |
| print("Tuned Decision Tree Parameters: {}".format(tree_cv.best_params_)) | |
| print("Best score is {}".format(tree_cv.best_score_)) | |
| # Hold-out set in Classification | |
| # Import necessary modules | |
| from sklearn.model_selection import train_test_split | |
| from sklearn.linear_model import LogisticRegression | |
| from sklearn.model_selection import GridSearchCV | |
| # Create the hyperparameter grid | |
| c_space = np.logspace(-5, 8, 15) | |
| param_grid = {'C':c_space, 'penalty': ['l1', 'l2']} | |
| # Instantiate the logistic regression classifier: logreg | |
| logreg = LogisticRegression() | |
| # Create train and test sets | |
| X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4, random_state=42) | |
| # Instantiate the GridSearchCV object: logreg_cv | |
| logreg_cv = GridSearchCV(logreg, param_grid, cv=5) | |
| # Fit it to the training data | |
| logreg_cv.fit(X_train, y_train) | |
| # Print the optimal parameters and best score | |
| print("Tuned Logistic Regression Parameter: {}".format(logreg_cv.best_params_)) | |
| print("Tuned Logistic Regression Accuracy: {}".format(logreg_cv.best_score_)) | |
| # Hold-out set in Regression | |
| from sklearn.linear_model import ElasticNet | |
| from sklearn.metrics import mean_squared_error | |
| # Create train and test sets | |
| X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4, random_state=42) | |
| # Create the hyperparameter grid | |
| l1_space = np.linspace(0, 1, 30) | |
| param_grid = {'l1_ratio': l1_space} | |
| # Instantiate the ElasticNet regressor: elastic_net | |
| elastic_net = ElasticNet() | |
| # Setup the GridSearchCV object: gm_cv | |
| gm_cv = GridSearchCV(elastic_net, param_grid, cv=5) | |
| # Fit it to the training data | |
| gm_cv.fit(X_train, y_train) | |
| # Predict on the test set and compute metrics | |
| y_pred = gm_cv.predict(X_test) | |
| r2 = gm_cv.score(X_test, y_test) | |
| mse = mean_squared_error(y_test, y_pred) | |
| print("Tuned ElasticNet l1 ratio: {}".format(gm_cv.best_params_)) | |
| print("Tuned ElasticNet R squared: {}".format(r2)) | |
| print("Tuned ElasticNet MSE: {}".format(mse)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment