Last active
July 24, 2018 13:21
-
-
Save accessnash/9bfdffe64c98697b3393de8aae594515 to your computer and use it in GitHub Desktop.
Preprocessing data using Gapminder data - Supervised Learning with scikit learn at Datacamp
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # Import pandas | |
| import pandas as pd | |
| # Read 'gapminder.csv' into a DataFrame: df | |
| df = pd.read_csv('gapminder.csv') | |
| # Create a boxplot of life expectancy per region | |
| df.boxplot('life', 'Region', rot=60) | |
| # Show the plot | |
| plt.show() | |
| # Create dummy variables: df_region | |
| df_region = pd.get_dummies(df) | |
| # Print the columns of df_region | |
| print(df_region.columns) | |
| # Create dummy variables with drop_first=True: df_region | |
| df_region = pd.get_dummies(df, drop_first = True) | |
| # Print the new columns of df_region | |
| print(df_region.columns) | |
| # Import necessary modules | |
| from sklearn.linear_model import Ridge | |
| from sklearn.model_selection import cross_val_score | |
| # Instantiate a ridge regressor: ridge | |
| ridge = Ridge(alpha = 0.5, normalize = True) | |
| # Perform 5-fold cross-validation: ridge_cv | |
| ridge_cv = cross_val_score(ridge, X, y, cv =5) | |
| # Print the cross-validated scores | |
| print(ridge_cv) | |
| # Convert '?' to NaN | |
| df[df == '?'] = np.nan | |
| # Print the number of NaNs | |
| print(df.isnull().sum()) | |
| # Print shape of original DataFrame | |
| print("Shape of Original DataFrame: {}".format(df.shape)) | |
| # Drop missing values and print shape of new DataFrame | |
| df = df.dropna() | |
| # Print shape of new DataFrame | |
| print("Shape of DataFrame After Dropping All Rows with Missing Values: {}".format(df.shape)) | |
| # Imputing missing data in a ML Pipeline | |
| # Import the Imputer module | |
| from sklearn.preprocessing import Imputer | |
| from sklearn.svm import SVC | |
| # Setup the Imputation transformer: imp | |
| imp = Imputer(missing_values='NaN', strategy='most_frequent', axis=0) | |
| # Instantiate the SVC classifier: clf | |
| clf = SVC() | |
| # Setup the pipeline with the required steps: steps | |
| steps = [('imputation', imp), | |
| ('SVM', clf)] | |
| # Import necessary modules | |
| from sklearn.preprocessing import Imputer | |
| from sklearn.pipeline import Pipeline | |
| from sklearn.svm import SVC | |
| # Setup the pipeline steps: steps | |
| steps = [('imputation', Imputer(missing_values='NaN', strategy='most_frequent', axis=0)), | |
| ('SVM', SVC())] | |
| # Create the pipeline: pipeline | |
| pipeline = Pipeline(steps) | |
| # Create training and test sets | |
| X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=42) | |
| # Fit the pipeline to the train set | |
| pipeline.fit(X_train, y_train) | |
| # Predict the labels of the test set | |
| y_pred = pipeline.predict(X_test) | |
| # Compute metrics | |
| print(classification_report(y_test, y_pred)) | |
| # Centering and scaling the data | |
| # Import scale | |
| from sklearn.preprocessing import scale | |
| # Scale the features: X_scaled | |
| X_scaled = scale(X) | |
| # Print the mean and standard deviation of the unscaled features | |
| print("Mean of Unscaled Features: {}".format(np.mean(X))) | |
| print("Standard Deviation of Unscaled Features: {}".format(np.std(X))) | |
| # Print the mean and standard deviation of the scaled features | |
| print("Mean of Scaled Features: {}".format(np.mean(X_scaled))) | |
| print("Standard Deviation of Scaled Features: {}".format(np.std(X_scaled))) | |
| # Import the necessary modules | |
| from sklearn.preprocessing import StandardScaler | |
| from sklearn.pipeline import Pipeline | |
| # Setup the pipeline steps: steps | |
| steps = [('scaler', StandardScaler()), | |
| ('knn', KNeighborsClassifier())] | |
| # Create the pipeline: pipeline | |
| pipeline = Pipeline(steps) | |
| # Create train and test sets | |
| X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=42) | |
| # Fit the pipeline to the training set: knn_scaled | |
| knn_scaled = pipeline.fit(X_train, y_train) | |
| # Instantiate and fit a k-NN classifier to the unscaled data | |
| knn_unscaled = KNeighborsClassifier().fit(X_train, y_train) | |
| # Compute and print metrics | |
| print('Accuracy with Scaling: {}'.format(knn_scaled.score(X_test, y_test))) | |
| print('Accuracy without Scaling: {}'.format(knn_unscaled.score(X_test, y_test))) | |
| # Pipeline for classification | |
| # Setup the pipeline | |
| steps = [('scaler', StandardScaler()), | |
| ('SVM', SVC())] | |
| pipeline = Pipeline(steps) | |
| # Specify the hyperparameter space | |
| parameters = {'SVM__C':[1, 10, 100], | |
| 'SVM__gamma':[0.1, 0.01]} | |
| # Create train and test sets | |
| X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=21) | |
| # Instantiate the GridSearchCV object: cv | |
| cv = GridSearchCV(pipeline, parameters, cv=3) | |
| # Fit to the training set | |
| cv.fit(X_train, y_train) | |
| # Predict the labels of the test set: y_pred | |
| y_pred = cv.predict(X_test) | |
| # Compute and print metrics | |
| print("Accuracy: {}".format(cv.score(X_test, y_test))) | |
| print(classification_report(y_test, y_pred)) | |
| print("Tuned Model Parameters: {}".format(cv.best_params_)) | |
| # Pipeline for Regression | |
| # Setup the pipeline steps: steps | |
| steps = [('imputation', Imputer(missing_values='NaN', strategy='mean', axis=0)), | |
| ('scaler', StandardScaler()), | |
| ('elasticnet', ElasticNet())] | |
| # Create the pipeline: pipeline | |
| pipeline = Pipeline(steps) | |
| # Specify the hyperparameter space | |
| parameters = {'elasticnet__l1_ratio':np.linspace(0,1,30)} | |
| # Create train and test sets | |
| X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4, random_state=42) | |
| # Create the GridSearchCV object: gm_cv | |
| gm_cv = GridSearchCV(pipeline, param_grid=parameters, cv=3) | |
| # Fit to the training set | |
| gm_cv.fit(X_train, y_train) | |
| from sklearn import metrics | |
| # Compute and print the metrics | |
| r2 = gm_cv.score(X_test,y_test) | |
| print("Tuned ElasticNet Alpha: {}".format(gm_cv.best_params_)) | |
| print("Tuned ElasticNet R squared: {}".format(r2)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment