Created
December 5, 2017 13:46
-
-
Save pierdom/eeddea2977f0146a12e32f67bb764e33 to your computer and use it in GitHub Desktop.
[Common data-preparaion techniques in Scikit-learn] missing values and categorical attributes. From "Hands-on Machine Learning with Scikit-Learn and TensorFlow" #python #datascience #scikit #machinelearning
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # imput dataset | |
| housing_num = pd.DataFrame(...) | |
| # Dealing with missing values (replace with median for each attribute) | |
| from sklearn.preprocessing import Imputer # use the imputer estimator | |
| imputer = Imputer(strategy="median") # tell it which strategy to use | |
| housing_num = housing.drop("ocean_proximity", axis=1) # remove categorical attributes | |
| imputer.fit(housing_num) # train the estimator with data | |
| imputer.statistics_ # show statistics (check if ok) | |
| X = imputer.transform(housing_num) # apply transf. (get numpy arr) | |
| housing_tr = pd.DataFrame(X, columns=housing_num.columns) # put it back to new dataframe | |
| # Handling text and categorical attributes (strategy 1: map category to numerical value) | |
| from sklearn.preprocessing import LabelEncoder | |
| encoder = LabelEncoder() # create encoder object | |
| housing_cat = housing["ocean_proximity"] # it's the text attribute | |
| housing_cat_encoded = encoder.fit_transform(housing_cat) # apply transf. (get numpy arr) | |
| encoder.classes_ # show mappings | |
| # Handling text and categorical attributes (strategy 2: create new attribute per category) | |
| from sklearn.preprocessing import OneHotEncoder | |
| encoder = OneHotEncoder() | |
| housing_cat_1hot = encoder.fit_transform(housing_cat_encoded.reshape(-1,1)) | |
| housing_cat_1hot # show new attributes (sparse matrix) | |
| housing_cat_1hot.toarray() # convert to dese NumPy array | |
| # Apply both transformations (cat-to-int, int-to-cat) in one show | |
| from sklearn.preprocessing import LabelBinarizer | |
| encoder = LabelBinarizer() | |
| housing_cat_1hot = encoder.fit_transform(housing_cat) | |
| housing_cat_1hot # show dense (by default) numpy array | |
| array([[0, 1, 0, 0, 0], | |
| [0, 1, 0, 0, 0], | |
| [0, 0, 0, 0, 1], | |
| ..., | |
| [0, 1, 0, 0, 0], | |
| [1, 0, 0, 0, 0], | |
| [0, 0, 0, 1, 0]]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment