pierdom · December 5, 2017 13:46
diff --git a/data_prep.py b/data_prep.py
 # imput dataset
 housing_num = pd.DataFrame(...)

 # Dealing with missing values (replace with median for each attribute)
 from sklearn.preprocessing import Imputer 				  # use the imputer estimator
 imputer = Imputer(strategy="median")      				  # tell it which strategy to use
 housing_num = housing.drop("ocean_proximity", axis=1)	  # remove categorical attributes
 imputer.fit(housing_num)								  # train the estimator with data
 imputer.statistics_										  # show statistics (check if ok)
 X = imputer.transform(housing_num)						  # apply transf. (get numpy arr)
 housing_tr = pd.DataFrame(X, columns=housing_num.columns) # put it back to new dataframe

 # Handling text and categorical attributes (strategy 1: map category to numerical value)
 from sklearn.preprocessing import LabelEncoder
 encoder = LabelEncoder()								  # create encoder object
 housing_cat = housing["ocean_proximity"]				  # it's the text attribute
 housing_cat_encoded = encoder.fit_transform(housing_cat)  # apply transf. (get numpy arr)
 encoder.classes_										  # show mappings

 # Handling text and categorical attributes (strategy 2: create new attribute per category)
 from sklearn.preprocessing import OneHotEncoder
 encoder = OneHotEncoder()
 housing_cat_1hot = encoder.fit_transform(housing_cat_encoded.reshape(-1,1))
 housing_cat_1hot										  # show new attributes (sparse matrix)
 housing_cat_1hot.toarray()								  # convert to dese NumPy array

 # Apply both transformations (cat-to-int, int-to-cat) in one show
 from sklearn.preprocessing import LabelBinarizer
 encoder = LabelBinarizer()
 housing_cat_1hot = encoder.fit_transform(housing_cat)
 housing_cat_1hot  									  # show dense (by default) numpy array
 array([[0, 1, 0, 0, 0],
       [0, 1, 0, 0, 0],
       [0, 0, 0, 0, 1],
       ...,
       [0, 1, 0, 0, 0],
       [1, 0, 0, 0, 0],
       [0, 0, 0, 1, 0]])
	# imput dataset
	housing_num = pd.DataFrame(...)

	# Dealing with missing values (replace with median for each attribute)
	from sklearn.preprocessing import Imputer # use the imputer estimator
	imputer = Imputer(strategy="median") # tell it which strategy to use
	housing_num = housing.drop("ocean_proximity", axis=1) # remove categorical attributes
	imputer.fit(housing_num) # train the estimator with data
	imputer.statistics_ # show statistics (check if ok)
	X = imputer.transform(housing_num) # apply transf. (get numpy arr)
	housing_tr = pd.DataFrame(X, columns=housing_num.columns) # put it back to new dataframe

	# Handling text and categorical attributes (strategy 1: map category to numerical value)
	from sklearn.preprocessing import LabelEncoder
	encoder = LabelEncoder() # create encoder object
	housing_cat = housing["ocean_proximity"] # it's the text attribute
	housing_cat_encoded = encoder.fit_transform(housing_cat) # apply transf. (get numpy arr)
	encoder.classes_ # show mappings

	# Handling text and categorical attributes (strategy 2: create new attribute per category)
	from sklearn.preprocessing import OneHotEncoder
	encoder = OneHotEncoder()
	housing_cat_1hot = encoder.fit_transform(housing_cat_encoded.reshape(-1,1))
	housing_cat_1hot # show new attributes (sparse matrix)
	housing_cat_1hot.toarray() # convert to dese NumPy array

	# Apply both transformations (cat-to-int, int-to-cat) in one show
	from sklearn.preprocessing import LabelBinarizer
	encoder = LabelBinarizer()
	housing_cat_1hot = encoder.fit_transform(housing_cat)
	housing_cat_1hot # show dense (by default) numpy array
	array([[0, 1, 0, 0, 0],
	[0, 1, 0, 0, 0],
	[0, 0, 0, 0, 1],
	...,
	[0, 1, 0, 0, 0],
	[1, 0, 0, 0, 0],
	[0, 0, 0, 1, 0]])