Created
November 23, 2018 20:56
-
-
Save rom1504/51b014d922acca8d891b3d2998e1761a to your computer and use it in GitHub Desktop.
nn to solve house regression
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from math import sqrt | |
from scipy import stats | |
from scipy.stats import norm, skew | |
from scipy.special import boxcox1p | |
import pandas as pd | |
train = pd.read_csv('train.csv') | |
test = pd.read_csv('test.csv') | |
train_ID = train['Id'] | |
test_ID = test['Id'] | |
# Nous retirons la colonne Id qui n'est pas utile pour l'entrainement | |
train.drop("Id", axis = 1, inplace = True) | |
test.drop("Id", axis = 1, inplace = True) | |
train.head() | |
train = train.drop(train[(train['GrLivArea']>4000) & (train['SalePrice']<300000)].index) | |
train_shape = train.shape[0] | |
test_shape = test.shape[0] | |
y_train = train.SalePrice.values | |
total_data = pd.concat((train, test), sort=True).reset_index(drop=True) | |
total_data.drop(['SalePrice'], axis=1, inplace=True) | |
total = total_data.isnull().sum().sort_values(ascending=False) | |
total_data['MSSubClass'] = total_data['MSSubClass'].apply(str) | |
total_data['OverallCond'] = total_data['OverallCond'].astype(str) | |
total_data['YrSold'] = total_data['YrSold'].astype(str) | |
total_data['MoSold'] = total_data['MoSold'].astype(str) | |
# Nous remplissions les valeurs nulles avec la mediane de la colonne | |
total_data['LotFrontage'] = total_data['LotFrontage'].fillna(total_data['LotFrontage'].median()) | |
total_data['MasVnrArea'] = total_data['MasVnrArea'].fillna(total_data['MasVnrArea'].median()) | |
# Les valeurs catégoriques nulles sont remplies avec "mode()", le mode c'est comme la mediane mais pour les valeurs catégorique | |
for col in ('GarageType', 'GarageFinish', 'GarageQual','GarageCond'): | |
total_data[col] = total_data[col].fillna(total_data[col].mode()[0]) | |
for col in ('BsmtFinType2', 'BsmtExposure', 'BsmtCond','BsmtFinType1','BsmtQual'): | |
total_data[col] = total_data[col].fillna(total_data[col].mode()[0]) | |
total_data['Electrical'] = total_data['Electrical'].fillna(total_data['Electrical'].mode()[0]) | |
total_data['MSZoning'] = total_data['MSZoning'].fillna(total_data['MSZoning'].mode()[0]) | |
total_data['KitchenQual'] = total_data['KitchenQual'].fillna(total_data['KitchenQual'].mode()[0]) | |
total_data['Exterior1st'] = total_data['Exterior1st'].fillna(total_data['Exterior1st'].mode()[0]) | |
total_data['Exterior2nd'] = total_data['Exterior2nd'].fillna(total_data['Exterior2nd'].mode()[0]) | |
total_data['SaleType'] = total_data['SaleType'].fillna(total_data['SaleType'].mode()[0]) | |
# Ces informations sont importante, nous ne pouvons pas les remplir avec une valeur par défaut | |
# Pour le cas de PoolQC par exemple si une valeur est nulle, cela veut dire que la maison n'a pas de piscine | |
# Donc nous remplissons avec None | |
total_data['PoolQC'] = total_data['PoolQC'].fillna('None') | |
total_data['MiscFeature'] = total_data['MiscFeature'].fillna('None') | |
total_data['Alley'] = total_data['Alley'].fillna('None') | |
total_data['Fence'] = total_data['Fence'].fillna('None') | |
total_data['FireplaceQu'] = total_data['FireplaceQu'].fillna('None') | |
total_data["MasVnrType"] = total_data["MasVnrType"].fillna('None') | |
# Pareil ici, mais là c'est déjà une valeur numérique donc 0 au lieu de None | |
for col in ('BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF','TotalBsmtSF', 'BsmtFullBath', 'BsmtHalfBath'): | |
total_data[col] = total_data[col].fillna(0) | |
for col in ('GarageYrBlt', 'GarageArea', 'GarageCars'): | |
total_data[col] = total_data[col].fillna(0) | |
# Utilities contient 99% de AllPub, cela n'apportera aucune informations utiles à l'algorithme | |
total_data = total_data.drop(['Utilities'], axis=1) | |
# Venant de la description du jeu de données "(Assume typical unless deductions are warranted)" donc NaN = Typical | |
total_data["Functional"] = total_data["Functional"].fillna('Typ') | |
for col in ('GarageType', 'GarageFinish', 'GarageQual','GarageCond'): | |
total_data[col] = total_data[col].fillna(total_data[col].mode()[0]) | |
for col in ('BsmtFinType2', 'BsmtExposure', 'BsmtCond','BsmtFinType1','BsmtQual'): | |
total_data[col] = total_data[col].fillna(total_data[col].mode()[0]) | |
total_data['TotalSF'] = total_data['TotalBsmtSF'] + total_data['1stFlrSF'] + total_data['2ndFlrSF'] | |
total = total_data.isnull().sum().sort_values(ascending=False) | |
pourcentage = (total_data.isnull().sum()/total_data.isnull().count()).sort_values(ascending=False) | |
donnees_manquantes = pd.concat([total, pourcentage], axis=1, keys=['Total', 'Pourcentage']) | |
donnees_manquantes.head(20) | |
import numpy as np | |
y_train = np.log(train["SalePrice"]) | |
colonnes_numeriques = total_data.dtypes[total_data.dtypes != "object"].index | |
# Check the skew of all numerical features | |
colonnes_asymetriques = total_data[colonnes_numeriques].apply(lambda x: skew(x.dropna())).sort_values(ascending=False) | |
print("\nAsymétrie dans les données numériques: \n") | |
asymetrie = pd.DataFrame({'Asymetrie' :colonnes_asymetriques}) | |
asymetrie.head(10) | |
asymetrie = asymetrie[abs(asymetrie) > 0.75] | |
colonnes_asymetriques = asymetrie.index | |
lam = 0.15 | |
for col in colonnes_asymetriques: | |
total_data[col] = boxcox1p(total_data[col], lam) | |
total_data = pd.get_dummies(total_data) | |
train = total_data[:train_shape] | |
test = total_data[train_shape:] | |
def rmsle_cv(model): | |
kf = KFold(5, shuffle=True, random_state=42).get_n_splits(train.values) | |
rmse= np.sqrt(-cross_val_score(model, train.values, y_train, scoring="neg_mean_squared_error", cv = kf)) | |
return(rmse) | |
keras_x = train.values | |
keras_x_test = test.values | |
keras_y = y_train.values.reshape(-1, 1) | |
from sklearn.model_selection import train_test_split | |
Xtrain, Xtest, ytrain, ytest = train_test_split(keras_x, keras_y, test_size=0.20, random_state=42) | |
from tensorflow import keras | |
import numpy as np | |
from keras.layers import Dense, Activation, Dropout | |
from keras.models import Sequential | |
model = Sequential([ | |
Dense(256, input_shape=(329,)), | |
Activation('tanh'), | |
Dense(256), | |
Activation('tanh'), | |
Dense(256), | |
Activation('tanh'), | |
Dense(256), | |
Activation('tanh'), | |
Dense(256), | |
Activation('tanh'), | |
Dense(256), | |
Activation('tanh'), | |
Dense(256), | |
Activation('tanh'), | |
Dense(256), | |
Activation('tanh'), | |
Dense(256), | |
Activation('tanh'), | |
Dense(256), | |
Activation('tanh'), | |
Dense(1) | |
]) | |
model.compile(loss='mean_squared_error', optimizer='adam', metrics=['mean_squared_error']) | |
for i in range(10): | |
print("iteration {}/20".format(i)) | |
model.fit(Xtrain, ytrain, nb_epoch=10, batch_size=min(16*(i+1), 64)) | |
predictions = model.predict(Xtrain) | |
print("training set rmse {}".format(sqrt(np.mean(np.square(predictions - ytrain))))) | |
prediction = model.predict(Xtest) | |
print("prediction set rmse {}".format(sqrt(np.mean(np.square(prediction - ytest))))) | |
prediction_test = model.predict(keras_x_test) | |
print(prediction_test) | |
predictions_soumi = np.exp(prediction_test).flatten() | |
soumission = pd.DataFrame({'Id': test_ID, 'SalePrice': predictions_soumi}) | |
soumission.to_csv('soumission.csv', index=False) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment