Skip to content

Instantly share code, notes, and snippets.

@rom1504
Created November 23, 2018 20:56
Show Gist options
  • Save rom1504/51b014d922acca8d891b3d2998e1761a to your computer and use it in GitHub Desktop.
Save rom1504/51b014d922acca8d891b3d2998e1761a to your computer and use it in GitHub Desktop.
nn to solve house regression
from math import sqrt
from scipy import stats
from scipy.stats import norm, skew
from scipy.special import boxcox1p
import pandas as pd
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
train_ID = train['Id']
test_ID = test['Id']
# Nous retirons la colonne Id qui n'est pas utile pour l'entrainement
train.drop("Id", axis = 1, inplace = True)
test.drop("Id", axis = 1, inplace = True)
train.head()
train = train.drop(train[(train['GrLivArea']>4000) & (train['SalePrice']<300000)].index)
train_shape = train.shape[0]
test_shape = test.shape[0]
y_train = train.SalePrice.values
total_data = pd.concat((train, test), sort=True).reset_index(drop=True)
total_data.drop(['SalePrice'], axis=1, inplace=True)
total = total_data.isnull().sum().sort_values(ascending=False)
total_data['MSSubClass'] = total_data['MSSubClass'].apply(str)
total_data['OverallCond'] = total_data['OverallCond'].astype(str)
total_data['YrSold'] = total_data['YrSold'].astype(str)
total_data['MoSold'] = total_data['MoSold'].astype(str)
# Nous remplissions les valeurs nulles avec la mediane de la colonne
total_data['LotFrontage'] = total_data['LotFrontage'].fillna(total_data['LotFrontage'].median())
total_data['MasVnrArea'] = total_data['MasVnrArea'].fillna(total_data['MasVnrArea'].median())
# Les valeurs catégoriques nulles sont remplies avec "mode()", le mode c'est comme la mediane mais pour les valeurs catégorique
for col in ('GarageType', 'GarageFinish', 'GarageQual','GarageCond'):
total_data[col] = total_data[col].fillna(total_data[col].mode()[0])
for col in ('BsmtFinType2', 'BsmtExposure', 'BsmtCond','BsmtFinType1','BsmtQual'):
total_data[col] = total_data[col].fillna(total_data[col].mode()[0])
total_data['Electrical'] = total_data['Electrical'].fillna(total_data['Electrical'].mode()[0])
total_data['MSZoning'] = total_data['MSZoning'].fillna(total_data['MSZoning'].mode()[0])
total_data['KitchenQual'] = total_data['KitchenQual'].fillna(total_data['KitchenQual'].mode()[0])
total_data['Exterior1st'] = total_data['Exterior1st'].fillna(total_data['Exterior1st'].mode()[0])
total_data['Exterior2nd'] = total_data['Exterior2nd'].fillna(total_data['Exterior2nd'].mode()[0])
total_data['SaleType'] = total_data['SaleType'].fillna(total_data['SaleType'].mode()[0])
# Ces informations sont importante, nous ne pouvons pas les remplir avec une valeur par défaut
# Pour le cas de PoolQC par exemple si une valeur est nulle, cela veut dire que la maison n'a pas de piscine
# Donc nous remplissons avec None
total_data['PoolQC'] = total_data['PoolQC'].fillna('None')
total_data['MiscFeature'] = total_data['MiscFeature'].fillna('None')
total_data['Alley'] = total_data['Alley'].fillna('None')
total_data['Fence'] = total_data['Fence'].fillna('None')
total_data['FireplaceQu'] = total_data['FireplaceQu'].fillna('None')
total_data["MasVnrType"] = total_data["MasVnrType"].fillna('None')
# Pareil ici, mais là c'est déjà une valeur numérique donc 0 au lieu de None
for col in ('BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF','TotalBsmtSF', 'BsmtFullBath', 'BsmtHalfBath'):
total_data[col] = total_data[col].fillna(0)
for col in ('GarageYrBlt', 'GarageArea', 'GarageCars'):
total_data[col] = total_data[col].fillna(0)
# Utilities contient 99% de AllPub, cela n'apportera aucune informations utiles à l'algorithme
total_data = total_data.drop(['Utilities'], axis=1)
# Venant de la description du jeu de données "(Assume typical unless deductions are warranted)" donc NaN = Typical
total_data["Functional"] = total_data["Functional"].fillna('Typ')
for col in ('GarageType', 'GarageFinish', 'GarageQual','GarageCond'):
total_data[col] = total_data[col].fillna(total_data[col].mode()[0])
for col in ('BsmtFinType2', 'BsmtExposure', 'BsmtCond','BsmtFinType1','BsmtQual'):
total_data[col] = total_data[col].fillna(total_data[col].mode()[0])
total_data['TotalSF'] = total_data['TotalBsmtSF'] + total_data['1stFlrSF'] + total_data['2ndFlrSF']
total = total_data.isnull().sum().sort_values(ascending=False)
pourcentage = (total_data.isnull().sum()/total_data.isnull().count()).sort_values(ascending=False)
donnees_manquantes = pd.concat([total, pourcentage], axis=1, keys=['Total', 'Pourcentage'])
donnees_manquantes.head(20)
import numpy as np
y_train = np.log(train["SalePrice"])
colonnes_numeriques = total_data.dtypes[total_data.dtypes != "object"].index
# Check the skew of all numerical features
colonnes_asymetriques = total_data[colonnes_numeriques].apply(lambda x: skew(x.dropna())).sort_values(ascending=False)
print("\nAsymétrie dans les données numériques: \n")
asymetrie = pd.DataFrame({'Asymetrie' :colonnes_asymetriques})
asymetrie.head(10)
asymetrie = asymetrie[abs(asymetrie) > 0.75]
colonnes_asymetriques = asymetrie.index
lam = 0.15
for col in colonnes_asymetriques:
total_data[col] = boxcox1p(total_data[col], lam)
total_data = pd.get_dummies(total_data)
train = total_data[:train_shape]
test = total_data[train_shape:]
def rmsle_cv(model):
kf = KFold(5, shuffle=True, random_state=42).get_n_splits(train.values)
rmse= np.sqrt(-cross_val_score(model, train.values, y_train, scoring="neg_mean_squared_error", cv = kf))
return(rmse)
keras_x = train.values
keras_x_test = test.values
keras_y = y_train.values.reshape(-1, 1)
from sklearn.model_selection import train_test_split
Xtrain, Xtest, ytrain, ytest = train_test_split(keras_x, keras_y, test_size=0.20, random_state=42)
from tensorflow import keras
import numpy as np
from keras.layers import Dense, Activation, Dropout
from keras.models import Sequential
model = Sequential([
Dense(256, input_shape=(329,)),
Activation('tanh'),
Dense(256),
Activation('tanh'),
Dense(256),
Activation('tanh'),
Dense(256),
Activation('tanh'),
Dense(256),
Activation('tanh'),
Dense(256),
Activation('tanh'),
Dense(256),
Activation('tanh'),
Dense(256),
Activation('tanh'),
Dense(256),
Activation('tanh'),
Dense(256),
Activation('tanh'),
Dense(1)
])
model.compile(loss='mean_squared_error', optimizer='adam', metrics=['mean_squared_error'])
for i in range(10):
print("iteration {}/20".format(i))
model.fit(Xtrain, ytrain, nb_epoch=10, batch_size=min(16*(i+1), 64))
predictions = model.predict(Xtrain)
print("training set rmse {}".format(sqrt(np.mean(np.square(predictions - ytrain)))))
prediction = model.predict(Xtest)
print("prediction set rmse {}".format(sqrt(np.mean(np.square(prediction - ytest)))))
prediction_test = model.predict(keras_x_test)
print(prediction_test)
predictions_soumi = np.exp(prediction_test).flatten()
soumission = pd.DataFrame({'Id': test_ID, 'SalePrice': predictions_soumi})
soumission.to_csv('soumission.csv', index=False)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment