Created
June 19, 2019 21:14
-
-
Save thiagomarzagao/8242740bab41faf5851b98b26cfa6412 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
import pandas as pd | |
import forestci as fci | |
import matplotlib.pyplot as plt | |
from sklearn.utils import shuffle | |
from sklearn.model_selection import train_test_split | |
from sklearn.ensemble import RandomForestRegressor | |
from sklearn.model_selection import cross_val_predict | |
# set seed | |
random_state = 42 | |
# load data | |
df = pd.read_csv('wimoveis.csv') | |
# keep only certain features | |
df = df[[ | |
'preco_total', | |
'area_util', | |
'local', | |
'iptu', | |
'condominio', | |
'quartos', | |
'suites', | |
'banheiros', | |
'vagas', | |
'churrasqueira', | |
'idade', | |
'brinquedoteca', | |
'tv', | |
'piscina', | |
'playground', | |
'sauna', | |
'academia', | |
'portaria', | |
'jogos', | |
'festas', | |
'andares', | |
]] | |
# impute median values for missing condo fees and property tax | |
for var in ['iptu', 'condominio']: | |
median = df[df[var] > 0][var].median() | |
df[var] = df[var].fillna(median) | |
# drop outliers | |
df = df[df['area_util'] < 50000] | |
df = df[df['preco_total'] < 10000000] | |
df = df[df['condominio'] < 1500000] | |
df = df[df['iptu'] < 20000] | |
df = df[df['vagas'] < 10] | |
# take the log of all quantitative variables | |
for var in ['preco_total', 'area_util', 'iptu', 'condominio']: | |
df[var] = df[var].map(lambda x: np.log(x)) | |
# dummify location | |
locais = pd.get_dummies(df['local'], prefix = 'local', drop_first = True) | |
for col in locais.columns: | |
df[col] = locais[col] | |
del df['local'] | |
# split X and Y | |
y = df['preco_total'].values | |
del df['preco_total'] | |
X = df.values | |
# shuffle sample order | |
X, y = shuffle(X, y, random_state = random_state) | |
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.15, random_state = random_state) | |
# instantiate model | |
model = RandomForestRegressor(n_estimators = 1000, criterion = 'mse', n_jobs = -1, random_state = random_state) | |
# train model | |
yhat = cross_val_predict(model, X_train, y_train, cv = 10) | |
# put estimated prices back in R$ | |
yhat_reais = np.exp(yhat) | |
# put observed prices back in R$ | |
y_reais = np.exp(y_train) | |
# compute errors | |
errors = yhat_reais - y_reais | |
# compute median absolute error | |
median_abs_error = np.median(np.absolute(errors)) | |
print('median absolute error (in R$):', median_abs_error) | |
# compute proportional error (error / asking price) | |
proportional_errors = errors / y_reais | |
median_prop_error = np.median(np.absolute(proportional_errors)) | |
mean_prop_error = np.mean(np.absolute(proportional_errors)) | |
print('median absolute error (in %):', median_prop_error) | |
print('mean absolute error (in %):', mean_prop_error) | |
''' | |
df_output = pd.DataFrame([]) | |
df_output['yhat'] = yhat | |
df_output['yhat_reais'] = yhat_reais | |
df_output['y_reais'] = y_reais | |
df_output['errors'] = errors | |
df_output['area_util'] = np.exp(X_train[:,0]) | |
df_output['iptu'] = np.exp(X_train[:,1]) | |
df_output['condominio'] = np.exp(X_train[:,2]) | |
df_output.plot.scatter('y_reais', 'errors') | |
plt.show() | |
df_output.plot.scatter('area_util', 'errors') | |
plt.show() | |
df_output.plot.scatter('iptu', 'errors') | |
plt.show() | |
df_output.plot.scatter('condominio', 'errors') | |
plt.show() | |
''' | |
# test the model | |
model = RandomForestRegressor(n_estimators = 1000, criterion = 'mse', n_jobs = -1, random_state = random_state) | |
model.fit(X_train, y_train) | |
yhat = model.predict(X_test) | |
# put estimated prices back in R$ | |
yhat_reais = np.exp(yhat) | |
# put observed prices back in R$ | |
y_reais = np.exp(y_test) | |
# compute errors | |
errors = yhat_reais - y_reais | |
# compute median absolute error | |
median_abs_error = np.median(np.absolute(errors)) | |
print('median absolute error (in R$):', median_abs_error) | |
# compute proportional error (error / asking price) | |
proportional_errors = errors / y_reais | |
median_prop_error = np.median(np.absolute(proportional_errors)) | |
mean_prop_error = np.mean(np.absolute(proportional_errors)) | |
print('median absolute error (in %):', median_prop_error) | |
print('mean absolute error (in %):', mean_prop_error) | |
# estimate uncertainty | |
variances = fci.random_forest_error(model, X_train, X_test) | |
plt.errorbar(y_test, yhat, yerr = np.sqrt(variances), fmt = 'o', ecolor = 'red') | |
plt.plot([10, 16], [10, 16], 'k--') | |
plt.xlabel('actual price, in log(R$)') | |
plt.ylabel('predicted price, in log(R$)') | |
plt.show() | |
# check interval predictions | |
lower = yhat - np.sqrt(variances) | |
upper = yhat + np.sqrt(variances) | |
corrects = 0 | |
for y_i, l, u in zip(y_test, lower, upper): | |
if l <= y_i <= u: | |
corrects += 1 | |
print(corrects, 'corrects out of', len(yhat)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment