Skip to content

Instantly share code, notes, and snippets.

@thiagomarzagao
Created June 19, 2019 21:14
Show Gist options
  • Save thiagomarzagao/8242740bab41faf5851b98b26cfa6412 to your computer and use it in GitHub Desktop.
Save thiagomarzagao/8242740bab41faf5851b98b26cfa6412 to your computer and use it in GitHub Desktop.
import numpy as np
import pandas as pd
import forestci as fci
import matplotlib.pyplot as plt
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_predict
# set seed
random_state = 42
# load data
df = pd.read_csv('wimoveis.csv')
# keep only certain features
df = df[[
'preco_total',
'area_util',
'local',
'iptu',
'condominio',
'quartos',
'suites',
'banheiros',
'vagas',
'churrasqueira',
'idade',
'brinquedoteca',
'tv',
'piscina',
'playground',
'sauna',
'academia',
'portaria',
'jogos',
'festas',
'andares',
]]
# impute median values for missing condo fees and property tax
for var in ['iptu', 'condominio']:
median = df[df[var] > 0][var].median()
df[var] = df[var].fillna(median)
# drop outliers
df = df[df['area_util'] < 50000]
df = df[df['preco_total'] < 10000000]
df = df[df['condominio'] < 1500000]
df = df[df['iptu'] < 20000]
df = df[df['vagas'] < 10]
# take the log of all quantitative variables
for var in ['preco_total', 'area_util', 'iptu', 'condominio']:
df[var] = df[var].map(lambda x: np.log(x))
# dummify location
locais = pd.get_dummies(df['local'], prefix = 'local', drop_first = True)
for col in locais.columns:
df[col] = locais[col]
del df['local']
# split X and Y
y = df['preco_total'].values
del df['preco_total']
X = df.values
# shuffle sample order
X, y = shuffle(X, y, random_state = random_state)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.15, random_state = random_state)
# instantiate model
model = RandomForestRegressor(n_estimators = 1000, criterion = 'mse', n_jobs = -1, random_state = random_state)
# train model
yhat = cross_val_predict(model, X_train, y_train, cv = 10)
# put estimated prices back in R$
yhat_reais = np.exp(yhat)
# put observed prices back in R$
y_reais = np.exp(y_train)
# compute errors
errors = yhat_reais - y_reais
# compute median absolute error
median_abs_error = np.median(np.absolute(errors))
print('median absolute error (in R$):', median_abs_error)
# compute proportional error (error / asking price)
proportional_errors = errors / y_reais
median_prop_error = np.median(np.absolute(proportional_errors))
mean_prop_error = np.mean(np.absolute(proportional_errors))
print('median absolute error (in %):', median_prop_error)
print('mean absolute error (in %):', mean_prop_error)
'''
df_output = pd.DataFrame([])
df_output['yhat'] = yhat
df_output['yhat_reais'] = yhat_reais
df_output['y_reais'] = y_reais
df_output['errors'] = errors
df_output['area_util'] = np.exp(X_train[:,0])
df_output['iptu'] = np.exp(X_train[:,1])
df_output['condominio'] = np.exp(X_train[:,2])
df_output.plot.scatter('y_reais', 'errors')
plt.show()
df_output.plot.scatter('area_util', 'errors')
plt.show()
df_output.plot.scatter('iptu', 'errors')
plt.show()
df_output.plot.scatter('condominio', 'errors')
plt.show()
'''
# test the model
model = RandomForestRegressor(n_estimators = 1000, criterion = 'mse', n_jobs = -1, random_state = random_state)
model.fit(X_train, y_train)
yhat = model.predict(X_test)
# put estimated prices back in R$
yhat_reais = np.exp(yhat)
# put observed prices back in R$
y_reais = np.exp(y_test)
# compute errors
errors = yhat_reais - y_reais
# compute median absolute error
median_abs_error = np.median(np.absolute(errors))
print('median absolute error (in R$):', median_abs_error)
# compute proportional error (error / asking price)
proportional_errors = errors / y_reais
median_prop_error = np.median(np.absolute(proportional_errors))
mean_prop_error = np.mean(np.absolute(proportional_errors))
print('median absolute error (in %):', median_prop_error)
print('mean absolute error (in %):', mean_prop_error)
# estimate uncertainty
variances = fci.random_forest_error(model, X_train, X_test)
plt.errorbar(y_test, yhat, yerr = np.sqrt(variances), fmt = 'o', ecolor = 'red')
plt.plot([10, 16], [10, 16], 'k--')
plt.xlabel('actual price, in log(R$)')
plt.ylabel('predicted price, in log(R$)')
plt.show()
# check interval predictions
lower = yhat - np.sqrt(variances)
upper = yhat + np.sqrt(variances)
corrects = 0
for y_i, l, u in zip(y_test, lower, upper):
if l <= y_i <= u:
corrects += 1
print(corrects, 'corrects out of', len(yhat))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment