Last active
September 13, 2024 10:40
-
-
Save sachinsdate/9dc2f1751656fa20fde041bb69c5fa7f to your computer and use it in GitHub Desktop.
An illustration of estimation bias. CSV files referenced in the code can be available as gists
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import math | |
import pandas as pd | |
import numpy as np | |
from patsy import dmatrices | |
import statsmodels.api as sm | |
import scipy.stats | |
from sklearn.model_selection import train_test_split | |
import matplotlib.pyplot as plt | |
from matplotlib.ticker import StrMethodFormatter | |
import seaborn as sns | |
############################################ | |
####### Load the M & M weights data ######## | |
############################################ | |
df_mm = pd.read_csv(filepath_or_buffer='m_and_m_sample_weights.csv', header=0) | |
df_mm['Mean_Sample_Weight'] = df_mm['Weight_In_GMS']/15 | |
plt.figure(figsize=(8, 6)) | |
############################################ | |
######### Plot the 60 sample means ######### | |
############################################ | |
sns.scatterplot(x=['Mean Sample Weight']*60, y=df_mm['Mean_Sample_Weight'], color='cornflowerblue', label='Mean Sample Weight') | |
# Plot the mean of the 'population' of 453 M&Ms | |
sns.scatterplot(x=['Mean Sample Weight'], y=2.29792, color='orange', label=f'Population mean', s=100) | |
plt.title('Mean Weights of 60 Random Samples') | |
plt.xlabel('Samples') | |
plt.ylabel('Weight in Grams') | |
plt.legend() | |
plt.grid(True) | |
plt.show() | |
############################################ | |
#### Load the Taipei house prices data ##### | |
############################################ | |
df_hp = pd.read_csv(filepath_or_buffer='taipei_real_estate_prices.csv', header=0) | |
########################################################## | |
# Plot House price versus number of convenenience stores # | |
########################################################## | |
plt.figure(figsize=(10, 6)) | |
sns.scatterplot(x='number_of_convenience_stores', y='house_price_of_unit_area', data=df_hp, color='cornflowerblue') | |
plt.xlabel('Number of Nearby Convenience Stores') | |
plt.ylabel('House Price per Unit Area') | |
plt.title('House Price per Unit Area vs. Number of Nearby Convenience Stores') | |
plt.legend() | |
# Show the plot | |
plt.show() | |
############################################ | |
########## Fit the linear model ########### | |
############################################ | |
reg_expr = 'house_price_of_unit_area ~ number_of_convenience_stores' | |
y, X = dmatrices(reg_expr, data=df_hp, return_type='dataframe') | |
olsr_model_results = sm.OLS(y, X).fit() | |
olsr_model_results.params | |
############################################ | |
########## Plot the fitted model ########### | |
############################################ | |
plt.figure(figsize=(10, 6)) | |
sns.scatterplot(x='number_of_convenience_stores', y='house_price_of_unit_area', data=df_hp, color='cornflowerblue') | |
# Get the predicted y values | |
predicted_y = olsr_model_results.predict(X) | |
sns.lineplot(x=np.array(X['number_of_convenience_stores']), y=np.array(predicted_y), color='orange', marker='o', label='OLS Predicted') | |
plt.xlabel('Number of Nearby Convenience Stores') | |
plt.ylabel('House Price per Unit Area') | |
plt.title('House Price per Unit Area vs. Number of Nearby Convenience Stores') | |
plt.legend() | |
plt.show() | |
################################################ | |
# Plot 50 fitted models on bootstrapped samples | |
################################################ | |
num_c_stores_unique = df_hp['number_of_convenience_stores'].unique() | |
num_c_stores_unique.sort() | |
columns = [f'num_c_stores_{int(val)}' for val in num_c_stores_unique] | |
predicted_means_df = pd.DataFrame(columns=columns) | |
predicted_means_df['beta_0'] = np.nan | |
predicted_means_df['beta_1'] = np.nan | |
plt.figure(figsize=(10, 6)) | |
sns.scatterplot(x='number_of_convenience_stores', y='house_price_of_unit_area', data=df_hp, color='cornflowerblue') | |
# Run the simulation 50 times | |
for i in range(50): | |
# Pull out a random sample of 50 rows | |
sample_df = df_hp.sample(n=50, random_state=i) | |
y, X = dmatrices('house_price_of_unit_area ~ number_of_convenience_stores', data=sample_df, return_type='dataframe') | |
model = sm.OLS(y, X).fit() | |
predicted_y = model.predict(X) | |
means = [] | |
for val in num_c_stores_unique: | |
mean_val = predicted_y[sample_df['number_of_convenience_stores'] == val].mean() | |
means.append(mean_val) | |
predicted_means_df.loc[i, columns] = means | |
predicted_means_df.loc[i, 'beta_0'] = model.params['Intercept'] | |
predicted_means_df.loc[i, 'beta_1'] = model.params['number_of_convenience_stores'] | |
x_vals = np.array([df_hp['number_of_convenience_stores'].min(), df_hp['number_of_convenience_stores'].max()]) | |
y_vals = model.params['Intercept'] + model.params['number_of_convenience_stores'] * x_vals | |
sns.lineplot(x=x_vals, y=y_vals, color='orange', linestyle='--', alpha=0.3) | |
plt.title('OLS Regression Lines Fitted on 50 Random Samples with the Full Dataset in the Background') | |
plt.xlabel('Number of Nearby Convenience Stores') | |
plt.ylabel('House Price per Unit Area') | |
plt.show() | |
####################################################### | |
# Plot beta_0 and beta_1 from the bootstrapped models # | |
####################################################### | |
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 7)) | |
ax1.set_title("Estimated β_0 and population β_0", fontsize=14) | |
ax1.set_ylabel("β_0", color='black', fontsize=14) | |
ax1.scatter(x=['beta_0']*50, y=predicted_means_df['beta_0'], marker='o', label="Sample β_0", color='cornflowerblue') | |
ax1.scatter(x=['beta_0'], y=olsr_model_results.params['Intercept'], marker='o', s=100, label="Population β_0", color='orange') | |
ax1.tick_params(axis='both', which='major', labelsize=14) | |
ax2.set_title("Estimated β_1 and population β_1", fontsize=14) | |
ax2.scatter(x=['beta_1']*50, y=predicted_means_df['beta_1'], marker='o', label="β_1", color='cornflowerblue') | |
ax2.scatter(x=['beta_1'], y=olsr_model_results.params['number_of_convenience_stores'], marker='o', s=100, label="Population β_1", color='orange') | |
ax2.set_ylabel("β_1", color='black', fontsize=14) | |
ax2.tick_params(axis='both', which='major', labelsize=14) | |
plt.show() | |
################################################################################### | |
# Fit a linear model on a sample that leaves out the top 5% most expensive houses # | |
################################################################################### | |
#The original model | |
reg_expr = 'house_price_of_unit_area ~ number_of_convenience_stores' | |
y, X = dmatrices(reg_expr, data=df_hp, return_type='dataframe') | |
olsr_model_results = sm.OLS(y, X).fit() | |
print(olsr_model_results.params) | |
#Biased sample | |
df_hp_95p = df_hp[df_hp['house_price_of_unit_area'] < 59.525] | |
#The biased model | |
reg_expr = 'house_price_of_unit_area ~ number_of_convenience_stores' | |
y_95p, X_95p = dmatrices(reg_expr, data=df_hp_95p, return_type='dataframe') | |
olsr_model_results_95p = sm.OLS(y_95p, X_95p).fit() | |
print(olsr_model_results_95p.params) | |
################################################ | |
# Display the original and the biased models # | |
################################################ | |
# Create a scatter plot | |
plt.figure(figsize=(10, 6)) | |
sns.scatterplot(x='number_of_convenience_stores', y='house_price_of_unit_area', data=df_hp, color='cornflowerblue') | |
predicted_y = olsr_model_results.predict(X) | |
sns.lineplot(x=np.array(X['number_of_convenience_stores']), y=np.array(predicted_y), color='orange', marker='o', label='OLS Predicted') | |
predicted_y_95p = olsr_model_results_95p.predict(X) | |
sns.lineplot(x=np.array(X['number_of_convenience_stores']), y=np.array(predicted_y_95p), color='red', marker='o', label='OLS Predicted (Biased Coeffs)') | |
plt.xlabel('Number of Nearby Convenience Stores') | |
plt.ylabel('House Price per Unit Area') | |
plt.title('House Price per Unit Area vs. Number of Nearby Convenience Stores') | |
plt.legend() | |
# Show the plot | |
plt.show() | |
################################################ | |
# Plot House Price versus distance to MRT stop # | |
################################################ | |
plt.figure(figsize=(10, 6)) | |
sns.scatterplot(x='distance_to_the_nearest_mrt_station', y='house_price_of_unit_area', data=df_hp, color='cornflowerblue') | |
plt.title('House Price per Unit Area vs Distance to Nearest MRT Station') | |
plt.xlabel('Distance to the Nearest MRT Station') | |
plt.ylabel('House Price per Unit Area') | |
plt.grid(True) | |
# Show plot | |
plt.show() | |
#################################################### | |
####### Fit a log-linear model on this data ######## | |
#################################################### | |
# Compute the log of house_price_of_unit_area | |
df_hp['log_house_price_of_unit_area'] = np.log(df_hp['house_price_of_unit_area']) | |
reg_expr = 'log_house_price_of_unit_area ~ distance_to_the_nearest_mrt_station' | |
y, X = dmatrices(reg_expr, data=df_hp, return_type='dataframe') | |
log_linear_olsr_model_results = sm.OLS(y, X).fit() | |
log_linear_olsr_model_results.params | |
################################################ | |
########## Plot the log-linear model ########### | |
################################################ | |
plt.figure(figsize=(10, 6)) | |
sns.scatterplot(x='distance_to_the_nearest_mrt_station', y='log_house_price_of_unit_area', data=df_hp, color='cornflowerblue') | |
predicted_y = log_linear_olsr_model_results.predict(X) | |
sns.lineplot(x=np.array(X['distance_to_the_nearest_mrt_station']), y=np.array(predicted_y), color='orange', marker='o', label='OLS Predicted') | |
plt.title('Natural Log of House Price per Unit Area vs Distance to Nearest MRT Station') | |
plt.xlabel('Distance to the Nearest MRT Station') | |
plt.ylabel('Natural Log of House Price per Unit Area') | |
plt.grid(True) | |
plt.show() | |
################################################ | |
# Fit linear models on 50 bootstrapped samples # | |
################################################ | |
estimated_params_log_linear_ols_mrt_model_df = pd.DataFrame(columns=['gamma_0', 'gamma_1']) | |
regression_expr = 'house_price_of_unit_area ~ distance_to_the_nearest_mrt_station' | |
for i in range(50): | |
sample_df = df_hp.sample(n=50, random_state=np.random.randint(0, 10000)) | |
y, X = dmatrices(regression_expr, data=sample_df, return_type='dataframe') | |
sample_model_results = sm.OLS(y, X).fit() | |
gamma_0 = sample_model_results.params['Intercept'] | |
gamma_1 = sample_model_results.params['distance_to_the_nearest_mrt_station'] | |
estimated_params_log_linear_ols_mrt_model_df.at[i, 'gamma_0'] = gamma_0 | |
estimated_params_log_linear_ols_mrt_model_df.at[i, 'gamma_1'] = gamma_1 | |
############################################################################ | |
# Plot the Beta_0 and Beta_1 params from the 50 bootstrapped linear models # | |
############ and the 'population' mean from the log-linear model ########### | |
############################################################################ | |
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 7)) | |
ax1.set_title("Estimated γ_0 from the linear model and\nthe population e^γ_0 from the log-linear model", fontsize='14') | |
ax1.set_ylabel("γ_0", color='black', fontsize=14) | |
ax1.tick_params(axis='both', which='major', labelsize=14) | |
ax1.scatter(x=['gamma_0']*50, y=estimated_params_log_linear_ols_mrt_model_df['gamma_0'], marker='o', label="Sample γ_0", color='cornflowerblue') | |
ax1.scatter(x=['gamma_0'], y=np.exp(log_linear_olsr_model_results.params['Intercept']), marker='o', s=100, label="Population γ_0", color='orange') | |
ax2.set_title("Estimated γ_1 from the linear model and\nthe population γ_1 from the log-linear model", fontsize='14') | |
ax2.set_ylabel("γ_1", color='black', fontsize=14) | |
ax2.tick_params(axis='both', which='major', labelsize=14) | |
ax2.scatter(x=['gamma_1']*50, y=estimated_params_log_linear_ols_mrt_model_df['gamma_1'], marker='o', label="γ_1", color='cornflowerblue') | |
ax2.scatter(x=['gamma_1'], y=log_linear_olsr_model_results.params['distance_to_the_nearest_mrt_station'], marker='o', s=100, label="Population γ_1", color='orange') | |
plt.show() | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment