Skip to content

Instantly share code, notes, and snippets.

@sachinsdate
Last active September 13, 2024 10:40
Show Gist options
  • Save sachinsdate/9dc2f1751656fa20fde041bb69c5fa7f to your computer and use it in GitHub Desktop.
Save sachinsdate/9dc2f1751656fa20fde041bb69c5fa7f to your computer and use it in GitHub Desktop.
An illustration of estimation bias. CSV files referenced in the code can be available as gists
import math
import pandas as pd
import numpy as np
from patsy import dmatrices
import statsmodels.api as sm
import scipy.stats
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from matplotlib.ticker import StrMethodFormatter
import seaborn as sns
############################################
####### Load the M & M weights data ########
############################################
df_mm = pd.read_csv(filepath_or_buffer='m_and_m_sample_weights.csv', header=0)
df_mm['Mean_Sample_Weight'] = df_mm['Weight_In_GMS']/15
plt.figure(figsize=(8, 6))
############################################
######### Plot the 60 sample means #########
############################################
sns.scatterplot(x=['Mean Sample Weight']*60, y=df_mm['Mean_Sample_Weight'], color='cornflowerblue', label='Mean Sample Weight')
# Plot the mean of the 'population' of 453 M&Ms
sns.scatterplot(x=['Mean Sample Weight'], y=2.29792, color='orange', label=f'Population mean', s=100)
plt.title('Mean Weights of 60 Random Samples')
plt.xlabel('Samples')
plt.ylabel('Weight in Grams')
plt.legend()
plt.grid(True)
plt.show()
############################################
#### Load the Taipei house prices data #####
############################################
df_hp = pd.read_csv(filepath_or_buffer='taipei_real_estate_prices.csv', header=0)
##########################################################
# Plot House price versus number of convenenience stores #
##########################################################
plt.figure(figsize=(10, 6))
sns.scatterplot(x='number_of_convenience_stores', y='house_price_of_unit_area', data=df_hp, color='cornflowerblue')
plt.xlabel('Number of Nearby Convenience Stores')
plt.ylabel('House Price per Unit Area')
plt.title('House Price per Unit Area vs. Number of Nearby Convenience Stores')
plt.legend()
# Show the plot
plt.show()
############################################
########## Fit the linear model ###########
############################################
reg_expr = 'house_price_of_unit_area ~ number_of_convenience_stores'
y, X = dmatrices(reg_expr, data=df_hp, return_type='dataframe')
olsr_model_results = sm.OLS(y, X).fit()
olsr_model_results.params
############################################
########## Plot the fitted model ###########
############################################
plt.figure(figsize=(10, 6))
sns.scatterplot(x='number_of_convenience_stores', y='house_price_of_unit_area', data=df_hp, color='cornflowerblue')
# Get the predicted y values
predicted_y = olsr_model_results.predict(X)
sns.lineplot(x=np.array(X['number_of_convenience_stores']), y=np.array(predicted_y), color='orange', marker='o', label='OLS Predicted')
plt.xlabel('Number of Nearby Convenience Stores')
plt.ylabel('House Price per Unit Area')
plt.title('House Price per Unit Area vs. Number of Nearby Convenience Stores')
plt.legend()
plt.show()
################################################
# Plot 50 fitted models on bootstrapped samples
################################################
num_c_stores_unique = df_hp['number_of_convenience_stores'].unique()
num_c_stores_unique.sort()
columns = [f'num_c_stores_{int(val)}' for val in num_c_stores_unique]
predicted_means_df = pd.DataFrame(columns=columns)
predicted_means_df['beta_0'] = np.nan
predicted_means_df['beta_1'] = np.nan
plt.figure(figsize=(10, 6))
sns.scatterplot(x='number_of_convenience_stores', y='house_price_of_unit_area', data=df_hp, color='cornflowerblue')
# Run the simulation 50 times
for i in range(50):
# Pull out a random sample of 50 rows
sample_df = df_hp.sample(n=50, random_state=i)
y, X = dmatrices('house_price_of_unit_area ~ number_of_convenience_stores', data=sample_df, return_type='dataframe')
model = sm.OLS(y, X).fit()
predicted_y = model.predict(X)
means = []
for val in num_c_stores_unique:
mean_val = predicted_y[sample_df['number_of_convenience_stores'] == val].mean()
means.append(mean_val)
predicted_means_df.loc[i, columns] = means
predicted_means_df.loc[i, 'beta_0'] = model.params['Intercept']
predicted_means_df.loc[i, 'beta_1'] = model.params['number_of_convenience_stores']
x_vals = np.array([df_hp['number_of_convenience_stores'].min(), df_hp['number_of_convenience_stores'].max()])
y_vals = model.params['Intercept'] + model.params['number_of_convenience_stores'] * x_vals
sns.lineplot(x=x_vals, y=y_vals, color='orange', linestyle='--', alpha=0.3)
plt.title('OLS Regression Lines Fitted on 50 Random Samples with the Full Dataset in the Background')
plt.xlabel('Number of Nearby Convenience Stores')
plt.ylabel('House Price per Unit Area')
plt.show()
#######################################################
# Plot beta_0 and beta_1 from the bootstrapped models #
#######################################################
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 7))
ax1.set_title("Estimated β_0 and population β_0", fontsize=14)
ax1.set_ylabel("β_0", color='black', fontsize=14)
ax1.scatter(x=['beta_0']*50, y=predicted_means_df['beta_0'], marker='o', label="Sample β_0", color='cornflowerblue')
ax1.scatter(x=['beta_0'], y=olsr_model_results.params['Intercept'], marker='o', s=100, label="Population β_0", color='orange')
ax1.tick_params(axis='both', which='major', labelsize=14)
ax2.set_title("Estimated β_1 and population β_1", fontsize=14)
ax2.scatter(x=['beta_1']*50, y=predicted_means_df['beta_1'], marker='o', label="β_1", color='cornflowerblue')
ax2.scatter(x=['beta_1'], y=olsr_model_results.params['number_of_convenience_stores'], marker='o', s=100, label="Population β_1", color='orange')
ax2.set_ylabel("β_1", color='black', fontsize=14)
ax2.tick_params(axis='both', which='major', labelsize=14)
plt.show()
###################################################################################
# Fit a linear model on a sample that leaves out the top 5% most expensive houses #
###################################################################################
#The original model
reg_expr = 'house_price_of_unit_area ~ number_of_convenience_stores'
y, X = dmatrices(reg_expr, data=df_hp, return_type='dataframe')
olsr_model_results = sm.OLS(y, X).fit()
print(olsr_model_results.params)
#Biased sample
df_hp_95p = df_hp[df_hp['house_price_of_unit_area'] < 59.525]
#The biased model
reg_expr = 'house_price_of_unit_area ~ number_of_convenience_stores'
y_95p, X_95p = dmatrices(reg_expr, data=df_hp_95p, return_type='dataframe')
olsr_model_results_95p = sm.OLS(y_95p, X_95p).fit()
print(olsr_model_results_95p.params)
################################################
# Display the original and the biased models #
################################################
# Create a scatter plot
plt.figure(figsize=(10, 6))
sns.scatterplot(x='number_of_convenience_stores', y='house_price_of_unit_area', data=df_hp, color='cornflowerblue')
predicted_y = olsr_model_results.predict(X)
sns.lineplot(x=np.array(X['number_of_convenience_stores']), y=np.array(predicted_y), color='orange', marker='o', label='OLS Predicted')
predicted_y_95p = olsr_model_results_95p.predict(X)
sns.lineplot(x=np.array(X['number_of_convenience_stores']), y=np.array(predicted_y_95p), color='red', marker='o', label='OLS Predicted (Biased Coeffs)')
plt.xlabel('Number of Nearby Convenience Stores')
plt.ylabel('House Price per Unit Area')
plt.title('House Price per Unit Area vs. Number of Nearby Convenience Stores')
plt.legend()
# Show the plot
plt.show()
################################################
# Plot House Price versus distance to MRT stop #
################################################
plt.figure(figsize=(10, 6))
sns.scatterplot(x='distance_to_the_nearest_mrt_station', y='house_price_of_unit_area', data=df_hp, color='cornflowerblue')
plt.title('House Price per Unit Area vs Distance to Nearest MRT Station')
plt.xlabel('Distance to the Nearest MRT Station')
plt.ylabel('House Price per Unit Area')
plt.grid(True)
# Show plot
plt.show()
####################################################
####### Fit a log-linear model on this data ########
####################################################
# Compute the log of house_price_of_unit_area
df_hp['log_house_price_of_unit_area'] = np.log(df_hp['house_price_of_unit_area'])
reg_expr = 'log_house_price_of_unit_area ~ distance_to_the_nearest_mrt_station'
y, X = dmatrices(reg_expr, data=df_hp, return_type='dataframe')
log_linear_olsr_model_results = sm.OLS(y, X).fit()
log_linear_olsr_model_results.params
################################################
########## Plot the log-linear model ###########
################################################
plt.figure(figsize=(10, 6))
sns.scatterplot(x='distance_to_the_nearest_mrt_station', y='log_house_price_of_unit_area', data=df_hp, color='cornflowerblue')
predicted_y = log_linear_olsr_model_results.predict(X)
sns.lineplot(x=np.array(X['distance_to_the_nearest_mrt_station']), y=np.array(predicted_y), color='orange', marker='o', label='OLS Predicted')
plt.title('Natural Log of House Price per Unit Area vs Distance to Nearest MRT Station')
plt.xlabel('Distance to the Nearest MRT Station')
plt.ylabel('Natural Log of House Price per Unit Area')
plt.grid(True)
plt.show()
################################################
# Fit linear models on 50 bootstrapped samples #
################################################
estimated_params_log_linear_ols_mrt_model_df = pd.DataFrame(columns=['gamma_0', 'gamma_1'])
regression_expr = 'house_price_of_unit_area ~ distance_to_the_nearest_mrt_station'
for i in range(50):
sample_df = df_hp.sample(n=50, random_state=np.random.randint(0, 10000))
y, X = dmatrices(regression_expr, data=sample_df, return_type='dataframe')
sample_model_results = sm.OLS(y, X).fit()
gamma_0 = sample_model_results.params['Intercept']
gamma_1 = sample_model_results.params['distance_to_the_nearest_mrt_station']
estimated_params_log_linear_ols_mrt_model_df.at[i, 'gamma_0'] = gamma_0
estimated_params_log_linear_ols_mrt_model_df.at[i, 'gamma_1'] = gamma_1
############################################################################
# Plot the Beta_0 and Beta_1 params from the 50 bootstrapped linear models #
############ and the 'population' mean from the log-linear model ###########
############################################################################
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 7))
ax1.set_title("Estimated γ_0 from the linear model and\nthe population e^γ_0 from the log-linear model", fontsize='14')
ax1.set_ylabel("γ_0", color='black', fontsize=14)
ax1.tick_params(axis='both', which='major', labelsize=14)
ax1.scatter(x=['gamma_0']*50, y=estimated_params_log_linear_ols_mrt_model_df['gamma_0'], marker='o', label="Sample γ_0", color='cornflowerblue')
ax1.scatter(x=['gamma_0'], y=np.exp(log_linear_olsr_model_results.params['Intercept']), marker='o', s=100, label="Population γ_0", color='orange')
ax2.set_title("Estimated γ_1 from the linear model and\nthe population γ_1 from the log-linear model", fontsize='14')
ax2.set_ylabel("γ_1", color='black', fontsize=14)
ax2.tick_params(axis='both', which='major', labelsize=14)
ax2.scatter(x=['gamma_1']*50, y=estimated_params_log_linear_ols_mrt_model_df['gamma_1'], marker='o', label="γ_1", color='cornflowerblue')
ax2.scatter(x=['gamma_1'], y=log_linear_olsr_model_results.params['distance_to_the_nearest_mrt_station'], marker='o', s=100, label="Population γ_1", color='orange')
plt.show()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment