sachinsdate · September 13, 2024 10:40
diff --git a/estimator_bias.py b/estimator_bias.py
 import math
 import pandas as pd
 import numpy as np
 from patsy import dmatrices
 import statsmodels.api as sm
 import scipy.stats
 from sklearn.model_selection import train_test_split
 import matplotlib.pyplot as plt
 from matplotlib.ticker import StrMethodFormatter
 import seaborn as sns


 ############################################
 ####### Load the M & M weights data ########
 ############################################
 df_mm = pd.read_csv(filepath_or_buffer='m_and_m_sample_weights.csv', header=0)

 df_mm['Mean_Sample_Weight'] = df_mm['Weight_In_GMS']/15 

 plt.figure(figsize=(8, 6))

 ############################################
 ######### Plot the 60 sample means #########
 ############################################
 sns.scatterplot(x=['Mean Sample Weight']*60, y=df_mm['Mean_Sample_Weight'], color='cornflowerblue', label='Mean Sample Weight')

 # Plot the mean of the 'population' of 453 M&Ms
 sns.scatterplot(x=['Mean Sample Weight'], y=2.29792, color='orange', label=f'Population mean', s=100)

 plt.title('Mean Weights of 60 Random Samples')
 plt.xlabel('Samples')
 plt.ylabel('Weight in Grams')
 plt.legend()
 plt.grid(True)

 plt.show()

 ############################################
 #### Load the Taipei house prices data #####
 ############################################
 df_hp = pd.read_csv(filepath_or_buffer='taipei_real_estate_prices.csv', header=0)

 ##########################################################
 # Plot House price versus number of convenenience stores #
 ##########################################################
 plt.figure(figsize=(10, 6))
 sns.scatterplot(x='number_of_convenience_stores', y='house_price_of_unit_area', data=df_hp, color='cornflowerblue')

 plt.xlabel('Number of Nearby Convenience Stores')
 plt.ylabel('House Price per Unit Area')
 plt.title('House Price per Unit Area vs. Number of Nearby Convenience Stores')
 plt.legend()

 # Show the plot
 plt.show()

 ############################################
 ########## Fit the linear model  ###########
 ############################################
 reg_expr = 'house_price_of_unit_area ~ number_of_convenience_stores'
 y, X = dmatrices(reg_expr, data=df_hp, return_type='dataframe')
 olsr_model_results = sm.OLS(y, X).fit()
 olsr_model_results.params

 ############################################
 ########## Plot the fitted model ###########
 ############################################
 plt.figure(figsize=(10, 6))
 sns.scatterplot(x='number_of_convenience_stores', y='house_price_of_unit_area', data=df_hp, color='cornflowerblue')

 # Get the predicted y values
 predicted_y = olsr_model_results.predict(X)

 sns.lineplot(x=np.array(X['number_of_convenience_stores']), y=np.array(predicted_y), color='orange', marker='o', label='OLS Predicted')

 plt.xlabel('Number of Nearby Convenience Stores')
 plt.ylabel('House Price per Unit Area')
 plt.title('House Price per Unit Area vs. Number of Nearby Convenience Stores')
 plt.legend()

 plt.show()

 ################################################
 # Plot 50 fitted models on bootstrapped samples 
 ################################################

 num_c_stores_unique = df_hp['number_of_convenience_stores'].unique()
 num_c_stores_unique.sort()

 columns = [f'num_c_stores_{int(val)}' for val in num_c_stores_unique]
 predicted_means_df = pd.DataFrame(columns=columns)
 predicted_means_df['beta_0'] = np.nan
 predicted_means_df['beta_1'] = np.nan

 plt.figure(figsize=(10, 6))
 sns.scatterplot(x='number_of_convenience_stores', y='house_price_of_unit_area', data=df_hp, color='cornflowerblue')

 # Run the simulation 50 times
 for i in range(50):
    # Pull out a random sample of 50 rows
    sample_df = df_hp.sample(n=50, random_state=i)

    y, X = dmatrices('house_price_of_unit_area ~ number_of_convenience_stores', data=sample_df, return_type='dataframe')

    model = sm.OLS(y, X).fit()

    predicted_y = model.predict(X)

    means = []
    for val in num_c_stores_unique:
        mean_val = predicted_y[sample_df['number_of_convenience_stores'] == val].mean()
        means.append(mean_val)
    
    predicted_means_df.loc[i, columns] = means

    predicted_means_df.loc[i, 'beta_0'] = model.params['Intercept']
    predicted_means_df.loc[i, 'beta_1'] = model.params['number_of_convenience_stores']

    x_vals = np.array([df_hp['number_of_convenience_stores'].min(), df_hp['number_of_convenience_stores'].max()])
    y_vals = model.params['Intercept'] + model.params['number_of_convenience_stores'] * x_vals
    sns.lineplot(x=x_vals, y=y_vals, color='orange', linestyle='--', alpha=0.3)

 plt.title('OLS Regression Lines Fitted on 50 Random Samples with the Full Dataset in the Background')
 plt.xlabel('Number of Nearby Convenience Stores')
 plt.ylabel('House Price per Unit Area')
 plt.show()


 #######################################################
 # Plot beta_0 and beta_1 from the bootstrapped models #
 #######################################################

 fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 7))

 ax1.set_title("Estimated β_0 and population β_0", fontsize=14)
 ax1.set_ylabel("β_0", color='black', fontsize=14)

 ax1.scatter(x=['beta_0']*50, y=predicted_means_df['beta_0'], marker='o', label="Sample β_0", color='cornflowerblue')
 ax1.scatter(x=['beta_0'], y=olsr_model_results.params['Intercept'], marker='o', s=100, label="Population β_0", color='orange')
 ax1.tick_params(axis='both', which='major', labelsize=14)

 ax2.set_title("Estimated β_1 and population β_1", fontsize=14)
 ax2.scatter(x=['beta_1']*50, y=predicted_means_df['beta_1'], marker='o', label="β_1", color='cornflowerblue')
 ax2.scatter(x=['beta_1'], y=olsr_model_results.params['number_of_convenience_stores'], marker='o', s=100, label="Population β_1", color='orange')
 ax2.set_ylabel("β_1", color='black', fontsize=14)

 ax2.tick_params(axis='both', which='major', labelsize=14)

 plt.show()

 ###################################################################################
 # Fit a linear model on a sample that leaves out the top 5% most expensive houses #
 ###################################################################################

 #The original model
 reg_expr = 'house_price_of_unit_area ~ number_of_convenience_stores'

 y, X = dmatrices(reg_expr, data=df_hp, return_type='dataframe')

 olsr_model_results = sm.OLS(y, X).fit()

 print(olsr_model_results.params)

 #Biased sample
 df_hp_95p = df_hp[df_hp['house_price_of_unit_area'] < 59.525]

 #The biased model
 reg_expr = 'house_price_of_unit_area ~ number_of_convenience_stores'

 y_95p, X_95p = dmatrices(reg_expr, data=df_hp_95p, return_type='dataframe')

 olsr_model_results_95p = sm.OLS(y_95p, X_95p).fit()

 print(olsr_model_results_95p.params)

 ################################################
 #  Display the original and the biased models  #
 ################################################
 # Create a scatter plot
 plt.figure(figsize=(10, 6))
 sns.scatterplot(x='number_of_convenience_stores', y='house_price_of_unit_area', data=df_hp, color='cornflowerblue')

 predicted_y = olsr_model_results.predict(X)

 sns.lineplot(x=np.array(X['number_of_convenience_stores']), y=np.array(predicted_y), color='orange', marker='o', label='OLS Predicted')

 predicted_y_95p = olsr_model_results_95p.predict(X)

 sns.lineplot(x=np.array(X['number_of_convenience_stores']), y=np.array(predicted_y_95p), color='red', marker='o', label='OLS Predicted (Biased Coeffs)')

 plt.xlabel('Number of Nearby Convenience Stores')
 plt.ylabel('House Price per Unit Area')
 plt.title('House Price per Unit Area vs. Number of Nearby Convenience Stores')
 plt.legend()

 # Show the plot
 plt.show()


 ################################################
 # Plot House Price versus distance to MRT stop #
 ################################################

 plt.figure(figsize=(10, 6))

 sns.scatterplot(x='distance_to_the_nearest_mrt_station', y='house_price_of_unit_area', data=df_hp, color='cornflowerblue')

 plt.title('House Price per Unit Area vs Distance to Nearest MRT Station')
 plt.xlabel('Distance to the Nearest MRT Station')
 plt.ylabel('House Price per Unit Area')
 plt.grid(True)

 # Show plot
 plt.show()

 ####################################################
 ####### Fit a log-linear model on this data ########
 ####################################################

 # Compute the log of house_price_of_unit_area
 df_hp['log_house_price_of_unit_area'] = np.log(df_hp['house_price_of_unit_area'])

 reg_expr = 'log_house_price_of_unit_area ~ distance_to_the_nearest_mrt_station'

 y, X = dmatrices(reg_expr, data=df_hp, return_type='dataframe')

 log_linear_olsr_model_results = sm.OLS(y, X).fit()

 log_linear_olsr_model_results.params

 ################################################
 ########## Plot the log-linear model ###########
 ################################################

 plt.figure(figsize=(10, 6))

 sns.scatterplot(x='distance_to_the_nearest_mrt_station', y='log_house_price_of_unit_area', data=df_hp, color='cornflowerblue')

 predicted_y = log_linear_olsr_model_results.predict(X)

 sns.lineplot(x=np.array(X['distance_to_the_nearest_mrt_station']), y=np.array(predicted_y), color='orange', marker='o', label='OLS Predicted')

 plt.title('Natural Log of House Price per Unit Area vs Distance to Nearest MRT Station')
 plt.xlabel('Distance to the Nearest MRT Station')
 plt.ylabel('Natural Log of House Price per Unit Area')
 plt.grid(True)

 plt.show()


 ################################################
 # Fit linear models on 50 bootstrapped samples #
 ################################################
 estimated_params_log_linear_ols_mrt_model_df = pd.DataFrame(columns=['gamma_0', 'gamma_1'])

 regression_expr = 'house_price_of_unit_area ~ distance_to_the_nearest_mrt_station'

 for i in range(50):
    sample_df = df_hp.sample(n=50, random_state=np.random.randint(0, 10000))

    y, X = dmatrices(regression_expr, data=sample_df, return_type='dataframe')

    sample_model_results = sm.OLS(y, X).fit()

    gamma_0 = sample_model_results.params['Intercept']
    gamma_1 = sample_model_results.params['distance_to_the_nearest_mrt_station']
    
    estimated_params_log_linear_ols_mrt_model_df.at[i, 'gamma_0'] = gamma_0
    estimated_params_log_linear_ols_mrt_model_df.at[i, 'gamma_1'] = gamma_1


 ############################################################################
 # Plot the Beta_0 and Beta_1 params from the 50 bootstrapped linear models #
 ############ and the 'population' mean from the log-linear model ###########
 ############################################################################

 fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 7))

 ax1.set_title("Estimated γ_0 from the linear model and\nthe population e^γ_0 from the log-linear model", fontsize='14')
 ax1.set_ylabel("γ_0", color='black', fontsize=14)
 ax1.tick_params(axis='both', which='major', labelsize=14)

 ax1.scatter(x=['gamma_0']*50, y=estimated_params_log_linear_ols_mrt_model_df['gamma_0'], marker='o', label="Sample γ_0", color='cornflowerblue')
 ax1.scatter(x=['gamma_0'], y=np.exp(log_linear_olsr_model_results.params['Intercept']), marker='o', s=100, label="Population γ_0", color='orange')

 ax2.set_title("Estimated γ_1 from the linear model and\nthe population γ_1 from the log-linear model", fontsize='14')
 ax2.set_ylabel("γ_1", color='black', fontsize=14)
 ax2.tick_params(axis='both', which='major', labelsize=14)

 ax2.scatter(x=['gamma_1']*50, y=estimated_params_log_linear_ols_mrt_model_df['gamma_1'], marker='o', label="γ_1", color='cornflowerblue')
 ax2.scatter(x=['gamma_1'], y=log_linear_olsr_model_results.params['distance_to_the_nearest_mrt_station'], marker='o', s=100, label="Population γ_1", color='orange')

 plt.show()
	import math
	import pandas as pd
	import numpy as np
	from patsy import dmatrices
	import statsmodels.api as sm
	import scipy.stats
	from sklearn.model_selection import train_test_split
	import matplotlib.pyplot as plt
	from matplotlib.ticker import StrMethodFormatter
	import seaborn as sns


	############################################
	####### Load the M & M weights data ########
	############################################
	df_mm = pd.read_csv(filepath_or_buffer='m_and_m_sample_weights.csv', header=0)

	df_mm['Mean_Sample_Weight'] = df_mm['Weight_In_GMS']/15

	plt.figure(figsize=(8, 6))

	############################################
	######### Plot the 60 sample means #########
	############################################
	sns.scatterplot(x=['Mean Sample Weight']*60, y=df_mm['Mean_Sample_Weight'], color='cornflowerblue', label='Mean Sample Weight')

	# Plot the mean of the 'population' of 453 M&Ms
	sns.scatterplot(x=['Mean Sample Weight'], y=2.29792, color='orange', label=f'Population mean', s=100)

	plt.title('Mean Weights of 60 Random Samples')
	plt.xlabel('Samples')
	plt.ylabel('Weight in Grams')
	plt.legend()
	plt.grid(True)

	plt.show()

	############################################
	#### Load the Taipei house prices data #####
	############################################
	df_hp = pd.read_csv(filepath_or_buffer='taipei_real_estate_prices.csv', header=0)

	##########################################################
	# Plot House price versus number of convenenience stores #
	##########################################################
	plt.figure(figsize=(10, 6))
	sns.scatterplot(x='number_of_convenience_stores', y='house_price_of_unit_area', data=df_hp, color='cornflowerblue')

	plt.xlabel('Number of Nearby Convenience Stores')
	plt.ylabel('House Price per Unit Area')
	plt.title('House Price per Unit Area vs. Number of Nearby Convenience Stores')
	plt.legend()

	# Show the plot
	plt.show()

	############################################
	########## Fit the linear model ###########
	############################################
	reg_expr = 'house_price_of_unit_area ~ number_of_convenience_stores'
	y, X = dmatrices(reg_expr, data=df_hp, return_type='dataframe')
	olsr_model_results = sm.OLS(y, X).fit()
	olsr_model_results.params

	############################################
	########## Plot the fitted model ###########
	############################################
	plt.figure(figsize=(10, 6))
	sns.scatterplot(x='number_of_convenience_stores', y='house_price_of_unit_area', data=df_hp, color='cornflowerblue')

	# Get the predicted y values
	predicted_y = olsr_model_results.predict(X)

	sns.lineplot(x=np.array(X['number_of_convenience_stores']), y=np.array(predicted_y), color='orange', marker='o', label='OLS Predicted')

	plt.xlabel('Number of Nearby Convenience Stores')
	plt.ylabel('House Price per Unit Area')
	plt.title('House Price per Unit Area vs. Number of Nearby Convenience Stores')
	plt.legend()

	plt.show()

	################################################
	# Plot 50 fitted models on bootstrapped samples
	################################################

	num_c_stores_unique = df_hp['number_of_convenience_stores'].unique()
	num_c_stores_unique.sort()

	columns = [f'num_c_stores_{int(val)}' for val in num_c_stores_unique]
	predicted_means_df = pd.DataFrame(columns=columns)
	predicted_means_df['beta_0'] = np.nan
	predicted_means_df['beta_1'] = np.nan

	plt.figure(figsize=(10, 6))
	sns.scatterplot(x='number_of_convenience_stores', y='house_price_of_unit_area', data=df_hp, color='cornflowerblue')

	# Run the simulation 50 times
	for i in range(50):
	# Pull out a random sample of 50 rows
	sample_df = df_hp.sample(n=50, random_state=i)

	y, X = dmatrices('house_price_of_unit_area ~ number_of_convenience_stores', data=sample_df, return_type='dataframe')

	model = sm.OLS(y, X).fit()

	predicted_y = model.predict(X)

	means = []
	for val in num_c_stores_unique:
	mean_val = predicted_y[sample_df['number_of_convenience_stores'] == val].mean()
	means.append(mean_val)

	predicted_means_df.loc[i, columns] = means

	predicted_means_df.loc[i, 'beta_0'] = model.params['Intercept']
	predicted_means_df.loc[i, 'beta_1'] = model.params['number_of_convenience_stores']

	x_vals = np.array([df_hp['number_of_convenience_stores'].min(), df_hp['number_of_convenience_stores'].max()])
	y_vals = model.params['Intercept'] + model.params['number_of_convenience_stores'] * x_vals
	sns.lineplot(x=x_vals, y=y_vals, color='orange', linestyle='--', alpha=0.3)

	plt.title('OLS Regression Lines Fitted on 50 Random Samples with the Full Dataset in the Background')
	plt.xlabel('Number of Nearby Convenience Stores')
	plt.ylabel('House Price per Unit Area')
	plt.show()


	#######################################################
	# Plot beta_0 and beta_1 from the bootstrapped models #
	#######################################################

	fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 7))

	ax1.set_title("Estimated β_0 and population β_0", fontsize=14)
	ax1.set_ylabel("β_0", color='black', fontsize=14)

	ax1.scatter(x=['beta_0']*50, y=predicted_means_df['beta_0'], marker='o', label="Sample β_0", color='cornflowerblue')
	ax1.scatter(x=['beta_0'], y=olsr_model_results.params['Intercept'], marker='o', s=100, label="Population β_0", color='orange')
	ax1.tick_params(axis='both', which='major', labelsize=14)

	ax2.set_title("Estimated β_1 and population β_1", fontsize=14)
	ax2.scatter(x=['beta_1']*50, y=predicted_means_df['beta_1'], marker='o', label="β_1", color='cornflowerblue')
	ax2.scatter(x=['beta_1'], y=olsr_model_results.params['number_of_convenience_stores'], marker='o', s=100, label="Population β_1", color='orange')
	ax2.set_ylabel("β_1", color='black', fontsize=14)

	ax2.tick_params(axis='both', which='major', labelsize=14)

	plt.show()

	###################################################################################
	# Fit a linear model on a sample that leaves out the top 5% most expensive houses #
	###################################################################################

	#The original model
	reg_expr = 'house_price_of_unit_area ~ number_of_convenience_stores'

	y, X = dmatrices(reg_expr, data=df_hp, return_type='dataframe')

	olsr_model_results = sm.OLS(y, X).fit()

	print(olsr_model_results.params)

	#Biased sample
	df_hp_95p = df_hp[df_hp['house_price_of_unit_area'] < 59.525]

	#The biased model
	reg_expr = 'house_price_of_unit_area ~ number_of_convenience_stores'

	y_95p, X_95p = dmatrices(reg_expr, data=df_hp_95p, return_type='dataframe')

	olsr_model_results_95p = sm.OLS(y_95p, X_95p).fit()

	print(olsr_model_results_95p.params)

	################################################
	# Display the original and the biased models #
	################################################
	# Create a scatter plot
	plt.figure(figsize=(10, 6))
	sns.scatterplot(x='number_of_convenience_stores', y='house_price_of_unit_area', data=df_hp, color='cornflowerblue')

	predicted_y = olsr_model_results.predict(X)

	sns.lineplot(x=np.array(X['number_of_convenience_stores']), y=np.array(predicted_y), color='orange', marker='o', label='OLS Predicted')

	predicted_y_95p = olsr_model_results_95p.predict(X)

	sns.lineplot(x=np.array(X['number_of_convenience_stores']), y=np.array(predicted_y_95p), color='red', marker='o', label='OLS Predicted (Biased Coeffs)')

	plt.xlabel('Number of Nearby Convenience Stores')
	plt.ylabel('House Price per Unit Area')
	plt.title('House Price per Unit Area vs. Number of Nearby Convenience Stores')
	plt.legend()

	# Show the plot
	plt.show()


	################################################
	# Plot House Price versus distance to MRT stop #
	################################################

	plt.figure(figsize=(10, 6))

	sns.scatterplot(x='distance_to_the_nearest_mrt_station', y='house_price_of_unit_area', data=df_hp, color='cornflowerblue')

	plt.title('House Price per Unit Area vs Distance to Nearest MRT Station')
	plt.xlabel('Distance to the Nearest MRT Station')
	plt.ylabel('House Price per Unit Area')
	plt.grid(True)

	# Show plot
	plt.show()

	####################################################
	####### Fit a log-linear model on this data ########
	####################################################

	# Compute the log of house_price_of_unit_area
	df_hp['log_house_price_of_unit_area'] = np.log(df_hp['house_price_of_unit_area'])

	reg_expr = 'log_house_price_of_unit_area ~ distance_to_the_nearest_mrt_station'

	y, X = dmatrices(reg_expr, data=df_hp, return_type='dataframe')

	log_linear_olsr_model_results = sm.OLS(y, X).fit()

	log_linear_olsr_model_results.params

	################################################
	########## Plot the log-linear model ###########
	################################################

	plt.figure(figsize=(10, 6))

	sns.scatterplot(x='distance_to_the_nearest_mrt_station', y='log_house_price_of_unit_area', data=df_hp, color='cornflowerblue')

	predicted_y = log_linear_olsr_model_results.predict(X)

	sns.lineplot(x=np.array(X['distance_to_the_nearest_mrt_station']), y=np.array(predicted_y), color='orange', marker='o', label='OLS Predicted')

	plt.title('Natural Log of House Price per Unit Area vs Distance to Nearest MRT Station')
	plt.xlabel('Distance to the Nearest MRT Station')
	plt.ylabel('Natural Log of House Price per Unit Area')
	plt.grid(True)

	plt.show()


	################################################
	# Fit linear models on 50 bootstrapped samples #
	################################################
	estimated_params_log_linear_ols_mrt_model_df = pd.DataFrame(columns=['gamma_0', 'gamma_1'])

	regression_expr = 'house_price_of_unit_area ~ distance_to_the_nearest_mrt_station'

	for i in range(50):
	sample_df = df_hp.sample(n=50, random_state=np.random.randint(0, 10000))

	y, X = dmatrices(regression_expr, data=sample_df, return_type='dataframe')

	sample_model_results = sm.OLS(y, X).fit()

	gamma_0 = sample_model_results.params['Intercept']
	gamma_1 = sample_model_results.params['distance_to_the_nearest_mrt_station']

	estimated_params_log_linear_ols_mrt_model_df.at[i, 'gamma_0'] = gamma_0
	estimated_params_log_linear_ols_mrt_model_df.at[i, 'gamma_1'] = gamma_1


	############################################################################
	# Plot the Beta_0 and Beta_1 params from the 50 bootstrapped linear models #
	############ and the 'population' mean from the log-linear model ###########
	############################################################################

	fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 7))

	ax1.set_title("Estimated γ_0 from the linear model and\nthe population e^γ_0 from the log-linear model", fontsize='14')
	ax1.set_ylabel("γ_0", color='black', fontsize=14)
	ax1.tick_params(axis='both', which='major', labelsize=14)

	ax1.scatter(x=['gamma_0']*50, y=estimated_params_log_linear_ols_mrt_model_df['gamma_0'], marker='o', label="Sample γ_0", color='cornflowerblue')
	ax1.scatter(x=['gamma_0'], y=np.exp(log_linear_olsr_model_results.params['Intercept']), marker='o', s=100, label="Population γ_0", color='orange')

	ax2.set_title("Estimated γ_1 from the linear model and\nthe population γ_1 from the log-linear model", fontsize='14')
	ax2.set_ylabel("γ_1", color='black', fontsize=14)
	ax2.tick_params(axis='both', which='major', labelsize=14)

	ax2.scatter(x=['gamma_1']*50, y=estimated_params_log_linear_ols_mrt_model_df['gamma_1'], marker='o', label="γ_1", color='cornflowerblue')
	ax2.scatter(x=['gamma_1'], y=log_linear_olsr_model_results.params['distance_to_the_nearest_mrt_station'], marker='o', s=100, label="Population γ_1", color='orange')

	plt.show()