Skip to content

Instantly share code, notes, and snippets.

@sachinsdate
Created July 31, 2024 10:26
Show Gist options
  • Save sachinsdate/e413c378dcac12a93f37eefdba6f046e to your computer and use it in GitHub Desktop.
Save sachinsdate/e413c378dcac12a93f37eefdba6f046e to your computer and use it in GitHub Desktop.
import math
import pandas as pd
from patsy import dmatrices
import numpy as np
import scipy.stats
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
#Read the autos dataset into a Pandas Dataframe (automobiles.data.csv can be downloaded from the Gists section)
filename = 'automobiles.data.csv'
df = pd.read_csv(filepath_or_buffer=filename, header=0)
#Remove rows with NaNs
df = df.dropna()
print(df)
#Create a patsy expression of a linear model that regresses city_mpg on curb_weight and horsepower without an intercept
expr = 'city_mpg ~ curb_weight + horsepower -1'
#Create training and testing datasets in a 80:20 format
df_train, df_test = train_test_split(df, test_size=0.2)
#Carve out the y and X matrices
y_train, X_train = dmatrices(expr, df_train, return_type='dataframe')
y_test, X_test = dmatrices(expr, df_test, return_type='dataframe')
#Print the correlation matrix of X with y
corr_df_auto = pd.concat([y_train, X_train], axis=1).corr()
print(corr_df_auto)
#Create the OLS model, train it, and print the training summary
olsr_model_auto = sm.OLS(endog=y_train, exog=X_train)
olsr_results_auto= olsr_model_auto.fit()
print(olsr_results_auto.summary())
#Create the truncated model, train it, and print the training summary
olsr_model_auto_truncated = sm.OLS(endog=y_train, exog=X_train[['curb_weight']])
olsr_results_auto_tuncated= olsr_model_auto_truncated.fit()
print(olsr_results_auto_tuncated.summary())
#Calculate the omitted variable bias manually
#Get the variance-covariance matrix of X:
cov_mat = df_train[['curb_weight', 'horsepower']].cov(ddof=1)
print('cov(curb_weight, horsepower)=',cov_mat['curb_weight']['horsepower'], 'var(curb_weight)=',cov_mat['curb_weight']['curb_weight'])
#Coeffs of fitted full model
gamma_1_cap = olsr_results_auto.params.at['curb_weight']
gamma_2_cap = olsr_results_auto.params.at['horsepower']
print(gamma_1_cap, gamma_2_cap)
#Omitted Variable Bias on E(beta_1_cap)
bias_beta_1 = gamma_2_cap*(cov_mat['curb_weight']['horsepower'])/cov_mat['curb_weight']['curb_weight']
print('bias_beta_1=',bias_beta_1)
#Print the biased expectation of beta_1_cap
print('Biased E(beta_1_cap)=',gamma_1_cap + bias_beta_1)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment