Skip to content

Instantly share code, notes, and snippets.

@yongkangc
Created December 10, 2021 15:16
Show Gist options
  • Save yongkangc/ee2926298d96aa0fc04229bd6ef42eb0 to your computer and use it in GitHub Desktop.
Save yongkangc/ee2926298d96aa0fc04229bd6ef42eb0 to your computer and use it in GitHub Desktop.
Linear Regression Code Template
#importing the required packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
def normalize_z(dfin):
dfout = (dfin - dfin.mean(axis=0))/dfin.std(axis=0)
return dfout
def normalize_minmax(dfin):
dfout = (dfin - dfin.min(axis=0))/(dfin.max(axis=0)-dfin.min(axis=0))
return dfout
def get_features_targets(df, feature_names, target_names):
df_feature = df.loc[:,feature_names]
df_target = df.loc[:,target_names]
return df_feature, df_target
def prepare_feature(df_feature):
n = df_feature.shape[0]
ones = np.ones(n).reshape(n,1)
return np.concatenate((ones,df_feature.to_numpy()),axis = 1)
def prepare_target(df_feature):
return df_feature.to_numpy()
def predict(df_feature, beta):
df_feature = normalize_z(df_feature)
X = prepare_feature(df_feature)
return predict_norm(X, beta)
def predict_norm(X, beta):
return np.matmul(X,beta)
def split_data(df_feature, df_target, random_state=None, test_size=0.5):
np.random.seed(random_state)
TestSize = int(test_size*len(df_feature))
testchoice = np.random.choice(len(df_feature),size = TestSize, replace = False)
remainder = []
for i in df_feature.index:
if i not in testchoice:
remainder.append(i)
trainchoice = np.random.choice(remainder, size = len(remainder),replace = False)
df_feature_train = df_feature.iloc[trainchoice]
df_target_train = df_target.iloc[trainchoice]
df_feature_test = df_feature.iloc[testchoice]
df_target_test = df_target.iloc[testchoice]
return df_feature_train, df_feature_test, df_target_train, df_target_test
def poly_features(df_feature, colname, colname_transformed, degree=2):
col = df_feature[colname]
df_feature[colname_transformed] = np.power(col, degree)
return df_feature
class LinearRegression:
def __init__(self):
self.beta = None
self.J_storage = None
def fit(self,X,y,iterations,alpha):
"""
Fit Linear model with the datasets using gradient descent.
Parameters
X : Training data.
y : Target Values
"""
n,p = X.shape
self.J_storage = np.zeros(iterations)
beta = np.random.randn(p,1) / np.sqrt(n) #Weight Initialization
for i in range(iterations):
y_pred = self.predict_norm(X,beta)
error = y_pred - y
beta = beta - (alpha/n) * np.matmul(X.T,error)
cost = self.compute_cost(X,y,beta)
self.J_storage[i] = cost
self.beta = beta
return self.beta
def predict(self,df_feature,normalisation=None):
"""
Predict the target values
Parameters
df_feature : Test data.
normalisation : Normalisation method for the test data before prediction. Default is None.
Returns
y_pred : Predicted target values.
"""
df_feature = self.prepare_feature(df_feature)
if normalisation == 'standard':
return self.predict_norm(normalize_z(df_feature), self.beta)
elif normalisation == 'min-max':
return self.predict_norm(normalize_minmax(df_feature), self.beta)
else:
return self.predict_norm(df_feature, self.beta)
def predict_norm(self,X, beta):
return np.matmul(X,beta)
def compute_cost(self,X, y, beta):
m = X.shape[0]
y_pred = np.matmul(X,beta)
error = y_pred - y
J = (1/(2*m))*np.matmul(error.T,error)
return J[0][0]
def prepare_feature(self,df_feature):
n = df_feature.shape[0]
ones = np.ones(n).reshape(n,1)
return np.concatenate((ones,df_feature.to_numpy()),axis = 1)
class Evaluate:
def __init__(self,target,prediction):
self.target = target
self.prediction = prediction
def r2_score(self):
rss = np.sum((self.prediction - self.target) ** 2)
tss = np.sum((self.target-self.target.mean()) ** 2)
r2 = 1 - (rss / tss)
return r2
def mean_absolute_error(self):
n = self.target.shape[0]
error = abs(self.target-self.prediction)
mae = np.sum(error)/n
return mae
def adjusted_r2_score(self):
r2 = self.r2_score()
n = self.target.shape[0]
k = self.target.shape[1]
adj_r2 = 1-(1-r2)*(n-1)/(n-k-1)
return adj_r2
def evaluate(self):
r2 = self.r2_score()
adjusted_r2 = self.adjusted_r2_score()
mae = self.mean_absolute_error()
print(f"mae : {mae}\n")
print(f"r2 : {r2}")
print(f"adjusted r2 : {adjusted_r2}")
def __str__(self) -> str:
return self.evaluate()
#importing the required packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
# The data preprocessing for this is done manually
df = pd.read_csv("https://raw.githubusercontent.com/ExtremelySunnyYK/Covid-Modelling/task_1/data/2D%20Dataset.csv")
df.head()
# Extract feature and target
columns = ['Days from start', 'Confirmed Cases on date',
'Active Cases on date', 'Continent__Africa', 'Continent__Asia',
'Continent__Europe', 'Continent__North America',
'Continent__South America', 'Country__Argentina', 'Country__Bangladesh',
'Country__Belgium', 'Country__Brazil', 'Country__Canada',
'Country__Chile', 'Country__Colombia', 'Country__Czechia',
'Country__Ecuador', 'Country__France', 'Country__Germany',
'Country__Hungary', 'Country__India', 'Country__Indonesia',
'Country__Iran', 'Country__Italy', 'Country__Malaysia',
'Country__Mexico', 'Country__Pakistan', 'Country__Peru',
'Country__Philippines', 'Country__Poland', 'Country__Romania',
'Country__Russia', 'Country__South Africa', 'Country__Spain',
'Country__Turkey', 'Country__US', 'Country__Ukraine',
'Country__United Kingdom', 'GDP Per Capita', 'Cases per day']
df_features, df_target = get_features_targets(df,columns,["Deaths per day"])
# normalize features
df_features = normalize_z(df_features)
df_features_train, df_features_test, df_target_train, df_target_test = split_data(df_features, df_target, random_state=100, test_size=0.3)
# change to numpy array and append column for feature
X = prepare_feature(df_features_train) # concatenating for the y intercept
target = prepare_target(df_target_train)
model = LinearRegression()
iterations = 100
alpha = 0.01
model.fit(X,target,iterations,alpha)
print(f"Beta : {model.beta}")
pred = model.predict(df_features_test)
plt.plot(model.J_storage)
score = Evaluate(df_target_test, pred).evaluate()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment