Created
December 10, 2021 15:16
-
-
Save yongkangc/ee2926298d96aa0fc04229bd6ef42eb0 to your computer and use it in GitHub Desktop.
Linear Regression Code Template
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#importing the required packages | |
import numpy as np | |
import pandas as pd | |
import matplotlib.pyplot as plt | |
import seaborn as sns | |
import statsmodels.api as sm | |
def normalize_z(dfin): | |
dfout = (dfin - dfin.mean(axis=0))/dfin.std(axis=0) | |
return dfout | |
def normalize_minmax(dfin): | |
dfout = (dfin - dfin.min(axis=0))/(dfin.max(axis=0)-dfin.min(axis=0)) | |
return dfout | |
def get_features_targets(df, feature_names, target_names): | |
df_feature = df.loc[:,feature_names] | |
df_target = df.loc[:,target_names] | |
return df_feature, df_target | |
def prepare_feature(df_feature): | |
n = df_feature.shape[0] | |
ones = np.ones(n).reshape(n,1) | |
return np.concatenate((ones,df_feature.to_numpy()),axis = 1) | |
def prepare_target(df_feature): | |
return df_feature.to_numpy() | |
def predict(df_feature, beta): | |
df_feature = normalize_z(df_feature) | |
X = prepare_feature(df_feature) | |
return predict_norm(X, beta) | |
def predict_norm(X, beta): | |
return np.matmul(X,beta) | |
def split_data(df_feature, df_target, random_state=None, test_size=0.5): | |
np.random.seed(random_state) | |
TestSize = int(test_size*len(df_feature)) | |
testchoice = np.random.choice(len(df_feature),size = TestSize, replace = False) | |
remainder = [] | |
for i in df_feature.index: | |
if i not in testchoice: | |
remainder.append(i) | |
trainchoice = np.random.choice(remainder, size = len(remainder),replace = False) | |
df_feature_train = df_feature.iloc[trainchoice] | |
df_target_train = df_target.iloc[trainchoice] | |
df_feature_test = df_feature.iloc[testchoice] | |
df_target_test = df_target.iloc[testchoice] | |
return df_feature_train, df_feature_test, df_target_train, df_target_test | |
def poly_features(df_feature, colname, colname_transformed, degree=2): | |
col = df_feature[colname] | |
df_feature[colname_transformed] = np.power(col, degree) | |
return df_feature | |
class LinearRegression: | |
def __init__(self): | |
self.beta = None | |
self.J_storage = None | |
def fit(self,X,y,iterations,alpha): | |
""" | |
Fit Linear model with the datasets using gradient descent. | |
Parameters | |
X : Training data. | |
y : Target Values | |
""" | |
n,p = X.shape | |
self.J_storage = np.zeros(iterations) | |
beta = np.random.randn(p,1) / np.sqrt(n) #Weight Initialization | |
for i in range(iterations): | |
y_pred = self.predict_norm(X,beta) | |
error = y_pred - y | |
beta = beta - (alpha/n) * np.matmul(X.T,error) | |
cost = self.compute_cost(X,y,beta) | |
self.J_storage[i] = cost | |
self.beta = beta | |
return self.beta | |
def predict(self,df_feature,normalisation=None): | |
""" | |
Predict the target values | |
Parameters | |
df_feature : Test data. | |
normalisation : Normalisation method for the test data before prediction. Default is None. | |
Returns | |
y_pred : Predicted target values. | |
""" | |
df_feature = self.prepare_feature(df_feature) | |
if normalisation == 'standard': | |
return self.predict_norm(normalize_z(df_feature), self.beta) | |
elif normalisation == 'min-max': | |
return self.predict_norm(normalize_minmax(df_feature), self.beta) | |
else: | |
return self.predict_norm(df_feature, self.beta) | |
def predict_norm(self,X, beta): | |
return np.matmul(X,beta) | |
def compute_cost(self,X, y, beta): | |
m = X.shape[0] | |
y_pred = np.matmul(X,beta) | |
error = y_pred - y | |
J = (1/(2*m))*np.matmul(error.T,error) | |
return J[0][0] | |
def prepare_feature(self,df_feature): | |
n = df_feature.shape[0] | |
ones = np.ones(n).reshape(n,1) | |
return np.concatenate((ones,df_feature.to_numpy()),axis = 1) | |
class Evaluate: | |
def __init__(self,target,prediction): | |
self.target = target | |
self.prediction = prediction | |
def r2_score(self): | |
rss = np.sum((self.prediction - self.target) ** 2) | |
tss = np.sum((self.target-self.target.mean()) ** 2) | |
r2 = 1 - (rss / tss) | |
return r2 | |
def mean_absolute_error(self): | |
n = self.target.shape[0] | |
error = abs(self.target-self.prediction) | |
mae = np.sum(error)/n | |
return mae | |
def adjusted_r2_score(self): | |
r2 = self.r2_score() | |
n = self.target.shape[0] | |
k = self.target.shape[1] | |
adj_r2 = 1-(1-r2)*(n-1)/(n-k-1) | |
return adj_r2 | |
def evaluate(self): | |
r2 = self.r2_score() | |
adjusted_r2 = self.adjusted_r2_score() | |
mae = self.mean_absolute_error() | |
print(f"mae : {mae}\n") | |
print(f"r2 : {r2}") | |
print(f"adjusted r2 : {adjusted_r2}") | |
def __str__(self) -> str: | |
return self.evaluate() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#importing the required packages | |
import numpy as np | |
import pandas as pd | |
import matplotlib.pyplot as plt | |
import seaborn as sns | |
import statsmodels.api as sm | |
# The data preprocessing for this is done manually | |
df = pd.read_csv("https://raw.githubusercontent.com/ExtremelySunnyYK/Covid-Modelling/task_1/data/2D%20Dataset.csv") | |
df.head() | |
# Extract feature and target | |
columns = ['Days from start', 'Confirmed Cases on date', | |
'Active Cases on date', 'Continent__Africa', 'Continent__Asia', | |
'Continent__Europe', 'Continent__North America', | |
'Continent__South America', 'Country__Argentina', 'Country__Bangladesh', | |
'Country__Belgium', 'Country__Brazil', 'Country__Canada', | |
'Country__Chile', 'Country__Colombia', 'Country__Czechia', | |
'Country__Ecuador', 'Country__France', 'Country__Germany', | |
'Country__Hungary', 'Country__India', 'Country__Indonesia', | |
'Country__Iran', 'Country__Italy', 'Country__Malaysia', | |
'Country__Mexico', 'Country__Pakistan', 'Country__Peru', | |
'Country__Philippines', 'Country__Poland', 'Country__Romania', | |
'Country__Russia', 'Country__South Africa', 'Country__Spain', | |
'Country__Turkey', 'Country__US', 'Country__Ukraine', | |
'Country__United Kingdom', 'GDP Per Capita', 'Cases per day'] | |
df_features, df_target = get_features_targets(df,columns,["Deaths per day"]) | |
# normalize features | |
df_features = normalize_z(df_features) | |
df_features_train, df_features_test, df_target_train, df_target_test = split_data(df_features, df_target, random_state=100, test_size=0.3) | |
# change to numpy array and append column for feature | |
X = prepare_feature(df_features_train) # concatenating for the y intercept | |
target = prepare_target(df_target_train) | |
model = LinearRegression() | |
iterations = 100 | |
alpha = 0.01 | |
model.fit(X,target,iterations,alpha) | |
print(f"Beta : {model.beta}") | |
pred = model.predict(df_features_test) | |
plt.plot(model.J_storage) | |
score = Evaluate(df_target_test, pred).evaluate() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment