yongkangc · December 10, 2021 15:16
diff --git a/Model.py b/Model.py
 #importing the required packages
 import numpy as np
 import pandas as pd
 import matplotlib.pyplot as plt
 import seaborn as sns
 import statsmodels.api as sm

 def normalize_z(dfin):
    dfout = (dfin - dfin.mean(axis=0))/dfin.std(axis=0)
    return dfout

 def normalize_minmax(dfin):
    dfout = (dfin - dfin.min(axis=0))/(dfin.max(axis=0)-dfin.min(axis=0))
    return dfout

 def get_features_targets(df, feature_names, target_names):
    df_feature = df.loc[:,feature_names]
    df_target = df.loc[:,target_names]
    return df_feature, df_target

 def prepare_feature(df_feature):
    n = df_feature.shape[0]
    ones = np.ones(n).reshape(n,1)
    return np.concatenate((ones,df_feature.to_numpy()),axis = 1)

 def prepare_target(df_feature):
    return df_feature.to_numpy()

 def predict(df_feature, beta):
    df_feature = normalize_z(df_feature)
    X = prepare_feature(df_feature)
    return predict_norm(X, beta)

 def predict_norm(X, beta):
    return np.matmul(X,beta)

 def split_data(df_feature, df_target, random_state=None, test_size=0.5):
    np.random.seed(random_state)
    TestSize = int(test_size*len(df_feature))
    testchoice = np.random.choice(len(df_feature),size = TestSize, replace = False)
    remainder = []
    for i in df_feature.index:
        if i not in testchoice:
            remainder.append(i)
    trainchoice = np.random.choice(remainder, size = len(remainder),replace = False)
    df_feature_train = df_feature.iloc[trainchoice]
    df_target_train = df_target.iloc[trainchoice]
    df_feature_test = df_feature.iloc[testchoice]
    df_target_test = df_target.iloc[testchoice]
    return df_feature_train, df_feature_test, df_target_train, df_target_test

 def poly_features(df_feature, colname, colname_transformed, degree=2):
    col = df_feature[colname]
    df_feature[colname_transformed] = np.power(col, degree)
    return df_feature

 class LinearRegression:
    def __init__(self):
        self.beta = None
        self.J_storage = None
    
    def fit(self,X,y,iterations,alpha):
        """
        Fit Linear model with the datasets using gradient descent.

        Parameters
            X : Training data.

            y : Target Values
        """
        n,p = X.shape
        self.J_storage = np.zeros(iterations)
        beta = np.random.randn(p,1) / np.sqrt(n) #Weight Initialization
        for i in range(iterations):
            y_pred = self.predict_norm(X,beta)
            error = y_pred - y
            beta = beta - (alpha/n) * np.matmul(X.T,error)
            cost = self.compute_cost(X,y,beta)
            self.J_storage[i] = cost
        self.beta = beta
        return self.beta

    def predict(self,df_feature,normalisation=None):
        """
        Predict the target values
        
        Parameters
            df_feature : Test data.
            normalisation : Normalisation method for the test data before prediction. Default is None.

        Returns
            y_pred : Predicted target values.

        """
        df_feature = self.prepare_feature(df_feature)
        if normalisation == 'standard':
            return self.predict_norm(normalize_z(df_feature), self.beta)
        elif normalisation == 'min-max':
            return self.predict_norm(normalize_minmax(df_feature), self.beta)
        else:
            return self.predict_norm(df_feature, self.beta)
    
    def predict_norm(self,X, beta):
        return np.matmul(X,beta)
    
    def compute_cost(self,X, y, beta):
        m = X.shape[0]
        y_pred = np.matmul(X,beta)
        error = y_pred - y
        J = (1/(2*m))*np.matmul(error.T,error)
        return J[0][0] 

    def prepare_feature(self,df_feature):
        n = df_feature.shape[0]
        ones = np.ones(n).reshape(n,1)
        return np.concatenate((ones,df_feature.to_numpy()),axis = 1)
    
 class Evaluate:
    def __init__(self,target,prediction):
        self.target = target
        self.prediction = prediction

    def r2_score(self):
        rss = np.sum((self.prediction - self.target) ** 2)
        tss = np.sum((self.target-self.target.mean()) ** 2)
        r2 = 1 - (rss / tss)
        return r2

    def mean_absolute_error(self):
        n = self.target.shape[0]
        error = abs(self.target-self.prediction)
        mae = np.sum(error)/n
        return mae

    def adjusted_r2_score(self):
        r2 = self.r2_score()
        n = self.target.shape[0]
        k = self.target.shape[1]
        adj_r2 = 1-(1-r2)*(n-1)/(n-k-1)
        return adj_r2
    
    def evaluate(self):
        r2 = self.r2_score()
        adjusted_r2 = self.adjusted_r2_score()
        mae = self.mean_absolute_error()
        print(f"mae : {mae}\n")
        print(f"r2 : {r2}")
        print(f"adjusted r2 : {adjusted_r2}")
    
    def __str__(self) -> str:
        return self.evaluate()
diff --git a/usage.py b/usage.py
 #importing the required packages
 import numpy as np
 import pandas as pd
 import matplotlib.pyplot as plt
 import seaborn as sns
 import statsmodels.api as sm

 # The data preprocessing for this is done manually
 df = pd.read_csv("https://raw.githubusercontent.com/ExtremelySunnyYK/Covid-Modelling/task_1/data/2D%20Dataset.csv")
 df.head()

 # Extract feature and target
 columns = ['Days from start', 'Confirmed Cases on date',
       'Active Cases on date', 'Continent__Africa', 'Continent__Asia',
       'Continent__Europe', 'Continent__North America',
       'Continent__South America', 'Country__Argentina', 'Country__Bangladesh',
       'Country__Belgium', 'Country__Brazil', 'Country__Canada',
       'Country__Chile', 'Country__Colombia', 'Country__Czechia',
       'Country__Ecuador', 'Country__France', 'Country__Germany',
       'Country__Hungary', 'Country__India', 'Country__Indonesia',
       'Country__Iran', 'Country__Italy', 'Country__Malaysia',
       'Country__Mexico', 'Country__Pakistan', 'Country__Peru',
       'Country__Philippines', 'Country__Poland', 'Country__Romania',
       'Country__Russia', 'Country__South Africa', 'Country__Spain',
       'Country__Turkey', 'Country__US', 'Country__Ukraine',
       'Country__United Kingdom', 'GDP Per Capita', 'Cases per day']

 df_features, df_target = get_features_targets(df,columns,["Deaths per day"])

 # normalize features
 df_features = normalize_z(df_features)

 df_features_train, df_features_test, df_target_train, df_target_test = split_data(df_features, df_target, random_state=100, test_size=0.3)

 # change to numpy array and append column for feature
 X = prepare_feature(df_features_train) # concatenating for the y intercept
 target = prepare_target(df_target_train)

 model = LinearRegression()
 iterations = 100
 alpha = 0.01
 model.fit(X,target,iterations,alpha)
 print(f"Beta : {model.beta}")
 pred = model.predict(df_features_test)
 plt.plot(model.J_storage)
 score = Evaluate(df_target_test, pred).evaluate()
	#importing the required packages
	import numpy as np
	import pandas as pd
	import matplotlib.pyplot as plt
	import seaborn as sns
	import statsmodels.api as sm

	def normalize_z(dfin):
	dfout = (dfin - dfin.mean(axis=0))/dfin.std(axis=0)
	return dfout

	def normalize_minmax(dfin):
	dfout = (dfin - dfin.min(axis=0))/(dfin.max(axis=0)-dfin.min(axis=0))
	return dfout

	def get_features_targets(df, feature_names, target_names):
	df_feature = df.loc[:,feature_names]
	df_target = df.loc[:,target_names]
	return df_feature, df_target

	def prepare_feature(df_feature):
	n = df_feature.shape[0]
	ones = np.ones(n).reshape(n,1)
	return np.concatenate((ones,df_feature.to_numpy()),axis = 1)

	def prepare_target(df_feature):
	return df_feature.to_numpy()

	def predict(df_feature, beta):
	df_feature = normalize_z(df_feature)
	X = prepare_feature(df_feature)
	return predict_norm(X, beta)

	def predict_norm(X, beta):
	return np.matmul(X,beta)

	def split_data(df_feature, df_target, random_state=None, test_size=0.5):
	np.random.seed(random_state)
	TestSize = int(test_size*len(df_feature))
	testchoice = np.random.choice(len(df_feature),size = TestSize, replace = False)
	remainder = []
	for i in df_feature.index:
	if i not in testchoice:
	remainder.append(i)
	trainchoice = np.random.choice(remainder, size = len(remainder),replace = False)
	df_feature_train = df_feature.iloc[trainchoice]
	df_target_train = df_target.iloc[trainchoice]
	df_feature_test = df_feature.iloc[testchoice]
	df_target_test = df_target.iloc[testchoice]
	return df_feature_train, df_feature_test, df_target_train, df_target_test

	def poly_features(df_feature, colname, colname_transformed, degree=2):
	col = df_feature[colname]
	df_feature[colname_transformed] = np.power(col, degree)
	return df_feature

	class LinearRegression:
	def __init__(self):
	self.beta = None
	self.J_storage = None

	def fit(self,X,y,iterations,alpha):
	"""
	Fit Linear model with the datasets using gradient descent.

	Parameters
	X : Training data.

	y : Target Values
	"""
	n,p = X.shape
	self.J_storage = np.zeros(iterations)
	beta = np.random.randn(p,1) / np.sqrt(n) #Weight Initialization
	for i in range(iterations):
	y_pred = self.predict_norm(X,beta)
	error = y_pred - y
	beta = beta - (alpha/n) * np.matmul(X.T,error)
	cost = self.compute_cost(X,y,beta)
	self.J_storage[i] = cost
	self.beta = beta
	return self.beta

	def predict(self,df_feature,normalisation=None):
	"""
	Predict the target values

	Parameters
	df_feature : Test data.
	normalisation : Normalisation method for the test data before prediction. Default is None.

	Returns
	y_pred : Predicted target values.

	"""
	df_feature = self.prepare_feature(df_feature)
	if normalisation == 'standard':
	return self.predict_norm(normalize_z(df_feature), self.beta)
	elif normalisation == 'min-max':
	return self.predict_norm(normalize_minmax(df_feature), self.beta)
	else:
	return self.predict_norm(df_feature, self.beta)

	def predict_norm(self,X, beta):
	return np.matmul(X,beta)

	def compute_cost(self,X, y, beta):
	m = X.shape[0]
	y_pred = np.matmul(X,beta)
	error = y_pred - y
	J = (1/(2m))np.matmul(error.T,error)
	return J[0][0]

	def prepare_feature(self,df_feature):
	n = df_feature.shape[0]
	ones = np.ones(n).reshape(n,1)
	return np.concatenate((ones,df_feature.to_numpy()),axis = 1)

	class Evaluate:
	def __init__(self,target,prediction):
	self.target = target
	self.prediction = prediction

	def r2_score(self):
	rss = np.sum((self.prediction - self.target) ** 2)
	tss = np.sum((self.target-self.target.mean()) ** 2)
	r2 = 1 - (rss / tss)
	return r2

	def mean_absolute_error(self):
	n = self.target.shape[0]
	error = abs(self.target-self.prediction)
	mae = np.sum(error)/n
	return mae

	def adjusted_r2_score(self):
	r2 = self.r2_score()
	n = self.target.shape[0]
	k = self.target.shape[1]
	adj_r2 = 1-(1-r2)*(n-1)/(n-k-1)
	return adj_r2

	def evaluate(self):
	r2 = self.r2_score()
	adjusted_r2 = self.adjusted_r2_score()
	mae = self.mean_absolute_error()
	print(f"mae : {mae}\n")
	print(f"r2 : {r2}")
	print(f"adjusted r2 : {adjusted_r2}")

	def __str__(self) -> str:
	return self.evaluate()