Skip to content

Instantly share code, notes, and snippets.

@pb111
Created November 16, 2018 07:52
Show Gist options
  • Save pb111/1a9c3759804ba14e01b65063d3ef3edd to your computer and use it in GitHub Desktop.
Save pb111/1a9c3759804ba14e01b65063d3ef3edd to your computer and use it in GitHub Desktop.
SLR
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"% matplotlib inline\n",
"\n",
"# The above command sets the backend of matplotlib to the 'inline' backend. \n",
"# It means the output of plotting commands is displayed inline."
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"# Import necessary libraries\n",
"\n",
"import numpy as np\n",
"import pandas as pd\n",
"import matplotlib.pyplot as plt\n"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"# Import the data\n",
"\n",
"url = \"C:/project_datasets/SALES.txt\"\n",
"df = pd.read_csv(url, sep='\\t', header=None)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(36, 2)\n"
]
}
],
"source": [
"# Exploratory data analysis\n",
"\n",
"# View the dimensions of df\n",
"\n",
"print(df.shape)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" 0 1\n",
"0 12.0 15.0\n",
"1 20.5 16.0\n",
"2 21.0 18.0\n",
"3 15.5 27.0\n",
"4 15.3 21.0\n"
]
}
],
"source": [
"# View the top 5 rows of df\n",
"\n",
"print(df.head())"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"# Rename columns of df dataframe\n",
"\n",
"df.columns = ['Sales', 'Advertising']"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" Sales Advertising\n",
"0 12.0 15.0\n",
"1 20.5 16.0\n",
"2 21.0 18.0\n",
"3 15.5 27.0\n",
"4 15.3 21.0\n"
]
}
],
"source": [
"# View the top 5 rows of df with column names renamed\n",
"\n",
"print(df.head())"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"RangeIndex: 36 entries, 0 to 35\n",
"Data columns (total 2 columns):\n",
"Sales 36 non-null float64\n",
"Advertising 36 non-null float64\n",
"dtypes: float64(2)\n",
"memory usage: 656.0 bytes\n",
"None\n"
]
}
],
"source": [
"# View dataframe summary\n",
"\n",
"print(df.info())"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" Sales Advertising\n",
"count 36.000000 36.000000\n",
"mean 24.255556 28.527778\n",
"std 6.185118 18.777625\n",
"min 12.000000 1.000000\n",
"25% 20.300000 15.750000\n",
"50% 24.250000 23.000000\n",
"75% 28.600000 41.000000\n",
"max 36.500000 65.000000\n"
]
}
],
"source": [
"# View descriptive statistics\n",
"\n",
"print(df.describe())"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"# Declare feature variable and target array\n",
"\n",
"X = df['Sales'].values\n",
"y = df['Advertising'].values\n",
"\n",
"# Sales and Advertising data values are given by X and y respectively.\n",
"\n",
"# Values attribute of pandas dataframe returns the numpy arrays."
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(36,)\n",
"(36,)\n"
]
}
],
"source": [
"# Print the dimensions of X and y\n",
"\n",
"print(X.shape)\n",
"print(y.shape)"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"# Reshape X and y\n",
"\n",
"X = X.reshape(-1,1)\n",
"y = y.reshape(-1,1)\n",
"\n",
"# Since we are working with only one feature variable, so we need to do reshaping using NumPy's reshape() method. \n",
"# It specifies first dimension to be -1, which means \"unspecified\". \n",
"# Its value is inferred from the length of the array and the remaining dimensions. \n",
" "
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(36, 1)\n",
"(36, 1)\n"
]
}
],
"source": [
"# Print the dimensions of X and y after reshaping\n",
"\n",
"print(X.shape)\n",
"print(y.shape)\n",
"\n",
"# We can see the difference in dimensions of X and y before and after reshaping.\n",
"# It is essential in this case because getting the feature and target variable arrays into the right format for scikit-learn \n",
"# is an important precursor to model building."
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "\n",
"text/plain": [
"<Figure size 432x288 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# Visualizing the relationship between X and y by scatterplot\n",
"\n",
"# Plot scatter plot between X and y\n",
"\n",
"plt.scatter(X, y, color = 'blue', label='Scatter Plot')\n",
"plt.title('Relationship between Sales and Advertising')\n",
"plt.xlabel('Sales')\n",
"plt.ylabel('Advertising')\n",
"plt.legend(loc=4)\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [],
"source": [
"# Split X and y into training and test data sets\n",
"\n",
"from sklearn.model_selection import train_test_split\n",
"X_train,X_test,y_train,y_test = train_test_split(X, y, test_size=0.33, random_state=42)\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(24, 1)\n",
"(24, 1)\n",
"(12, 1)\n",
"(12, 1)\n"
]
}
],
"source": [
"# Print the dimensions of X_train,X_test,y_train,y_test\n",
"\n",
"print(X_train.shape)\n",
"print(y_train.shape)\n",
"print(X_test.shape)\n",
"print(y_test.shape)"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [],
"source": [
"# Fit the linear model\n",
"\n",
"# Instantiate the linear regression object lm\n",
"from sklearn.linear_model import LinearRegression\n",
"lm = LinearRegression()\n",
"\n",
"\n",
"# Train the model using training data sets\n",
"lm.fit(X_train,y_train)\n",
"\n",
"\n",
"# Predict on the test data\n",
"y_pred=lm.predict(X_test)"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"RMSE value: 11.2273\n"
]
}
],
"source": [
"# Calculate and print Root Mean Square Error(RMSE)\n",
"\n",
"from sklearn.metrics import mean_squared_error\n",
"mse = mean_squared_error(y_test, y_pred)\n",
"rmse = np.sqrt(mse)\n",
"print(\"RMSE value: {:.4f}\".format(rmse))\n"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"R2 Score value: 0.5789\n"
]
}
],
"source": [
"# Calculate and print r2_score\n",
"\n",
"from sklearn.metrics import r2_score\n",
"print (\"R2 Score value: {:.4f}\".format(r2_score(y_test, y_pred)))\n"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Estimated model slope, a: [[1.60509347]]\n",
"Estimated model intercept, b: (array([-11.16003616]),)\n"
]
}
],
"source": [
"# Compute model slope and intercept\n",
"\n",
"a = lm.coef_\n",
"b = lm.intercept_,\n",
"print(\"Estimated model slope, a:\" , a)\n",
"print(\"Estimated model intercept, b:\" , b) \n"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [],
"source": [
"# So, our fitted regression line is \n",
"\n",
"# y = 1.60509347 * x - 11.16003616 \n",
"\n",
"# That is our linear model."
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([[ 8.10108551],\n",
" [21.74438002],\n",
" [22.54692675],\n",
" [13.71891266],\n",
" [13.39789396]])"
]
},
"execution_count": 22,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Predicting Advertising values\n",
"\n",
"lm.predict(X)[0:5]\n",
"\n",
"# Predicting Advertising values on first five Sales values."
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[[27.36220717]]\n"
]
}
],
"source": [
"# To make an individual prediction using the linear regression model.\n",
"\n",
"print(str(lm.predict(24)))"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "\n",
"text/plain": [
"<Figure size 432x288 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# Plot the Regression Line\n",
"\n",
"\n",
"plt.scatter(X, y, color = 'blue', label='Scatter Plot')\n",
"plt.plot(X_test, y_pred, color = 'black', linewidth=3, label = 'Regression Line')\n",
"plt.title('Relationship between Sales and Advertising')\n",
"plt.xlabel('Sales')\n",
"plt.ylabel('Advertising')\n",
"plt.legend(loc=4)\n",
"plt.show()\n"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "\n",
"text/plain": [
"<Figure size 432x288 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# Plotting residual errors\n",
"\n",
"plt.scatter(lm.predict(X_train), lm.predict(X_train) - y_train, color = 'red', label = 'Train data')\n",
"plt.scatter(lm.predict(X_test), lm.predict(X_test) - y_test, color = 'blue', label = 'Test data')\n",
"plt.hlines(xmin = 0, xmax = 50, y = 0, linewidth = 3)\n",
"plt.title('Residual errors')\n",
"plt.legend(loc = 4)\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Training set score: 0.2861\n",
"Test set score: 0.5789\n"
]
}
],
"source": [
"# Checking for Overfitting or Underfitting the data\n",
"\n",
"print(\"Training set score: {:.4f}\".format(lm.score(X_train,y_train)))\n",
"\n",
"print(\"Test set score: {:.4f}\".format(lm.score(X_test,y_test)))"
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['lm_regressor.pkl']"
]
},
"execution_count": 34,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Save model for future use\n",
"\n",
"from sklearn.externals import joblib\n",
"joblib.dump(lm, 'lm_regressor.pkl')\n",
"\n",
"# To load the model\n",
"\n",
"# lm2=joblib.load('lm_regressor.pkl')"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.5"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment