Skip to content

Instantly share code, notes, and snippets.

@Ogaday
Created September 4, 2025 16:27
Show Gist options
  • Save Ogaday/5ec1145fe80aebcf1c938d19f4381e82 to your computer and use it in GitHub Desktop.
Save Ogaday/5ec1145fe80aebcf1c938d19f4381e82 to your computer and use it in GitHub Desktop.
Example models
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "b53e9afe",
"metadata": {},
"outputs": [],
"source": [
"from typing import Optional\n",
"\n",
"import matplotlib.pyplot as plt\n",
"import numpy as np\n",
"\n",
"from model import make_train_test_data, LinearModel, NeighboursModel, train_model"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "8199fb44",
"metadata": {},
"outputs": [],
"source": [
"SEED = 0\n",
"BIAS = 1.0\n",
"NOISE = 10.0\n",
"K_NEIGHBOURS = 5"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "532e8c7e",
"metadata": {},
"outputs": [],
"source": [
"def fit_lm(X: np.ndarray, y: np.ndarray) -> LinearModel:\n",
" intercept = y.mean()\n",
" gradient = (y.max() - y.min()) / (X.max() - X.min())\n",
" model = LinearModel(intercept=intercept, gradient=gradient).fit(X, y)\n",
" return model\n",
"\n",
"\n",
"def fit_knn(X: np.ndarray, y: np.ndarray, k_neighbours: int = 1) -> NeighboursModel:\n",
" model = NeighboursModel(k_neighbours=k_neighbours).fit(X, y)\n",
" return model"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "1cb96f69",
"metadata": {},
"outputs": [],
"source": [
"X_train, X_test, y_train, y_test = make_train_test_data(seed=SEED, bias=BIAS, noise=NOISE)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "edb6bf95",
"metadata": {},
"outputs": [],
"source": [
"\n",
"def plot_model(\n",
" model,\n",
" X: np.ndarray,\n",
" y: np.ndarray,\n",
" *,\n",
" name: Optional[str] = None,\n",
" x_test: Optional[np.ndarray] = None,\n",
" y_test: Optional[np.ndarray] = None,\n",
" ax=None\n",
"):\n",
" if ax is None:\n",
" fig, ax = plt.subplots()\n",
" train_plot = ax.scatter(X, y)\n",
" xx = np.linspace(X.min(), X.max()).reshape(-1, 1)\n",
" model_line, *_ = ax.plot(xx, model.predict(xx))\n",
" if name:\n",
" ax.set_title(name)\n",
" if x_test is not None and y_test is not None:\n",
" test_plot = ax.scatter(x_test, y_test)\n",
" score = model.score(x_test, y_test)\n",
" ax.legend([train_plot, test_plot, model_line], [\"Train set\", \"Test set\", f\"Predictions (score: {score:.3f})\"])\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "aca8ced3",
"metadata": {},
"outputs": [
{
"data": {
"image/png": "",
"text/plain": [
"<Figure size 1800x600 with 2 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"lm = fit_lm(X_train, y_train)\n",
"knn = fit_knn(X_train, y_train, k_neighbours=K_NEIGHBOURS)\n",
"\n",
"fig, axs = plt.subplots(1, 2, figsize=(18, 6), sharex=True, sharey=True)\n",
"plot_model(lm, X_train, y_train, name=\"Linear Model\", x_test=X_test, y_test=y_test, ax=axs[0])\n",
"plot_model(knn, X_train, y_train, name=\"Neighbours Model (k=5)\", x_test=X_test, y_test=y_test, ax=axs[1])"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "76c2a6ba",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.9308834744932162"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"lm.score(X_test, y_test)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "dca4a561",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.9146345869852457"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"knn.score(X_test, y_test)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "c3003190",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.9308834744932162"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"train_model(model_type=\"linear_regression\", lr_intercept=lm.intercept, lr_gradient=lm.gradient, bias=BIAS, seed=SEED)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "013322ff",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.9146345869852457"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"train_model(model_type=\"nearest_neighbour\", k_neighbours=K_NEIGHBOURS, bias=BIAS, seed=SEED)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.11"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
from typing import Literal, Optional
import numpy as np
from scipy.spatial.distance import cdist
from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split
def make_train_test_data(
n_samples: int = 100, bias: float = 0.0, noise: float = 0.0, seed: Optional[int] = None
) -> tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
"""Generate a 1D linear dataset with a train/test split.
Parameters
----------
n_samples
The number of total rows in the combined dataset (train & test).
bias
The offset used to generate the dataset.
noise
The random error used to generate the dataset.
seed
The random seed used to generate the dataset. Used to control reproducibility.
Returns
-------
X_train, X_test, y_train, y_test: tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]
"""
state = np.random.RandomState(seed=seed)
X, y = make_regression(
n_samples=n_samples,
n_features=1,
n_informative=1,
bias=bias,
noise=noise,
random_state=state,
)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=state)
return X_train, X_test, y_train, y_test
class LinearModel(RegressorMixin, BaseEstimator):
"""Simple 1D linear regression model with fixed intercept and gradient.
Attributes
----------
intercept
The intercept, or offset of the model
gradient
The gradient, or slope of the model
"""
def __init__(self, intercept: float = 0, gradient: float = 0):
"""Create a new LinearModel instance.
Parameters
----------
intercept
The intercept, or offset of the model
gradient
The gradient, or slope of the model
"""
self.intercept = intercept
self.gradient = gradient
def fit(self, X, y) -> "LinearModel":
"""Fit the linear model.
This a no-op, as intercept and gradient are supplied on initialisation.
Parameters
----------
X
1D training features, with shape (n_samples, 1).
y
1D training targets, with shape (n_samples,).
Returns
-------
LinearModel
Linear model instance.
"""
self.is_fitted_ = True
return self
def predict(self, X) -> np.ndarray:
"""Produce predictions.
Parameters
----------
X
1D features, with shape (n_samples, 1).
Returns
-------
np.ndarray
Predicted targets.
"""
return np.array(X * self.gradient + self.intercept).ravel()
class NeighboursModel(RegressorMixin, BaseEstimator):
"""Simple k-nearest neighbours model.
Attributes
----------
k_neighbours
The number of neighbours.
"""
def __init__(self, k_neighbours: int = 1):
"""Create a new NeighboursModel instance.
Parameters
----------
k_neighbours
The number of neighbours.
"""
self.k_neighbours = k_neighbours
def fit(self, X, y) -> "NeighboursModel":
"""Fit the neighbours model.
Parameters
----------
X
1D training features, with shape (n_samples, 1).
y
1D training targets, with shape (n_samples,).
Returns
-------
NeighboursModel
k-nn model instance.
"""
self._X = X
self._y = y
self.is_fitted = True
return self
def predict(self, X):
"""Produce predictions.
Parameters
----------
X
1D features, with shape (n_samples, 1).
Returns
-------
np.ndarray
Predicted targets.
"""
return self._y[cdist(X, self._X).argsort(axis=1)[:, : self.k_neighbours]].mean(axis=1)
def train_model(
model_type: Literal["linear_regression", "nearest_neighbour"],
lr_intercept: float = 0.0,
lr_gradient: float = 0.0,
k_neighbours: int = 1,
n_samples: int = 100,
bias: int = 10,
noise: float = 10,
seed: int = 42,
):
"""Train a model with the supplied parameters.
Generates a dataset with the bias, noise and seed parameters, and fits and scores the model.
Returns
-------
score: float
The R2 score of the model on the test set.
"""
X_train, X_test, y_train, y_test = make_train_test_data(
n_samples=n_samples, bias=bias, noise=noise, seed=seed
)
if model_type == "linear_regression":
model = LinearModel(intercept=lr_intercept, gradient=lr_gradient)
elif model_type == "nearest_neighbour":
model = NeighboursModel(k_neighbours=k_neighbours)
else:
raise ValueError(f"Unrecognised model_type: '{model_type}'")
return model.fit(X_train, y_train).score(X_test, y_test)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment