Skip to content

Instantly share code, notes, and snippets.

@piyush01123
Last active July 8, 2025 04:52
Show Gist options
  • Save piyush01123/5229e673b4be04f58281d15905df228d to your computer and use it in GitHub Desktop.
Save piyush01123/5229e673b4be04f58281d15905df228d to your computer and use it in GitHub Desktop.
Naive Bayes classification of MNIST images http://web.iitd.ac.in/~bspanda/BY.pdf
import numpy as np
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
# Load data
digits = datasets.load_digits()
trainX, testX, trainY, testY = train_test_split(digits.images, digits.target, test_size=0.2, random_state=42)
# mnist = datasets.fetch_openml('mnist_784', version=1, as_frame=False)
# X, y = mnist['data'], mnist['target'].astype(int)
# trainX, testX, trainY, testY = train_test_split(X, y, test_size=0.2, random_state=42)
# Compute conditional stats for each class
conditional_stats = []
for digit in range(10):
digit_data = trainX[trainY == digit]
mean = digit_data.mean(axis=0)
std = digit_data.std(axis=0)
std[std == 0] = 1e-3 # More robust fix than adding 1e-6
conditional_stats.append({"mean": mean, "std": std})
# Prior probabilities for each digit
prior_probs = [(trainY == digit).sum() / len(trainY) for digit in range(10)]
log_prior = np.log(prior_probs)
# Predict function using Gaussian log-likelihood
def predict(X):
predictions = []
for x in X:
log_probs = []
for digit in range(10):
mean = conditional_stats[digit]["mean"]
std = conditional_stats[digit]["std"]
# Gaussian log-likelihood
log_likelihood = -0.5 * np.sum(np.log(2 * np.pi * std**2) + ((x - mean)**2) / (std**2))
log_probs.append(log_likelihood + log_prior[digit])
predictions.append(np.argmax(log_probs))
return np.array(predictions)
# Predict and evaluate
predY = predict(testX)
print("Accuracy:", accuracy_score(testY, predY))
print("\nClassification Report:\n", classification_report(testY, predY))
import numpy as np
from sklearn import datasets
from sklearn.model_selection import train_test_split
import scipy.stats
digits = datasets.load_digits()
trainX, testX, trainY, testY = train_test_split(digits.images, digits.target, test_size=.2)
Y_freq = {y: sum(trainY==y) for y in range(10)}
X_stats = {y: {"X_mean": np.mean(trainX[trainY==y], axis=0), \
"X_std": np.std(trainX[trainY==y], axis=0) \
} \
for y in range(10)
}
P = np.zeros((testX.shape[0], 10))
for row, x in enumerate(testX):
probs = []
for y in range(10):
mean, std = X_stats[y]["X_mean"], X_stats[y]["X_std"]
A = scipy.stats.norm(mean, std).pdf(x)
B = np.where(np.all([x==mean, std==0], axis=0), 1, A)
C = np.where(np.all([x!=mean, std==0], axis=0), 1e-4, B)
probs.append(np.product(C)*Y_freq[y])
P[row] = probs
pred = np.argmax(P, axis=1)
acc = np.sum(pred==testY)/testY.shape[0]
print(acc)
@piyush01123
Copy link
Author

Note: Acc obtained is 88.88%

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment