Skip to content

Instantly share code, notes, and snippets.

@kastnerkyle
Last active March 27, 2021 12:58
Show Gist options
  • Select an option

  • Save kastnerkyle/9341182 to your computer and use it in GitHub Desktop.

Select an option

Save kastnerkyle/9341182 to your computer and use it in GitHub Desktop.
Matrix factorization code related to matrix completion
# (C) Kyle Kastner, June 2014
# License: BSD 3 clause
import numpy as np
from scipy import sparse
def minibatch_indices(X, minibatch_size):
minibatch_indices = np.arange(0, len(X), minibatch_size)
minibatch_indices = np.asarray(list(minibatch_indices) + [len(X)])
start_indices = minibatch_indices[:-1]
end_indices = minibatch_indices[1:]
return zip(start_indices, end_indices)
def shuffle_in_unison(a, b):
"""
http://stackoverflow.com/questions/4601373/better-way-to-shuffle-two-numpy-arrays-in-unison
"""
rng_state = np.random.get_state()
np.random.shuffle(a)
np.random.set_state(rng_state)
np.random.shuffle(b)
def PMF(X, rank=10, learning_rate=0.001, momentum=0.8,
regularization=0.25, minibatch_size=1000, max_epoch=1000,
nan_value=0, status_percentage=0.1, random_state=None):
"""
Python implementation of Probabilistic Matrix Factorization (PMF).
Parameters
----------
X: numpy array or scipy.sparse coo matrix, shape (n_users, n_items)
Input data. If a dense array is passed in, it will be converted to a
sparse matrix by looking for all `nan_value` numbers and treating them
as empty.
rank: int, optional (default=10)
Rank of the low-rank factor matrices. A higher rank should result in a
better approximation, at the cost of more memory and slower computataion.
learning_rate: float, optional (default=0.001)
Learning rate for minibatch gradient descent.
momentum: float, optional (default=0.8)
Momentum for minibatch gradient descent.
regularization: float, optional (default=0.25)
L2 regularization penalty for minibatch gradient descent.
minibatch_size: int, optional (default=1000)
The size of each minibatch. If this is larger than size of the dataset,
will default to running over the whole dataset.
max_epoch: int, optional (default=1000)
The maximum number of epochs.
nan_value: int, optional (default=0)
This value will be masked out of the input for calculations
Should match the value considered the "not rated" in the dataset X.
status_percentage: float in (0, 1), optional (default=0.1)
The relative percentage of `max_epochs` when status will be printed.
For example, 0.1 is every 10%, 0.01 is every 1%, and so on. For
the default values of max_epoch=1000, status_percentage=0.1 this
is equivalent to a status print every 100 epochs.
random_state: RandomState, int, or None, optional (default=None)
Random state to pass in. Can be an int, None, or np.random.RandomState
object.
Returns
-------
U: array-like, shape (X.shape[0], rank)
Row basis for reconstruction.
Usage:
reconstruction = np.dot(U, V.T) + X_mean
V: array-like, shape (X.shape[1], rank)
Column basis for reconstruction.
Usage:
reconstruction = np.dot(U, V.T) + X_mean
X_mean: float
Global mean prediction, needed for reconstruction
Usage
reconstruction = np.dot(U, V.T) + X_mean
Notes
-----
Based on code from Ruslan Salakhutdinov
http://www.cs.toronto.edu/~rsalakhu/code_BPMF/pmf.m
Probabilistic Matrix Factorization, R. Salakhutdinov and A. Mnih,
Advances in Neural Information Processing Systems 20, 2008
"""
if not sparse.isspmatrix_coo(X):
val_index = np.where(X != nan_value)
X = sparse.coo_matrix((X[val_index[0], val_index[1]],
(val_index[0], val_index[1])))
# Simplest prediction is the global mean
X_mean = X.mean()
lr = learning_rate
reg = regularization
mom = momentum
if random_state is None or type(random_state) is int:
random_state = np.random.RandomState(random_state)
N, M = X.shape
U = 0.1 * random_state.randn(N, rank)
V = 0.1 * random_state.randn(M, rank)
U_inc = np.zeros_like(U)
V_inc = np.zeros_like(V)
dU = np.zeros_like(U)
dV = np.zeros_like(V)
epoch = 0
status_inc = int(np.ceil(max_epoch * status_percentage))
print("Printing updates every %i epochs" % status_inc)
status_points = list(range(0, max_epoch, status_inc)) + [max_epoch - 1]
# Need this in order to index
X_s = X.tolil()
while epoch < max_epoch:
# Get indices for non-NaN values
r, c = X.nonzero()
mb_indices = minibatch_indices(zip(r, c), minibatch_size)
n_batches = len(mb_indices)
shuffle_in_unison(r, c)
mean_abs_err = 0.
for i, j in mb_indices:
# Reset derivative matrices each minibatch
dU[:, :] = 0.
dV[:, :] = 0.
# Slice out row and column indices
r_i = r[i:j]
c_i = c[i:j]
# Get data corresponding to the row and column indices
X_i = X_s[r_i, c_i].toarray().ravel() - X_mean
# Compute predictions
pred = np.sum(U[r_i] * V[c_i], axis=1)
# Compute how algorithm is doing
mean_abs_err += np.sum(np.abs(pred - X_i)) / (n_batches * (j - i))
# Loss has a tendency to be unstable, but is the "right thing"
# to monitor instead of sum_abs_err
# pred_loss = (pred - X_i) ** 2
# Compute gradients
grad_loss = 2 * (pred - X_i)
grad_U = grad_loss[:, None] * V[c_i] + reg * U[r_i]
grad_V = grad_loss[:, None] * U[r_i] + reg * V[c_i]
dU[r_i] = grad_U
dV[c_i] = grad_V
# Momentum storage
U_inc = mom * U_inc + lr * dU
V_inc = mom * V_inc + lr * dV
U -= U_inc
V -= V_inc
if epoch in status_points:
print("Epoch %i of %i" % (epoch + 1, max_epoch))
print("Mean absolute error %f" % (mean_abs_err))
epoch += 1
return U, V, X_mean
if __name__ == "__main__":
import matplotlib.pyplot as plt
R = np.array([[5, 3, 0, 1],
[4, 0, 0, 1],
[1, 1, 0, 5],
[1, 0, 0, 4],
[0, 1, 5, 4]], dtype=float)
U, V, m = PMF(R, learning_rate=0.001, momentum=0.95,
minibatch_size=2, rank=5, max_epoch=250, random_state=1999)
R2 = np.dot(U, V.T) + m
plt.matshow(R * (R > 0))
plt.title("Ground truth ratings")
plt.matshow(R2 * (R > 0))
plt.title("Predicted ratings")
plt.show()
@hl8086
Copy link
Copy Markdown

hl8086 commented Mar 27, 2021

thanks for your post

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment