Skip to content

Instantly share code, notes, and snippets.

@charanpald
Created January 26, 2016 13:34
Show Gist options
  • Save charanpald/ce73d9511994bc9f472f to your computer and use it in GitHub Desktop.
Save charanpald/ce73d9511994bc9f472f to your computer and use it in GitHub Desktop.
Generate MovieLens recommendations using the SVD
# Run some recommendation experiments using MovieLens 100K
import pandas
import numpy
import scipy.sparse
import scipy.sparse.linalg
import matplotlib.pyplot as plt
from sklearn.metrics import mean_absolute_error
data_dir = "data/ml-100k/"
data_shape = (943, 1682)
df = pandas.read_csv(data_dir + "ua.base", sep="\t", header=-1)
values = df.values
values[:, 0:2] -= 1
X_train = scipy.sparse.csr_matrix((values[:, 2], (values[:, 0], values[:, 1])), dtype=numpy.float, shape=data_shape)
df = pandas.read_csv(data_dir + "ua.test", sep="\t", header=-1)
values = df.values
values[:, 0:2] -= 1
X_test = scipy.sparse.csr_matrix((values[:, 2], (values[:, 0], values[:, 1])), dtype=numpy.float, shape=data_shape)
# Compute means of nonzero elements
X_row_mean = numpy.zeros(data_shape[0])
X_row_sum = numpy.zeros(data_shape[0])
train_rows, train_cols = X_train.nonzero()
# Iterate through nonzero elements to compute sums and counts of rows elements
for i in range(train_rows.shape[0]):
X_row_mean[train_rows[i]] += X_train[train_rows[i], train_cols[i]]
X_row_sum[train_rows[i]] += 1
# Note that (X_row_sum == 0) is required to prevent divide by zero
X_row_mean /= X_row_sum + (X_row_sum == 0)
# Subtract mean rating for each user
for i in range(train_rows.shape[0]):
X_train[train_rows[i], train_cols[i]] -= X_row_mean[train_rows[i]]
test_rows, test_cols = X_test.nonzero()
for i in range(test_rows.shape[0]):
X_test[test_rows[i], test_cols[i]] -= X_row_mean[test_rows[i]]
X_train = numpy.array(X_train.toarray())
X_test = numpy.array(X_test.toarray())
ks = numpy.arange(2, 50)
train_mae = numpy.zeros(ks.shape[0])
test_mae = numpy.zeros(ks.shape[0])
train_scores = X_train[(train_rows, train_cols)]
test_scores = X_test[(test_rows, test_cols)]
# Now take SVD of X_train
U, s, Vt = numpy.linalg.svd(X_train, full_matrices=False)
for j, k in enumerate(ks):
X_pred = U[:, 0:k].dot(numpy.diag(s[0:k])).dot(Vt[0:k, :])
pred_train_scores = X_pred[(train_rows, train_cols)]
pred_test_scores = X_pred[(test_rows, test_cols)]
train_mae[j] = mean_absolute_error(train_scores, pred_train_scores)
test_mae[j] = mean_absolute_error(test_scores, pred_test_scores)
print(k, train_mae[j], test_mae[j])
plt.plot(ks, train_mae, 'k', label="Train")
plt.plot(ks, test_mae, 'r', label="Test")
plt.xlabel("k")
plt.ylabel("MAE")
plt.legend()
plt.show()
@charanpald
Copy link
Author

Your statement "The empty values in R will still be empty in the reconstruction." is wrong unfortunately. Are you able to prove it for k < rank(R)?

@denis-bz
Copy link

Numpy SVD is quite different from the "Funk SVD" that you need in recommender systems.
Why ? See
https://github.com/aaw/IncrementalSVD.jl#great-but-julia-already-has-an-svd-function-ill-just-use-that
(admirably clear)

cheers

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment