In [1]: import mf_rec_test as mfrt
In [2]: mfrt.run(method='python', num_factors=8, num_users=30, num_items=60)
Recommendation test time in minutes: 2.9021
Factorization RMSE: 0.087
In [3]: mfrt.run(method='cython', num_factors=8, num_users=30, num_items=60)
Recommendation test time in minutes: 0.0028
Factorization RMSE: 0.081
In [4]: mfrt.run(method='numba', num_factors=8, num_users=30, num_items=60)
Recommendation test time in minutes: 0.0652
Factorization RMSE: 0.088
In [5]: mfrt.run(method='numba', num_factors=8, num_users=30, num_items=60)
Recommendation test time in minutes: 0.0031
Factorization RMSE: 0.084
Last active
September 21, 2016 12:30
-
-
Save jhemann/5584536 to your computer and use it in GitHub Desktop.
Code to evaluate speed improvements using numba versus cython on a matrix factorization problem used in the setting of a recommendation system.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
#!/usr/bin/python | |
''' | |
Adapted from example code by Albert Au Yeung (2010) | |
http://www.quuxlabs.com/blog/2010/09/matrix-factorization-a-simple-tutorial-and-implementation-in-python/#source-code | |
An implementation of matrix factorization... | |
@INPUT: | |
R : a matrix to be factorized, dimension N x M | |
P : an initial matrix of dimension N x K | |
Q : an initial matrix of dimension M x K | |
K : the number of latent features | |
steps : the maximum number of steps to perform the optimisation | |
alpha : the learning rate | |
beta : the regularization parameter | |
@OUTPUT: | |
the final matrices P and Q, steps taken and error | |
''' | |
import numpy | |
def matrix_factorization(R, P, Q, K, steps=5000, alpha=0.0002, beta=0.02): | |
half_beta = beta / 2.0 | |
for step in xrange(steps): | |
for i in xrange(len(R)): | |
for j in xrange(len(R[i])): | |
if R[i,j] > 0: | |
eij = R[i,j] - numpy.dot(P[i,:],Q[j,:]) | |
for k in xrange(K): | |
P[i,k] += alpha * (2 * eij * Q[j,k] - beta * P[i,k]) | |
Q[j,k] += alpha * (2 * eij * P[i,k] - beta * Q[j,k]) | |
e = 0 | |
for i in xrange(len(R)): | |
for j in xrange(len(R[i])): | |
if R[i,j] > 0: | |
e += (R[i,j] - numpy.dot(P[i,:],Q[j,:]))**2 | |
for k in xrange(K): | |
e += half_beta * (P[i,k]*P[i,k] + Q[j,k]*Q[j,k]) | |
if e < 0.001: | |
break | |
return P, Q, step, e | |
def matrix_factorization_original(R, P, Q, K, steps=5000, alpha=0.0002, beta=0.02): | |
Q = Q.T | |
for step in xrange(steps): | |
for i in xrange(len(R)): | |
for j in xrange(len(R[i])): | |
if R[i][j] > 0: | |
eij = R[i][j] - numpy.dot(P[i,:],Q[:,j]) | |
for k in xrange(K): | |
P[i][k] = P[i][k] + alpha * (2 * eij * Q[k][j] - beta * P[i][k]) | |
Q[k][j] = Q[k][j] + alpha * (2 * eij * P[i][k] - beta * Q[k][j]) | |
e = 0 | |
for i in xrange(len(R)): | |
for j in xrange(len(R[i])): | |
if R[i][j] > 0: | |
e = e + pow(R[i][j] - numpy.dot(P[i,:],Q[:,j]), 2) | |
for k in xrange(K): | |
e = e + (beta/2) * ( pow(P[i][k],2) + pow(Q[k][j],2) ) | |
if e < 0.001: | |
break | |
return P, Q.T, step, e | |
############################################################################### | |
if __name__ == "__main__": | |
R = [ | |
[5,3,0,1], | |
[4,0,0,1], | |
[1,1,0,5], | |
[1,0,0,4], | |
[0,1,5,4], | |
] | |
R = numpy.array(R) | |
N = len(R) | |
M = len(R[0]) | |
K = 2 | |
P = numpy.random.rand(N,K) | |
Q = numpy.random.rand(M,K) | |
nP, nQ = matrix_factorization(R, P, Q, K) | |
nR = numpy.dot(nP, nQ.T) | |
print nR |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
#!/usr/bin/python | |
''' | |
Adapted from example code by Albert Au Yeung (2010) | |
http://www.quuxlabs.com/blog/2010/09/matrix-factorization-a-simple-tutorial-and-implementation-in-python/#source-code | |
An implementation of matrix factorization... | |
@INPUT: | |
R : a matrix to be factorized, dimension N x M | |
P : an initial matrix of dimension N x K | |
Q : an initial matrix of dimension M x K | |
K : the number of latent features | |
steps : the maximum number of steps to perform the optimisation | |
alpha : the learning rate | |
beta : the regularization parameter | |
@OUTPUT: | |
the final matrices P and Q, steps taken and error | |
''' | |
from __future__ import division | |
cimport cython | |
cimport numpy as np | |
@cython.boundscheck(False) | |
@cython.wraparound(False) | |
def matrix_factorization(np.ndarray[double, ndim=2] R, | |
np.ndarray[double, ndim=2] P, | |
np.ndarray[double, ndim=2] Q, | |
int K, | |
int steps=5000, | |
double alpha=0.0002, | |
double beta=0.02): | |
cdef double half_beta = beta / 2.0 | |
cdef int N = R.shape[0] | |
cdef int M = R.shape[1] | |
cdef double eij = 0.0 | |
cdef double e = 0.0 | |
cdef double two_eij = 0.0 | |
cdef int step = 0 | |
cdef int i = 0 | |
cdef int j = 0 | |
cdef int k = 0 | |
cdef double temp = 0.0 | |
for step in xrange(steps): | |
for i in xrange(N): | |
for j in xrange(M): | |
if R[i,j] > 0: | |
#Convert the line | |
# eij = R[i,j] - numpy.dot(P[i,:],Q[j,:]) | |
#in mf.py to avoid using numpy such that numba can optimize | |
#the calculation (once execution is moved into C, e.g. via | |
#numpy, numba cannot optimize the corresponding code). This | |
#code is now equivalent to mf_numba.py for comparisons | |
eij = R[i,j] | |
for p in xrange(K): | |
eij -= P[i,p] * Q[j,p] | |
for k in xrange(K): | |
P[i,k] += alpha * (2 * eij * Q[j,k] - beta * P[i,k]) | |
Q[j,k] += alpha * (2 * eij * P[i,k] - beta * Q[j,k]) | |
e = 0.0 | |
for i in xrange(N): | |
for j in xrange(M): | |
if R[i,j] > 0: | |
temp = R[i,j] | |
for p in xrange(K): | |
temp -= P[i,p] * Q[j,p] | |
e = e + temp * temp | |
for k in xrange(K): | |
e += half_beta * (P[i,k]*P[i,k] + Q[j,k]*Q[j,k]) | |
if e < 0.001: | |
break | |
return P, Q, step, e | |
if __name__ == "__main__": | |
R = [ | |
[5,3,0,1], | |
[4,0,0,1], | |
[1,1,0,5], | |
[1,0,0,4], | |
[0,1,5,4], | |
] | |
R = np.array(R) | |
N = len(R) | |
M = len(R[0]) | |
K = 2 | |
P = np.random.rand(N,K) | |
Q = np.random.rand(M,K) | |
nP, nQ = matrix_factorization(R, P, Q, K) | |
nR = np.dot(nP, nQ.T) | |
print nR |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
#!/usr/bin/python | |
''' | |
Adapted from example code by Albert Au Yeung (2010) | |
http://www.quuxlabs.com/blog/2010/09/matrix-factorization-a-simple-tutorial-and-implementation-in-python/#source-code | |
An implementation of matrix factorization... | |
@INPUT: | |
R : a matrix to be factorized, dimension N x M | |
P : an initial matrix of dimension N x K | |
Q : an initial matrix of dimension M x K | |
K : the number of latent features | |
steps : the maximum number of steps to perform the optimisation | |
alpha : the learning rate | |
beta : the regularization parameter | |
@OUTPUT: | |
the final matrices P and Q, steps taken and error | |
''' | |
from operator import mul | |
from itertools import imap, izip, starmap | |
import numpy as np | |
from numba import typeof, double, int_ | |
from numba.decorators import autojit, jit | |
@autojit(locals={'step': int_, 'e': double, 'err': double}) | |
def matrix_factorization(R, P, Q, K): | |
steps = 5000 | |
alpha = 0.0002 | |
beta = 0.02 | |
half_beta = beta / 2.0 | |
N, M = R.shape | |
e = 0.0 | |
for step in xrange(steps): | |
for i in xrange(N): | |
for j in xrange(M): | |
if R[i,j] > 0: | |
#Convert the line | |
# eij = R[i,j] - numpy.dot(P[i,:],Q[j,:]) | |
#in mf.py to avoid using numpy such that numba can optimize | |
#the calculation (once execution is moved into C, e.g. via | |
#numpy, numba cannot optimize the corresponding code) | |
eij = R[i,j] | |
for p in xrange(K): | |
eij -= P[i,p] * Q[j,p] | |
for k in xrange(K): | |
P[i,k] += alpha * (2 * eij * Q[j,k] - beta * P[i,k]) | |
Q[j,k] += alpha * (2 * eij * P[i,k] - beta * Q[j,k]) | |
e = 0.0 | |
for i in xrange(N): | |
for j in xrange(M): | |
if R[i,j] > 0: | |
temp = R[i,j] | |
for p in xrange(K): | |
temp -= P[i,p] * Q[j,p] | |
e = e + temp * temp | |
for k in xrange(K): | |
e += half_beta * (P[i,k]*P[i,k] + Q[j,k]*Q[j,k]) | |
if e < 0.001: | |
break | |
return P, Q, step, e | |
@jit(argtypes=[double[:,:], double[:,:], double[:,:], int_], | |
restype=typeof((double[:,:], double[:,:], int_, double)), | |
locals={'step': int_, 'e': double, 'err': double}) | |
def matrix_factorization2(R, P, Q, K): | |
steps = 5000 | |
alpha = 0.0002 | |
beta = 0.02 | |
half_beta = beta / 2.0 | |
Q = Q.T | |
N, M = R.shape | |
dot = np.dot | |
for step in xrange(steps): | |
for i in xrange(N): | |
for j in xrange(M): | |
if R[i,j] > 0: | |
temp = double(dot(P[i,:], Q[:,j])) | |
#temp = sum(map(mul, P[i,:], Q[:,j])) | |
#temp = sum(imap(mul, P[i,:], Q[:,j])) | |
#temp = sum(starmap(mul, izip(P[i,:], Q[:,j]))) | |
#temp = 0 | |
#for p in xrange(M): | |
# temp = temp + P[i,p] * Q[p,j] | |
eij = R[i,j] - temp | |
for k in xrange(K): | |
P[i,k] += alpha * (2 * eij * Q[k,j] - beta * P[i,k]) | |
Q[k,j] += alpha * (2 * eij * P[i,k] - beta * Q[k,j]) | |
e = 0.0 | |
for i in xrange(N): | |
for j in xrange(M): | |
if R[i,j] > 0: | |
temp = double(dot(P[i,:], Q[:,j])) | |
#temp = sum(map(mul, P[i,:], Q[:,j])) | |
#temp = sum(imap(mul, P[i,:], Q[:,j])) | |
#temp = sum(starmap(mul, izip(P[i,:], Q[:,j]))) | |
#temp = 0 | |
#for p in xrange(M): | |
# temp = temp + P[i,p] * Q[p,j] | |
e = e + (R[i,j] - temp)**2 | |
for k in xrange(K): | |
e = e + half_beta * (P[i,k]**2 + Q[k,j]**2) | |
if e < 0.001: | |
break | |
return P, Q.T, step, e |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import shelve | |
import time | |
import numpy as np | |
import matplotlib.pyplot as plt | |
def make_ratings_data(num_users, num_items, seed=8675309): | |
'''Returns a num_users x num_items array of fake ratings, where the ratings | |
can be on a 1-5 star scale, with half-stars allowed. Assume a value of 0 | |
indicates that the associated user has not rated the associated item. Like | |
most ratings data sets, the array will be sparse | |
''' | |
valid_ratings = np.arange(1, 5.5, 0.5) | |
valid_ratings = np.insert(valid_ratings, 0, 0) | |
#Make ~80% of the data be "missing", with the other 20% having a uniform | |
#distribution over the 1-5 star rating scale | |
probs = [0.20 / 9] * 9 | |
probs.insert(0, 0.80) | |
pseudo_RNG = np.random.RandomState(seed) | |
ratings_matrix = pseudo_RNG.choice(valid_ratings, replace=True, | |
size=(num_users, num_items), | |
p=probs) | |
return ratings_matrix | |
def run(hist=False, method='cython', seed=1234567, num_users=100, | |
num_items=200, num_factors=8): | |
start_time = time.time() | |
R = make_ratings_data(num_users, num_items, seed=None) | |
N, M = R.shape | |
K = num_factors | |
pseudo_RNG = np.random.RandomState(seed) | |
P = pseudo_RNG.rand(N, K) | |
Q = pseudo_RNG.rand(M, K) | |
if method == 'cython': | |
import mf_cython | |
matrix_factorization = mf_cython.matrix_factorization | |
if method == 'numba': | |
import mf_numba | |
matrix_factorization = mf_numba.matrix_factorization | |
if method == 'python': | |
import mf | |
matrix_factorization = mf.matrix_factorization | |
nP, nQ, steps, error = matrix_factorization(R, P, Q, K) | |
print 'Convergence in %i steps' % steps | |
nR = np.dot(nP, nQ.T) | |
R_actual = np.ma.masked_equal(R, 0) | |
missing_mask = np.ma.getmaskarray(R_actual) | |
nR_actual = np.ma.masked_array(nR, mask=missing_mask) | |
fit_error = nR_actual - R_actual | |
fit_error_filled = fit_error.filled(-999) | |
actual_ratings = np.where(fit_error_filled > -999) | |
fit_diffs = np.asarray(fit_error[actual_ratings]) | |
fit_RMSE = np.sqrt(np.sum(fit_diffs**2) / fit_diffs.size) | |
if hist: | |
plt.hist(fit_diffs, bins=20) | |
plt.xlabel = 'Rating Error (ratings on 1-5, 1/2 point scale)' | |
plt.ylabel = 'Number of Consumers' | |
plt.show() | |
#Save the output for later if we want to analyze results without re-running | |
#the code... | |
s = shelve.open('mf_rec_test_results_%s.db' % method) | |
s['nP'] = nP | |
s['nQ'] = nQ | |
s['nR'] = nR | |
s['R_actual'] = R_actual | |
s['nR_actual'] = nR_actual | |
s['fit_error'] = fit_error | |
s.close() | |
print 'Recommendation test time in minutes: %.4f' \ | |
% ((time.time() - start_time) / 60.0) | |
print 'Factorization RMSE: %.3f' % fit_RMSE | |
return None |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
""" | |
Created on Sun Apr 01 23:49:22 2012 | |
@author: JoshuaHemann | |
""" | |
# Run with: | |
# | |
# vs_compiler | |
# python setup_mf_cython.py build_ext --inplace | |
# | |
# to compile Cython extensions in-place (useful during development) | |
from distutils.core import setup | |
from Cython.Distutils import build_ext | |
from Cython.Build import cythonize | |
import numpy as np | |
setup(cmdclass={'build_ext': build_ext}, | |
include_dirs=[np.get_include()], | |
ext_modules=cythonize('mf_cython.pyx', annotate=True)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment