Created
October 3, 2011 21:36
-
-
Save gsakkis/1260324 to your computer and use it in GitHub Desktop.
Slope One
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sys, csv, time | |
from collections import defaultdict | |
from itertools import islice | |
import orig_slopeone, slopeone, numpy_slopeone | |
classes = [ | |
orig_slopeone.SlopeOne, | |
#orig_slopeone.SlopeOneNumpy, # XXX: dog slow | |
slopeone.SlopeOne, | |
slopeone.SlopeOne2, | |
slopeone.SlopeOneSym, | |
slopeone.SlopeOneSym2, | |
numpy_slopeone.SlopeOneDense, | |
numpy_slopeone.SlopeOneSparse, | |
] | |
class ItemsStorage(object): | |
def __init__(self, maxsize=None): | |
self.items = [] | |
self.item2id = {} | |
self.maxsize = maxsize | |
def add_item(self, item): | |
id = self.item2id.get(item) | |
if id is None: | |
n = len(self.items) | |
if self.maxsize is None or n < self.maxsize: | |
self.item2id[item] = n | |
self.items.append(item) | |
id = n | |
return id | |
def id(self, item): | |
return self.item2id[item] | |
def item(self, id): | |
return self.items[id] | |
def __len__(self): | |
return len(self.items) | |
def load_data(max_num_items=None): | |
users_ratings = defaultdict(dict) | |
items_storage = ItemsStorage(max_num_items) | |
reader = csv.reader(open('BX-Book-Ratings.csv'), delimiter=';') | |
reader.next() # skip header | |
for user_id, isbn, rating in reader: | |
item_id = items_storage.add_item(isbn) | |
rating = int(rating) | |
if item_id is None or rating == 0: | |
continue | |
users_ratings[user_id][item_id] = rating | |
return len(items_storage), users_ratings | |
def main(max_num_items, num_users, top=5): | |
num_items, user_ratings = load_data(max_num_items) | |
print >> sys.stderr, '%d items / %d users / %d ratings' % ( | |
num_items, len(user_ratings), | |
sum(len(r) for r in user_ratings.itervalues())) | |
for cls in classes: | |
name = '%s.%s' % (cls.__module__, cls.__name__) | |
print >> sys.stderr, name | |
start = time.time() | |
slopone = cls(num_items) | |
slopone.update(user_ratings.itervalues()) | |
print >> sys.stderr, ' training time: %s' % (time.time() - start) | |
out = open('%s.out' % name, 'w') | |
if 0: | |
slopone.dump(out) | |
else: | |
start = time.time() | |
for itemid2rating in islice(user_ratings.itervalues(), num_users): | |
print >> out, slopone.recomendations(itemid2rating, top) | |
print >> sys.stderr, ' prediction time (top %d items for %d users): %s' % ( | |
top, num_users, time.time() - start) | |
out.close() | |
if __name__ == '__main__': | |
main(*map(int, sys.argv[1:])) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from collections import defaultdict | |
from heapq import nlargest | |
from itertools import izip | |
from operator import itemgetter | |
import numpy as np | |
from scipy import sparse | |
class SlopeOneDense(object): | |
def __init__(self, num_items): | |
self.num_items = num_items | |
def update(self, users): | |
# collect the freqs and dicts in dict-of-dicts | |
freqdict = defaultdict(lambda: defaultdict(int)) | |
diffdict = defaultdict(lambda: defaultdict(int)) | |
for user in users: | |
itemid_ratings = sorted(user.iteritems(), key=itemgetter(0)) | |
while itemid_ratings: | |
i, ri = itemid_ratings.pop() | |
freqs_i = freqdict[i] | |
diffs_i = diffdict[i] | |
for j, rj in itemid_ratings: | |
freqs_i[j] += 1 | |
diffs_i[j] += ri - rj | |
self.freqsmat = freqsmat = np.zeros((self.num_items, self.num_items), dtype=int) | |
self.diffsmat = diffsmat = np.zeros((self.num_items, self.num_items), dtype=float) | |
while freqdict: | |
i, freqdict_i = freqdict.popitem() | |
freqsmat[i].put(freqdict_i.keys(), freqdict_i.values()) | |
diffdict_i = diffdict[i] | |
diffsmat[i].put(diffdict_i.keys(), diffdict_i.values()) | |
del diffdict[i] | |
nonzero = freqsmat.nonzero() | |
# scale the diffs by freq | |
diffsmat[nonzero] /= freqsmat[nonzero] | |
# freqsmat is symmetric | |
freqsmat.T[nonzero] = freqsmat[nonzero] | |
# diffsmat is antisymmetric | |
diffsmat.T[nonzero] = -diffsmat[nonzero] | |
def dump(self, fileobj): | |
freqsmat, diffsmat = self.freqsmat, self.diffsmat | |
for i, j in izip(*freqsmat.nonzero()): | |
diff = diffsmat[i,j] | |
# convert -0.0 to 0.0 | |
if diff == 0: diff = np.abs(diff) | |
print >> fileobj, i, j, freqsmat[i,j], diff | |
def recomendations(self, user, top=None): | |
# ids and ratings of items already rated by user | |
user_itemids = user.keys() | |
# take the diff and freq columns corresponding to the user itemids | |
user_diffs_mat = self.diffsmat[:,user_itemids] | |
user_freqs_mat = self.freqsmat[:,user_itemids] | |
# for each candidate item compute the sum of co-occurences with all user | |
# rated items | |
user_freqs_vec = user_freqs_mat.sum(axis=1) | |
# consider only the items with non-zero total freq excluding the already | |
# rated ones | |
candidate_ids = np.setdiff1d(user_freqs_vec.nonzero()[0], user_itemids) | |
if candidate_ids.size == 0: | |
return [] | |
# memory-efficient and slightly faster version of: | |
# predictions_vec = (user_freqs_mat[candidate_ids] * | |
# (user_ratings_vec + user_diffs_mat[candidate_ids]) | |
# ).sum(axis=1) / user_freqs_vec[candidate_ids] | |
predictions_mat = user_diffs_mat[candidate_ids] | |
predictions_mat += user.values() | |
predictions_mat *= user_freqs_mat[candidate_ids] | |
predictions_vec = predictions_mat.sum(axis=1) | |
predictions_vec /= user_freqs_vec[candidate_ids] | |
# translate from numpy to python-land and sort or find top largest | |
predictions = izip(predictions_vec, candidate_ids) | |
if top is not None: | |
return nlargest(top, predictions) | |
else: | |
return sorted(predictions, reverse=True) | |
class SlopeOneSparse(object): | |
def __init__(self, num_items): | |
self.num_items = num_items | |
def update(self, users): | |
# collect the freqs and dicts in dict-of-dicts | |
freqdict = defaultdict(lambda: defaultdict(int)) | |
diffdict = defaultdict(lambda: defaultdict(int)) | |
for user in users: | |
itemid_ratings = sorted(user.iteritems(), key=itemgetter(0)) | |
while itemid_ratings: | |
i, ri = itemid_ratings.pop() | |
freqs_i = freqdict[i] | |
diffs_i = diffdict[i] | |
for j, rj in itemid_ratings: | |
freqs_i[j] += 1 | |
diffs_i[j] += ri - rj | |
# double the size of nonzero entries to account for the symmetric ones | |
nnz = 2 * sum(len(v) for v in freqdict.itervalues()) | |
# (i, j) rows for nonzero freqs | |
ij = np.empty((2, nnz), dtype=int) | |
# nonzero freqs | |
freqs = np.empty(nnz, dtype=int) | |
# diffs for nonzero freqs | |
diffs = np.empty(nnz, dtype=float) | |
# populate the lower triangle | |
start = 0 | |
while freqdict: | |
i, freqdict_i = freqdict.popitem() | |
end = start + len(freqdict_i) | |
ij[0, start:end] = i | |
ij[1, start:end] = freqdict_i.keys() | |
freqs[start:end] = freqdict_i.values() | |
diffs[start:end] = diffdict[i].values() | |
del diffdict[i] | |
start = end | |
# populate the upper triangle | |
nnz2 = nnz / 2 | |
ij[:, nnz2:] = ij[[1,0],:nnz2] | |
freqs[nnz2:] = freqs[:nnz2] | |
# scale the diffs by freq | |
diffs[:nnz2] /= freqs[:nnz2] | |
# diffs is antisymmetric | |
diffs[nnz2:] = -diffs[:nnz2] | |
shape = (self.num_items, self.num_items) | |
self.freqsmat = sparse.csr_matrix((freqs, ij), shape=shape) | |
# negate the diffs because we'll be selecting itemids by rows instead | |
# of columns | |
self.diffsmat = sparse.csr_matrix((-diffs, ij), shape=shape) | |
def dump(self, fileobj): | |
freqsmat, diffsmat = self.freqsmat, self.diffsmat | |
for i, j in sorted(izip(*freqsmat.nonzero())): | |
print >> fileobj, i, j, freqsmat[i,j], float(diffsmat[i,j]) | |
def recomendations(self, user, top=None): | |
# ids and ratings of items already rated by user | |
user_itemids = user.keys() | |
# take the diff and freq rows corresponding to the user itemids | |
user_diffs_mat = self.diffsmat[user_itemids] | |
user_freqs_mat = self.freqsmat[user_itemids] | |
# for each candidate item compute the sum of co-occurences with all user | |
# rated items | |
user_freqs_vec = user_freqs_mat.sum(axis=0) | |
# consider only the items with non-zero total freq excluding the already | |
# rated ones | |
candidate_ids = np.setdiff1d(np.asarray(user_freqs_vec.nonzero()[1]).ravel(), | |
user_itemids) | |
if candidate_ids.size == 0: | |
return [] | |
# compute the prediction vector | |
predictions_mat = np.asarray(user_diffs_mat[:, candidate_ids].todense()) | |
predictions_mat += np.array(user.values()).reshape((len(user), 1)) | |
predictions_mat *= user_freqs_mat[:, candidate_ids].todense() | |
predictions_vec = predictions_mat.sum(axis=0) | |
predictions_vec /= np.asarray(user_freqs_vec[:, candidate_ids]).ravel() | |
# translate from numpy to python-land and sort or find top largest | |
predictions = izip(predictions_vec, candidate_ids) | |
if top is not None: | |
return nlargest(top, predictions) | |
else: | |
return sorted(predictions, reverse=True) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sys | |
from heapq import nlargest | |
from operator import itemgetter | |
from numpy import int32, float32 | |
from scipy.sparse import dok_matrix | |
class SlopeOne(object): | |
def __init__(self, num_items): | |
self.diffs = {} | |
self.freqs = {} | |
def recomendations(self, useritems, top=None): | |
preds, freqs = {}, {} | |
useritems = dict(useritems) | |
for item, rating in useritems.iteritems(): | |
for diffitem, diffratings in self.diffs.iteritems(): | |
try: | |
freq = self.freqs[diffitem][item] | |
except KeyError: | |
continue | |
preds.setdefault(diffitem, 0.0) | |
freqs.setdefault(diffitem, 0) | |
preds[diffitem] += freq * (diffratings[item] + rating) | |
freqs[diffitem] += freq | |
predlist = [(value / freqs[item], item) | |
for item, value in preds.iteritems() | |
if item not in useritems and freqs[item] > 0] | |
if top is not None: | |
predlist = nlargest(top, predlist) | |
else: | |
predlist.sort(key=lambda x: (-x[0], -x[1])) | |
return predlist | |
def update(self, userdata): | |
for userratings in userdata: | |
for item1, rating1 in userratings.iteritems(): | |
self.freqs.setdefault(item1, {}) | |
self.diffs.setdefault(item1, {}) | |
for item2, rating2 in userratings.iteritems(): | |
if item2 == item1: | |
continue | |
self.freqs[item1].setdefault(item2, 0) | |
self.diffs[item1].setdefault(item2, 0.0) | |
self.freqs[item1][item2] += 1 | |
self.diffs[item1][item2] += rating1 - rating2 | |
for item1, ratings in self.diffs.iteritems(): | |
for item2 in ratings: | |
ratings[item2] /= self.freqs[item1][item2] | |
def dump(self, fileobj=sys.stdout): | |
diffs = self.diffs | |
key = itemgetter(0) | |
for i, rows in sorted(self.freqs.iteritems(), key=key): | |
diff_i = diffs[i] | |
for j, freq in sorted(rows.iteritems(), key=key): | |
print >> fileobj, i, j, freq, float(diff_i[j]) | |
class SlopeOneNumpy(object): | |
def __init__(self, num_items): | |
self.freqs = dok_matrix((num_items, num_items), dtype=int32) | |
self.diffs = dok_matrix((num_items, num_items), dtype=float32) | |
def recomendations(self, useritems, top=None): | |
resultpreds, resultfreqs = {}, {} | |
useritems = dict(useritems) | |
for id, rating in useritems.iteritems(): | |
freqrow = self.freqs.getrow(id) | |
for freq, indice in zip(freqrow.data, freqrow.indices): | |
resultpreds.setdefault(indice, 0.0) | |
resultfreqs.setdefault(indice, 0) | |
resultpreds[indice] += freq * (self.diffs[indice, id] + rating) | |
resultfreqs[indice] += freq | |
predlist = [(value / resultfreqs[item], item) | |
for item, value in resultpreds.iteritems() | |
if item not in useritems and resultfreqs[item] > 0] | |
if top is not None: | |
predlist = nlargest(top, predlist) | |
else: | |
predlist.sort(key=lambda x: (-x[0], -x[1])) | |
return predlist | |
def update(self, userdata): | |
for userratings in userdata: | |
for item1, rating1 in userratings.iteritems(): | |
for item2, rating2 in userratings.iteritems(): | |
self.freqs[item1, item2] += 1 | |
delta = float32(rating1 - rating2) | |
if delta != 0: | |
# there is a bug in scipy when assign a 0 value for a already zero entry | |
self.diffs[item1, item2] += delta | |
for key, rating in self.diffs.iteritems(): | |
self.diffs[key] = rating/self.freqs[key] |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from __future__ import division | |
import sys | |
from collections import defaultdict | |
from heapq import nlargest | |
from operator import itemgetter | |
class SlopeOne(object): | |
def __init__(self, num_items): | |
self.freqs = defaultdict(lambda: defaultdict(int)) | |
self.diffs = defaultdict(lambda: defaultdict(int)) | |
def update(self, users): | |
freqs, diffs = self.freqs, self.diffs | |
for user in users: | |
item_ratings = sorted(user.iteritems(), key=itemgetter(0)) | |
while item_ratings: | |
item1, rating1 = item_ratings.pop() | |
item1_freqs = freqs[item1] | |
item1_diffs = diffs[item1] | |
for item2, rating2 in item_ratings: | |
item1_freqs[item2] += 1 | |
freqs[item2][item1] += 1 | |
diff = rating1 - rating2 | |
if diff: | |
item1_diffs[item2] += diff | |
diffs[item2][item1] -= diff | |
for item1, ratings in diffs.iteritems(): | |
item1_freqs = freqs[item1] | |
for item2 in ratings: | |
ratings[item2] /= item1_freqs[item2] | |
def recomendations(self, item2rating, top=None): | |
freqs, diffs = self.freqs, self.diffs | |
predfreqs = defaultdict(int) | |
preds = defaultdict(int) | |
for olditem in freqs: | |
if olditem in item2rating: | |
continue | |
freqs_olditem = freqs[olditem] | |
for item in item2rating: | |
freq = freqs_olditem.get(item) | |
if freq is not None: | |
predfreqs[olditem] += freq | |
diffrating = diffs[olditem][item] | |
preds[olditem] += freq * (item2rating[item] + diffrating) | |
results = ((preds[item]/predfreqs[item], item) for item in preds) | |
if top is not None: | |
results = nlargest(top, results) | |
else: | |
results = sorted(results, reverse=True) | |
return results | |
def dump(self, fileobj=sys.stdout): | |
diffs = self.diffs | |
key = itemgetter(0) | |
for i, rows in sorted(self.freqs.iteritems(), key=key): | |
diff_i = diffs[i] | |
for j, freq in sorted(rows.iteritems(), key=key): | |
print >> fileobj, i, j, freq, float(diff_i[j]) | |
class SlopeOne2(SlopeOne): | |
def update(self, users): | |
freqs, diffs = self.freqs, self.diffs | |
for user in users: | |
for item1, rating1 in user.iteritems(): | |
item1_freqs = freqs[item1] | |
item1_diffs = diffs[item1] | |
for item2, rating2 in user.iteritems(): | |
item1_freqs[item2] += 1 | |
item1_diffs[item2] += rating1 - rating2 | |
for item1, ratings in diffs.iteritems(): | |
item1_freqs = freqs[item1] | |
for item2 in ratings: | |
ratings[item2] /= item1_freqs[item2] | |
class SlopeOneSym(SlopeOne): | |
def update(self, users): | |
freqs, diffs = self.freqs, self.diffs | |
for user in users: | |
item_ratings = sorted(user.iteritems(), key=itemgetter(0)) | |
while item_ratings: | |
item1, rating1 = item_ratings.pop() | |
item1_freqs = freqs[item1] | |
item1_diffs = diffs[item1] | |
for item2, rating2 in item_ratings: | |
item1_freqs[item2] += 1 | |
item1_diffs[item2] += rating1 - rating2 | |
for item1, ratings in diffs.iteritems(): | |
item1_freqs = freqs[item1] | |
for item2 in ratings: | |
ratings[item2] /= item1_freqs[item2] | |
def recomendations(self, item2rating, top=None): | |
freqs, diffs = self.freqs, self.diffs | |
predfreqs = defaultdict(int) | |
preds = defaultdict(int) | |
for olditem in freqs: | |
if olditem in item2rating: | |
continue | |
freqs_olditem = freqs[olditem] | |
for item in item2rating: | |
if olditem > item: | |
freq = freqs_olditem.get(item) | |
else: | |
freq = freqs[item].get(olditem) | |
if freq is not None: | |
predfreqs[olditem] += freq | |
if olditem > item: | |
diffrating = diffs[olditem][item] | |
else: | |
diffrating = -diffs[item][olditem] | |
preds[olditem] += freq * (item2rating[item] + diffrating) | |
results = ((preds[item]/predfreqs[item], item) for item in preds) | |
if top is not None: | |
results = nlargest(top, results) | |
else: | |
results = sorted(results, reverse=True) | |
return results | |
class SlopeOneSym2(SlopeOneSym): | |
def update(self, users): | |
freqs, diffs = self.freqs, self.diffs | |
for user in users: | |
for item1, rating1 in user.iteritems(): | |
item1_freqs = freqs[item1] | |
item1_diffs = diffs[item1] | |
for item2 in user: | |
if item2 >= item1: | |
continue | |
item1_freqs[item2] += 1 | |
item1_diffs[item2] += rating1 - user[item2] | |
for item1, ratings in diffs.iteritems(): | |
item1_freqs = freqs[item1] | |
for item2 in ratings: | |
ratings[item2] /= item1_freqs[item2] |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import unittest | |
import orig_slopeone, slopeone, numpy_slopeone | |
from benchmark import ItemsStorage | |
class SlopeOneTest(unittest.TestCase): | |
users = [ | |
dict(b1=10, b2=5, b3=7, b4=0, b5=8), | |
dict(b1=3, b3=6, b5=9, b7=1), | |
dict(b2=4, b4=6, b6=8), | |
dict(b5=2, b6=3, b7=4, b8=5, b9=6), | |
dict(b8=10, b9=2, b10=8), | |
dict(b4=3, b5=5, b6=7), | |
] | |
recommendations = [ | |
[('b9', 12), ('b8', 11), ('b6', 6.8), ('b7', 5)], | |
[('b9', 8), ('b8', 7), ('b6', 7), ('b2', 8/3.), ('b4', 0)], | |
[('b1', 12.5), ('b9', 11), ('b8', 10), ('b3', 9.5), ('b7', 9.0), ('b5', 8.4)], | |
[('b10', 7.5), ('b3', 3), ('b1', 2), ('b2', -1), ('b4', -1.5)], | |
[('b7', 4.5), ('b6', 3.5), ('b5', 2.5)], | |
[('b9', 9.5), ('b8', 8.5), ('b1', 19/3.), ('b3', 16/3.), ('b7', 4), ('b2', 3.5)], | |
] | |
def setUp(self): | |
self.items_storage = items_storage = ItemsStorage() | |
for user in self.users: | |
for book_id in user.iterkeys(): | |
items_storage.add_item(book_id) | |
self.ratings = map(self._itemid2rating, self.users) | |
def test_slope_one(self): | |
self._test(slopeone.SlopeOne) | |
def test_slope_one2(self): | |
self._test(slopeone.SlopeOne2) | |
def test_slope_one_sym(self): | |
self._test(slopeone.SlopeOneSym) | |
def test_slope_one_sym2(self): | |
self._test(slopeone.SlopeOneSym2) | |
def test_slope_one_dense(self): | |
self._test(numpy_slopeone.SlopeOneDense) | |
def test_slope_one_sparse(self): | |
self._test(numpy_slopeone.SlopeOneSparse) | |
def test_orig_slope_one(self): | |
self._test(orig_slopeone.SlopeOne) | |
def test_orig_slope_one_numpy(self): | |
self._test(orig_slopeone.SlopeOneNumpy) | |
def _itemid2rating(self, user): | |
return dict((self.items_storage.id(book_id), rating) | |
for book_id, rating in user.iteritems()) | |
def _test(self, cls): | |
slopeone = cls(len(self.items_storage)) | |
slopeone.update(self.ratings) | |
for user, recommendations in zip(self.users, self.recommendations): | |
for top in None, 1, 2, 3: | |
predictions = [ | |
(self.items_storage.item(item_id), rating) | |
for rating, item_id in | |
slopeone.recomendations(self._itemid2rating(user), top=top) | |
] | |
self.assertEqual(predictions, recommendations[:top]) | |
class SmallSlopeOneTest(SlopeOneTest): | |
users = [ | |
dict(b1=5, b2=3, b3=2), | |
dict(b1=3, b2=4), | |
dict(b2=2, b3=5), | |
] | |
recommendations = [ | |
[], | |
[('b3', 10/3.)], | |
[('b1', 13/3.)], | |
] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment