Created
September 26, 2022 04:23
-
-
Save reachsumit/b6d4ef3b2bc9f7ec6cb5ef9414d7a734 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
import pandas as pd | |
from numpy import bincount, log, log1p | |
from scipy.sparse import coo_matrix, linalg | |
class ExplicitCF: | |
def __init__(self): | |
self.df = pd.read_csv("ml-100k/u.data", sep='\t', header=None, names=['user', 'item', 'rating'], usecols=range(3)) | |
self.df['user'] = self.df['user'].astype("category") | |
self.df['item'] = self.df['item'].astype("category") | |
self.df.dropna(inplace=True) | |
self.rating_matrix = coo_matrix((self.df['rating'].astype(float), | |
(self.df['item'].cat.codes, | |
self.df['user'].cat.codes))) | |
def _bm25_weight(self, X, K1=100, B=0.8): | |
"""Weighs each row of a sparse matrix X by BM25 weighting""" | |
# calculate idf per term (user) | |
X = coo_matrix(X) | |
N = float(X.shape[0]) | |
idf = log(N) - log1p(bincount(X.col)) | |
# calculate length_norm per document (artist) | |
row_sums = np.ravel(X.sum(axis=1)) | |
average_length = row_sums.mean() | |
length_norm = (1.0 - B) + B * row_sums / average_length | |
# weight matrix rows by bm25 | |
X.data = X.data * (K1 + 1.0) / (K1 * length_norm[X.row] + X.data) * idf[X.col] | |
return X | |
def factorize(self): | |
item_factor, _, user_factor = linalg.svds(self._bm25_weight(self.rating_matrix), 50) | |
return item_factor, user_factor | |
def init_predict(self, x_factors): | |
# fully normalize factors, so can compare with only the dot product | |
norms = np.linalg.norm(x_factors, axis=-1) | |
self.factors = x_factors / norms[:, np.newaxis] | |
def get_related(self, x_id, N=5): | |
scores = self.factors.dot(self.factors[x_id]) | |
best = np.argpartition(scores, -N)[-N:] | |
print("Recommendations:") | |
for _id, score in sorted(zip(best, scores[best]), key=lambda x: -x[1]): | |
print(f"item id: {_id}, score: {score}") | |
cf_object = ExplicitCF() | |
print(cf_object.df.head()) | |
# user item rating | |
#0 196 242 3 | |
#1 186 302 3 | |
#2 22 377 1 | |
#3 244 51 2 | |
#4 166 346 1 | |
print(cf_object.df.user.nunique()) # 943 | |
print(cf_object.df.item.nunique()) # 1682 | |
print(cf_object.df.rating.describe()) | |
#count 100000.000000 | |
#mean 3.529860 | |
#std 1.125674 | |
#min 1.000000 | |
#25% 3.000000 | |
#50% 4.000000 | |
#75% 4.000000 | |
#max 5.000000 | |
#Name: rating, dtype: float64 | |
print(cf_object.rating_matrix.shape) # (1682, 943) | |
item_factor, user_factor = cf_object.factorize() | |
print(item_factor.shape) # (1682, 50) | |
print(user_factor.shape) # (50, 943) | |
cf_object.init_predict(item_factor) | |
print(cf_object.factors.shape) # (1682, 50) | |
cf_object.get_related(314) | |
#Recommendations: | |
#item id: 314, score: 1.0 | |
#item id: 315, score: 0.8940031189407059 | |
#item id: 346, score: 0.8509562164687848 | |
#item id: 271, score: 0.8441764974934266 | |
#item id: 312, score: 0.7475076699852435 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment