Last active
March 1, 2025 03:07
-
-
Save reachsumit/35598b25c7abdc0c034f03db823c9a7d to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
import pandas as pd | |
from numpy import bincount, log, log1p | |
from scipy.sparse import coo_matrix, linalg | |
class ImplicitCF: | |
def __init__(self): | |
self.df = pd.read_csv("lastfm-dataset-360K/usersha1-artmbid-artname-plays.tsv", sep='\t', header=None, names=['user', 'artist', 'plays'], usecols=[0,2,3]) | |
self.df['user'] = self.df['user'].astype("category") | |
self.df['artist'] = self.df['artist'].astype("category") | |
self.df.dropna(inplace=True) | |
self.plays = coo_matrix((self.df['plays'].astype(float), | |
(self.df['artist'].cat.codes, | |
self.df['user'].cat.codes))) | |
def _bm25_weight(self, X, K1=100, B=0.8): | |
"""Weighs each row of a sparse matrix X by BM25 weighting""" | |
# calculate idf per term (user) | |
X = coo_matrix(X) | |
N = float(X.shape[0]) | |
idf = log(N) - log1p(bincount(X.col)) | |
# calculate length_norm per document (artist) | |
row_sums = np.ravel(X.sum(axis=1)) | |
average_length = row_sums.mean() | |
length_norm = (1.0 - B) + B * row_sums / average_length | |
# weight matrix rows by bm25 | |
X.data = X.data * (K1 + 1.0) / (K1 * length_norm[X.row] + X.data) * idf[X.col] | |
return X | |
def _alternating_least_squares(self, Cui, factors, regularization, iterations=20): | |
artists, users = Cui.shape | |
X = np.random.rand(artists, factors) * 0.01 | |
Y = np.random.rand(users, factors) * 0.01 | |
Ciu = Cui.T.tocsr() | |
for iteration in range(iterations): | |
self._least_squares(Cui, X, Y, regularization) | |
self._least_squares(Ciu, Y, X, regularization) | |
return X, Y | |
def _least_squares(self, Cui, X, Y, regularization): | |
artists, factors = X.shape | |
YtY = Y.T.dot(Y) | |
for u in range(artists): | |
# accumulate YtCuY + regularization * I in A | |
A = YtY + regularization * np.eye(factors) | |
# accumulate YtCuPu in b | |
b = np.zeros(factors) | |
for i in Cui[u,:].indices: | |
confidence = Cui[u,i] | |
factor = Y[i] | |
A += (confidence - 1) * np.outer(factor, factor) | |
b += confidence * factor | |
# Xu = (YtCuY + regularization * I)^-1 (YtCuPu) | |
X[u] = np.linalg.solve(A, b) | |
def factorize(self): | |
artist_factor, user_factor = self._alternating_least_squares(self._bm25_weight(self.plays).tocsr(), 50, 1, 10) | |
return artist_factor, user_factor | |
def init_predict(self, x_factors): | |
# fully normalize factors, so can compare with only the dot product | |
norms = np.linalg.norm(x_factors, axis=-1) | |
self.factors = x_factors / norms[:, np.newaxis] | |
def get_related(self, x_id, N=5): | |
scores = self.factors.dot(self.factors[x_id]) | |
best = np.argpartition(scores, -N)[-N:] | |
print("Recommendations:") | |
for _id, score in sorted(zip(best, scores[best]), key=lambda x: -x[1]): | |
print(f"artist id: {_id}, artist_name: {self.df.artist[_id]} score: {score:.5f}") | |
cf_object = ImplicitCF() | |
print(cf_object.df.head()) | |
# user artist plays | |
#0 00000c289a1829a808ac09c00daf10bc3c4e223b betty blowtorch 2137 | |
#1 00000c289a1829a808ac09c00daf10bc3c4e223b die Ärzte 1099 | |
#2 00000c289a1829a808ac09c00daf10bc3c4e223b melissa etheridge 897 | |
#3 00000c289a1829a808ac09c00daf10bc3c4e223b elvenking 717 | |
#4 00000c289a1829a808ac09c00daf10bc3c4e223b juliette & the licks 706 | |
print(cf_object.df.user.nunique()) # 358868 | |
print(cf_object.df.artist.nunique()) # 292364 | |
print(cf_object.df.plays.describe()) | |
#count 1.753565e+07 | |
#mean 2.151932e+02 | |
#std 6.144815e+02 | |
#min 0.000000e+00 | |
#25% 3.500000e+01 | |
#50% 9.400000e+01 | |
#75% 2.240000e+02 | |
#max 4.191570e+05 | |
#Name: plays, dtype: float64 | |
print(cf_object.plays.shape) # (292364, 358868) | |
artist_factor, user_factor = cf_object.factorize() | |
print(artist_factor.shape) # (292364, 50) | |
print(user_factor.shape) # (358868, 50) | |
cf_object.init_predict(artist_factor) | |
print(cf_object.factors.shape) # (292364, 50) | |
cf_object.get_related(2170) | |
#Recommendations: | |
#artist id: 2170, artist_name: maroon 5 score: 1.00000 | |
#artist id: 170436, artist_name: vilma palma e vampiros score: 1.00000 | |
#artist id: 257, artist_name: the beatles score: 1.00000 | |
#artist id: 24297, artist_name: dizzee rascal score: 0.77604 | |
#artist id: 257575, artist_name: the velvet underground score: 0.71697 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Hello... I gone through your article. Have you did this part?
Next, I went over one of the most popular research on a factor model which is specially tailored for implicit feedback recommenders. We also implemented factorization-based recommender systems in Python for both explicit and implicit datasets.