Last active
April 22, 2019 07:45
-
-
Save aclisp/5b30eb4442c5e9f3099c61290419b9be to your computer and use it in GitHub Desktop.
Recommendation using Python
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def calc_precision_recall(recommend_method, training_data, testing_data, user_colname, item_colname, sim_matrix, top_k): | |
users_testing_and_training = list( | |
set( testing_data[user_colname].unique() ).intersection( | |
set( training_data[user_colname].unique() )) | |
) | |
hit = 0 | |
n_recall = 0 | |
n_precision = 0 | |
for user in users_testing_and_training: | |
rank = recommend_method(training_data, user_colname, item_colname, user, sim_matrix, top_k) | |
rank = set(item[0] for item in rank) | |
items = set(testing_data[testing_data[user_colname] == user][item_colname].unique()) | |
hit += len(rank.intersection(items)) | |
n_recall += len(items) | |
n_precision += top_k | |
return [hit/(1.0*n_precision), hit/(1.0*n_recall)] | |
def calc_coverage(recommend_method, training_data, testing_data, user_colname, item_colname, sim_matrix, top_k): | |
recommend_items = set() | |
all_items = set() | |
for user in training_data[user_colname].unique(): | |
for item in training_data[training_data[user_colname] == user][item_colname]: | |
all_items.add(item) | |
rank = recommend_method(training_data, user_colname, item_colname, user, sim_matrix, top_k) | |
for item, score in rank: | |
recommend_items.add(item) | |
return len(recommend_items) / (len(all_items) * 1.0) | |
def calc_popularity(recommend_method, training_data, testing_data, user_colname, item_colname, sim_matrix, top_k): | |
item_popularity = dict() | |
training_data_grouped = training_data.groupby([item_colname]).agg({user_colname: 'count'}).reset_index() | |
for row in training_data_grouped.itertuples(): | |
item_popularity[getattr(row, item_colname)] = getattr(row, user_colname) | |
ret = 0 | |
n = 0 | |
for user in training_data[user_colname].unique(): | |
rank = recommend_method(training_data, user_colname, item_colname, user, sim_matrix, top_k) | |
for item, score in rank: | |
ret += math.log(1 + item_popularity[item]) | |
n += 1 | |
ret /= n * 1.0 | |
return ret |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def recommend_item_cf(training_data, user_colname, item_colname, target_user, sim_matrix, top_k): | |
rank = dict() | |
user_items = training_data[training_data[user_colname] == target_user][item_colname].unique() | |
for item_i in user_items: | |
for item_j, similarity in sim_matrix.loc[item_i].sort_values(ascending=False)[0:top_k].items(): | |
#print(item_i, item_j, similarity) | |
if item_j in user_items: | |
continue | |
rank[item_j] = rank.setdefault(item_j, 0) + similarity | |
return sorted(rank.items(), key=operator.itemgetter(1), reverse=True)[0:top_k] | |
my_matrix = similarity_matrix(my_data, 'user_id', 'song') | |
recommend_item_cf(my_data, 'user_id', 'song', '001', my_matrix, 5) | |
def recommend_most_popular(training_data, user_colname, item_colname, target_user, sim_matrix, top_k): | |
training_data_grouped = training_data.groupby([item_colname]).agg({user_colname: 'count'}).reset_index() | |
training_data_grouped.rename(columns={user_colname: 'score'}, inplace=True) | |
training_data_sort = training_data_grouped.sort_values(['score', item_colname], ascending=[0, 1]) | |
return training_data_sort.head(top_k).values.tolist() | |
def recommend_random(training_data, user_colname, item_colname, target_user, sim_matrix, top_k): | |
items = list(training_data[item_colname].unique()) | |
return [(items[i], 0) for i in random.sample(range(len(items)), top_k)] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
my_data = pandas.DataFrame([['001', 'A'],['001', 'B'],['001', 'D'], | |
['002', 'B'],['002', 'C'],['002', 'E'], | |
['003', 'C'],['003', 'D'], | |
['004', 'B'],['004', 'C'], ['004', 'D'], | |
['005', 'A'],['005','D']], columns=['user_id', 'song']) | |
def similarity_matrix(training_data, user_colname, item_colname): | |
all_items = training_data[item_colname].unique() | |
#print("all_items", all_items) | |
items_users = [] | |
for item in all_items: | |
users = set( training_data[training_data[item_colname] == item][user_colname].unique() ) | |
items_users.append( users ) | |
#print("items_users", items_users) | |
cooccurence_matrix = np.matrix(np.zeros(shape=(len(all_items), len(all_items))), float) | |
for i in range(0, len(all_items)): | |
users_i = items_users[i] | |
for j in range(0, len(all_items)): | |
users_j = items_users[j] | |
users_intersection = users_i.intersection(users_j) | |
# Jaccard Index | |
if len(users_intersection) != 0: | |
users_union = users_i.union(users_j) | |
cooccurence_matrix[j,i] = float(len(users_intersection))/float(len(users_union)) | |
else: | |
cooccurence_matrix[j,i] = 0 | |
return pandas.DataFrame(cooccurence_matrix, index=all_items, columns=all_items) | |
similarity_matrix(my_data, 'user_id', 'song') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment