Skip to content

Instantly share code, notes, and snippets.

@accessnash
Created June 14, 2020 09:44
Show Gist options
  • Select an option

  • Save accessnash/c320ad9345e838ff9440541ffc5ac7e1 to your computer and use it in GitHub Desktop.

Select an option

Save accessnash/c320ad9345e838ff9440541ffc5ac7e1 to your computer and use it in GitHub Desktop.
A basic movie recommendation system using 100,000 data points and 1600+ movies
# -*- coding: utf-8 -*-
"""
Created on Sat Jun 13 09:39:13 2020
@author: User
"""
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
movie_titles_df = pd.read_csv('Movie_Id_Titles')
movie_rating_df = pd.read_csv('u.data', sep = '\t', names = ['user_id', 'item_id', 'rating', 'timestamp'])
movie_rating_df.drop(['timestamp'], axis = 1, inplace = True)
movies_rating_df = pd.merge(movie_rating_df, movie_titles_df, on = 'item_id')
movies_rating_df.describe()
movies_rating_df.groupby('title').describe()
ratings_df_mean = movies_rating_df.groupby('title')['rating'].describe()['mean']
ratings_df_count = movies_rating_df.groupby('title')['rating'].describe()['count']
ratings_all_df = pd.concat([ratings_df_mean, ratings_df_count], axis = 1 )
ratings_all_df.reset_index()
ratings_all_df['mean'].plot(bins = 100, kind = 'hist', color = 'b')
ratings_all_df['count'].plot(bins = 100, kind = 'hist', color = 'b')
ratings_all_df[ratings_all_df['mean'] == 5]
ratings_all_df.sort_values('count', ascending = False).head(100)
ratings_all_df.sort_values('count', ascending = True).head(100)
userid_movie_matrix = movies_rating_df.pivot_table(index = 'user_id', columns = 'title', values = 'rating')
# Checking correlation with other movies for a single selection
fargo = userid_movie_matrix['Fargo (1996)']
fargo_corr = pd.DataFrame(userid_movie_matrix.corrwith(fargo), columns = ['Correlations'])
fargo_corr = fargo_corr.join(ratings_all_df['count'])
fargo_corr.dropna(inplace = True)
fargo_corr.sort_values('Correlations', ascending = False)
fargo_corr[fargo_corr['count']> 100].sort_values('Correlations', ascending = False)
# Creating a database for all movies and how they're correlated with each other
movie_correlations = userid_movie_matrix.corr(method = 'pearson', min_periods = 100)
myRatings = pd.read_csv('My_Ratings.csv')
similar_movies_list = pd.Series()
for i in range(0,2):
similar_movie = movie_correlations[myRatings['Movie Name'][i]].dropna()
similar_movie = similar_movie.map(lambda x : x* myRatings['Ratings'][i])
similar_movies_list = similar_movies_list.append(similar_movie)
similar_movies_list.sort_values(inplace = True, ascending = False)
print(similar_movies_list.head(10))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment