Created
June 14, 2020 09:44
-
-
Save accessnash/c320ad9345e838ff9440541ffc5ac7e1 to your computer and use it in GitHub Desktop.
A basic movie recommendation system using 100,000 data points and 1600+ movies
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # -*- coding: utf-8 -*- | |
| """ | |
| Created on Sat Jun 13 09:39:13 2020 | |
| @author: User | |
| """ | |
| import pandas as pd | |
| import numpy as np | |
| import matplotlib.pyplot as plt | |
| import seaborn as sns | |
| movie_titles_df = pd.read_csv('Movie_Id_Titles') | |
| movie_rating_df = pd.read_csv('u.data', sep = '\t', names = ['user_id', 'item_id', 'rating', 'timestamp']) | |
| movie_rating_df.drop(['timestamp'], axis = 1, inplace = True) | |
| movies_rating_df = pd.merge(movie_rating_df, movie_titles_df, on = 'item_id') | |
| movies_rating_df.describe() | |
| movies_rating_df.groupby('title').describe() | |
| ratings_df_mean = movies_rating_df.groupby('title')['rating'].describe()['mean'] | |
| ratings_df_count = movies_rating_df.groupby('title')['rating'].describe()['count'] | |
| ratings_all_df = pd.concat([ratings_df_mean, ratings_df_count], axis = 1 ) | |
| ratings_all_df.reset_index() | |
| ratings_all_df['mean'].plot(bins = 100, kind = 'hist', color = 'b') | |
| ratings_all_df['count'].plot(bins = 100, kind = 'hist', color = 'b') | |
| ratings_all_df[ratings_all_df['mean'] == 5] | |
| ratings_all_df.sort_values('count', ascending = False).head(100) | |
| ratings_all_df.sort_values('count', ascending = True).head(100) | |
| userid_movie_matrix = movies_rating_df.pivot_table(index = 'user_id', columns = 'title', values = 'rating') | |
| # Checking correlation with other movies for a single selection | |
| fargo = userid_movie_matrix['Fargo (1996)'] | |
| fargo_corr = pd.DataFrame(userid_movie_matrix.corrwith(fargo), columns = ['Correlations']) | |
| fargo_corr = fargo_corr.join(ratings_all_df['count']) | |
| fargo_corr.dropna(inplace = True) | |
| fargo_corr.sort_values('Correlations', ascending = False) | |
| fargo_corr[fargo_corr['count']> 100].sort_values('Correlations', ascending = False) | |
| # Creating a database for all movies and how they're correlated with each other | |
| movie_correlations = userid_movie_matrix.corr(method = 'pearson', min_periods = 100) | |
| myRatings = pd.read_csv('My_Ratings.csv') | |
| similar_movies_list = pd.Series() | |
| for i in range(0,2): | |
| similar_movie = movie_correlations[myRatings['Movie Name'][i]].dropna() | |
| similar_movie = similar_movie.map(lambda x : x* myRatings['Ratings'][i]) | |
| similar_movies_list = similar_movies_list.append(similar_movie) | |
| similar_movies_list.sort_values(inplace = True, ascending = False) | |
| print(similar_movies_list.head(10)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment