Skip to content

Instantly share code, notes, and snippets.

ratings_df = pd.read_csv(f'{BASE_FOLDER}/MovieLens-1M/ratings.dat',
delimiter='::', engine='python', header=None,
names=['user_id', 'movie_id', 'rating', 'time'])
ratings_df.head()
users_df_occupation_by_gender = users_df.groupby(['occupation', 'gender']).size().reset_index(name='occupation_by_gender_count')
gender_count = users_df['gender'].value_counts()
users_df_occupation_by_gender['occupation_by_gender_count'] = \
users_df_occupation_by_gender['occupation_by_gender_count'].div(users_df_occupation_by_gender['gender'].map(gender_count))
users_df_occupation_by_gender
occupation_by_gender_plot = px.bar(users_df_occupation_by_gender, x='occupation', y='occupation_by_gender_count', color='gender', barmode='group')
occupation_by_gender_plot
user_id gender age occupation zip_code
0 1 F 1 K-12 student 48067
1 2 M 56 self-employed 70072
2 3 M 25 scientist 55117
3 4 M 45 executive/managerial 02460
4 5 M 25 writer 55455
readme_text = np.array(open(f'{BASE_FOLDER}/MovieLens-1M/README').read().splitlines())
start_index = np.flatnonzero(np.core.defchararray.find(readme_text,'Occupation is chosen')!=-1)[0]
end_index = np.flatnonzero(np.core.defchararray.find(readme_text,'MOVIES FILE DESCRIPTION')!=-1)[0]
occupation_list = [x.split('"')[1] for x in readme_text[start_index:end_index][2:-1].tolist()]
occupation_dict = dict(zip(range(len(occupation_list)), occupation_list))
users_df = pd.read_csv(f'{BASE_FOLDER}/MovieLens-1M/users.dat',
delimiter='::', engine='python', header=None,
names=['user_id', 'gender', 'age', 'occupation', 'zip_code'])
users_df['occupation'] = users_df['occupation'].replace(occupation_dict)
movies_df['year'] = movies_df['movie_name'].apply(lambda movie_name: re.search('\((\d*)\)', movie_name).groups(1)[0])
movie_count_by_year = px.histogram(movies_df, x='year', height=400, title='Movie count by year').update_xaxes(categoryorder="total descending")
movie_count_by_year
movies_df['genre'] = movies_df['genre'].apply(lambda x: x.split('|'))
movies_df_exploded = movies_df.explode('genre')
px.histogram(movies_df_exploded, x='genre', height=400, title='Movie count by genre').update_xaxes(categoryorder="total descending")
movie_name genre
1 Toy Story (1995) Animation|Children's|Comedy
2 Jumanji (1995) Adventure|Children's|Fantasy
3 Grumpier Old Men (1995) Comedy|Romance
4 Waiting to Exhale (1995) Comedy|Drama
5 Father of the Bride Part II (1995) Comedy
movies_df = pd.read_csv(f'{BASE_FOLDER}/MovieLens-1M/movies.dat',
delimiter='::', engine= 'python', header=None,
names=['movie_name', 'genre'])
movies_df.head()
import pandas as pd
import plotly.express as px
from sklearn.feature_extraction.text import TfidfVectorizer
import wikipedia
import numpy as np
from tqdm.notebook import tqdm
import re
import os
from sklearn.metrics.pairwise import linear_kernel
from surprise import Dataset
import sys
from PyQt6.QtWidgets import QApplication, QHBoxLayout, QMainWindow, QPushButton, QSpinBox, QVBoxLayout, QWidget, QFileDialog, QLabel, QErrorMessage
from PyQt6.QtGui import QImage, QPixmap
import cv2
import numpy as np