Skip to content

Instantly share code, notes, and snippets.

@vatsalsaglani
Created October 8, 2022 13:18
Show Gist options
  • Save vatsalsaglani/3c1e3490b0871528743f6ac31bec7176 to your computer and use it in GitHub Desktop.
Save vatsalsaglani/3c1e3490b0871528743f6ac31bec7176 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"import pandas as pd\n",
"import torch\n",
"import torch.nn as nn\n",
"import torch.nn.functional as F\n",
"import random\n",
"import sys\n",
"sys.path.append(\"../\")\n",
"from constants import *"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"movies_df = pd.read_csv(\"../data/ml-25m/ml-25m/movies.csv\")"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"62423"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(movies_df)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>movieId</th>\n",
" <th>title</th>\n",
" <th>genres</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>62413</th>\n",
" <td>209145</td>\n",
" <td>Liberté (2019)</td>\n",
" <td>Drama</td>\n",
" </tr>\n",
" <tr>\n",
" <th>62414</th>\n",
" <td>209147</td>\n",
" <td>The Carpet of Horror (1962)</td>\n",
" <td>Crime|Horror</td>\n",
" </tr>\n",
" <tr>\n",
" <th>62415</th>\n",
" <td>209151</td>\n",
" <td>Mao Zedong 1949 (2019)</td>\n",
" <td>(no genres listed)</td>\n",
" </tr>\n",
" <tr>\n",
" <th>62416</th>\n",
" <td>209153</td>\n",
" <td>Happy Flight (2008)</td>\n",
" <td>Comedy|Drama</td>\n",
" </tr>\n",
" <tr>\n",
" <th>62417</th>\n",
" <td>209155</td>\n",
" <td>Santosh Subramaniam (2008)</td>\n",
" <td>Action|Comedy|Romance</td>\n",
" </tr>\n",
" <tr>\n",
" <th>62418</th>\n",
" <td>209157</td>\n",
" <td>We (2018)</td>\n",
" <td>Drama</td>\n",
" </tr>\n",
" <tr>\n",
" <th>62419</th>\n",
" <td>209159</td>\n",
" <td>Window of the Soul (2001)</td>\n",
" <td>Documentary</td>\n",
" </tr>\n",
" <tr>\n",
" <th>62420</th>\n",
" <td>209163</td>\n",
" <td>Bad Poems (2018)</td>\n",
" <td>Comedy|Drama</td>\n",
" </tr>\n",
" <tr>\n",
" <th>62421</th>\n",
" <td>209169</td>\n",
" <td>A Girl Thing (2001)</td>\n",
" <td>(no genres listed)</td>\n",
" </tr>\n",
" <tr>\n",
" <th>62422</th>\n",
" <td>209171</td>\n",
" <td>Women of Devil's Island (1962)</td>\n",
" <td>Action|Adventure|Drama</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" movieId title genres\n",
"62413 209145 Liberté (2019) Drama\n",
"62414 209147 The Carpet of Horror (1962) Crime|Horror\n",
"62415 209151 Mao Zedong 1949 (2019) (no genres listed)\n",
"62416 209153 Happy Flight (2008) Comedy|Drama\n",
"62417 209155 Santosh Subramaniam (2008) Action|Comedy|Romance\n",
"62418 209157 We (2018) Drama\n",
"62419 209159 Window of the Soul (2001) Documentary\n",
"62420 209163 Bad Poems (2018) Comedy|Drama\n",
"62421 209169 A Girl Thing (2001) (no genres listed)\n",
"62422 209171 Women of Devil's Island (1962) Action|Adventure|Drama"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"movies_df.tail(10)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"ratings_df = pd.read_csv(\"../data/ml-25m/ml-25m/ratings.csv\")"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"25000095"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(ratings_df)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>userId</th>\n",
" <th>movieId</th>\n",
" <th>rating</th>\n",
" <th>timestamp</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>296</td>\n",
" <td>5.0</td>\n",
" <td>1147880044</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1</td>\n",
" <td>306</td>\n",
" <td>3.5</td>\n",
" <td>1147868817</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>1</td>\n",
" <td>307</td>\n",
" <td>5.0</td>\n",
" <td>1147868828</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>1</td>\n",
" <td>665</td>\n",
" <td>5.0</td>\n",
" <td>1147878820</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>1</td>\n",
" <td>899</td>\n",
" <td>3.5</td>\n",
" <td>1147868510</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" userId movieId rating timestamp\n",
"0 1 296 5.0 1147880044\n",
"1 1 306 3.5 1147868817\n",
"2 1 307 5.0 1147868828\n",
"3 1 665 5.0 1147878820\n",
"4 1 899 3.5 1147868510"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"ratings_df.head()"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"ratings_df.sort_values(by=[\"timestamp\"], inplace=True)\n",
"grouped_ratings = ratings_df.groupby(by=\"userId\").agg(list)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>movieId</th>\n",
" <th>rating</th>\n",
" <th>timestamp</th>\n",
" </tr>\n",
" <tr>\n",
" <th>userId</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>[5952, 2012, 2011, 1653, 1250, 6539, 6377, 344...</td>\n",
" <td>[4.0, 2.5, 2.5, 4.0, 4.0, 3.5, 4.0, 4.0, 4.0, ...</td>\n",
" <td>[1147868053, 1147868068, 1147868079, 114786809...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>[2797, 5952, 1080, 553, 653, 497, 1374, 1653, ...</td>\n",
" <td>[1.0, 5.0, 1.0, 2.0, 3.0, 4.0, 4.5, 4.5, 3.0, ...</td>\n",
" <td>[1141415509, 1141415528, 1141415532, 114141553...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>[356, 593, 1270, 1, 480, 2571, 260, 318, 1196,...</td>\n",
" <td>[4.0, 4.0, 3.5, 4.0, 2.0, 4.0, 4.0, 4.0, 4.0, ...</td>\n",
" <td>[1439472199, 1439472203, 1439472211, 143947221...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>[97913, 93510, 91630, 93840, 195159, 122914, 1...</td>\n",
" <td>[3.5, 4.0, 3.5, 4.5, 5.0, 3.0, 2.0, 3.5, 2.5, ...</td>\n",
" <td>[1573937091, 1573937096, 1573937103, 157393711...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>[592, 590, 296, 150, 344, 153, 588, 595, 231, ...</td>\n",
" <td>[3.0, 3.0, 4.0, 5.0, 4.0, 3.0, 4.0, 3.0, 4.0, ...</td>\n",
" <td>[830786155, 830786155, 830786155, 830786155, 8...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>[2421, 1466, 161, 902, 858, 2815, 1183, 1704, ...</td>\n",
" <td>[3.0, 3.0, 2.0, 4.0, 5.0, 3.0, 2.0, 5.0, 5.0, ...</td>\n",
" <td>[945141530, 945141530, 945141530, 945141564, 9...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>[590, 296, 592, 150, 153, 165, 344, 588, 595, ...</td>\n",
" <td>[3.0, 4.0, 3.0, 4.0, 3.0, 3.0, 2.0, 4.0, 4.0, ...</td>\n",
" <td>[835444730, 835444730, 835444730, 835444730, 8...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>[1672, 1617, 1777, 1721, 1704, 551, 903, 110, ...</td>\n",
" <td>[4.0, 5.0, 3.0, 4.0, 4.0, 2.0, 4.0, 5.0, 3.0, ...</td>\n",
" <td>[890489203, 890489203, 890489236, 890489263, 8...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>[1073, 260, 1356, 805, 1210, 667, 1367, 61, 85...</td>\n",
" <td>[5.0, 4.0, 5.0, 5.0, 5.0, 5.0, 5.0, 4.0, 3.0, ...</td>\n",
" <td>[859381992, 859382015, 859382042, 859382042, 8...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10</th>\n",
" <td>[1962, 2915, 2605, 4361, 193, 3361, 3863, 1347...</td>\n",
" <td>[3.0, 3.0, 3.5, 3.0, 1.0, 3.0, 4.0, 3.0, 2.0, ...</td>\n",
" <td>[1227570828, 1227570836, 1227570841, 122757085...</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" movieId \\\n",
"userId \n",
"1 [5952, 2012, 2011, 1653, 1250, 6539, 6377, 344... \n",
"2 [2797, 5952, 1080, 553, 653, 497, 1374, 1653, ... \n",
"3 [356, 593, 1270, 1, 480, 2571, 260, 318, 1196,... \n",
"4 [97913, 93510, 91630, 93840, 195159, 122914, 1... \n",
"5 [592, 590, 296, 150, 344, 153, 588, 595, 231, ... \n",
"6 [2421, 1466, 161, 902, 858, 2815, 1183, 1704, ... \n",
"7 [590, 296, 592, 150, 153, 165, 344, 588, 595, ... \n",
"8 [1672, 1617, 1777, 1721, 1704, 551, 903, 110, ... \n",
"9 [1073, 260, 1356, 805, 1210, 667, 1367, 61, 85... \n",
"10 [1962, 2915, 2605, 4361, 193, 3361, 3863, 1347... \n",
"\n",
" rating \\\n",
"userId \n",
"1 [4.0, 2.5, 2.5, 4.0, 4.0, 3.5, 4.0, 4.0, 4.0, ... \n",
"2 [1.0, 5.0, 1.0, 2.0, 3.0, 4.0, 4.5, 4.5, 3.0, ... \n",
"3 [4.0, 4.0, 3.5, 4.0, 2.0, 4.0, 4.0, 4.0, 4.0, ... \n",
"4 [3.5, 4.0, 3.5, 4.5, 5.0, 3.0, 2.0, 3.5, 2.5, ... \n",
"5 [3.0, 3.0, 4.0, 5.0, 4.0, 3.0, 4.0, 3.0, 4.0, ... \n",
"6 [3.0, 3.0, 2.0, 4.0, 5.0, 3.0, 2.0, 5.0, 5.0, ... \n",
"7 [3.0, 4.0, 3.0, 4.0, 3.0, 3.0, 2.0, 4.0, 4.0, ... \n",
"8 [4.0, 5.0, 3.0, 4.0, 4.0, 2.0, 4.0, 5.0, 3.0, ... \n",
"9 [5.0, 4.0, 5.0, 5.0, 5.0, 5.0, 5.0, 4.0, 3.0, ... \n",
"10 [3.0, 3.0, 3.5, 3.0, 1.0, 3.0, 4.0, 3.0, 2.0, ... \n",
"\n",
" timestamp \n",
"userId \n",
"1 [1147868053, 1147868068, 1147868079, 114786809... \n",
"2 [1141415509, 1141415528, 1141415532, 114141553... \n",
"3 [1439472199, 1439472203, 1439472211, 143947221... \n",
"4 [1573937091, 1573937096, 1573937103, 157393711... \n",
"5 [830786155, 830786155, 830786155, 830786155, 8... \n",
"6 [945141530, 945141530, 945141530, 945141564, 9... \n",
"7 [835444730, 835444730, 835444730, 835444730, 8... \n",
"8 [890489203, 890489203, 890489236, 890489263, 8... \n",
"9 [859381992, 859382015, 859382042, 859382042, 8... \n",
"10 [1227570828, 1227570836, 1227570841, 122757085... "
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"grouped_ratings.head(10)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"movieIdMapping = {k:i+2 for i, k in enumerate(sorted(list(ratings_df.movieId.unique())))}"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"59047"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(movieIdMapping)"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"62423"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(movies_df)"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"209171"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"max(movieIdMapping.keys())"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"1"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"min(movieIdMapping.keys())"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"59048"
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"max(movieIdMapping.values())"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"2"
]
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"min(movieIdMapping.values())\n"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [],
"source": [
"ratings_df[\"movieId_mapped\"] = ratings_df.movieId.map(movieIdMapping)"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>userId</th>\n",
" <th>movieId</th>\n",
" <th>rating</th>\n",
" <th>timestamp</th>\n",
" <th>movieId_mapped</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>326761</th>\n",
" <td>2262</td>\n",
" <td>21</td>\n",
" <td>3.0</td>\n",
" <td>789652009</td>\n",
" <td>22</td>\n",
" </tr>\n",
" <tr>\n",
" <th>326810</th>\n",
" <td>2262</td>\n",
" <td>1079</td>\n",
" <td>3.0</td>\n",
" <td>789652009</td>\n",
" <td>1054</td>\n",
" </tr>\n",
" <tr>\n",
" <th>326767</th>\n",
" <td>2262</td>\n",
" <td>47</td>\n",
" <td>5.0</td>\n",
" <td>789652009</td>\n",
" <td>48</td>\n",
" </tr>\n",
" <tr>\n",
" <th>15845015</th>\n",
" <td>102689</td>\n",
" <td>1</td>\n",
" <td>4.0</td>\n",
" <td>822873600</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>15845023</th>\n",
" <td>102689</td>\n",
" <td>39</td>\n",
" <td>5.0</td>\n",
" <td>822873600</td>\n",
" <td>40</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" userId movieId rating timestamp movieId_mapped\n",
"326761 2262 21 3.0 789652009 22\n",
"326810 2262 1079 3.0 789652009 1054\n",
"326767 2262 47 5.0 789652009 48\n",
"15845015 102689 1 4.0 822873600 2\n",
"15845023 102689 39 5.0 822873600 40"
]
},
"execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"ratings_df.head()"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"59048"
]
},
"execution_count": 21,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"ratings_df.movieId_mapped.max()"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"2"
]
},
"execution_count": 22,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"ratings_df.movieId_mapped.min()"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [],
"source": [
"movies_df[\"movieId_mapped\"] = movies_df.movieId.map(movieIdMapping)"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [],
"source": [
"movies_df.to_csv(\"../data/ml-25m/ml-25m/movies_mapped.csv\", index = False)"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [],
"source": [
"ratings_df.to_csv(\"../data/ml-25m/ml-25m/ratings_mapped.csv\", index = False)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"interpreter": {
"hash": "ff9c16f4f11009bb918bd4cbef0c02902e53456483176d7e27b50617b808988a"
},
"kernelspec": {
"display_name": "Python 3.7.10 ('clustering')",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.10"
},
"orig_nbformat": 4
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment