Created
October 17, 2016 13:17
-
-
Save pilipolio/5b6a373f8be14ed9035a22272bfb766c to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"--2016-10-17 14:09:06-- http://files.grouplens.org/datasets/movielens/ml-1m.zip\n", | |
"Resolving files.grouplens.org... 128.101.34.146\n", | |
"Connecting to files.grouplens.org|128.101.34.146|:80... connected.\n", | |
"HTTP request sent, awaiting response... 200 OK\n", | |
"Length: 5917549 (5.6M) [application/zip]\n", | |
"Saving to: 'ml-1m.zip.1'\n", | |
"\n", | |
"ml-1m.zip.1 100%[=====================>] 5.64M 550KB/s in 21s \n", | |
"\n", | |
"2016-10-17 14:09:27 (279 KB/s) - 'ml-1m.zip.1' saved [5917549/5917549]\n", | |
"\n", | |
"Archive: ml-1m.zip\n", | |
" inflating: ./ml-1m/movies.dat \n", | |
" inflating: ./ml-1m/ratings.dat \n", | |
" inflating: ./ml-1m/README \n", | |
" inflating: ./ml-1m/users.dat \n" | |
] | |
} | |
], | |
"source": [ | |
"! wget http://files.grouplens.org/datasets/movielens/ml-1m.zip\n", | |
"! unzip ml-1m.zip -d ." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"/Users/gui/.virtualenvs/gui3/lib/python3.5/site-packages/ipykernel/__main__.py:3: ParserWarning: Falling back to the 'python' engine because the 'c' engine does not support regex separators (separators > 1 char and different from '\\s+' are interpreted as regex); you can avoid this warning by specifying engine='python'.\n", | |
" app.launch_new_instance()\n" | |
] | |
}, | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>user</th>\n", | |
" <th>item</th>\n", | |
" <th>rating</th>\n", | |
" <th>timestamp</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>1</td>\n", | |
" <td>1193</td>\n", | |
" <td>5</td>\n", | |
" <td>978300760</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>1</td>\n", | |
" <td>661</td>\n", | |
" <td>3</td>\n", | |
" <td>978302109</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>1</td>\n", | |
" <td>914</td>\n", | |
" <td>3</td>\n", | |
" <td>978301968</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>1</td>\n", | |
" <td>3408</td>\n", | |
" <td>4</td>\n", | |
" <td>978300275</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>1</td>\n", | |
" <td>2355</td>\n", | |
" <td>5</td>\n", | |
" <td>978824291</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" user item rating timestamp\n", | |
"0 1 1193 5 978300760\n", | |
"1 1 661 3 978302109\n", | |
"2 1 914 3 978301968\n", | |
"3 1 3408 4 978300275\n", | |
"4 1 2355 5 978824291" | |
] | |
}, | |
"execution_count": 4, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"import pandas as pd\n", | |
"\n", | |
"ratings = pd.read_csv('./ml-1m/ratings.dat', sep='::', names=['user', 'item', 'rating', 'timestamp'])\n", | |
"ratings.head()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>user_support</th>\n", | |
" <th>item_support</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>count</th>\n", | |
" <td>6040.000000</td>\n", | |
" <td>3706.000000</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>mean</th>\n", | |
" <td>165.597517</td>\n", | |
" <td>269.889099</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>std</th>\n", | |
" <td>192.747029</td>\n", | |
" <td>384.047838</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>min</th>\n", | |
" <td>20.000000</td>\n", | |
" <td>1.000000</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>25%</th>\n", | |
" <td>44.000000</td>\n", | |
" <td>33.000000</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>50%</th>\n", | |
" <td>96.000000</td>\n", | |
" <td>123.500000</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>75%</th>\n", | |
" <td>208.000000</td>\n", | |
" <td>350.000000</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>max</th>\n", | |
" <td>2314.000000</td>\n", | |
" <td>3428.000000</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" user_support item_support\n", | |
"count 6040.000000 3706.000000\n", | |
"mean 165.597517 269.889099\n", | |
"std 192.747029 384.047838\n", | |
"min 20.000000 1.000000\n", | |
"25% 44.000000 33.000000\n", | |
"50% 96.000000 123.500000\n", | |
"75% 208.000000 350.000000\n", | |
"max 2314.000000 3428.000000" | |
] | |
}, | |
"execution_count": 5, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"item_supports = ratings.groupby('item')['user'].nunique().to_frame('item_support')\n", | |
"item_supports.describe()\n", | |
"\n", | |
"user_supports = ratings.groupby('user')['item'].nunique().to_frame('user_support')\n", | |
"user_supports.describe()\n", | |
"\n", | |
"pd.concat([user_supports.describe(), item_supports.describe()], axis=1)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"/Users/gui/.virtualenvs/gui3/lib/python3.5/site-packages/ipykernel/__main__.py:1: ParserWarning: Falling back to the 'python' engine because the 'c' engine does not support regex separators (separators > 1 char and different from '\\s+' are interpreted as regex); you can avoid this warning by specifying engine='python'.\n", | |
" if __name__ == '__main__':\n" | |
] | |
}, | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>title</th>\n", | |
" <th>genres</th>\n", | |
" <th>first_genre</th>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>item</th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>Toy Story (1995)</td>\n", | |
" <td>Animation|Children's|Comedy</td>\n", | |
" <td>Animation</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>Jumanji (1995)</td>\n", | |
" <td>Adventure|Children's|Fantasy</td>\n", | |
" <td>Adventure</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>Grumpier Old Men (1995)</td>\n", | |
" <td>Comedy|Romance</td>\n", | |
" <td>Comedy</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>Waiting to Exhale (1995)</td>\n", | |
" <td>Comedy|Drama</td>\n", | |
" <td>Comedy</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>5</th>\n", | |
" <td>Father of the Bride Part II (1995)</td>\n", | |
" <td>Comedy</td>\n", | |
" <td>Comedy</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" title genres \\\n", | |
"item \n", | |
"1 Toy Story (1995) Animation|Children's|Comedy \n", | |
"2 Jumanji (1995) Adventure|Children's|Fantasy \n", | |
"3 Grumpier Old Men (1995) Comedy|Romance \n", | |
"4 Waiting to Exhale (1995) Comedy|Drama \n", | |
"5 Father of the Bride Part II (1995) Comedy \n", | |
"\n", | |
" first_genre \n", | |
"item \n", | |
"1 Animation \n", | |
"2 Adventure \n", | |
"3 Comedy \n", | |
"4 Comedy \n", | |
"5 Comedy " | |
] | |
}, | |
"execution_count": 8, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"movies = pd.read_csv('./ml-1m/movies.dat', sep='::', names=['item', 'title', 'genres'], index_col='item')\n", | |
"movies['first_genre'] = movies.genres.str.split('|').str.get(0)\n", | |
"\n", | |
"movies.head()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 18, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"/Users/gui/.virtualenvs/gui3/lib/python3.5/site-packages/ipykernel/__main__.py:1: ParserWarning: Falling back to the 'python' engine because the 'c' engine does not support regex separators (separators > 1 char and different from '\\s+' are interpreted as regex); you can avoid this warning by specifying engine='python'.\n", | |
" if __name__ == '__main__':\n" | |
] | |
}, | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>gender</th>\n", | |
" <th>age</th>\n", | |
" <th>occupation</th>\n", | |
" <th>zipcode</th>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>user</th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>F</td>\n", | |
" <td>1</td>\n", | |
" <td>10</td>\n", | |
" <td>48067</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>M</td>\n", | |
" <td>56</td>\n", | |
" <td>16</td>\n", | |
" <td>70072</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>M</td>\n", | |
" <td>25</td>\n", | |
" <td>15</td>\n", | |
" <td>55117</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>M</td>\n", | |
" <td>45</td>\n", | |
" <td>7</td>\n", | |
" <td>02460</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>5</th>\n", | |
" <td>M</td>\n", | |
" <td>25</td>\n", | |
" <td>20</td>\n", | |
" <td>55455</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" gender age occupation zipcode\n", | |
"user \n", | |
"1 F 1 10 48067\n", | |
"2 M 56 16 70072\n", | |
"3 M 25 15 55117\n", | |
"4 M 45 7 02460\n", | |
"5 M 25 20 55455" | |
] | |
}, | |
"execution_count": 18, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# See http://files.grouplens.org/datasets/movielens/ml-1m-README.txt for more details\n", | |
"users = pd.read_csv('./ml-1m/users.dat', sep='::', names=['user', 'gender', 'age', 'occupation', 'zipcode'], index_col='user')\n", | |
"users.head()" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.5.0" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 0 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment