EthanRosenthal · May 15, 2019 07:58
diff --git a/insight-nb1.ipynb b/insight-nb1.ipynb
 {
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "np.random.seed(0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "!curl -O http://files.grouplens.org/datasets/movielens/ml-100k.zip\n",
    "!unzip ml-100k.zip\n",
    "!cd ml-100k/"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>user_id</th>\n",
       "      <th>item_id</th>\n",
       "      <th>rating</th>\n",
       "      <th>timestamp</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>196</td>\n",
       "      <td>242</td>\n",
       "      <td>3</td>\n",
       "      <td>881250949</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>186</td>\n",
       "      <td>302</td>\n",
       "      <td>3</td>\n",
       "      <td>891717742</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>22</td>\n",
       "      <td>377</td>\n",
       "      <td>1</td>\n",
       "      <td>878887116</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>244</td>\n",
       "      <td>51</td>\n",
       "      <td>2</td>\n",
       "      <td>880606923</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>166</td>\n",
       "      <td>346</td>\n",
       "      <td>1</td>\n",
       "      <td>886397596</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   user_id  item_id  rating  timestamp\n",
       "0      196      242       3  881250949\n",
       "1      186      302       3  891717742\n",
       "2       22      377       1  878887116\n",
       "3      244       51       2  880606923\n",
       "4      166      346       1  886397596"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "names = ['user_id', 'item_id', 'rating', 'timestamp']\n",
    "df = pd.read_csv('u.data', sep='\\t', names=names)\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[ 5.,  3.,  4., ...,  0.,  0.,  0.],\n",
       "       [ 4.,  0.,  0., ...,  0.,  0.,  0.],\n",
       "       [ 0.,  0.,  0., ...,  0.,  0.,  0.],\n",
       "       ..., \n",
       "       [ 5.,  0.,  0., ...,  0.,  0.,  0.],\n",
       "       [ 0.,  0.,  0., ...,  0.,  0.,  0.],\n",
       "       [ 0.,  5.,  0., ...,  0.,  0.,  0.]])"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "n_users = df.user_id.unique().shape[0]\n",
    "n_items = df.item_id.unique().shape[0]\n",
    "ratings = np.zeros((n_users, n_items))\n",
    "for row in df.itertuples():\n",
    "    ratings[row[1]-1, row[2]-1] = row[3]\n",
    "ratings"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "943 users\n",
      "1682 items\n",
      "Sparsity: 6.30%\n"
     ]
    }
   ],
   "source": [
    "print str(n_users) + ' users'\n",
    "print str(n_items) + ' items'\n",
    "sparsity = float(len(ratings.nonzero()[0]))\n",
    "sparsity /= (ratings.shape[0] * ratings.shape[1])\n",
    "sparsity *= 100\n",
    "print 'Sparsity: {:4.2f}%'.format(sparsity)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def train_test_split(ratings):\n",
    "    test = np.zeros(ratings.shape)\n",
    "    train = ratings.copy()\n",
    "    for user in xrange(ratings.shape[0]):\n",
    "        test_ratings = np.random.choice(ratings[user, :].nonzero()[0], \n",
    "                                        size=10, \n",
    "                                        replace=False)\n",
    "        train[user, test_ratings] = 0.\n",
    "        test[user, test_ratings] = ratings[user, test_ratings]\n",
    "        \n",
    "    # Test and training are truly disjoint\n",
    "    assert(np.all((train * test) == 0)) \n",
    "    return train, test"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "train, test = train_test_split(ratings)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.11"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
 }
	{
	"cells": [
	{
	"cell_type": "code",
	"execution_count": 1,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"import numpy as np\n",
	"import pandas as pd\n",
	"np.random.seed(0)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 2,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"!curl -O http://files.grouplens.org/datasets/movielens/ml-100k.zip\n",
	"!unzip ml-100k.zip\n",
	"!cd ml-100k/"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 3,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/html": [
	"<div>\n",
	"<table border=\"1\" class=\"dataframe\">\n",
	" <thead>\n",
	" <tr style=\"text-align: right;\">\n",
	" <th></th>\n",
	" <th>user_id</th>\n",
	" <th>item_id</th>\n",
	" <th>rating</th>\n",
	" <th>timestamp</th>\n",
	" </tr>\n",
	" </thead>\n",
	" <tbody>\n",
	" <tr>\n",
	" <th>0</th>\n",
	" <td>196</td>\n",
	" <td>242</td>\n",
	" <td>3</td>\n",
	" <td>881250949</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>1</th>\n",
	" <td>186</td>\n",
	" <td>302</td>\n",
	" <td>3</td>\n",
	" <td>891717742</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>2</th>\n",
	" <td>22</td>\n",
	" <td>377</td>\n",
	" <td>1</td>\n",
	" <td>878887116</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>3</th>\n",
	" <td>244</td>\n",
	" <td>51</td>\n",
	" <td>2</td>\n",
	" <td>880606923</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>4</th>\n",
	" <td>166</td>\n",
	" <td>346</td>\n",
	" <td>1</td>\n",
	" <td>886397596</td>\n",
	" </tr>\n",
	" </tbody>\n",
	"</table>\n",
	"</div>"
	],
	"text/plain": [
	" user_id item_id rating timestamp\n",
	"0 196 242 3 881250949\n",
	"1 186 302 3 891717742\n",
	"2 22 377 1 878887116\n",
	"3 244 51 2 880606923\n",
	"4 166 346 1 886397596"
	]
	},
	"execution_count": 3,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"names = ['user_id', 'item_id', 'rating', 'timestamp']\n",
	"df = pd.read_csv('u.data', sep='\\t', names=names)\n",
	"df.head()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 4,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"array([[ 5., 3., 4., ..., 0., 0., 0.],\n",
	" [ 4., 0., 0., ..., 0., 0., 0.],\n",
	" [ 0., 0., 0., ..., 0., 0., 0.],\n",
	" ..., \n",
	" [ 5., 0., 0., ..., 0., 0., 0.],\n",
	" [ 0., 0., 0., ..., 0., 0., 0.],\n",
	" [ 0., 5., 0., ..., 0., 0., 0.]])"
	]
	},
	"execution_count": 4,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"n_users = df.user_id.unique().shape[0]\n",
	"n_items = df.item_id.unique().shape[0]\n",
	"ratings = np.zeros((n_users, n_items))\n",
	"for row in df.itertuples():\n",
	" ratings[row[1]-1, row[2]-1] = row[3]\n",
	"ratings"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 5,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"943 users\n",
	"1682 items\n",
	"Sparsity: 6.30%\n"
	]
	}
	],
	"source": [
	"print str(n_users) + ' users'\n",
	"print str(n_items) + ' items'\n",
	"sparsity = float(len(ratings.nonzero()[0]))\n",
	"sparsity /= (ratings.shape[0] * ratings.shape[1])\n",
	"sparsity *= 100\n",
	"print 'Sparsity: {:4.2f}%'.format(sparsity)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 6,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"def train_test_split(ratings):\n",
	" test = np.zeros(ratings.shape)\n",
	" train = ratings.copy()\n",
	" for user in xrange(ratings.shape[0]):\n",
	" test_ratings = np.random.choice(ratings[user, :].nonzero()[0], \n",
	" size=10, \n",
	" replace=False)\n",
	" train[user, test_ratings] = 0.\n",
	" test[user, test_ratings] = ratings[user, test_ratings]\n",
	" \n",
	" # Test and training are truly disjoint\n",
	" assert(np.all((train * test) == 0)) \n",
	" return train, test"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 7,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"train, test = train_test_split(ratings)"
	]
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 2",
	"language": "python",
	"name": "python2"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 2
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython2",
	"version": "2.7.11"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 0
	}