febriy · January 6, 2020 07:55
diff --git a/evaluation.ipynb b/evaluation.ipynb
 {
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "from scipy.sparse import coo_matrix\n",
    "from implicit.als import AlternatingLeastSquares\n",
    "from implicit.utils import nonzeros"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "project_id = \"CHANGEME\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Requesting query... ok.\n",
      "Query running...\n",
      "  Elapsed 11.25 s. Waiting...\n",
      "Query done.\n",
      "Processed: 9.4 GB\n",
      "\n",
      "Retrieving results...\n",
      "  Got page: 1; 46.0% done. Elapsed 24.05 s.\n",
      "  Got page: 2; 92.0% done. Elapsed 41.02 s.\n",
      "  Got page: 3; 100.0% done. Elapsed 49.03 s.\n",
      "Got 217819 rows.\n",
      "\n",
      "Total time taken 50.26 s.\n",
      "Finished at 2017-07-08 10:51:18.\n"
     ]
    }
   ],
   "source": [
    "query = \"\"\"\n",
    "WITH stars AS (\n",
    "     SELECT DISTINCT actor.login AS user, repo.name AS repo\n",
    "     FROM `githubarchive.month.2017*`\n",
    "     WHERE type=\"WatchEvent\"\n",
    "),\n",
    "repositories_stars AS (\n",
    "     SELECT repo, COUNT(*) as c FROM stars GROUP BY repo\n",
    "     ORDER BY c DESC\n",
    "     LIMIT 1000\n",
    "),\n",
    "users_stars AS (\n",
    "    SELECT user, COUNT(*) as c FROM  stars\n",
    "    WHERE repo IN (SELECT repo FROM repositories_stars)\n",
    "    GROUP BY user\n",
    "    HAVING c >= 10 AND C <= 150\n",
    "    LIMIT 10000\n",
    ")\n",
    "SELECT user, repo FROM stars\n",
    "WHERE repo IN (SELECT repo FROM repositories_stars)\n",
    "AND user IN (SELECT user FROM users_stars)\n",
    "\"\"\"\n",
    "\n",
    "data = pd.io.gbq.read_gbq(query, dialect=\"standard\", project_id=project_id)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "data['user'] = data['user'].astype(\"category\")\n",
    "data['repo'] = data['repo'].astype(\"category\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from sklearn.base import BaseEstimator, TransformerMixin\n",
    "from sklearn.pipeline import Pipeline"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "class UtilityMatrixTransformer(BaseEstimator, TransformerMixin):\n",
    "    def __init__(self, confidence=40):\n",
    "        self.confidence = confidence\n",
    "\n",
    "    def fit(self, X, y=None):\n",
    "        return self\n",
    "\n",
    "    def transform(self, X, y=None):\n",
    "        return coo_matrix((np.ones(X.shape[0]),\n",
    "                           (X['repo'].cat.codes.copy(),\n",
    "                            X['user'].cat.codes.copy()))) * self.confidence"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "class ALSEstimator(BaseEstimator, TransformerMixin):\n",
    "    def __init__(self, factors=50,\n",
    "                       regularization=0.01,\n",
    "                       iterations=10,\n",
    "                       filter_seen=True):\n",
    "        self.factors = factors\n",
    "        self.regularization = regularization\n",
    "        self.iterations = iterations\n",
    "        self.filter_seen = filter_seen\n",
    "\n",
    "    def fit(self, X, y=None):\n",
    "        self.model = AlternatingLeastSquares(factors=self.factors,\n",
    "                                             regularization=self.regularization,\n",
    "                                             iterations=self.iterations,\n",
    "                                             dtype=np.float32,\n",
    "                                             use_native=True)\n",
    "        self.model.fit(X)\n",
    "        if self.filter_seen:\n",
    "            self.fit_X = X\n",
    "        return self\n",
    "    \n",
    "    def predict(self, X, y=None):\n",
    "        predictions = np.dot(self.model.item_factors, self.model.user_factors.T)\n",
    "        if self.filter_seen:\n",
    "            predictions[self.fit_X.nonzero()] = -99\n",
    "        return predictions\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#https://gist.github.com/mblondel/7337391\n",
    "\n",
    "def dcg_score(y_true, y_score, k=10, gains=\"exponential\"):\n",
    "    \"\"\"Discounted cumulative gain (DCG) at rank k\n",
    "    Parameters\n",
    "    ----------\n",
    "    y_true : array-like, shape = [n_samples]\n",
    "        Ground truth (true relevance labels).\n",
    "    y_score : array-like, shape = [n_samples]\n",
    "        Predicted scores.\n",
    "    k : int\n",
    "        Rank.\n",
    "    gains : str\n",
    "        Whether gains should be \"exponential\" (default) or \"linear\".\n",
    "    Returns\n",
    "    -------\n",
    "    DCG @k : float\n",
    "    \"\"\"\n",
    "    order = np.argsort(y_score)[::-1]\n",
    "    y_true = np.take(y_true, order[:k])\n",
    "\n",
    "    if gains == \"exponential\":\n",
    "        gains = 2 ** y_true - 1\n",
    "    elif gains == \"linear\":\n",
    "        gains = y_true\n",
    "    else:\n",
    "        raise ValueError(\"Invalid gains option.\")\n",
    "\n",
    "    # highest rank is 1 so +2 instead of +1\n",
    "    discounts = np.log2(np.arange(len(y_true)) + 2)\n",
    "    return np.sum(gains / discounts)\n",
    "\n",
    "\n",
    "def ndcg_score(y_true, y_score, k=10, gains=\"exponential\"):\n",
    "    \"\"\"Normalized discounted cumulative gain (NDCG) at rank k\n",
    "    Parameters\n",
    "    ----------\n",
    "    y_true : array-like, shape = [n_samples]\n",
    "        Ground truth (true relevance labels).\n",
    "    y_score : array-like, shape = [n_samples]\n",
    "        Predicted scores.\n",
    "    k : int\n",
    "        Rank.\n",
    "    gains : str\n",
    "        Whether gains should be \"exponential\" (default) or \"linear\".\n",
    "    Returns\n",
    "    -------\n",
    "    NDCG @k : float\n",
    "    \"\"\"\n",
    "    best = dcg_score(y_true, y_true, k, gains)\n",
    "    if best == 0:\n",
    "        return 0    \n",
    "    actual = dcg_score(y_true, y_score, k, gains)\n",
    "    return actual / best"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def get_col(Y, col):\n",
    "    return np.squeeze(np.asarray(Y[:,col]))\n",
    "\n",
    "def ndcg_score_matrix(Y_true, Y_score, k=10, gains=\"exponential\"):\n",
    "    score = 0.0\n",
    "    n_users = Y_true.shape[1]\n",
    "    for u in range(n_users):\n",
    "        s = ndcg_score(get_col(Y_true, u), get_col(Y_score, u))\n",
    "        score += s\n",
    "    return score / n_users"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from sklearn.model_selection import PredefinedSplit\n",
    "\n",
    "class LeavePOutByGroup():\n",
    "    def __init__(self, X, p=5, n_splits=2):\n",
    "        self.X = X\n",
    "        self.p = p\n",
    "        self.n_splits = n_splits\n",
    "        test_fold = self.X.groupby(\"user\").cumcount().apply(lambda x: int(x / p) if x < (n_splits * p) else -1)\n",
    "        self.s = PredefinedSplit(test_fold)\n",
    "\n",
    "    def get_n_splits(self, X=None, y=None, groups=None):\n",
    "        return self.n_splits\n",
    "\n",
    "    def split(self, X=None, y=None, groups=None):\n",
    "        return self.s.split()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def ndcg_scorer(estimator, X_test):\n",
    "    truth = UtilityMatrixTransformer(confidence=1).fit_transform(X_test).todense()\n",
    "    predictions = estimator.predict(X_test)\n",
    "    return ndcg_score_matrix(truth, predictions, k=10)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from sklearn.model_selection import cross_val_score"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from sklearn.model_selection import GridSearchCV"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [],
   "source": [
    "rec_pipeline = Pipeline([\n",
    "        ('matrix', UtilityMatrixTransformer()),\n",
    "        ('als', ALSEstimator()),\n",
    "])\n",
    "\n",
    "param_grid = [\n",
    "    {\n",
    "        'matrix__confidence': [1, 3, 10, 40, 100],\n",
    "        'als__factors': [20, 50, 100],\n",
    "        'als__regularization': [1e-2, 1e-3, 1e-4],\n",
    "    }\n",
    "]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Fitting 3 folds for each of 45 candidates, totalling 135 fits\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "[Parallel(n_jobs=1)]: Done 135 out of 135 | elapsed: 13.3min finished\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "GridSearchCV(cv=<__main__.LeavePOutByGroup instance at 0x1211de440>,\n",
       "       error_score='raise',\n",
       "       estimator=Pipeline(memory=None,\n",
       "     steps=[('matrix', UtilityMatrixTransformer(confidence=40)), ('als', ALSEstimator(factors=50, filter_seen=True, iterations=10, regularization=0.01))]),\n",
       "       fit_params=None, iid=True, n_jobs=1,\n",
       "       param_grid=[{'matrix__confidence': [1, 3, 10, 40, 100], 'als__factors': [20, 50, 100], 'als__regularization': [0.01, 0.001, 0.0001]}],\n",
       "       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,\n",
       "       scoring=<function ndcg_scorer at 0x120f70500>, verbose=1)"
      ]
     },
     "execution_count": 31,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "shuffled_train_set = data.reindex(np.random.permutation(data.index)).sort_values(\"user\")\n",
    "grid_search = GridSearchCV(rec_pipeline, param_grid,\n",
    "                           cv=LeavePOutByGroup(shuffled_train_set, p=5, n_splits=3),\n",
    "                           scoring=ndcg_scorer, verbose=1)\n",
    "grid_search.fit(shuffled_train_set)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'als__factors': 20, 'als__regularization': 0.001, 'matrix__confidence': 3}"
      ]
     },
     "execution_count": 32,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "grid_search.best_params_"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(0.12182770483269723, {'matrix__confidence': 1, 'als__factors': 20, 'als__regularization': 0.01})\n",
      "(0.13166824473805691, {'matrix__confidence': 3, 'als__factors': 20, 'als__regularization': 0.01})\n",
      "(0.13282873335943512, {'matrix__confidence': 10, 'als__factors': 20, 'als__regularization': 0.01})\n",
      "(0.11039252745416286, {'matrix__confidence': 40, 'als__factors': 20, 'als__regularization': 0.01})\n",
      "(0.089016691660527295, {'matrix__confidence': 100, 'als__factors': 20, 'als__regularization': 0.01})\n",
      "(0.12093506965903911, {'matrix__confidence': 1, 'als__factors': 20, 'als__regularization': 0.001})\n",
      "(0.13349651687119168, {'matrix__confidence': 3, 'als__factors': 20, 'als__regularization': 0.001})\n",
      "(0.13296881851844261, {'matrix__confidence': 10, 'als__factors': 20, 'als__regularization': 0.001})\n",
      "(0.11066298985479994, {'matrix__confidence': 40, 'als__factors': 20, 'als__regularization': 0.001})\n",
      "(0.090079804327817939, {'matrix__confidence': 100, 'als__factors': 20, 'als__regularization': 0.001})\n",
      "(0.12115380609913633, {'matrix__confidence': 1, 'als__factors': 20, 'als__regularization': 0.0001})\n",
      "(0.13149519337094234, {'matrix__confidence': 3, 'als__factors': 20, 'als__regularization': 0.0001})\n",
      "(0.1324632559745306, {'matrix__confidence': 10, 'als__factors': 20, 'als__regularization': 0.0001})\n",
      "(0.11146953791233, {'matrix__confidence': 40, 'als__factors': 20, 'als__regularization': 0.0001})\n",
      "(0.087150593455471825, {'matrix__confidence': 100, 'als__factors': 20, 'als__regularization': 0.0001})\n",
      "(0.11191469606402717, {'matrix__confidence': 1, 'als__factors': 50, 'als__regularization': 0.01})\n",
      "(0.13091750356245202, {'matrix__confidence': 3, 'als__factors': 50, 'als__regularization': 0.01})\n",
      "(0.12765971686755495, {'matrix__confidence': 10, 'als__factors': 50, 'als__regularization': 0.01})\n",
      "(0.10417408070456254, {'matrix__confidence': 40, 'als__factors': 50, 'als__regularization': 0.01})\n",
      "(0.08646553233734848, {'matrix__confidence': 100, 'als__factors': 50, 'als__regularization': 0.01})\n",
      "(0.11191591881133119, {'matrix__confidence': 1, 'als__factors': 50, 'als__regularization': 0.001})\n",
      "(0.13101055863881378, {'matrix__confidence': 3, 'als__factors': 50, 'als__regularization': 0.001})\n",
      "(0.12704689426232149, {'matrix__confidence': 10, 'als__factors': 50, 'als__regularization': 0.001})\n",
      "(0.10546083019755575, {'matrix__confidence': 40, 'als__factors': 50, 'als__regularization': 0.001})\n",
      "(0.088077539716670858, {'matrix__confidence': 100, 'als__factors': 50, 'als__regularization': 0.001})\n",
      "(0.11183762637497301, {'matrix__confidence': 1, 'als__factors': 50, 'als__regularization': 0.0001})\n",
      "(0.13123515836596611, {'matrix__confidence': 3, 'als__factors': 50, 'als__regularization': 0.0001})\n",
      "(0.12755216556143176, {'matrix__confidence': 10, 'als__factors': 50, 'als__regularization': 0.0001})\n",
      "(0.10545558137554324, {'matrix__confidence': 40, 'als__factors': 50, 'als__regularization': 0.0001})\n",
      "(0.085301802039959324, {'matrix__confidence': 100, 'als__factors': 50, 'als__regularization': 0.0001})\n",
      "(0.10611469337423451, {'matrix__confidence': 1, 'als__factors': 100, 'als__regularization': 0.01})\n",
      "(0.12196884692068968, {'matrix__confidence': 3, 'als__factors': 100, 'als__regularization': 0.01})\n",
      "(0.1175596768388132, {'matrix__confidence': 10, 'als__factors': 100, 'als__regularization': 0.01})\n",
      "(0.097768165103126289, {'matrix__confidence': 40, 'als__factors': 100, 'als__regularization': 0.01})\n",
      "(0.083236186967734632, {'matrix__confidence': 100, 'als__factors': 100, 'als__regularization': 0.01})\n",
      "(0.10592410193883216, {'matrix__confidence': 1, 'als__factors': 100, 'als__regularization': 0.001})\n",
      "(0.12342558447935369, {'matrix__confidence': 3, 'als__factors': 100, 'als__regularization': 0.001})\n",
      "(0.11774383257111316, {'matrix__confidence': 10, 'als__factors': 100, 'als__regularization': 0.001})\n",
      "(0.10114449211940411, {'matrix__confidence': 40, 'als__factors': 100, 'als__regularization': 0.001})\n",
      "(0.085345035644795259, {'matrix__confidence': 100, 'als__factors': 100, 'als__regularization': 0.001})\n",
      "(0.10588644986046256, {'matrix__confidence': 1, 'als__factors': 100, 'als__regularization': 0.0001})\n",
      "(0.12292610723641109, {'matrix__confidence': 3, 'als__factors': 100, 'als__regularization': 0.0001})\n",
      "(0.11897882932070754, {'matrix__confidence': 10, 'als__factors': 100, 'als__regularization': 0.0001})\n",
      "(0.10299449602711824, {'matrix__confidence': 40, 'als__factors': 100, 'als__regularization': 0.0001})\n",
      "(0.086555398366424466, {'matrix__confidence': 100, 'als__factors': 100, 'als__regularization': 0.0001})\n"
     ]
    }
   ],
   "source": [
    "cvres = grid_search.cv_results_\n",
    "for mean_score, params in zip(cvres[\"mean_test_score\"], cvres[\"params\"]):\n",
    "    print(mean_score, params)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
 }
diff --git a/recommendations.ipynb b/recommendations.ipynb
	{
	"cells": [
	{
	"cell_type": "code",
	"execution_count": 2,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"import pandas as pd\n",
	"import numpy as np\n",
	"from scipy.sparse import coo_matrix\n",
	"from implicit.als import AlternatingLeastSquares\n",
	"from implicit.utils import nonzeros"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 3,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"project_id = \"CHANGEME\""
	]
	},
	{
	"cell_type": "code",
	"execution_count": 4,
	"metadata": {
	"scrolled": true
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"Requesting query... ok.\n",
	"Query running...\n",
	" Elapsed 11.25 s. Waiting...\n",
	"Query done.\n",
	"Processed: 9.4 GB\n",
	"\n",
	"Retrieving results...\n",
	" Got page: 1; 46.0% done. Elapsed 24.05 s.\n",
	" Got page: 2; 92.0% done. Elapsed 41.02 s.\n",
	" Got page: 3; 100.0% done. Elapsed 49.03 s.\n",
	"Got 217819 rows.\n",
	"\n",
	"Total time taken 50.26 s.\n",
	"Finished at 2017-07-08 10:51:18.\n"
	]
	}
	],
	"source": [
	"query = \"\"\"\n",
	"WITH stars AS (\n",
	" SELECT DISTINCT actor.login AS user, repo.name AS repo\n",
	" FROM `githubarchive.month.2017*`\n",
	" WHERE type=\"WatchEvent\"\n",
	"),\n",
	"repositories_stars AS (\n",
	" SELECT repo, COUNT(*) as c FROM stars GROUP BY repo\n",
	" ORDER BY c DESC\n",
	" LIMIT 1000\n",
	"),\n",
	"users_stars AS (\n",
	" SELECT user, COUNT(*) as c FROM stars\n",
	" WHERE repo IN (SELECT repo FROM repositories_stars)\n",
	" GROUP BY user\n",
	" HAVING c >= 10 AND C <= 150\n",
	" LIMIT 10000\n",
	")\n",
	"SELECT user, repo FROM stars\n",
	"WHERE repo IN (SELECT repo FROM repositories_stars)\n",
	"AND user IN (SELECT user FROM users_stars)\n",
	"\"\"\"\n",
	"\n",
	"data = pd.io.gbq.read_gbq(query, dialect=\"standard\", project_id=project_id)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 5,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"data['user'] = data['user'].astype(\"category\")\n",
	"data['repo'] = data['repo'].astype(\"category\")"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 6,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"from sklearn.base import BaseEstimator, TransformerMixin\n",
	"from sklearn.pipeline import Pipeline"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 7,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"class UtilityMatrixTransformer(BaseEstimator, TransformerMixin):\n",
	" def __init__(self, confidence=40):\n",
	" self.confidence = confidence\n",
	"\n",
	" def fit(self, X, y=None):\n",
	" return self\n",
	"\n",
	" def transform(self, X, y=None):\n",
	" return coo_matrix((np.ones(X.shape[0]),\n",
	" (X['repo'].cat.codes.copy(),\n",
	" X['user'].cat.codes.copy()))) * self.confidence"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 8,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"class ALSEstimator(BaseEstimator, TransformerMixin):\n",
	" def __init__(self, factors=50,\n",
	" regularization=0.01,\n",
	" iterations=10,\n",
	" filter_seen=True):\n",
	" self.factors = factors\n",
	" self.regularization = regularization\n",
	" self.iterations = iterations\n",
	" self.filter_seen = filter_seen\n",
	"\n",
	" def fit(self, X, y=None):\n",
	" self.model = AlternatingLeastSquares(factors=self.factors,\n",
	" regularization=self.regularization,\n",
	" iterations=self.iterations,\n",
	" dtype=np.float32,\n",
	" use_native=True)\n",
	" self.model.fit(X)\n",
	" if self.filter_seen:\n",
	" self.fit_X = X\n",
	" return self\n",
	" \n",
	" def predict(self, X, y=None):\n",
	" predictions = np.dot(self.model.item_factors, self.model.user_factors.T)\n",
	" if self.filter_seen:\n",
	" predictions[self.fit_X.nonzero()] = -99\n",
	" return predictions\n"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 10,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"#https://gist.github.com/mblondel/7337391\n",
	"\n",
	"def dcg_score(y_true, y_score, k=10, gains=\"exponential\"):\n",
	" \"\"\"Discounted cumulative gain (DCG) at rank k\n",
	" Parameters\n",
	" ----------\n",
	" y_true : array-like, shape = [n_samples]\n",
	" Ground truth (true relevance labels).\n",
	" y_score : array-like, shape = [n_samples]\n",
	" Predicted scores.\n",
	" k : int\n",
	" Rank.\n",
	" gains : str\n",
	" Whether gains should be \"exponential\" (default) or \"linear\".\n",
	" Returns\n",
	" -------\n",
	" DCG @k : float\n",
	" \"\"\"\n",
	" order = np.argsort(y_score)[::-1]\n",
	" y_true = np.take(y_true, order[:k])\n",
	"\n",
	" if gains == \"exponential\":\n",
	" gains = 2 ** y_true - 1\n",
	" elif gains == \"linear\":\n",
	" gains = y_true\n",
	" else:\n",
	" raise ValueError(\"Invalid gains option.\")\n",
	"\n",
	" # highest rank is 1 so +2 instead of +1\n",
	" discounts = np.log2(np.arange(len(y_true)) + 2)\n",
	" return np.sum(gains / discounts)\n",
	"\n",
	"\n",
	"def ndcg_score(y_true, y_score, k=10, gains=\"exponential\"):\n",
	" \"\"\"Normalized discounted cumulative gain (NDCG) at rank k\n",
	" Parameters\n",
	" ----------\n",
	" y_true : array-like, shape = [n_samples]\n",
	" Ground truth (true relevance labels).\n",
	" y_score : array-like, shape = [n_samples]\n",
	" Predicted scores.\n",
	" k : int\n",
	" Rank.\n",
	" gains : str\n",
	" Whether gains should be \"exponential\" (default) or \"linear\".\n",
	" Returns\n",
	" -------\n",
	" NDCG @k : float\n",
	" \"\"\"\n",
	" best = dcg_score(y_true, y_true, k, gains)\n",
	" if best == 0:\n",
	" return 0 \n",
	" actual = dcg_score(y_true, y_score, k, gains)\n",
	" return actual / best"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 11,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"def get_col(Y, col):\n",
	" return np.squeeze(np.asarray(Y[:,col]))\n",
	"\n",
	"def ndcg_score_matrix(Y_true, Y_score, k=10, gains=\"exponential\"):\n",
	" score = 0.0\n",
	" n_users = Y_true.shape[1]\n",
	" for u in range(n_users):\n",
	" s = ndcg_score(get_col(Y_true, u), get_col(Y_score, u))\n",
	" score += s\n",
	" return score / n_users"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 17,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"from sklearn.model_selection import PredefinedSplit\n",
	"\n",
	"class LeavePOutByGroup():\n",
	" def __init__(self, X, p=5, n_splits=2):\n",
	" self.X = X\n",
	" self.p = p\n",
	" self.n_splits = n_splits\n",
	" test_fold = self.X.groupby(\"user\").cumcount().apply(lambda x: int(x / p) if x < (n_splits * p) else -1)\n",
	" self.s = PredefinedSplit(test_fold)\n",
	"\n",
	" def get_n_splits(self, X=None, y=None, groups=None):\n",
	" return self.n_splits\n",
	"\n",
	" def split(self, X=None, y=None, groups=None):\n",
	" return self.s.split()\n"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 18,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"def ndcg_scorer(estimator, X_test):\n",
	" truth = UtilityMatrixTransformer(confidence=1).fit_transform(X_test).todense()\n",
	" predictions = estimator.predict(X_test)\n",
	" return ndcg_score_matrix(truth, predictions, k=10)\n"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 19,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"from sklearn.model_selection import cross_val_score"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 20,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"from sklearn.model_selection import GridSearchCV"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 30,
	"metadata": {},
	"outputs": [],
	"source": [
	"rec_pipeline = Pipeline([\n",
	" ('matrix', UtilityMatrixTransformer()),\n",
	" ('als', ALSEstimator()),\n",
	"])\n",
	"\n",
	"param_grid = [\n",
	" {\n",
	" 'matrix__confidence': [1, 3, 10, 40, 100],\n",
	" 'als__factors': [20, 50, 100],\n",
	" 'als__regularization': [1e-2, 1e-3, 1e-4],\n",
	" }\n",
	"]"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 31,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"Fitting 3 folds for each of 45 candidates, totalling 135 fits\n"
	]
	},
	{
	"name": "stderr",
	"output_type": "stream",
	"text": [
	"[Parallel(n_jobs=1)]: Done 135 out of 135 \| elapsed: 13.3min finished\n"
	]
	},
	{
	"data": {
	"text/plain": [
	"GridSearchCV(cv=<__main__.LeavePOutByGroup instance at 0x1211de440>,\n",
	" error_score='raise',\n",
	" estimator=Pipeline(memory=None,\n",
	" steps=[('matrix', UtilityMatrixTransformer(confidence=40)), ('als', ALSEstimator(factors=50, filter_seen=True, iterations=10, regularization=0.01))]),\n",
	" fit_params=None, iid=True, n_jobs=1,\n",
	" param_grid=[{'matrix__confidence': [1, 3, 10, 40, 100], 'als__factors': [20, 50, 100], 'als__regularization': [0.01, 0.001, 0.0001]}],\n",
	" pre_dispatch='2*n_jobs', refit=True, return_train_score=True,\n",
	" scoring=<function ndcg_scorer at 0x120f70500>, verbose=1)"
	]
	},
	"execution_count": 31,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"shuffled_train_set = data.reindex(np.random.permutation(data.index)).sort_values(\"user\")\n",
	"grid_search = GridSearchCV(rec_pipeline, param_grid,\n",
	" cv=LeavePOutByGroup(shuffled_train_set, p=5, n_splits=3),\n",
	" scoring=ndcg_scorer, verbose=1)\n",
	"grid_search.fit(shuffled_train_set)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 32,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"{'als__factors': 20, 'als__regularization': 0.001, 'matrix__confidence': 3}"
	]
	},
	"execution_count": 32,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"grid_search.best_params_"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 33,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"(0.12182770483269723, {'matrix__confidence': 1, 'als__factors': 20, 'als__regularization': 0.01})\n",
	"(0.13166824473805691, {'matrix__confidence': 3, 'als__factors': 20, 'als__regularization': 0.01})\n",
	"(0.13282873335943512, {'matrix__confidence': 10, 'als__factors': 20, 'als__regularization': 0.01})\n",
	"(0.11039252745416286, {'matrix__confidence': 40, 'als__factors': 20, 'als__regularization': 0.01})\n",
	"(0.089016691660527295, {'matrix__confidence': 100, 'als__factors': 20, 'als__regularization': 0.01})\n",
	"(0.12093506965903911, {'matrix__confidence': 1, 'als__factors': 20, 'als__regularization': 0.001})\n",
	"(0.13349651687119168, {'matrix__confidence': 3, 'als__factors': 20, 'als__regularization': 0.001})\n",
	"(0.13296881851844261, {'matrix__confidence': 10, 'als__factors': 20, 'als__regularization': 0.001})\n",
	"(0.11066298985479994, {'matrix__confidence': 40, 'als__factors': 20, 'als__regularization': 0.001})\n",
	"(0.090079804327817939, {'matrix__confidence': 100, 'als__factors': 20, 'als__regularization': 0.001})\n",
	"(0.12115380609913633, {'matrix__confidence': 1, 'als__factors': 20, 'als__regularization': 0.0001})\n",
	"(0.13149519337094234, {'matrix__confidence': 3, 'als__factors': 20, 'als__regularization': 0.0001})\n",
	"(0.1324632559745306, {'matrix__confidence': 10, 'als__factors': 20, 'als__regularization': 0.0001})\n",
	"(0.11146953791233, {'matrix__confidence': 40, 'als__factors': 20, 'als__regularization': 0.0001})\n",
	"(0.087150593455471825, {'matrix__confidence': 100, 'als__factors': 20, 'als__regularization': 0.0001})\n",
	"(0.11191469606402717, {'matrix__confidence': 1, 'als__factors': 50, 'als__regularization': 0.01})\n",
	"(0.13091750356245202, {'matrix__confidence': 3, 'als__factors': 50, 'als__regularization': 0.01})\n",
	"(0.12765971686755495, {'matrix__confidence': 10, 'als__factors': 50, 'als__regularization': 0.01})\n",
	"(0.10417408070456254, {'matrix__confidence': 40, 'als__factors': 50, 'als__regularization': 0.01})\n",
	"(0.08646553233734848, {'matrix__confidence': 100, 'als__factors': 50, 'als__regularization': 0.01})\n",
	"(0.11191591881133119, {'matrix__confidence': 1, 'als__factors': 50, 'als__regularization': 0.001})\n",
	"(0.13101055863881378, {'matrix__confidence': 3, 'als__factors': 50, 'als__regularization': 0.001})\n",
	"(0.12704689426232149, {'matrix__confidence': 10, 'als__factors': 50, 'als__regularization': 0.001})\n",
	"(0.10546083019755575, {'matrix__confidence': 40, 'als__factors': 50, 'als__regularization': 0.001})\n",
	"(0.088077539716670858, {'matrix__confidence': 100, 'als__factors': 50, 'als__regularization': 0.001})\n",
	"(0.11183762637497301, {'matrix__confidence': 1, 'als__factors': 50, 'als__regularization': 0.0001})\n",
	"(0.13123515836596611, {'matrix__confidence': 3, 'als__factors': 50, 'als__regularization': 0.0001})\n",
	"(0.12755216556143176, {'matrix__confidence': 10, 'als__factors': 50, 'als__regularization': 0.0001})\n",
	"(0.10545558137554324, {'matrix__confidence': 40, 'als__factors': 50, 'als__regularization': 0.0001})\n",
	"(0.085301802039959324, {'matrix__confidence': 100, 'als__factors': 50, 'als__regularization': 0.0001})\n",
	"(0.10611469337423451, {'matrix__confidence': 1, 'als__factors': 100, 'als__regularization': 0.01})\n",
	"(0.12196884692068968, {'matrix__confidence': 3, 'als__factors': 100, 'als__regularization': 0.01})\n",
	"(0.1175596768388132, {'matrix__confidence': 10, 'als__factors': 100, 'als__regularization': 0.01})\n",
	"(0.097768165103126289, {'matrix__confidence': 40, 'als__factors': 100, 'als__regularization': 0.01})\n",
	"(0.083236186967734632, {'matrix__confidence': 100, 'als__factors': 100, 'als__regularization': 0.01})\n",
	"(0.10592410193883216, {'matrix__confidence': 1, 'als__factors': 100, 'als__regularization': 0.001})\n",
	"(0.12342558447935369, {'matrix__confidence': 3, 'als__factors': 100, 'als__regularization': 0.001})\n",
	"(0.11774383257111316, {'matrix__confidence': 10, 'als__factors': 100, 'als__regularization': 0.001})\n",
	"(0.10114449211940411, {'matrix__confidence': 40, 'als__factors': 100, 'als__regularization': 0.001})\n",
	"(0.085345035644795259, {'matrix__confidence': 100, 'als__factors': 100, 'als__regularization': 0.001})\n",
	"(0.10588644986046256, {'matrix__confidence': 1, 'als__factors': 100, 'als__regularization': 0.0001})\n",
	"(0.12292610723641109, {'matrix__confidence': 3, 'als__factors': 100, 'als__regularization': 0.0001})\n",
	"(0.11897882932070754, {'matrix__confidence': 10, 'als__factors': 100, 'als__regularization': 0.0001})\n",
	"(0.10299449602711824, {'matrix__confidence': 40, 'als__factors': 100, 'als__regularization': 0.0001})\n",
	"(0.086555398366424466, {'matrix__confidence': 100, 'als__factors': 100, 'als__regularization': 0.0001})\n"
	]
	}
	],
	"source": [
	"cvres = grid_search.cv_results_\n",
	"for mean_score, params in zip(cvres[\"mean_test_score\"], cvres[\"params\"]):\n",
	" print(mean_score, params)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": []
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 2",
	"language": "python",
	"name": "python2"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 2
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython2",
	"version": "2.7.12"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 2
	}