-
-
Save febriy/eed35fb73750154f24708f47e1ce359f to your computer and use it in GitHub Desktop.
Recommending GitHub repositories with Google Big Query and implicit library: https://medium.com/@jbochi/recommending-github-repositories-with-google-bigquery-and-the-implicit-library-e6cce666c77
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 52, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"import pandas as pd\n", | |
"import numpy as np\n", | |
"from scipy.sparse import coo_matrix\n", | |
"from implicit.als import AlternatingLeastSquares\n", | |
"import requests" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 118, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"project_id = \"CHANGEME\"\n", | |
"github_user = \"CHANGEME\"\n", | |
"github_token = \"CHANGEME\" # from https://github.com/settings/tokens" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 62, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"github_auth = requests.auth.HTTPBasicAuth(github_user, github_token)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 135, | |
"metadata": { | |
"collapsed": false, | |
"scrolled": true | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Requesting query... ok.\n", | |
"Query running...\n", | |
"Query done.\n", | |
"Cache hit.\n", | |
"\n", | |
"Retrieving results...\n", | |
"Got 78238 rows.\n", | |
"\n", | |
"Total time taken 6.51 s.\n", | |
"Finished at 2017-06-24 09:08:00.\n" | |
] | |
} | |
], | |
"source": [ | |
"query = \"\"\"\n", | |
"WITH stars AS (\n", | |
" SELECT actor.login AS user, repo.name AS repo, created_at AS timestamp\n", | |
" FROM githubarchive.month.201706\n", | |
" WHERE type=\"WatchEvent\"\n", | |
"),\n", | |
"repositories_stars AS (\n", | |
" SELECT repo, COUNT(*) as c\n", | |
" FROM stars\n", | |
" GROUP BY repo\n", | |
" ORDER BY c DESC\n", | |
" LIMIT 1000\n", | |
"),\n", | |
"users_stars AS (\n", | |
" SELECT user, COUNT(*) as c\n", | |
" FROM stars\n", | |
" WHERE repo IN (SELECT repo FROM repositories_stars)\n", | |
" GROUP BY user\n", | |
" HAVING c > 10 AND C < 100\n", | |
" LIMIT 10000\n", | |
")\n", | |
"SELECT\n", | |
"user, repo, timestamp\n", | |
"FROM stars\n", | |
"WHERE repo IN (SELECT repo FROM repositories_stars)\n", | |
"AND user IN (SELECT user FROM users_stars)\n", | |
"ORDER BY timestamp DESC\n", | |
"\"\"\"\n", | |
"\n", | |
"data = pd.io.gbq.read_gbq(query, index_col=\"timestamp\", dialect=\"standard\", project_id=project_id)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 136, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style>\n", | |
" .dataframe thead tr:only-child th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: left;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>user</th>\n", | |
" <th>repo</th>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>timestamp</th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>2017-06-23 23:57:04</th>\n", | |
" <td>n3tn0de</td>\n", | |
" <td>webkul/coolhue</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2017-06-23 23:55:08</th>\n", | |
" <td>psw0714</td>\n", | |
" <td>justjavac/free-programming-books-zh_CN</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2017-06-23 23:54:36</th>\n", | |
" <td>psw0714</td>\n", | |
" <td>ecomfe/echarts</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2017-06-23 23:54:21</th>\n", | |
" <td>psw0714</td>\n", | |
" <td>tastejs/todomvc</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2017-06-23 23:54:14</th>\n", | |
" <td>psw0714</td>\n", | |
" <td>babel/babel</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" user repo\n", | |
"timestamp \n", | |
"2017-06-23 23:57:04 n3tn0de webkul/coolhue\n", | |
"2017-06-23 23:55:08 psw0714 justjavac/free-programming-books-zh_CN\n", | |
"2017-06-23 23:54:36 psw0714 ecomfe/echarts\n", | |
"2017-06-23 23:54:21 psw0714 tastejs/todomvc\n", | |
"2017-06-23 23:54:14 psw0714 babel/babel" | |
] | |
}, | |
"execution_count": 136, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"data.head()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 140, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"# map each repo and user to a unique numeric value\n", | |
"data['user'] = data['user'].astype(\"category\")\n", | |
"data['repo'] = data['repo'].astype(\"category\")\n", | |
"\n", | |
"# create a sparse matrix of all the users/repos\n", | |
"stars = coo_matrix((np.ones(data.shape[0]),\n", | |
" (data['repo'].cat.codes.copy(),\n", | |
" data['user'].cat.codes.copy())))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 141, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"<999x4348 sparse matrix of type '<type 'numpy.float64'>'\n", | |
"\twith 78238 stored elements in COOrdinate format>" | |
] | |
}, | |
"execution_count": 141, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"stars" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 142, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"model = AlternatingLeastSquares(factors=50,\n", | |
" regularization=0.01,\n", | |
" dtype=np.float64,\n", | |
" iterations=50)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 144, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"confidence = 40\n", | |
"model.fit(confidence * stars)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 126, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"repos = dict(enumerate(data['repo'].cat.categories))\n", | |
"repo_ids = {r: i for i, r in repos.iteritems()}" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 127, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"[(u'tensorflow/tensorflow', 1.0000000000000004),\n", | |
" (u'jikexueyuanwiki/tensorflow-zh', 0.52015405760492706),\n", | |
" (u'BVLC/caffe', 0.4161581732982037),\n", | |
" (u'scikit-learn/scikit-learn', 0.40543551306117309),\n", | |
" (u'google/protobuf', 0.40160716582156247),\n", | |
" (u'fchollet/keras', 0.39897590674119598),\n", | |
" (u'shadowsocksr/shadowsocksr-csharp', 0.3798671235574328),\n", | |
" (u'ethereum/mist', 0.37205191726130321),\n", | |
" (u'pandas-dev/pandas', 0.34311692603549021),\n", | |
" (u'karpathy/char-rnn', 0.33868380215281335)]" | |
] | |
}, | |
"execution_count": 127, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"[(repos[r], s) for r, s in model.similar_items(repo_ids['tensorflow/tensorflow'])]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 129, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"def user_stars(user):\n", | |
" repos = []\n", | |
" url = \"https://api.github.com/users/{}/starred\".format(user)\n", | |
" while url:\n", | |
" resp = requests.get(url, auth=github_auth)\n", | |
" repos += [r[\"full_name\"] for r in resp.json()]\n", | |
" url = resp.links[\"next\"][\"url\"] if \"next\" in resp.links else None\n", | |
" return repos\n", | |
"\n", | |
"def user_items(u_stars):\n", | |
" star_ids = [repo_ids[s] for s in u_stars if s in repo_ids]\n", | |
" data = [confidence for _ in star_ids]\n", | |
" rows = [0 for _ in star_ids]\n", | |
" shape = (1, model.item_factors.shape[0])\n", | |
" return coo_matrix((data, (rows, star_ids)), shape=shape).tocsr()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 130, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"jbochi = user_items(user_stars(\"jbochi\"))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 131, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"def recommend(user_items):\n", | |
" recs = model.recommend(userid=0, user_items=user_items, recalculate_user=True)\n", | |
" return [(repos[r], s) for r, s in recs]\n", | |
"\n", | |
"def explain(user_items, repo):\n", | |
" _, recs, _ = model.explain(userid=0, user_items=user_items, itemid=repo_ids[repo])\n", | |
" return [(repos[r], s) for r, s in recs]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 132, | |
"metadata": { | |
"collapsed": false, | |
"scrolled": true | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"[(u'ansible/ansible', 1.3480146093553365),\n", | |
" (u'airbnb/superset', 1.337698670756992),\n", | |
" (u'scrapy/scrapy', 1.2682612609169515),\n", | |
" (u'grpc/grpc', 1.1558718295721062),\n", | |
" (u'scikit-learn/scikit-learn', 1.1539551159232055),\n", | |
" (u'grafana/grafana', 1.1265144087278358),\n", | |
" (u'google/protobuf', 1.078458167396922),\n", | |
" (u'lodash/lodash', 1.0690341693223879),\n", | |
" (u'josephmisiti/awesome-machine-learning', 1.0553796439629786),\n", | |
" (u'd3/d3', 1.0546232373207065)]" | |
] | |
}, | |
"execution_count": 132, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"recommend(jbochi)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 133, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"[(u'pandas-dev/pandas', 0.18368079727509334),\n", | |
" (u'BVLC/caffe', 0.15726607611115795),\n", | |
" (u'requests/requests', 0.15263841163355341),\n", | |
" (u'pallets/flask', 0.15259412774463132),\n", | |
" (u'robbyrussell/oh-my-zsh', 0.1503775470984523),\n", | |
" (u'apache/spark', 0.12771260655405856),\n", | |
" (u'tensorflow/tensorflow', 0.12343847633950071),\n", | |
" (u'kripken/emscripten', 0.12294875917036562),\n", | |
" (u'videojs/video.js', 0.12279727716802587),\n", | |
" (u'rust-lang/rust', 0.10859551238691327)]" | |
] | |
}, | |
"execution_count": 133, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"explain(jbochi, 'fchollet/keras')" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 2", | |
"language": "python", | |
"name": "python2" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 2 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython2", | |
"version": "2.7.12" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment