Created
March 31, 2017 18:05
-
-
Save brendansudol/7eadcd92df0431eb893ff0c78046b579 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"import glob\n", | |
"import json\n", | |
"import os\n", | |
"from datetime import datetime\n", | |
"\n", | |
"from github import Github, RateLimitExceededException\n", | |
"\n", | |
"GH_KEYS = [os.environ['GH_KEY{}'.format(i)] for i in range(1, 10)]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"839\n", | |
"['14c-prototype', '18f-bot', '18f-cli', '18f-education-discovery', '18f-reveal.js-theme']\n" | |
] | |
} | |
], | |
"source": [ | |
"# get 18f repo names (& sort them)\n", | |
"\n", | |
"gh = Github(GH_KEYS[0])\n", | |
"org = gh.get_organization('18F')\n", | |
"repo_names = sorted([r.name for r in org.get_repos()])\n", | |
"\n", | |
"print(len(repo_names))\n", | |
"print(repo_names[:5])" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": { | |
"collapsed": false, | |
"scrolled": false | |
}, | |
"outputs": [], | |
"source": [ | |
"# get and save all commits (since 2016) per repo\n", | |
"\n", | |
"SINCE = datetime(2016, 1, 1)\n", | |
"IDX_START = 0\n", | |
"IDX_END = 100\n", | |
"\n", | |
"\n", | |
"class Extract:\n", | |
" def __init__(self):\n", | |
" self.keys = GH_KEYS[:]\n", | |
" self.gh = Github(self.keys[0])\n", | |
" \n", | |
" def fetch(self, name):\n", | |
" try:\n", | |
" repo = self.gh.get_organization('18F').get_repo(name)\n", | |
" commits = repo.get_commits(since=SINCE)\n", | |
" return { 'repo': name, 'commits': [c.raw_data for c in commits] }\n", | |
" except RateLimitExceededException:\n", | |
" self.update_key()\n", | |
" return self.fetch(name) \n", | |
"\n", | |
" def save(self, data, name):\n", | |
" with open('github-data/repos/{}.json'.format(name), 'w') as f:\n", | |
" json.dump(data, f)\n", | |
"\n", | |
" def update_key(self):\n", | |
" if not len(self.keys):\n", | |
" raise Exception('no more API keys to use :(')\n", | |
" key = self.keys.pop(0)\n", | |
" print('using new key \"{}\"...'.format(key))\n", | |
" self.gh = Github(key)\n", | |
"\n", | |
" \n", | |
"e = Extract() \n", | |
"for i, repo in enumerate(repo_names[IDX_START:IDX_END]):\n", | |
" print('{}. fetching commits for {}...'.format(i + IDX_START, repo))\n", | |
" e.save(e.fetch(repo), repo)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 25, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"# combine & save into one big dataset\n", | |
"\n", | |
"data_files = glob.glob('github-data/repos/*')\n", | |
"results = {}\n", | |
"\n", | |
"for fname in data_files:\n", | |
" with open(fname) as f:\n", | |
" d = json.load(f)\n", | |
" results[d['repo']] = d['commits']\n", | |
"\n", | |
"with open('github-data/all-commits.json', 'w') as f:\n", | |
" json.dump(results, f, ensure_ascii=False)" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.5.1" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 1 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
I had to look up the behavior of
self.keys = GH_KEYS[:]
?I was pretty sure it was to make a copy but had never seen the "slice as copy" before, apparently I'm not alone; http://stackoverflow.com/questions/2612802/how-to-clone-or-copy-a-list
it is a weird syntax and it does not make sense to use it ever 😂
Love seeing your code, thanks for teaching me!