Skip to content

Instantly share code, notes, and snippets.

@adewes
Created July 26, 2013 09:47
Show Gist options
  • Save adewes/6087664 to your computer and use it in GitHub Desktop.
Save adewes/6087664 to your computer and use it in GitHub Desktop.
Scripts for getting the full list of Github users (get_all_github_users.py) and for retrieving the details of these users (get_github_user_details.py) using the Github API.
import requests
import json
import datetime
import time
import sys
import math
ACCESS_TOKEN = '[put your API key here]'
usage = """Retrieves a list of all Github users using the Github API.
Usage: get_all_github_users.py [output JSON filename] [since ID|optional]
"""
if __name__ == '__main__':
if len(sys.argv) < 2:
print usage
exit(-1)
output_filename = sys.argv[1]
params = {'access_token' : ACCESS_TOKEN}
if len(sys.argv) >= 3:
params['since'] = sys.argv[2]
try:
with open(output_filename,"ab") as output_file:
while True:
r = requests.get('https://api.github.com/users',params=params)
remaining_requests = int(r.headers['x-ratelimit-remaining'])
reset_time = datetime.datetime.fromtimestamp(int(r.headers['x-ratelimit-reset']))
waiting_time = (reset_time-datetime.datetime.now()).total_seconds()
print "%d requests remaining, reset in %d minutes..." % (remaining_requests,math.ceil(waiting_time/60.0))
if remaining_requests == 0:
print "Allowed requests depleted, waiting %d minutes and %d seconds before continuing..." % (math.floor(waiting_time/60.0),waiting_time % 60)
time.sleep(waiting_time)
continue
if r.status_code != 200:
print "Error, waiting 10 seconds before retrying..."
time.sleep(10)
continue
users = json.loads(r.content)
if not len(users):
print "No more users returned, got em all :)"
break
params['since'] = str(users[-1]['id'])
print "Added users: "+", ".join([str(user['id']) for user in users])
for user in users:
output_file.write(json.dumps(user).strip()+"\n")
except KeyboardInterrupt:
print "Quitting..."
exit(0)
finally:
print "When relaunching this script, use the following minimum ID: ",params['since']
import multiprocessing as mp
import requests
import json
import datetime
import time
import random
import os
import sys
import httplib
ACCESS_TOKEN = '[put your API key here]'
usage = """
Usage: get_github_user_details.py [user JSON filename] [output JSON filename] [last ID|optional]
By default, the script appends to the output file.
"""
con = None
def establish_connection():
global con
print "Establishing connection in process %d..." % os.getpid()
con = httplib.HTTPSConnection('api.github.com',443)
def get_user_details(user_login,user_id):
global con
try:
print "Getting details for %s (%d)" % (user_login,user_id)
if not con:
establish_connection()
try:
con.request('GET','/users/%s?access_token=%s' % (user_login,ACCESS_TOKEN))
response = con.getresponse()
except:
print "Connection error, recreating and retrying in 5 seconds..."
time.sleep(5)
con = None
raise Exception("Connection failed!")
if response.status == 404:
print "User %s does not exist..." % user_login
return ""
elif response.status != 200 and response.status != 403:
print response.status,response.read()
print "Error, waiting 10 seconds before retrying..."
time.sleep(10)
raise Exception("connection failed!")
remaining_requests = int(response.getheader('x-ratelimit-remaining'))
reset_time = datetime.datetime.fromtimestamp(int(response.getheader('x-ratelimit-reset')))
print "%d requests remaining..." % (remaining_requests)
if remaining_requests == 0:
print "Allowed requests depleted, waiting..."
while True:
if reset_time < datetime.datetime.now():
print "Continuing!"
break
waiting_time_seconds = (reset_time-datetime.datetime.now()).total_seconds()
waiting_time_minutes = int(waiting_time_seconds/60)
waiting_time_seconds_remainder = int(waiting_time_seconds) % 60
print "%d minutes and %d seconds to go" % (waiting_time_minutes,waiting_time_seconds_remainder)
time.sleep(60)
raise Exception("Request limit exceeded!")
content = response.read()
user_details = json.loads(content)
return content
except KeyboardInterrupt as e:
return ""
except requests.exceptions.RequestException as e:
print "Exception occured:",str(e)
raise e
if __name__ == '__main__':
if len(sys.argv) < 3:
print usage
exit(-1)
users_filename = sys.argv[1]
output_filename = sys.argv[2]
manager = mp.Manager()
pool_size = 5
pool = mp.Pool(pool_size)
if len(sys.argv) >= 4:
since_id = int(sys.argv[3])
else:
since_id = 0
running_tasks = 0
task_list = []
with open(users_filename,"rb") as users_file, \
open(output_filename,"ab") as output_file:
try:
while True:
try:
user = json.loads(users_file.readline())
except ValueError:
print "Done"
break
if user['id'] <= since_id:
continue
while True:
for task in task_list:
if task.ready():
del task_list[task_list.index(task)]
if not task.successful():
print "Failed to get user details for %s, retrying..." % task.user['login']
new_task = pool.apply_async(get_user_details,[task.user['login'],task.user['id']])
new_task.user = task.user
task_list.append(new_task)
break
result = task.get().strip()
if result:
content = task.get().strip()
output_file.write(content+"\n")
output_file.flush()
if len(task_list) < pool_size:
task = pool.apply_async(get_user_details,[user['login'],user['id']])
task.user = user
task_list.append(task)
break
except KeyboardInterrupt:
print "Quitting..."
exit(0)
finally:
print "When relaunching this script, use the following minimum ID: %d" % (min([task.user['id'] for task in task_list])-1)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment