Created
July 26, 2013 09:47
-
-
Save adewes/6087664 to your computer and use it in GitHub Desktop.
Scripts for getting the full list of Github users (get_all_github_users.py) and for retrieving the details of these users (get_github_user_details.py) using the Github API.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
import json | |
import datetime | |
import time | |
import sys | |
import math | |
ACCESS_TOKEN = '[put your API key here]' | |
usage = """Retrieves a list of all Github users using the Github API. | |
Usage: get_all_github_users.py [output JSON filename] [since ID|optional] | |
""" | |
if __name__ == '__main__': | |
if len(sys.argv) < 2: | |
print usage | |
exit(-1) | |
output_filename = sys.argv[1] | |
params = {'access_token' : ACCESS_TOKEN} | |
if len(sys.argv) >= 3: | |
params['since'] = sys.argv[2] | |
try: | |
with open(output_filename,"ab") as output_file: | |
while True: | |
r = requests.get('https://api.github.com/users',params=params) | |
remaining_requests = int(r.headers['x-ratelimit-remaining']) | |
reset_time = datetime.datetime.fromtimestamp(int(r.headers['x-ratelimit-reset'])) | |
waiting_time = (reset_time-datetime.datetime.now()).total_seconds() | |
print "%d requests remaining, reset in %d minutes..." % (remaining_requests,math.ceil(waiting_time/60.0)) | |
if remaining_requests == 0: | |
print "Allowed requests depleted, waiting %d minutes and %d seconds before continuing..." % (math.floor(waiting_time/60.0),waiting_time % 60) | |
time.sleep(waiting_time) | |
continue | |
if r.status_code != 200: | |
print "Error, waiting 10 seconds before retrying..." | |
time.sleep(10) | |
continue | |
users = json.loads(r.content) | |
if not len(users): | |
print "No more users returned, got em all :)" | |
break | |
params['since'] = str(users[-1]['id']) | |
print "Added users: "+", ".join([str(user['id']) for user in users]) | |
for user in users: | |
output_file.write(json.dumps(user).strip()+"\n") | |
except KeyboardInterrupt: | |
print "Quitting..." | |
exit(0) | |
finally: | |
print "When relaunching this script, use the following minimum ID: ",params['since'] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import multiprocessing as mp | |
import requests | |
import json | |
import datetime | |
import time | |
import random | |
import os | |
import sys | |
import httplib | |
ACCESS_TOKEN = '[put your API key here]' | |
usage = """ | |
Usage: get_github_user_details.py [user JSON filename] [output JSON filename] [last ID|optional] | |
By default, the script appends to the output file. | |
""" | |
con = None | |
def establish_connection(): | |
global con | |
print "Establishing connection in process %d..." % os.getpid() | |
con = httplib.HTTPSConnection('api.github.com',443) | |
def get_user_details(user_login,user_id): | |
global con | |
try: | |
print "Getting details for %s (%d)" % (user_login,user_id) | |
if not con: | |
establish_connection() | |
try: | |
con.request('GET','/users/%s?access_token=%s' % (user_login,ACCESS_TOKEN)) | |
response = con.getresponse() | |
except: | |
print "Connection error, recreating and retrying in 5 seconds..." | |
time.sleep(5) | |
con = None | |
raise Exception("Connection failed!") | |
if response.status == 404: | |
print "User %s does not exist..." % user_login | |
return "" | |
elif response.status != 200 and response.status != 403: | |
print response.status,response.read() | |
print "Error, waiting 10 seconds before retrying..." | |
time.sleep(10) | |
raise Exception("connection failed!") | |
remaining_requests = int(response.getheader('x-ratelimit-remaining')) | |
reset_time = datetime.datetime.fromtimestamp(int(response.getheader('x-ratelimit-reset'))) | |
print "%d requests remaining..." % (remaining_requests) | |
if remaining_requests == 0: | |
print "Allowed requests depleted, waiting..." | |
while True: | |
if reset_time < datetime.datetime.now(): | |
print "Continuing!" | |
break | |
waiting_time_seconds = (reset_time-datetime.datetime.now()).total_seconds() | |
waiting_time_minutes = int(waiting_time_seconds/60) | |
waiting_time_seconds_remainder = int(waiting_time_seconds) % 60 | |
print "%d minutes and %d seconds to go" % (waiting_time_minutes,waiting_time_seconds_remainder) | |
time.sleep(60) | |
raise Exception("Request limit exceeded!") | |
content = response.read() | |
user_details = json.loads(content) | |
return content | |
except KeyboardInterrupt as e: | |
return "" | |
except requests.exceptions.RequestException as e: | |
print "Exception occured:",str(e) | |
raise e | |
if __name__ == '__main__': | |
if len(sys.argv) < 3: | |
print usage | |
exit(-1) | |
users_filename = sys.argv[1] | |
output_filename = sys.argv[2] | |
manager = mp.Manager() | |
pool_size = 5 | |
pool = mp.Pool(pool_size) | |
if len(sys.argv) >= 4: | |
since_id = int(sys.argv[3]) | |
else: | |
since_id = 0 | |
running_tasks = 0 | |
task_list = [] | |
with open(users_filename,"rb") as users_file, \ | |
open(output_filename,"ab") as output_file: | |
try: | |
while True: | |
try: | |
user = json.loads(users_file.readline()) | |
except ValueError: | |
print "Done" | |
break | |
if user['id'] <= since_id: | |
continue | |
while True: | |
for task in task_list: | |
if task.ready(): | |
del task_list[task_list.index(task)] | |
if not task.successful(): | |
print "Failed to get user details for %s, retrying..." % task.user['login'] | |
new_task = pool.apply_async(get_user_details,[task.user['login'],task.user['id']]) | |
new_task.user = task.user | |
task_list.append(new_task) | |
break | |
result = task.get().strip() | |
if result: | |
content = task.get().strip() | |
output_file.write(content+"\n") | |
output_file.flush() | |
if len(task_list) < pool_size: | |
task = pool.apply_async(get_user_details,[user['login'],user['id']]) | |
task.user = user | |
task_list.append(task) | |
break | |
except KeyboardInterrupt: | |
print "Quitting..." | |
exit(0) | |
finally: | |
print "When relaunching this script, use the following minimum ID: %d" % (min([task.user['id'] for task in task_list])-1) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment