Last active
November 16, 2021 15:36
-
-
Save antoine-lizee/20ce98b912a2503d23aa4ad008278d6f to your computer and use it in GitHub Desktop.
Fetch issues & comments from Topics
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import json | |
import time | |
import requests | |
issues_url = 'https://api.github.com/repos/alan-eu/Topics/issues' | |
comments_url = 'https://api.github.com/repos/alan-eu/Topics/issues/comments' | |
rate_limit_url = 'https://api.github.com/rate_limit' | |
headers = {'Authorization': 'token <your_token>', 'Accept': 'application/vnd.github.v3+json'} | |
params = { | |
'direction': 'desc', | |
'per_page': 100, | |
'state': 'all', | |
'sort': 'updated', | |
} | |
def get_resource(url, params, max_page = 200, recover = False, n_max_retries=3, since=None): | |
page = 1 | |
delta_page = 0 | |
n_retries = 0 | |
all_objects = [] | |
last_timestamp = '<not_set>' | |
if since: | |
params['since'] = since | |
if recover: | |
params['direction'] = 'asc' | |
while page < max_page: | |
t0 = time.time() | |
params['page'] = page - delta_page | |
resp = requests.get(url, headers=headers, params=params) | |
# resp.headers['Link'] # get the next page and the last. | |
print("Request %i, fetching %d objects in %.3f sec: " % (page, params['per_page'], time.time() - t0), end='') | |
if 'since' in params: | |
print('[since: %s page: %i] ' % (params['since'], params['page']), end='') | |
if resp.status_code in (500, 502): # Server error, retry | |
print('ERROR: %s' % resp.text) | |
if n_retries >= n_max_retries: | |
n_retries = 0 | |
page += 1 | |
else: | |
n_retries += 1 | |
continue | |
if resp.status_code == 422: # Maxing out the pagination | |
if recover: | |
print('Attempting to recover') | |
delta_page = page | |
params['since'] = last_timestamp | |
continue | |
else: | |
print('Over-paged, bailing-out with PARTIAL data') | |
break | |
if resp.text == '[]': # End of the loop | |
print("No more results, stopping") | |
break | |
page += 1 | |
all_objects += resp.json() | |
last_timestamp = all_objects[-1]['updated_at'] | |
print("last timestamp %s" % last_timestamp) | |
return all_objects | |
def write_json(objects, object_name): | |
n_objects = len(objects) | |
tmin = min(c['updated_at'] for c in objects) | |
tmax = max(c['updated_at'] for c in objects) | |
filename = f"{object_name}_from_rest_{tmin}_{tmax}_{n_objects}.json" | |
with open(filename, 'w') as f: | |
json.dump(objects, f) | |
print(f"{n_objects} elements written in json file for {object_name}.") | |
## Run thedownload ---- | |
since_time = '2021-09-15T00:00:00Z' | |
all_issues = get_resource(issues_url, params, 200) | |
# Since specific time: all_issues = get_resource(issues_url, params, 20, since=since_time) | |
write_json(all_issues, 'issues') | |
all_comments = get_resource(comments_url, params, 2000, recover=True) | |
# Since specific time: all_comments = get_resource(comments_url, params, 100, since=since_time) | |
write_json(all_comments, 'comments') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
I don't know if it's useful at this point but I think you forgot to reset
n_retries = 0
around line 58. If I understand the intended behaviour well.