Skip to content

Instantly share code, notes, and snippets.

@monkut
Created August 14, 2017 01:31
Show Gist options
  • Save monkut/0a51d1580e262b374d7f64289576907d to your computer and use it in GitHub Desktop.
Save monkut/0a51d1580e262b374d7f64289576907d to your computer and use it in GitHub Desktop.
parallel retrieve of github arhive data
"""
Retrieve github archieve data from:
https://www.githubarchive.org/
"""
import datetime
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm
import os
import requests
def get_github_hourly_data(desired_hour_datetime, output_root_directory=None):
"""
Obtain the github archieve json.gz for the given datetime (hour)
> If output_root_directory not given, 'github-archive-data' directory will be created under '~'.
"""
if not output_root_directory:
output_root_directory = os.path.expanduser('~/github-archive-data')
if not os.path.exists(output_root_directory):
os.mkdir(output_root_directory)
# create date directory if it does not exist
date_directory = desired_hour_datetime.strftime('%Y%m%d')
date_directory_fullpath = os.path.join(output_root_directory, date_directory)
if not os.path.exists(date_directory_fullpath):
os.mkdir(date_directory_fullpath)
print(f'> Created: {date_directory_fullpath}')
# Uses Github archive fileformat
# wget http://data.githubarchive.org/2015-01-{01..30}-{0..23}.json.gz
filename = '{year}-{month:02}-{day:02}-{hour}.json.gz'.format(year=desired_hour_datetime.year,
month=desired_hour_datetime.month,
day=desired_hour_datetime.day,
hour=desired_hour_datetime.hour)
gh_data_url = f'http://data.githubarchive.org/{filename}'
response = requests.get(gh_data_url, stream=True)
total = int(response.headers.get('content-length'))
output_filepath = os.path.join(date_directory_fullpath, filename)
if not os.path.exists(output_filepath):
with open(output_filepath, 'wb') as out_json_gz:
for chunk in response.iter_content():
out_json_gz.write(chunk)
else:
print('FILE EXISTS, SKIPPING: {}'.format(output_filepath))
return gh_data_url, output_filepath
def get_day_datetimes(start_datetime, total_days):
"""
Datetime hour generator.
Will generate the hourly datetime objects starting with the given 'start_datetime', up and until the number of days given.
(only full days supported)
:param start_datetime: (datetime) datetime (hour will be ignored)
:param total_days: (int) Number of days to generate
"""
assert total_days >= 1
for days in range(total_days):
for hours in range(24):
yield start_datetime + datetime.timedelta(days=days, hours=hours)
def collect_github_archive(initial_datetime, days):
"""
Download github archive data in parallel with a thread pool.
"""
with ThreadPoolExecutor(max_workers=10) as executor:
datetime_hours = get_day_datetimes(initial_datetime, days)
r = executor.map(get_github_hourly_data, datetime_hours)
return r
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment