Created
August 14, 2017 01:31
-
-
Save monkut/0a51d1580e262b374d7f64289576907d to your computer and use it in GitHub Desktop.
parallel retrieve of github arhive data
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Retrieve github archieve data from: | |
https://www.githubarchive.org/ | |
""" | |
import datetime | |
from concurrent.futures import ThreadPoolExecutor | |
from tqdm import tqdm | |
import os | |
import requests | |
def get_github_hourly_data(desired_hour_datetime, output_root_directory=None): | |
""" | |
Obtain the github archieve json.gz for the given datetime (hour) | |
> If output_root_directory not given, 'github-archive-data' directory will be created under '~'. | |
""" | |
if not output_root_directory: | |
output_root_directory = os.path.expanduser('~/github-archive-data') | |
if not os.path.exists(output_root_directory): | |
os.mkdir(output_root_directory) | |
# create date directory if it does not exist | |
date_directory = desired_hour_datetime.strftime('%Y%m%d') | |
date_directory_fullpath = os.path.join(output_root_directory, date_directory) | |
if not os.path.exists(date_directory_fullpath): | |
os.mkdir(date_directory_fullpath) | |
print(f'> Created: {date_directory_fullpath}') | |
# Uses Github archive fileformat | |
# wget http://data.githubarchive.org/2015-01-{01..30}-{0..23}.json.gz | |
filename = '{year}-{month:02}-{day:02}-{hour}.json.gz'.format(year=desired_hour_datetime.year, | |
month=desired_hour_datetime.month, | |
day=desired_hour_datetime.day, | |
hour=desired_hour_datetime.hour) | |
gh_data_url = f'http://data.githubarchive.org/{filename}' | |
response = requests.get(gh_data_url, stream=True) | |
total = int(response.headers.get('content-length')) | |
output_filepath = os.path.join(date_directory_fullpath, filename) | |
if not os.path.exists(output_filepath): | |
with open(output_filepath, 'wb') as out_json_gz: | |
for chunk in response.iter_content(): | |
out_json_gz.write(chunk) | |
else: | |
print('FILE EXISTS, SKIPPING: {}'.format(output_filepath)) | |
return gh_data_url, output_filepath | |
def get_day_datetimes(start_datetime, total_days): | |
""" | |
Datetime hour generator. | |
Will generate the hourly datetime objects starting with the given 'start_datetime', up and until the number of days given. | |
(only full days supported) | |
:param start_datetime: (datetime) datetime (hour will be ignored) | |
:param total_days: (int) Number of days to generate | |
""" | |
assert total_days >= 1 | |
for days in range(total_days): | |
for hours in range(24): | |
yield start_datetime + datetime.timedelta(days=days, hours=hours) | |
def collect_github_archive(initial_datetime, days): | |
""" | |
Download github archive data in parallel with a thread pool. | |
""" | |
with ThreadPoolExecutor(max_workers=10) as executor: | |
datetime_hours = get_day_datetimes(initial_datetime, days) | |
r = executor.map(get_github_hourly_data, datetime_hours) | |
return r |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment