monkut · August 14, 2017 01:31
diff --git a/github_archive_retriever.py b/github_archive_retriever.py
 """
 Retrieve github archieve data from:
 https://www.githubarchive.org/


 """

 import datetime
 from concurrent.futures import ThreadPoolExecutor        
 from tqdm import tqdm
 import os
 import requests


 def get_github_hourly_data(desired_hour_datetime, output_root_directory=None):
    """
    Obtain the github archieve json.gz for the given datetime (hour)
    
    > If output_root_directory not given, 'github-archive-data' directory will be created under '~'.
    """
    if not output_root_directory:
        output_root_directory = os.path.expanduser('~/github-archive-data')
        if not os.path.exists(output_root_directory):
            os.mkdir(output_root_directory)
    
    # create date directory if it does not exist
    date_directory = desired_hour_datetime.strftime('%Y%m%d')
    date_directory_fullpath = os.path.join(output_root_directory, date_directory)
    if not os.path.exists(date_directory_fullpath):
        os.mkdir(date_directory_fullpath)
        print(f'> Created: {date_directory_fullpath}')
        
    # Uses Github archive fileformat
    # wget http://data.githubarchive.org/2015-01-{01..30}-{0..23}.json.gz
    filename = '{year}-{month:02}-{day:02}-{hour}.json.gz'.format(year=desired_hour_datetime.year,
                                                           month=desired_hour_datetime.month,
                                                           day=desired_hour_datetime.day,
                                                           hour=desired_hour_datetime.hour)
    gh_data_url = f'http://data.githubarchive.org/{filename}'
    response = requests.get(gh_data_url, stream=True)
    total = int(response.headers.get('content-length'))
    
    output_filepath = os.path.join(date_directory_fullpath, filename)
    if not os.path.exists(output_filepath):
        with open(output_filepath, 'wb') as out_json_gz:
            for chunk in response.iter_content():
                out_json_gz.write(chunk)
    else:
        print('FILE EXISTS, SKIPPING: {}'.format(output_filepath))
            
    return gh_data_url, output_filepath


 def get_day_datetimes(start_datetime, total_days):
    """
    Datetime hour generator.
    Will generate the hourly datetime objects starting with the given 'start_datetime', up and until the number of days given.
    (only full days supported)
    
    :param start_datetime: (datetime) datetime (hour will be ignored)
    :param total_days: (int) Number of days to generate
    """
    assert total_days >= 1
    for days in range(total_days):        
        for hours in range(24):
            yield start_datetime + datetime.timedelta(days=days, hours=hours)

 def collect_github_archive(initial_datetime, days):
    """
    Download github archive data in parallel with a thread pool.
    """
    with ThreadPoolExecutor(max_workers=10) as executor:        
        datetime_hours = get_day_datetimes(initial_datetime, days)
        r = executor.map(get_github_hourly_data, datetime_hours)
        return r
	"""
	Retrieve github archieve data from:
	https://www.githubarchive.org/


	"""

	import datetime
	from concurrent.futures import ThreadPoolExecutor
	from tqdm import tqdm
	import os
	import requests


	def get_github_hourly_data(desired_hour_datetime, output_root_directory=None):
	"""
	Obtain the github archieve json.gz for the given datetime (hour)

	> If output_root_directory not given, 'github-archive-data' directory will be created under '~'.
	"""
	if not output_root_directory:
	output_root_directory = os.path.expanduser('~/github-archive-data')
	if not os.path.exists(output_root_directory):
	os.mkdir(output_root_directory)

	# create date directory if it does not exist
	date_directory = desired_hour_datetime.strftime('%Y%m%d')
	date_directory_fullpath = os.path.join(output_root_directory, date_directory)
	if not os.path.exists(date_directory_fullpath):
	os.mkdir(date_directory_fullpath)
	print(f'> Created: {date_directory_fullpath}')

	# Uses Github archive fileformat
	# wget http://data.githubarchive.org/2015-01-{01..30}-{0..23}.json.gz
	filename = '{year}-{month:02}-{day:02}-{hour}.json.gz'.format(year=desired_hour_datetime.year,
	month=desired_hour_datetime.month,
	day=desired_hour_datetime.day,
	hour=desired_hour_datetime.hour)
	gh_data_url = f'http://data.githubarchive.org/{filename}'
	response = requests.get(gh_data_url, stream=True)
	total = int(response.headers.get('content-length'))

	output_filepath = os.path.join(date_directory_fullpath, filename)
	if not os.path.exists(output_filepath):
	with open(output_filepath, 'wb') as out_json_gz:
	for chunk in response.iter_content():
	out_json_gz.write(chunk)
	else:
	print('FILE EXISTS, SKIPPING: {}'.format(output_filepath))

	return gh_data_url, output_filepath


	def get_day_datetimes(start_datetime, total_days):
	"""
	Datetime hour generator.
	Will generate the hourly datetime objects starting with the given 'start_datetime', up and until the number of days given.
	(only full days supported)

	:param start_datetime: (datetime) datetime (hour will be ignored)
	:param total_days: (int) Number of days to generate
	"""
	assert total_days >= 1
	for days in range(total_days):
	for hours in range(24):
	yield start_datetime + datetime.timedelta(days=days, hours=hours)

	def collect_github_archive(initial_datetime, days):
	"""
	Download github archive data in parallel with a thread pool.
	"""
	with ThreadPoolExecutor(max_workers=10) as executor:
	datetime_hours = get_day_datetimes(initial_datetime, days)
	r = executor.map(get_github_hourly_data, datetime_hours)
	return r