Created
October 20, 2015 14:38
-
-
Save johngian/c3ddb784a35e7bf377c2 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
import click | |
import os | |
import requests | |
import tempfile | |
from itertools import izip, repeat | |
from multiprocessing import Pool | |
def get_data(url, api_key, data_dir): | |
try: | |
headers = { | |
'X-API-KEY': api_key | |
} | |
response = requests.get(url, headers=headers) | |
if response.status_code == 200: | |
data = response.json() | |
for entry in data['results']: | |
user_url = entry['_url'] | |
user_response = requests.get(user_url, headers=headers) | |
if user_response.status_code == 200: | |
filename = user_url.rsplit('/', 2)[-2] | |
filepath = os.path.join(data_dir, filename) | |
with open(filepath, 'w') as f: | |
f.write(user_response.content) | |
except KeyboardInterrupt: | |
raise Exception() | |
def get_data_star(args): | |
return get_data(*args) | |
@click.command() | |
@click.option('--pool_size', default=10, help='Number of processes.') | |
@click.option('--api_key', required=True, help='Mozillians.org API key.') | |
@click.option('--api_url', default='https://mozillians.org/api/v2/users/', | |
help='Mozillians.org API endpoint.') | |
@click.option('--data_dir', default=tempfile.mkdtemp(prefix='mozillians'), | |
help='Output directory.') | |
def scrap(pool_size, api_key, api_url, data_dir): | |
if not os.path.exists(data_dir): | |
os.makedirs(data_dir) | |
pool = Pool(pool_size) | |
base_url = '{0}?page='.format(api_url) | |
urls = ['{0}{1}'.format(base_url, page) for page in range(1, 1242)] | |
arguments = izip(urls, repeat(api_key), repeat(data_dir)) | |
click.echo('POOL_SIZE: {}'.format(pool_size)) | |
click.echo('DATA_DIR: {}'.format(data_dir)) | |
try: | |
click.echo('Launching GET processes...') | |
pool.map(get_data_star, arguments) | |
pool.close() | |
click.echo('Scraping complete!') | |
except KeyboardInterrupt: | |
click.echo('Terminating process pool. ') | |
pool.terminate() | |
click.echo('Pool terminated.') | |
except Exception, e: | |
click.echo('Got exception: %r, terminating the pool' % (e,)) | |
pool.terminate() | |
click.echo('Pool is terminated') | |
finally: | |
click.echo('Joining pool processes.') | |
pool.join() | |
click.echo('Join complete!') | |
click.echo('Bye bye!') | |
if __name__ == '__main__': | |
scrap() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment