Created
December 5, 2016 04:11
-
-
Save darkarnium/3103dfb0b90f545d3e54a374d5752394 to your computer and use it in GitHub Desktop.
Populate Sesshu wish Alexa Top 1,000,000 sites.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
import tqdm | |
import json | |
import click | |
import boto3 | |
import multiprocessing | |
def request(target, plugin='http_robots'): | |
''' Builds a request object (JSON). ''' | |
r = {'target': 'http://{}'.format(target), 'plugin': plugin} | |
return(json.dumps(r)) | |
def dispatch(work, destination, results, id, block): | |
c = boto3.client('sns') | |
# Iterate over domains in this block, and submit to SNS. On failure, don't | |
# attempt to catch the exception, let it terminate the thread. On success | |
# submit a True into the results queue - for counting / status only. | |
for i in xrange(block * id, block * (id + 1)): | |
c.publish( | |
TopicArn=destination, | |
Message=request(work[i]) | |
) | |
results.put(True) | |
@click.command() | |
@click.option('--arn', help='Destination ARN (Amazon SNS)') | |
@click.option('--source', help='File containing list of domains') | |
@click.option('--workers', default=4, help='Workers to spawn (default: 4)') | |
def main(arn, source, workers): | |
''' Populate SQS with input domains. ''' | |
domains = [] | |
results = multiprocessing.Queue() | |
with open(source) as in_file: | |
domains = in_file.read().splitlines() | |
# Spawn workers, and submit the domains into the topic. | |
block = len(domains) / workers | |
for i in xrange(workers): | |
p = multiprocessing.Process( | |
target=dispatch, args=(domains, arn, results, i, block)) | |
p.start() | |
# Monitor the results queue until all work has been complete. | |
domain_counter = 0 | |
with tqdm.tqdm(total=len(domains), unit='R', unit_scale=False) as pbar: | |
# If there's a message in the results queue, increment the counter, and | |
# pull the message off. | |
while True: | |
if results.qsize > 0: | |
results.get() | |
domain_counter += 1 | |
pbar.update(1) | |
# Check whether all work has been processed and results received. | |
if len(domains) == (domain_counter + 1): | |
break | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment