Created
February 26, 2013 21:35
-
-
Save jgmize/5042441 to your computer and use it in GitHub Desktop.
get URL content lengths using requests, a gevent process pool, and redis
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
import logging | |
import re | |
from gevent import monkey, spawn | |
from gevent.pool import Pool | |
monkey.patch_all() | |
from redis import Redis | |
import requests | |
REDIS_OPTIONS = {} | |
URLQ = 'urlq' | |
URL_CONTENT_LENGTH_HASH_KEY = 'url_content_length' | |
redis = Redis(**REDIS_OPTIONS) | |
rel_next_regex = re.compile(r'<a rel="next".+?href="(?P<url>.+?)"') | |
def add_url_to_queue(url): | |
redis.lpush(URLQ, url) | |
def parse_next_link(content): | |
'check content for rel="next" link, return url' | |
for url in rel_next_regex.findall(content): | |
return url | |
def get_content_length(url): | |
logging.info('getting content length for ' + url) | |
content_length = 0 | |
try: | |
response = requests.get(url) | |
except: | |
pass # TODO: log traceback | |
else: | |
if response.status_code == 200: | |
content_length = len(response.content) | |
next_link = parse_next_link(response.content) | |
if next_link: | |
content_length += get_content_length(next_link) | |
# TODO: else log and/or store status code | |
logging.info(url + ' content length: ' + str(content_length)) | |
return content_length | |
def get_and_store_content_length(url): | |
return redis.hset( | |
URL_CONTENT_LENGTH_HASH_KEY, url, get_content_length(url)) | |
def hget_content_length(url): | |
return redis.hget(URL_CONTENT_LENGTH_HASH_KEY, url) | |
def scheduler(): | |
pool = Pool(10) | |
while True: | |
url = redis.rpop(URLQ) | |
if url: | |
pool.spawn(get_and_store_content_length, url) | |
else: | |
pool.join() | |
break | |
def main(): | |
logging.basicConfig( | |
format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) | |
scheduler_greenlet = spawn(scheduler) | |
scheduler_greenlet.join() | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment