Skip to content

Instantly share code, notes, and snippets.

@jgmize
Created February 26, 2013 21:35
Show Gist options
  • Save jgmize/5042441 to your computer and use it in GitHub Desktop.
Save jgmize/5042441 to your computer and use it in GitHub Desktop.
get URL content lengths using requests, a gevent process pool, and redis
#!/usr/bin/env python
import logging
import re
from gevent import monkey, spawn
from gevent.pool import Pool
monkey.patch_all()
from redis import Redis
import requests
REDIS_OPTIONS = {}
URLQ = 'urlq'
URL_CONTENT_LENGTH_HASH_KEY = 'url_content_length'
redis = Redis(**REDIS_OPTIONS)
rel_next_regex = re.compile(r'<a rel="next".+?href="(?P<url>.+?)"')
def add_url_to_queue(url):
redis.lpush(URLQ, url)
def parse_next_link(content):
'check content for rel="next" link, return url'
for url in rel_next_regex.findall(content):
return url
def get_content_length(url):
logging.info('getting content length for ' + url)
content_length = 0
try:
response = requests.get(url)
except:
pass # TODO: log traceback
else:
if response.status_code == 200:
content_length = len(response.content)
next_link = parse_next_link(response.content)
if next_link:
content_length += get_content_length(next_link)
# TODO: else log and/or store status code
logging.info(url + ' content length: ' + str(content_length))
return content_length
def get_and_store_content_length(url):
return redis.hset(
URL_CONTENT_LENGTH_HASH_KEY, url, get_content_length(url))
def hget_content_length(url):
return redis.hget(URL_CONTENT_LENGTH_HASH_KEY, url)
def scheduler():
pool = Pool(10)
while True:
url = redis.rpop(URLQ)
if url:
pool.spawn(get_and_store_content_length, url)
else:
pool.join()
break
def main():
logging.basicConfig(
format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
scheduler_greenlet = spawn(scheduler)
scheduler_greenlet.join()
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment