Skip to content

Instantly share code, notes, and snippets.

@LaoLiulaoliu
Last active August 29, 2015 14:06
Show Gist options
  • Save LaoLiulaoliu/51986c3a28d244a3567a to your computer and use it in GitHub Desktop.
Save LaoLiulaoliu/51986c3a28d244a3567a to your computer and use it in GitHub Desktop.
abstract download an html behavior
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Author: Yuande Liu <miracle (at) gmail.com>
import socket
import time
import requests
import lxml.html
def wrapper(url, timeout=20, retries=5, sleep=10, encoding ='gbk', text=False, errurl=None, errfunc=None):
html = get_html_tree(url, timeout=timeout, retries=retries, sleep=sleep, encoding=encoding, text=text, errurl=errurl, errfunc=errfunc)
return html
def get_html_tree(url, timeout=None, retries=1, sleep=1, encoding=None, text=None, errurl=None, errfunc=None):
for i in range(retries):
try:
ret = requests.get(url, timeout=timeout)
except socket.timeout:
continue
except requests.exceptions.Timeout:
continue
except socket.error: #[Errno 104] Connection reset by peer
continue
else:
if ret.status_code == 404:
time.sleep(sleep)
continue
elif errurl and ret.url.startswith(errurl):
if errfunc: errfunc()
time.sleep(sleep)
continue
else: # seems everything is fine
break
if not isinstance(ret.content, unicode):
try:
cont = ret.content.decode(encoding) if encoding else ret.content
except UnicodeDecodeError:
cont = ret.content.decode(encoding, 'ignore')
if text: return cont
return lxml.html.fromstring(cont)
else:
if text: return ret.content
return lxml.html.fromstring(ret.content)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment