Created
August 5, 2014 05:47
-
-
Save masroore/36da7c02ce8c1d5b79b1 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pycurl | |
import urllib | |
import copy | |
import re | |
import signal | |
import os | |
import random | |
from urlparse import urlsplit | |
from libpy.html import detect_encoding | |
# TODO: | |
# fetching the binary content even with unicode=True fails - PIL couldn't load png file | |
# fetched with Grab | |
# We should ignore SIGPIPE when using pycurl.NOSIGNAL - see | |
# the libcurl tutorial for more info. | |
# Comments: http://curl.haxx.se/mail/curlpython-2005-06/0004.html | |
try: | |
import signal | |
from signal import SIGPIPE, SIG_IGN | |
signal.signal(signal.SIGPIPE, signal.SIG_IGN) | |
except ImportError: | |
pass | |
except ValueError: | |
# Do this to ignore ValueError: signal only works in main thread | |
# in python 2.5 WTF??? | |
pass | |
class Error(pycurl.error): | |
"""Used to indicate network error. The same as pycrl.errror""" | |
class SiteError(Error): | |
""" | |
Used to indicate error of the remote resource | |
It is usefull for example when we query server which name can not | |
be resolved | |
""" | |
def get(url, config=None, soup=False): | |
"""Simple function for fetching url using grab instance""" | |
curl = Grab() | |
curl.setup('url', url) | |
if config: | |
curl.setup(config) | |
curl.run() | |
if soup: | |
return curl.soup | |
else: | |
return curl.body | |
class Grab: | |
"""Fancy wrapper for pycurl library""" | |
def __init__(self): | |
self.timeout = 20 | |
self.logFile = None | |
self.config = {} | |
self._bodyCallbacks = [] | |
self.debug = False | |
self.lastError = None | |
self.freshPostData = False | |
self.cookies_map = {} | |
self.oldUrl = None | |
self.debug = False | |
self.auto_cookies = False | |
self.generate_client_profile() | |
self.head = '' | |
self.body = '' | |
self.headers = {} | |
self.cookies = {} | |
self.unicode = True | |
self.encoding = None | |
self.use_tidy = False | |
self.out_headers = None | |
self.max_redirects = 5 | |
def generate_client_profile(self): | |
self.default_headers = { | |
'Accept': 'text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5', | |
'Accept-Language': 'ru,en-us;q=0.%(x)d,en;q=0.3;%(lang)s' % {'x': random.randint(5, 9), | |
'lang': random.choice(['ua', 'gb', 'uk'])}, | |
#'Accept-Encoding': 'gzip,compress;q=0.%(x)d,deflate;q=0' % {'x': random.randint(5, 9)}, | |
'Accept-Charset': 'utf-8,windows-1251;q=0.%(x)d,*;q=0.%(x)d' % {'x': random.randint(5, 9)} | |
} | |
#print self.default_headers | |
self.default_user_agent = random.choice(useragents) | |
#print self.default_user_agent | |
def _bodyCallback(self, data): | |
"""Used to process anser body""" | |
if self.nobody: | |
return 0 | |
else: | |
self.body = self.body + data | |
if self.maxsize: | |
if len(self.body) > self.maxsize: | |
return 0 | |
if self._bodyCallbacks: | |
for callback in self._bodyCallbacks: | |
if not callback(data): | |
return 0 | |
return len(data) | |
def _headCallback(self, data): | |
"""Used to process answer headers""" | |
if self.nohead: | |
return 0 | |
else: | |
self.head = self.head + data | |
return len(data) | |
def _debug_callback(self, type, data): | |
if type == 2: # pycurl.CURLINFO_HEADER_OUT: WTF?? pycurl.HEADER_OUT is invalid | |
self.out_headers = data | |
def request(self): | |
"""Run prepared curl request""" | |
self.curl.perform() | |
self.curl.close() | |
def setup(self, name, value = None): | |
""" | |
Configure curl request. Arguments variants: | |
1. name - option name, value - option value | |
2. name is dictionary, value is None | |
""" | |
if isinstance(name, dict): | |
for key, value in name.items(): | |
self.setup(key, value) | |
else: | |
if 'post' == name: | |
self.freshPostData = True | |
self.config[name] = value | |
def _changeState(self, name, value): | |
""" | |
Configure internal pycurl instance before request | |
""" | |
if isinstance(name, int): | |
self.curl.setopt(name, value) | |
# TODO: is it possible that dict passed to changeState? | |
elif isinstance(name, dict): | |
for key in name: | |
self.setup(key, name[key]) | |
if 'post' == name: | |
if value: | |
self.curl.setopt(pycurl.POSTFIELDS, urllib.urlencode(value)) | |
else: | |
self.curl.setopt(pycurl.HTTPGET,1) | |
elif 'logfile' == name: | |
self.logFile = value | |
elif 'url' == name: | |
self.curl.setopt(pycurl.URL, str(value)) | |
elif 'proxy' == name: | |
if value: | |
proxy = value | |
else: | |
proxy = '' | |
self.curl.setopt(pycurl.PROXY, proxy) | |
elif 'timeout' == name: | |
self.curl.setopt(pycurl.TIMEOUT, value) | |
elif 'connect_timeout' == name: | |
self.curl.setopt(pycurl.CONNECTTIMEOUT, value) | |
elif 'referer' == name: | |
self.curl.setopt(pycurl.REFERER, str(value)) | |
elif 'cookies' == name: | |
for name, value in value.items(): | |
self.register_cookie(name, value) | |
elif 'autocookies' == name: | |
pass | |
elif 'nobody' == name: | |
if True == value: | |
self.nobody = True | |
elif 'nohead' == name: | |
if True == value: | |
self.nohead = True | |
elif 'maxsize' == name: | |
self.maxsize = value | |
elif 'redirect' == name: | |
self.curl.setopt(pycurl.FOLLOWLOCATION, value) | |
elif 'max_redirects' == name: | |
self.curl.setopt(pycurl.MAXREDIRS, value) | |
elif 'userpwd' == name: | |
self.curl.setopt(pycurl.USERPWD, value) | |
elif 'bodyCallback' == name: | |
if isinstance(name, (list, tuple)): | |
self._bodyCallbacks = value | |
else: | |
self._bodyCallbacks.append(value) | |
elif 'user_agent' == name: | |
self.curl.setopt(pycurl.USERAGENT, value) | |
elif 'headers' == name: | |
self.curl.setopt(pycurl.HTTPHEADER, ['%s: %s' % (a, b) for a, b in value.iteritems()]) | |
elif 'autoreferer' == name: | |
if not 'referer' in self.config: | |
if not self.oldUrl is None: | |
self.curl.setopt(pycurl.REFERER, str(self.oldUrl)) | |
elif 'unicode' == name: | |
self.unicode = bool(value) | |
elif 'use_tidy' == name: | |
self.use_tidy = bool(value) | |
elif 'gzip' == name: | |
self.gzip = value | |
elif 'debug' == name: | |
self.curl.setopt(pycurl.VERBOSE, value) | |
else: | |
raise Exception, "unknown option: %s" % name | |
def _prepare(self): | |
"""Prepare for request""" | |
self.curl = pycurl.Curl() | |
self.curl.setopt(pycurl.SSL_VERIFYPEER, 0) | |
self.curl.setopt(pycurl.SSL_VERIFYHOST, 0) | |
self.curl.setopt(pycurl.FOLLOWLOCATION, 1) | |
self.curl.setopt(pycurl.TIMEOUT, self.timeout) | |
self.curl.setopt(pycurl.CONNECTTIMEOUT, self.timeout) | |
self.curl.setopt(pycurl.MAXREDIRS, self.max_redirects) | |
self.curl.setopt(pycurl.NOSIGNAL, 1) | |
self.curl.setopt(pycurl.WRITEFUNCTION, self._bodyCallback) | |
self.curl.setopt(pycurl.HEADERFUNCTION, self._headCallback) | |
self.curl.setopt(pycurl.DEBUGFUNCTION, self._debug_callback) | |
#self.curl.setopt(pycurl.VERBOSE, True) | |
self.head = '' | |
self.body = '' | |
self.headers = {} | |
self.cookies = {} | |
self.maxsize = 0 | |
self.nobody = False | |
self.nohead = False | |
self.lastError = ''#pycurl.CURLE_OK | |
self.encoding = None | |
if not 'user_agent' in self.config: | |
self.config['user_agent'] = self.default_user_agent | |
# Set up default headers if they do not exist | |
headers = self.config.setdefault('headers', {}) | |
for header, value in self.default_headers.iteritems(): | |
if not header in headers: | |
headers[header] = value | |
if self.config.get('gzip'): | |
if not header in headers: | |
headers['Accept-Encoding'] = 'gzip' | |
for name, value in self.config.items(): | |
self._changeState(name, value) | |
# If autocookies mode is enabled then use all registered cookies for this domain | |
# else use cookies given in setup calls (if any) | |
cookies = () | |
if self.config.get('autocookies'): | |
cookies = self.get_registered_cookies() | |
elif self.config.get('cookies'): | |
cookies = self.config['cookies'] | |
if cookies: | |
parts = [] | |
for name, value in cookies.iteritems(): | |
parts.append('%s=%s;' % (urllib.quote_plus(name), | |
urllib.quote_plus(value))) | |
self.curl.setopt(pycurl.COOKIE, ''.join(parts)) | |
# If we query new url we must reset old post and cookes information | |
# if they was not defined for new url becouse their values | |
# are still stored in the self.config | |
if self.oldUrl != self.config['url']: | |
if not self.freshPostData: | |
self.curl.setopt(pycurl.HTTPGET, 1) | |
self.freshPostData = False | |
def run(self): | |
"""Do request""" | |
self._prepare() | |
try: | |
self.curl.perform() | |
except pycurl.error, err: | |
# CURLE_WRITE_ERROR | |
# An error occurred when writing received data to a local file, or | |
# an error was returned to libcurl from a write callback. | |
# This is expected error and we should ignore it | |
if 23 == err[0]: | |
pass | |
else: | |
self._finish() | |
self.lastError = err | |
# 6 - could not resolve host | |
# 47 - too many redirects | |
# 52 - nothing was returned from the server | |
# 58 - problem with the local client certificate | |
# 59 - couldn't use specified cipher | |
# 60 - problem with the CA cert (path? access rights?) | |
if err[0] in (6, 47, 52, 58, 59, 60): | |
raise SiteError, err | |
raise Error, err | |
self._finish() | |
def _finish(self): | |
"""Process query result""" | |
self.oldUrl = self.config['url'] | |
if self.maxsize: | |
self.body = self.body[0:self.maxsize] | |
if self.logFile: | |
open(self.logFile, 'w').write( | |
self.config['url'] + '\n' + \ | |
self.curl.errstr() + '\n' + \ | |
self.head + '\n' + self.body) | |
for line in re.split('\r?\n', self.head): | |
try: | |
name, value = line.split(': ', 1) | |
if 'Set-Cookie' == name: | |
match = re.search('^([^=]+)=([^;]+)*', value) | |
if match: | |
self.cookies[match.group(1)] = match.group(2) | |
else: | |
self.headers[name] = value | |
except ValueError: | |
pass | |
for name, value in self.cookies.iteritems(): | |
self.register_cookie(name, value) | |
if self.headers.get('Content-Encoding') == 'gzip': | |
import StringIO | |
import gzip | |
gzipper = gzip.GzipFile(fileobj=StringIO.StringIO(self.body)) | |
self.body = gzipper.read() | |
if self.unicode: | |
self.decode_body() | |
if self.use_tidy: | |
if not self.unicode: | |
raise Exception('`use_tidy` options requires `unicode` option but it is off now') | |
else: | |
self.apply_tidy() | |
#self.curl.close() | |
def decode_body(self): | |
encoding = detect_encoding(self.body, headers=self.headers) | |
self.encoding = encoding | |
if encoding: | |
self.body = self.body.decode(encoding) | |
else: | |
# TODO: choose the proper way for handling case of unknown encoding | |
raise Exception('Could not determine encoding') | |
#self.body = self.body.decode('utf-8', 'ignore') | |
def apply_tidy(self): | |
print 'fuck' | |
import tidy | |
self.original_body = self.body | |
data = self.body.encode('utf-8') | |
options = dict( | |
output_xhtml=1, | |
show_body_only=0, | |
force_output=1, | |
char_encoding='utf8') | |
data = str(tidy.parseString(data, **options)) | |
self.body = data.decode('utf-8') | |
def getinfo(self,key): | |
return self.curl.getinfo(getattr(pycurl, key)) | |
def errstr(self): | |
"""get request error text""" | |
self.curl.errstr() | |
def getConfig(self, name): | |
try: | |
return self.config[name] | |
except KeyError: | |
return '' | |
def code(self): | |
return self.getinfo('RESPONSE_CODE') | |
def get_current_host(self): | |
domain = urlsplit(self.config['url'])[1] | |
host = domain.rsplit('.', 1)[-1] | |
return host | |
def register_cookie(self, name, value): | |
self.cookies_map.setdefault(self.get_current_host(), {})[name] = value | |
def get_registered_cookies(self): | |
return self.cookies_map.get(self.get_current_host(), {}) | |
@property | |
def soup(self): | |
from BeautifulSoup import BeautifulSoup | |
return BeautifulSoup(self.body) | |
useragents = ( | |
'Mozilla/4.0 (compatible; MSIE 6.0; MSN 2.5; Windows 98)', | |
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1)', | |
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322)', | |
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 2.0.50727)', | |
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.0.3705; .NET CLR 1.1.4322; Media Center PC 4.0; .NET CLR 2.0.50727)', | |
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.2; SV1; .NET CLR 1.1.4322)', | |
'Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.1)', | |
'Mozilla/4.0 (compatible; MSIE 7.0b; Win32)', | |
'Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 6.0)', | |
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; SV1; Arcor 5.005; .NET CLR 1.0.3705; .NET CLR 1.1.4322)', | |
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; YPC 3.0.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)', | |
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)', | |
'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.7.8) Gecko/20050511', | |
'Mozilla/5.0 (X11; U; Linux i686; cs-CZ; rv:1.7.12) Gecko/20050929', | |
'Mozilla/5.0 (Windows; U; Windows NT 5.1; nl-NL; rv:1.7.5) Gecko/20041202 Firefox/1.0', | |
'Mozilla/5.0 (X11; U; FreeBSD i386; en-US; rv:1.7.8) Gecko/20050609 Firefox/1.0.4', | |
'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.7.9) Gecko/20050711 Firefox/1.0.5', | |
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.7.10) Gecko/20050716 Firefox/1.0.6', | |
'Mozilla/5.0 (Macintosh; U; PPC Mac OS X Mach-O; en-US; rv:1.7.12) Gecko/20050915 Firefox/1.0.7', | |
'Mozilla/5.0 (Windows; U; Windows NT 5.1; nl; rv:1.8) Gecko/20051107 Firefox/1.5', | |
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-GB; rv:1.8.0.1) Gecko/20060111 Firefox/1.5.0.1', | |
'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.0.2) Gecko/20060308 Firefox/1.5.0.2', | |
'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.0.3) Gecko/20060426 Firefox/1.5.0.3', | |
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US; rv:1.8.0.4) Gecko/20060508 Firefox/1.5.0.4', | |
'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.0.6) Gecko/20060808 Fedora/1.5.0.6-2.fc5 Firefox/1.5.0.6 pango-text', | |
'Mozilla/5.0 (Macintosh; U; Intel Mac OS X; en-US; rv:1.8.0.7) Gecko/20060909 Firefox/1.5.0.7', | |
'Mozilla/5.0 (X11; U; Linux x86_64; en-US; rv:1.8.1) Gecko/20060601 Firefox/2.0 (Ubuntu-edgy)', | |
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.1) Gecko/20061204 Firefox/2.0.0.1', | |
'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.1.2) Gecko/20070220 Firefox/2.0.0.2', | |
'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.1.2) Gecko/20070221 SUSE/2.0.0.2-6.1 Firefox/2.0.0.2', | |
'Mozilla/5.0 (Windows; U; Windows NT 5.1; ru; rv:1.8.1.9) Gecko/20071025 Firefox/2.0.0.9', | |
'Mozilla/5.0 (X11; U; Linux i686 (x86_64); en-US; rv:1.8.1.9) Gecko/20071025 Firefox/2.0.0.9', | |
'Mozilla/5.0 (X11; U; Linux i686 (x86_64); en-US; rv:1.9a1) Gecko/20061204 GranParadiso/3.0a1', | |
'Opera/8.0 (X11; Linux i686; U; cs)', | |
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; en) Opera 8.50', | |
'Mozilla/5.0 (Windows NT 5.1; U; en) Opera 8.50', | |
'Opera/8.51 (Windows NT 5.1; U; en)', | |
'Opera/9.0 (Windows NT 5.1; U; en)', | |
'Opera/9.01 (X11; Linux i686; U; en)', | |
'Opera/9.02 (Windows NT 5.1; U; en)', | |
'Opera/9.10 (Windows NT 5.1; U; en)', | |
'Opera/9.23 (Windows NT 5.1; U; ru)', | |
) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment