-
-
Save mehmetkose/46f28b4e0ed05db9c08b to your computer and use it in GitHub Desktop.
Gist by paste.py @ 2015-07-04 09:11:16.505897
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- encoding: utf-8 -*- | |
from tornado.ioloop import IOLoop, PeriodicCallback | |
from tornado.web import RequestHandler, Application, url | |
from tornado.httpserver import HTTPServer | |
from tornado.httpclient import AsyncHTTPClient, HTTPError | |
from tornado.gen import coroutine, Return, WaitIterator | |
from tornado.netutil import bind_sockets, bind_unix_socket | |
from tornado.process import fork_processes | |
from json import loads, dumps | |
from urllib import urlencode | |
from sys import exc_info | |
from datetime import datetime | |
from rfc822 import parsedate_tz, mktime_tz | |
from random import choice | |
# from xml.dom import minidom | |
from lxml import etree | |
from pytz import utc | |
import json | |
import geoip | |
import logging | |
import time | |
import re | |
import sys | |
reload(sys) | |
sys.setdefaultencoding('utf-8') | |
del sys.setdefaultencoding | |
AsyncHTTPClient.configure("tornado.curl_httpclient.CurlAsyncHTTPClient") | |
geolite2 = geoip.open_database('geolite2.mmdb') | |
proxyru_base = 'http://proxy.com.ru' | |
app_log = logging.getLogger('tornado.application') | |
pages_re = re.compile(r'href="(/list_\d+\.html)"') | |
proxies_re = re.compile(r'<tr><b><td>(?P<serial>\d+)</td><td>(?P<ip>.+?)</td><td>(?P<port>\d+)</td><td>(?P<ptype>.+?)</td><td>.+?</td></b></tr>') | |
listen_addr = '9000' | |
# Some utilities for this project. | |
def benchmark(func): | |
def _wrapper(*args, **kwargs): | |
now = time.time() | |
result = func(*args, **kwargs) | |
elapsed = time.time() - now | |
app_log.info('benchmark result for %s: runtime = %.3f second(s)', func.__name__, elapsed) | |
return result | |
return _wrapper | |
def jsonify(plist, _filter=None, single=False): | |
proxies = plist.filter(_filter=_filter) | |
if not proxies: | |
return json.dumps({ | |
'error': 1, | |
'reason': 'No suitable proxy found.' | |
}) | |
if single: | |
proxy = choice(proxies) | |
return json.dumps({ | |
'error': 0, | |
'id': proxy.serial, | |
'ip': proxy.ip, | |
'port': proxy.port, | |
'type': proxy.ptype, | |
'country': proxy.country, | |
'latitude': proxy.lat, | |
'longitude': proxy.lon, | |
'location': proxy.full_name, | |
'last_modified': plist.last_updated.isoformat(), | |
'last_check': plist.last_tried.isoformat(), | |
}) | |
else: | |
return json.dumps({ | |
'proxies': map(lambda proxy : { | |
'id': proxy.serial, | |
'ip': proxy.ip, | |
'port': proxy.port, | |
'type': proxy.ptype, | |
'country': proxy.country, | |
'latitude': proxy.lat, | |
'longitude': proxy.lon, | |
'location': proxy.full_name, | |
}, proxies), | |
'error': 0, | |
'last_modified': plist.last_updated.isoformat(), | |
'last_check': plist.last_tried.isoformat(), | |
}) | |
def add_sub_tag(parent, child_name, child_text=None): | |
child_elem = etree.SubElement(parent, child_name) | |
if child_text is not None: | |
child_elem.text = unicode(child_text) | |
return child_elem | |
def xmlify(plist, _filter=None, single=False): | |
proxies = plist.filter(_filter=_filter) | |
if not proxies: | |
return '''\ | |
<?xml version='1.0' encoding='utf-8'?> | |
<Result> | |
<Error>1</Error> | |
<Reason>No suitable proxy found.</Reason> | |
</Result>\n''' | |
root_tag = etree.Element('Result') | |
add_sub_tag(root_tag, 'Error', 0) | |
add_sub_tag(root_tag, 'LastModified', plist.last_updated.isoformat()) | |
add_sub_tag(root_tag, 'LastCheck', plist.last_updated.isoformat()) | |
if single: | |
proxy = choice(proxies) | |
proxy_node = add_sub_tag(root_tag, 'Proxy') | |
add_sub_tag(proxy_node, 'ID', proxy.serial) | |
add_sub_tag(proxy_node, 'IP', proxy.ip) | |
add_sub_tag(proxy_node, 'Port', proxy.port) | |
add_sub_tag(proxy_node, 'Type', proxy.ptype) | |
add_sub_tag(proxy_node, 'Country', proxy.country) | |
add_sub_tag(proxy_node, 'Latitude', proxy.lat) | |
add_sub_tag(proxy_node, 'Longitude', proxy.lon) | |
add_sub_tag(proxy_node, 'Location', proxy.full_name) | |
else: | |
proxies_node = add_sub_tag(root_tag, 'Proxies') | |
for proxy in proxies: | |
proxy_node = add_sub_tag(proxies_node, 'Proxy') | |
add_sub_tag(proxy_node, 'ID', proxy.serial) | |
add_sub_tag(proxy_node, 'IP', proxy.ip) | |
add_sub_tag(proxy_node, 'Port', proxy.port) | |
add_sub_tag(proxy_node, 'Type', proxy.ptype) | |
add_sub_tag(proxy_node, 'Country', proxy.country) | |
add_sub_tag(proxy_node, 'Latitude', proxy.lat) | |
add_sub_tag(proxy_node, 'Longitude', proxy.lon) | |
add_sub_tag(proxy_node, 'Location', proxy.full_name) | |
return etree.tostring(root_tag, pretty_print=True, encoding='utf-8', xml_declaration=True) | |
class Proxy(object): | |
PROXY_FULL_ANONYMOUS = 0 | |
PROXY_ANONYMOUS = 1 | |
PROXY_TRANSPARENT = 2 | |
proxy_type = { | |
u'透明代理': PROXY_TRANSPARENT, | |
u'匿名代理': PROXY_ANONYMOUS, | |
u'高度匿名': PROXY_FULL_ANONYMOUS, | |
} | |
proxy_name = [ | |
'transparent', | |
'anonymous', | |
'full_anonymous', | |
] | |
def __init__(self, serial, ip, port, ptype): | |
self.serial = int(serial) | |
self.ip = ip | |
self.port = int(port) | |
self.ptype = self.proxy_type[ptype.decode('gb2312')] # Damn you, gb**** | |
app_log.info('new proxy: #%d @ %s:%d type: %s', self.serial, ip, self.port, self.proxy_name[self.ptype]) | |
geoinfo = geolite2.lookup(ip) | |
if geoinfo is not None: | |
self.country = geoinfo.country | |
self.lat, self.lon = geoinfo.location | |
self.tz = geoinfo.timezone | |
info_dict = geoinfo.get_info_dict() | |
if 'city' in info_dict: | |
self.city = info_dict['city']['names']['en'] | |
else: | |
self.city = None | |
if 'subdivisions' in info_dict: | |
self.subdiv = ' '.join(map(lambda x : x['names']['en'], info_dict['subdivisions'])) | |
else: | |
self.subdiv = None | |
if 'country' in info_dict: | |
self.country_name = info_dict['country']['names']['en'] | |
else: | |
self.country_name = None | |
else: | |
self.country = None | |
self.lat = None | |
self.lon = None | |
self.tz = None | |
self.city = None | |
self.subdiv = None | |
@property | |
def full_name(self): | |
if self.country_name is None: | |
return '' | |
ret = self.country_name | |
if self.subdiv is not None: | |
ret += ' ' + self.subdiv | |
if self.city is not None: | |
ret += ' ' + self.city | |
return ret | |
class ProxyFilter(object): | |
def __init__(self, country=None, serial=None, ptype=None): | |
self.country = country | |
self.serial = serial | |
self.ptype = ptype | |
def filter(self, proxy): | |
if self.country is not None and self.country != proxy.country: | |
return False | |
if self.serial is not None and self.serial != proxy.serial: | |
return False | |
if self.ptype is not None and self.ptype != proxy.ptype: | |
return False | |
return True | |
@property | |
def always_true(self): | |
return self.country is None and self.serial is None and self.ptype is None | |
# Background task, call ProxyList:fetch frequently to make sure data are up to date :) | |
class ProxyList(object): | |
def __init__(self): | |
self.proxies = [] | |
self.client = AsyncHTTPClient(max_clients=32) | |
self.last_updated = utc.localize(datetime.utcfromtimestamp(0)) | |
self.last_tried = utc.localize(datetime.utcfromtimestamp(0)) | |
self.running = False | |
@coroutine | |
def fetch(self): | |
# Two step method: first let's fetch first page and find out how many proxies are currently available. | |
if self.running: | |
app_log.warn('okay, another network lag. i\'ll quit right now.') | |
return | |
self.running = True | |
try: | |
app_log.info('update triggered at %s', utc.localize(datetime.utcnow()).isoformat()) | |
try: | |
result = yield self.client.fetch(proxyru_base + '/', validate_cert=False, connect_timeout=5, request_timeout=20, if_modified_since=self.last_updated, follow_redirects=False) | |
except HTTPError as e: | |
if e.code == 304: | |
app_log.info('proxy list not updated.') | |
return | |
if e.code == 599: | |
app_log.warning('just another timeout, hang tight!') | |
return | |
app_log.critical('server responsed with unknown code %d, check your code please!', e.code) | |
return | |
finally: | |
self.last_tried = utc.localize(datetime.utcnow()) | |
if 'Last-Modified' in result.headers: | |
last_updated = utc.localize(datetime.utcfromtimestamp(mktime_tz(parsedate_tz(result.headers['Last-Modified'])))) | |
else: | |
# No Last-Modified header, assume just modified. | |
app_log.warning('server responsed with no last-modified header, be aware of any other changes.') | |
last_updated = utc.localize(datetime.utcnow()) | |
pages = pages_re.findall(result.body) | |
futures = [] | |
for page in pages: | |
futures.append(self.client.fetch(proxyru_base + '/' + page, validate_cert=False, connect_timeout=5, request_timeout=20, if_modified_since=self.last_updated, follow_redirects=False)) | |
result_iterator = WaitIterator(*futures) | |
new_proxies = [] | |
while not result_iterator.done(): | |
try: | |
result = yield result_iterator.next() | |
except HTTPError as e: | |
if e.code == 599: | |
app_log.error('damn! proxy.com.ru breaks down again during my fetch.') | |
for it in futures: | |
it.cancel() | |
return | |
app_log.critical('hey! check your code! error when fetching page %d, error code = %d', result_iterator.current_index, e.code) | |
return | |
else: | |
for proxy in proxies_re.finditer(result.body): | |
new_proxies.append(Proxy(proxy.group('serial'), proxy.group('ip'), proxy.group('port'), proxy.group('ptype'))) | |
self.proxies = sorted(new_proxies, key = lambda proxy : proxy.serial) | |
self.last_updated = last_updated | |
finally: | |
app_log.info('bye!') | |
self.running = False | |
def filter(self, _filter=None): | |
if _filter is None or _filter.always_true: | |
return self.proxies | |
return filter(_filter.filter, self.proxies) | |
# HTTP Handlers goes here | |
class SingleHandler(RequestHandler): | |
def get(self, ptype=None, ext='json'): | |
country = self.get_query_argument('country', None) | |
serial = self.get_query_argument('serial', None) | |
if ptype is not None: | |
if ptype == 'transparent': | |
ptype = Proxy.PROXY_TRANSPARENT | |
elif ptype == 'anonymous': | |
ptype = Proxy.PROXY_ANONYMOUS | |
elif ptype == 'full-anonymous': | |
ptype = Proxy.PROXY_FULL_ANONYMOUS | |
else: | |
ptype = None | |
_filter = ProxyFilter(country=country, serial=serial, ptype=ptype) | |
if ext == 'json': | |
self.set_header('Content-Type', 'application/json') | |
self.write(jsonify(plist, _filter, True)) | |
elif ext == 'xml': | |
self.set_header('Content-Type', 'text/xml') | |
self.write(xmlify(plist, _filter, True)) | |
else: | |
self.set_header('Content-Type', 'application/json') | |
self.write(jsonify(plist, _filter, True)) | |
class ListHandler(RequestHandler): | |
def get(self, ptype=None, ext='json'): | |
country = self.get_query_argument('country', None) | |
serial = self.get_query_argument('serial', None) | |
if ptype is not None: | |
if ptype == 'transparent': | |
ptype = Proxy.PROXY_TRANSPARENT | |
elif ptype == 'anonymous': | |
ptype = Proxy.PROXY_ANONYMOUS | |
elif ptype == 'full-anonymous': | |
ptype = Proxy.PROXY_FULL_ANONYMOUS | |
else: | |
ptype = None | |
_filter = ProxyFilter(country=country, serial=serial, ptype=ptype) | |
if ext == 'json': | |
self.set_header('Content-Type', 'application/json') | |
self.write(jsonify(plist, _filter)) | |
elif ext == 'xml': | |
self.set_header('Content-Type', 'text/xml') | |
self.write(xmlify(plist, _filter)) | |
else: | |
self.set_header('Content-Type', 'application/json') | |
self.write(jsonify(plist, _filter)) | |
if __name__ == '__main__': | |
logging.basicConfig(level=logging.INFO) | |
plist = ProxyList() | |
PeriodicCallback(plist.fetch, 10100).start() | |
app = Application([ | |
url(r'/single(?:/(?P<ptype>.*?))?(?:\.(?P<ext>.*))?', SingleHandler), | |
url(r'/(?P<ptype>.*?)?(?:\.(?P<ext>.*))?', ListHandler), | |
]) | |
if listen_addr.startswith('unix:'): | |
socket = [bind_unix_socket(listen_addr[5:], mode=0660)] | |
else: | |
if ':' in listen_addr: | |
addr, port = listen_addr.rsplit(':', 1) | |
port = int(port) | |
else: | |
addr = None | |
port = int(listen_addr) | |
socket = bind_sockets(port, addr) | |
server = HTTPServer(app) | |
server.add_sockets(socket) | |
IOLoop.instance().add_callback(plist.fetch) | |
IOLoop.instance().start() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment