Created
September 29, 2014 16:51
-
-
Save Hackforid/7b8f2910ceb041ff8202 to your computer and use it in GitHub Desktop.
fetch proxies
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding:utf-8 -*- | |
import time | |
import requests | |
import gevent | |
from gevent.pool import Pool | |
from pyquery import PyQuery as pq | |
from gevent import monkey | |
monkey.patch_socket() | |
local_ip = "" | |
proxy_list = [] | |
def get_ip_by_baidu(proxies=None): | |
try: | |
baidu = requests.get( | |
'http://m.baidu.com/s?word=ip&pu=sz%401321_480&wpo=fast', | |
proxies=proxies, timeout=5, headers={'user-agent': 'chrome', }) | |
r = pq(baidu.text) | |
ip = r('span').eq(0).text() | |
except: | |
return | |
return ip | |
def get_local_ip(): | |
ip = get_ip_by_baidu() | |
print 'local ip = ', ip | |
return ip | |
def valid_proxy(ip, port): | |
proxies = {'http': 'http://%s:%s' % | |
(ip, port), 'https': 'https://%s:%s' % (ip, port)} | |
time_start = time.time() | |
ip = get_ip_by_baidu(proxies) | |
time_cost = time.time() - time_start | |
if (ip != None and ip != local_ip): | |
return ip, port, time_cost | |
else: | |
return ip, port, None | |
def valid_proxies(ip_list): | |
global proxy_list | |
threads = [] | |
valid_proxy_list = [] | |
p = Pool(1024) | |
for ip in ip_list: | |
url = '%s:%s' % (ip.get('ip'), ip.get('port')) | |
if url in proxy_list: | |
continue | |
else: | |
proxy_list.append(url) | |
threads.append(p.spawn(valid_proxy, ip.get('ip'), ip.get('port'))) | |
for thread in threads: | |
ip, port, time_cost = thread.get() | |
if time_cost: | |
valid_proxy_list.append( | |
{'ip': ip, 'port': port, 'time': time_cost}) | |
for proxy_ip in valid_proxy_list: | |
print '%s:%s %s' % (proxy_ip.get('ip'), proxy_ip.get('port'), proxy_ip.get('time')) | |
def fetch_kuaidaili(): | |
ip_list = [] | |
for i in range(1, 11): | |
r = pq(url="http://www.kuaidaili.com/proxylist/%s" % i) | |
trs = r('tbody').find('tr') | |
ip_list.extend([(tds.eq(0).text(), tds.eq(1).text()) | |
for tds in [tr('td') for tr in trs.items()]]) | |
return ip_list | |
def fetch_cn_proxy_com(): | |
r = pq(url="http://cn-proxy.com/") | |
trs = r('tbody').find('tr') | |
return [{'ip': tds.eq(0).text(), 'port': tds.eq(1).text()} for tds in [tr('td') for tr in trs.items()]] | |
def start(): | |
fetch_methods = [fetch_cn_proxy_com, fetch_cn_proxy_com] | |
ip_list = [] | |
threads = [gevent.spawn(method) for method in fetch_methods] | |
for thread in threads: | |
ip_list.extend(thread.get()) | |
valid_proxies(ip_list) | |
def main(): | |
global local_ip | |
local_ip = get_local_ip() | |
start() | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment