Skip to content

Instantly share code, notes, and snippets.

@Hackforid
Created September 29, 2014 16:51
Show Gist options
  • Save Hackforid/7b8f2910ceb041ff8202 to your computer and use it in GitHub Desktop.
Save Hackforid/7b8f2910ceb041ff8202 to your computer and use it in GitHub Desktop.
fetch proxies
# -*- coding:utf-8 -*-
import time
import requests
import gevent
from gevent.pool import Pool
from pyquery import PyQuery as pq
from gevent import monkey
monkey.patch_socket()
local_ip = ""
proxy_list = []
def get_ip_by_baidu(proxies=None):
try:
baidu = requests.get(
'http://m.baidu.com/s?word=ip&pu=sz%401321_480&wpo=fast',
proxies=proxies, timeout=5, headers={'user-agent': 'chrome', })
r = pq(baidu.text)
ip = r('span').eq(0).text()
except:
return
return ip
def get_local_ip():
ip = get_ip_by_baidu()
print 'local ip = ', ip
return ip
def valid_proxy(ip, port):
proxies = {'http': 'http://%s:%s' %
(ip, port), 'https': 'https://%s:%s' % (ip, port)}
time_start = time.time()
ip = get_ip_by_baidu(proxies)
time_cost = time.time() - time_start
if (ip != None and ip != local_ip):
return ip, port, time_cost
else:
return ip, port, None
def valid_proxies(ip_list):
global proxy_list
threads = []
valid_proxy_list = []
p = Pool(1024)
for ip in ip_list:
url = '%s:%s' % (ip.get('ip'), ip.get('port'))
if url in proxy_list:
continue
else:
proxy_list.append(url)
threads.append(p.spawn(valid_proxy, ip.get('ip'), ip.get('port')))
for thread in threads:
ip, port, time_cost = thread.get()
if time_cost:
valid_proxy_list.append(
{'ip': ip, 'port': port, 'time': time_cost})
for proxy_ip in valid_proxy_list:
print '%s:%s %s' % (proxy_ip.get('ip'), proxy_ip.get('port'), proxy_ip.get('time'))
def fetch_kuaidaili():
ip_list = []
for i in range(1, 11):
r = pq(url="http://www.kuaidaili.com/proxylist/%s" % i)
trs = r('tbody').find('tr')
ip_list.extend([(tds.eq(0).text(), tds.eq(1).text())
for tds in [tr('td') for tr in trs.items()]])
return ip_list
def fetch_cn_proxy_com():
r = pq(url="http://cn-proxy.com/")
trs = r('tbody').find('tr')
return [{'ip': tds.eq(0).text(), 'port': tds.eq(1).text()} for tds in [tr('td') for tr in trs.items()]]
def start():
fetch_methods = [fetch_cn_proxy_com, fetch_cn_proxy_com]
ip_list = []
threads = [gevent.spawn(method) for method in fetch_methods]
for thread in threads:
ip_list.extend(thread.get())
valid_proxies(ip_list)
def main():
global local_ip
local_ip = get_local_ip()
start()
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment