Skip to content

Instantly share code, notes, and snippets.

@chen206
Created March 18, 2014 06:41
Show Gist options
  • Save chen206/9614715 to your computer and use it in GitHub Desktop.
Save chen206/9614715 to your computer and use it in GitHub Desktop.
proxy spider & check
# -*- coding: utf-8 -*-
# Created on 2014-03-17
# @author: wd
import requests
import re
import logging
import socket
from datetime import datetime
def check_proxy(ip, port):
sk = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
sk.settimeout(3.0)
try:
sk.connect((ip, port))
return True
except:
pass
finally:
sk.close()
def check_search(ip, port):
try:
url = 'http://s.weibo.com/wb/s%s' % datetime.now().strftime('%Y-%m-%d')
r = requests.get(url, proxies={'http': '%s:%s' % (ip, port)}, timeout=5.0)
if 's.weibo.com' in r.text and 'pl_wb_feedlist' in r.text:
return True
except Exception, e:
print e
def taobao_myip(ip, port):
try:
url = "http://ip.taobao.com/service/getIpInfo.php?ip=myip"
r = requests.get(url, proxies={"http": '%s:%s' % (ip, port)}, timeout=5.0)
print r.text
resp = r.json()
print resp
if str(ip) == str(resp.get("data", {}).get("ip")):
return True
except requests.exceptions.Timeout, e:
return False
except requests.exceptions.ConnectionError, e:
return False
except Exception, e:
print e
return False
def f_youdaili():
retry = 0
youdaili_http = 'http://www.youdaili.cn/Daili/http/'
youdaili_http_today = youdaili_http + '%s.html'
youdaili_http_ps = youdaili_http + '%s_%s.html'
while 1:
if retry > 5:
logging.warn('f_youdaili failed. retry1=%s', retry)
break
try:
r = requests.get(youdaili_http, timeout=3.0)
m = re.search(r'%s(\d+)\.html' % youdaili_http, r.text)
pid = m.group(1)
youdaili_http_today_p = youdaili_http_today % pid
print youdaili_http_today_p
r = requests.get(youdaili_http_today_p)
ftext = r.text
except Exception, e:
pass
if not ftext:
retry += 1
continue
retry = 0
o = list()
o.append(youdaili_http_today_p)
match_proxy(r.text)
for i in re.findall(r'%s_(\d+)\.html' % pid, ftext):
youdaili_http_today_p = youdaili_http_ps % (pid, i)
if youdaili_http_today_p not in o:
print youdaili_http_today_p
try:
r = requests.get(youdaili_http_today_p)
except Exception, e:
retry += 1
print e
o.append(youdaili_http_today_p)
match_proxy(r.text)
if retry > 5:
logging.warn('retry2=%s', retry)
break
def f_cnproxy():
urls = []
retry = 0
for i in range(10):
urls.append('http://www.cnproxy.com/proxy%s.html' % (i + 1))
for i in range(2):
urls.append('http://www.cnproxy.com/proxyedu%s.html' % (i + 1))
for url in urls:
print url
if retry > 5:
logging.warn('f_cnproxy failed. retry1=%s', retry)
break
try:
r = requests.get(url)
except Exception, e:
retry += 1
continue
match_proxy(r.text)
def match_proxy(text, pattern=r'(\d+)\.(\d+)\.(\d+)\.(\d+):(\d+)'):
for i in re.findall(pattern, text):
ip = '.'.join(i[0:4])
port = int(i[4])
try:
if check_proxy(ip, port) and check_search(ip, port):
# print ip, port
proxies.append([ip, port])
except Exception, e:
pass
if __name__ == "__main__":
proxies = []
f_youdaili()
f_cnproxy()
for p in proxies:
print p
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment