Skip to content

Instantly share code, notes, and snippets.

@clarksun
Last active January 17, 2018 05:26
Show Gist options
  • Save clarksun/b9d4d770964d7b390b8f93a1197d5b0e to your computer and use it in GitHub Desktop.
Save clarksun/b9d4d770964d7b390b8f93a1197d5b0e to your computer and use it in GitHub Desktop.
临时代理获取临时cookie
# https://github.com/GuozhuHe/webspider/blob/master/webspider/utils/http_tools.py
def get_proxys(pages=4):
"""获取代理"""
proxy_list = []
url = 'http://www.xicidaili.com/wn/'
headers = generate_http_header()
headers.update(
{
'Referer': 'http://www.xicidaili.com/wn/',
'Host': 'www.xicidaili.com',
}
)
for page_no in range(1, pages+1):
response = requests.get(url=url.format(page_no=page_no), headers=headers)
html = etree.HTML(response.text)
ips = html.xpath("//table[@id='ip_list']/tr/td[2]/text()")
ports = html.xpath("//table[@id='ip_list']/tr/td[3]/text()")
assert len(ips) == len(ports)
for (ip, port) in zip(ips, ports):
proxy_list.append(constants.HTTP_PROXY_FORMATTER.format(ip=ip, port=port))
return proxy_list
def filter_unavailable_proxy(proxy_list, proxy_type='HTTPS'):
"""过滤掉无用的代理"""
available_proxy_list = []
for proxy in proxy_list:
if proxy_type == 'HTTPS':
protocol = 'https'
else:
protocol = 'http'
try:
response = requests.get('https://www.lagou.com/gongsi/0-0-0.json',
proxies={protocol:proxy},
timeout=1)
if response.status_code == constants.HTTP_SUCCESS and 'totalCount' in response.json():
available_proxy_list.append(proxy)
logging.info('可用代理数量 {}'.format(len(available_proxy_list)))
except Exception:
pass
return available_proxy_list
# https://github.com/GuozhuHe/webspider/blob/dev/webspider/utils/cookies.py
class Cookies(object):
_cookies = []
_last_update_time = None
@classmethod
def refresh_cookies(cls):
"""刷新cookie"""
proxys = get_proxys()
cls._cookies = cls.get_lagou_cookies_from_proxys(proxys)
cls._last_update_time = time.time()
@classmethod
def get_random_cookies(cls):
now = time.time()
# cookie超时时间
if len(cls._cookies) == 0 or (now - cls._last_update_time) >= constants.SECONDS_OF_DELAY * 3:
cls.refresh_cookies()
return random.choice(cls._cookies)
@classmethod
def remove_cookies(cls, cookies):
cls._cookies.remove(cookies)
@classmethod
def get_lagou_cookies_from_proxys(cls, proxys, proxy_type='https'):
logging.info('重新获取 cookies!')
logging.info('代理 IP 的数量: {}'.format(len(proxys)))
cookies = []
for proxy in proxys:
try:
response = requests.get('https://www.lagou.com/',
proxies={proxy_type: proxy},
allow_redirects=False,
timeout=2)
if len(response.cookies):
cookies.append(response.cookies)
except Exception:
pass
logging.info('可用cookies 数量: {}'.format(len(cookies)))
return cookies
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment