Last active
January 17, 2018 05:26
-
-
Save clarksun/b9d4d770964d7b390b8f93a1197d5b0e to your computer and use it in GitHub Desktop.
临时代理获取临时cookie
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# https://github.com/GuozhuHe/webspider/blob/master/webspider/utils/http_tools.py | |
def get_proxys(pages=4): | |
"""获取代理""" | |
proxy_list = [] | |
url = 'http://www.xicidaili.com/wn/' | |
headers = generate_http_header() | |
headers.update( | |
{ | |
'Referer': 'http://www.xicidaili.com/wn/', | |
'Host': 'www.xicidaili.com', | |
} | |
) | |
for page_no in range(1, pages+1): | |
response = requests.get(url=url.format(page_no=page_no), headers=headers) | |
html = etree.HTML(response.text) | |
ips = html.xpath("//table[@id='ip_list']/tr/td[2]/text()") | |
ports = html.xpath("//table[@id='ip_list']/tr/td[3]/text()") | |
assert len(ips) == len(ports) | |
for (ip, port) in zip(ips, ports): | |
proxy_list.append(constants.HTTP_PROXY_FORMATTER.format(ip=ip, port=port)) | |
return proxy_list | |
def filter_unavailable_proxy(proxy_list, proxy_type='HTTPS'): | |
"""过滤掉无用的代理""" | |
available_proxy_list = [] | |
for proxy in proxy_list: | |
if proxy_type == 'HTTPS': | |
protocol = 'https' | |
else: | |
protocol = 'http' | |
try: | |
response = requests.get('https://www.lagou.com/gongsi/0-0-0.json', | |
proxies={protocol:proxy}, | |
timeout=1) | |
if response.status_code == constants.HTTP_SUCCESS and 'totalCount' in response.json(): | |
available_proxy_list.append(proxy) | |
logging.info('可用代理数量 {}'.format(len(available_proxy_list))) | |
except Exception: | |
pass | |
return available_proxy_list | |
# https://github.com/GuozhuHe/webspider/blob/dev/webspider/utils/cookies.py | |
class Cookies(object): | |
_cookies = [] | |
_last_update_time = None | |
@classmethod | |
def refresh_cookies(cls): | |
"""刷新cookie""" | |
proxys = get_proxys() | |
cls._cookies = cls.get_lagou_cookies_from_proxys(proxys) | |
cls._last_update_time = time.time() | |
@classmethod | |
def get_random_cookies(cls): | |
now = time.time() | |
# cookie超时时间 | |
if len(cls._cookies) == 0 or (now - cls._last_update_time) >= constants.SECONDS_OF_DELAY * 3: | |
cls.refresh_cookies() | |
return random.choice(cls._cookies) | |
@classmethod | |
def remove_cookies(cls, cookies): | |
cls._cookies.remove(cookies) | |
@classmethod | |
def get_lagou_cookies_from_proxys(cls, proxys, proxy_type='https'): | |
logging.info('重新获取 cookies!') | |
logging.info('代理 IP 的数量: {}'.format(len(proxys))) | |
cookies = [] | |
for proxy in proxys: | |
try: | |
response = requests.get('https://www.lagou.com/', | |
proxies={proxy_type: proxy}, | |
allow_redirects=False, | |
timeout=2) | |
if len(response.cookies): | |
cookies.append(response.cookies) | |
except Exception: | |
pass | |
logging.info('可用cookies 数量: {}'.format(len(cookies))) | |
return cookies |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment