Created
July 6, 2016 07:32
-
-
Save 582033/22bf2693b1125e754f4cd84663cda178 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- coding: utf-8 -*- | |
import sys | |
import re | |
import json | |
import requests | |
from bs4 import BeautifulSoup | |
class crawler(): | |
def __init__(self, host): | |
self.host = host | |
self.form_data = { | |
'__VIEWSTATEGENERATOR':'69F4E07F', | |
'__VIEWSTATE':'/wEPDwULLTE1ODQzMDg1NDMPZBYCAgEPZBYKZg8PFgIeBFRleHQFyQHkvaDnmoQgSVA6IDEwNy4xOTEuMTE2LjI1PEJSPjxpbWcgc3JjPScuLi9pbWFnZXMvZmxhZ3MvVVMucG5nJyB3aWR0aD0nMzInIGhlaWdodD0nMzInIC8+PEJSPuS9oOeahOWbveWutuaIluWcsOWMujogVW5pdGVkIFN0YXRlczxCUj48YSBocmVmPScjTElTVCc+6YCa6L+H5L2/55SoIFZQTiBHYXRlIOadpeabtOaUueS9oOeahCBJUCDlnLDlnYAgITwvYT5kZAIBDw8WAh8ABXI8Yj7lnKggMjQg5bCP5pe25LmL5YaF55qEOiAxLDUzMSwzNjYg55So5oi377yM57Sv6K6h55So5oi35pWwOiAxLDIwNSwxNzEsMzExIOeUqOaIt++8jOmAmuS/oemHjzogMjAsOTg4Ljc3IFRCLjwvYj5kZAIDDw8WAh8ABQUzLDI0OWRkAgQPDxYCHwAFPDxiPuacieadpeiHqiAyMjcg5Liq5Zu95a6255qEIDEsMjA1LDE3MSwzMTEg5Liq55So5oi344CCPC9iPmRkAgYPDxYCHwAFBDY5NzFkZBgBBR5fX0NvbnRyb2xzUmVxdWlyZVBvc3RCYWNrS2V5X18WBAULQ19Tb2Z0RXRoZXIFBkNfTDJUUAUJQ19PcGVuVlBOBQZDX1NTVFBNhR6QKODAughVK7og96/pnNOg+MsSRdBoTwK8F1yUzw==', | |
'__EVENTVALIDATION':'/wEdAAeOnCU/WmcukVQ1Rszt4PpuZSmLidaMQ3gg2jFmkkuEoSCbR2H52ATFMg5mk6aQHX3LISMg9/mywZPt3Ki4BVA7RhcLWIOHmHJ6h2VtXvwLieWw6g9beu/2J/0raZOGI2E/WMskeKo19Gyidl+m11dTlWT5u5EoXokaDMPJeszCVBDwGRQM2BJm5pkQt2UxSGc=', | |
'C_L2TP':'on', | |
#'C_SoftEther':'on', | |
#'C_OpenVPN':'on', | |
'foo':'bar' #用于补全结尾,无实际意义 | |
} | |
def get_l2tp(self): | |
r = requests.post(self.host, data=self.form_data) | |
soup = BeautifulSoup(r.content) | |
tmp_list = [] | |
check_stack = [] | |
for td in soup.findAll('span', attrs={'style':'font-size: 12pt;'}): | |
if td.string: | |
if re.search(r'\d+\.\d+\.\d+\.\d+', td.string): | |
if td.string not in check_stack: | |
list_attr = {} | |
#获取IP | |
list_attr['ip'] = td.string | |
#获取等级评分 | |
rank_list = td.parent.parent.parent.findAll('span', text=re.compile('\d+,\d+')) | |
list_attr['rank'] = int(rank_list[0].string.replace(',','')) | |
#获取当前会话数 | |
session_list = td.parent.parent.parent.findAll('span', text=re.compile('\d+\ .*')) | |
list_attr['session'] = re.sub('(\d+).*', '\\1', session_list[0].string) | |
#获取吞吐量,单位Mbps | |
tp_list = td.parent.parent.parent.findAll('span', text=re.compile('\d+\ Mbps')) | |
list_attr['throughput'] = re.sub('(\d+\.\d+)\ \Mbps', '\\1', tp_list[0].string) | |
tmp_list.append(list_attr) | |
check_stack.append(td.string) | |
#按rank降序重排 | |
sorted(tmp_list, key = lambda x:x['rank'], reverse=True) | |
print json.dumps(tmp_list) | |
if __name__ == '__main__': | |
host = 'http://www.vpngate.net/cn/' | |
cw = crawler(host) | |
cw.get_l2tp() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment