Created
May 29, 2016 00:51
-
-
Save fffonion/27ef275845b1d06f44a51d7f7e6354df to your computer and use it in GitHub Desktop.
Anti-anti-crawler scripts
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
import time | |
mkcookie = lambda coo: ";".join(["%s=%s" % (k,v) for k,v in coo.iteritems()]) | |
def autodetect(resp, ct, ht, cookie, headers, url): | |
__func = None | |
while 'set-cookie' in resp: | |
coostr = resp['set-cookie'] | |
_session = re.findall('yunsuo_session_verify=(\w+)', coostr) | |
if _session: | |
__func = check_yunsuo | |
break | |
_t = re.findall('Just a moment', ct) | |
if _t: | |
__func = check_cf | |
_session = re.findall('__cfduid=([^;]+);', coostr) | |
break | |
break | |
if __func: | |
return __func(ht, cookie, _session[0], ct, headers, url) | |
def check_yunsuo(ht, cookie, session, content, headers, url): | |
import binascii | |
cookie['yunsuo_session_verify'] = session | |
cookie['srcurl'] = binascii.hexlify(url) | |
hd = dict(headers) | |
hd.update({'Cookie':mkcookie(cookie)}) | |
time.sleep(0.05) | |
resp, ct = ht.request('%s?security_verify_data=%s' % (url, binascii.hexlify("1366,768")), headers = hd) | |
cookie['security_session_mid_verify'] = re.findall('security_session_mid_verify=(\w+)', resp['set-cookie'])[0] | |
return True | |
def check_cf(ht, cookie, session, content, headers, url): | |
def get_cv(ct, host_name): | |
#ct = ct.replace('\n', '').replace('\r', '') | |
#find all hidden form value | |
hidden = re.findall('<input type="hidden" name="([^"]+)" value="([^\"]+)"', ct) | |
hidden = '&'.join(map(lambda x:'='.join(x), hidden)) | |
url = re.findall('<form id="[^"]+" action="([^"]+)" method="get">', ct)[0] | |
# get var name | |
# var t,r,a,f, kMuTlpA={"t":+((!+[]+!![]+!![]+[])+(!+[]+!![]+!![]+!![]+!![]+!![]))}; | |
_, n, m, v = re.findall('var (:?[^,]+,){4} ([^=]+)={"([^"]+)":([^}]+)};', ct, re.DOTALL)[0] | |
v = calc_symbol(v) | |
for op, arg in re.findall('%s\.%s(.)=([^;]+);' % (n, m), ct): | |
v = eval('%d %s %d' % (v, op, calc_symbol(arg))) | |
# t = re.findall('\+\s*([^\.]+)\.length', ct, re.DOTALL)[0] | |
# print '%s\.innerHTML\s*=\s*"([^"])";' % t | |
# new_len = len(re.findall('%s\.innerHTML\s*=\s*"([^"]+)";' % t, ct, re.DOTALL)[0]) | |
# print new_len | |
v += len(host_name) | |
wait = re.findall('}, (\d+)\);', ct, re.DOTALL)[0] | |
return hidden, v, url, wait | |
def calc_symbol(s): | |
_ = re.findall('\+?\(\(([^\)]+)\)\+\(([^\)]+)\)\)', s) | |
#type 1 +((...)+(...)) 2-digit num | |
if _: | |
v1, v2 = map(calc_symbol, _[0]) | |
return int(str(v1)+str(v2)) | |
#type 2 plain | |
else: | |
vmap = {'!':1, '[]':0, '!![]':1, '':0} | |
return sum(map(lambda x:vmap[x], s.split('+'))) | |
hd = dict(headers) | |
cookie['__cfduid'] = session | |
hd.update({"Cookie":mkcookie(cookie)}) | |
_host = re.findall("https*://([^/]+)", url)[0] | |
c, v, u, w = get_cv(content, _host) | |
time.sleep(int(w)/1000+2) | |
resp, ct = ht.request('%s%s?%s&jschl_answer=%s' % (url, u, c, v), headers = hd) | |
cfclearance = re.findall('cf_clearance=([^;]+);', resp['set-cookie'])[0] | |
cookie['cf_clearance'] = cfclearance | |
return True | |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import httplib2 | |
import aacs | |
ht = httplib2.Http() | |
url = "http://someurl.com/rss" | |
cookie = {} | |
hd = { | |
'User-Agent':'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.0 (KHTML, like Gecko) Chrome/24.6.5128.7 Safari/536.0', | |
'Accept-Language':'zh-CN,zh;q=0.8', | |
'Accept-Charset':'utf-8;q=0.7,*;q=0.7', | |
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', | |
'Connection': 'keep-alive', | |
'Accept-Encoding':'gzip,deflate', | |
} | |
resp, ct = ht.request(url, headers = hd) | |
if aacs.autodetect(resp, ct, ht, cookie, hd, url): | |
#request again | |
hd.update({"Cookie":aacs.mkcookie(cookie)}) | |
resp, ct = ht.request(url, headers = hd) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment