-
-
Save Wh1terat/309ca982aaa89ff9804d3e548ff6d5fc to your computer and use it in GitHub Desktop.
#!/usr/bin/env python3 | |
""" | |
InSCRAPEsula v0.1 | |
Inspired by and sections borrowed from https://github.com/ziplokk1/incapsula-cracker-py3" | |
""" | |
import logging | |
import re | |
from ast import literal_eval | |
from base64 import b64encode, b64decode | |
from random import random | |
from datetime import datetime, timedelta | |
from bs4 import BeautifulSoup | |
from requests import Session | |
from requests.utils import quote, urlparse | |
__title__ = 'inscrapesula' | |
__version__ = '0.1' | |
__author__ = 'Gareth Bryan' | |
import http.client | |
http.client.HTTPConnection.debuglevel = 1 | |
logging.basicConfig(level=logging.DEBUG) | |
class ISession(Session): | |
""" | |
Subclass requests.Session | |
""" | |
BROWSER_CHECKS = [ | |
("navigator", "true"), | |
("navigator.vendor", "Google Inc."), | |
("navigator.appName", "Netscape"), | |
("navigator.plugins.length==0", "false"), | |
("navigator.platform", "MacIntel"), | |
("navigator.webdriver", "undefined"), | |
("plugin_ext", "no extention"), | |
("ActiveXObject", "false"), | |
("webkitURL", "true"), | |
("_phantom", "false"), | |
("callPhantom", "false"), | |
("chrome", "true"), | |
("yandex", "false"), | |
("opera", "false"), | |
("opr", "false"), | |
("safari", "false"), | |
("awesomium", "false"), | |
("puffinDevice", "false"), | |
("__nightmare", "false"), | |
("domAutomation", "false"), | |
("domAutomationController", "false"), | |
("_Selenium_IDE_Recorder", "false"), | |
("document.__webdriver_script_fn", "false"), | |
("document.$cdc_asdjflasutopfhvcZLmcfl_", "false"), | |
("process.version", "false"), | |
("global", "false"), | |
("global.require", "false"), | |
("global.process", "false"), | |
("WebAssembly", "true"), | |
("window.toString()", "[object Window]"), | |
("navigator.cpuClass", "false"), | |
("navigator.oscpu", "false"), | |
("navigator.connection", "true"), | |
("navigator.language=='C'", "false"), | |
("window.outerWidth==0", "false"), | |
("window.outerHeight==0", "false"), | |
("window.WebGLRenderingContext", "true"), | |
("document.documentMode", "undefined"), | |
("eval.toString().length", "33"), | |
("'v2b147170f281b1d69c9de8f2e4c27a45252e1a6c83ec1dd95f7c3f5f5d987e9b'.toString()", "v2b147170f281b1d69c9de8f2e4c27a45252e1a6c83ec1dd95f7c3f5f5d987e9b"), | |
] | |
class IncapError(Exception): | |
""" | |
Base exception for all errors | |
""" | |
class MaxRetriesExceeded(IncapError): | |
""" | |
Raised when retries exceeds "max_attempts" | |
""" | |
class IncapBlocked(IncapError): | |
""" | |
Raised when Incapsula blocks with reCAPTCHA | |
""" | |
class IncapJSError(IncapError): | |
""" | |
Rasied when something goes wrong in the JS deobfuscation | |
""" | |
def __init__(self, **kwargs): | |
self.logger = logging.getLogger('inscrapesula') | |
self.max_attempts = kwargs.pop('attempts', 3) | |
super().__init__(**kwargs) | |
def get(self, url, **kwargs): | |
""" | |
Override requests.session.get() | |
:params: url: string URL for the new request object | |
:params: **kwargs: dict pass kwargs to original get function | |
:return: Response object | |
""" | |
kwargs.setdefault('allow_redirects', True) | |
# if bypass kwarg is set, don't try to solve | |
if kwargs.pop('bypass', False): | |
return self.request('GET', url, **kwargs) | |
return self._solve(self.request('GET', url, **kwargs)) | |
def _solve(self, resp, orig=None, attempts=0): | |
""" | |
Attempt to bypass incapsula | |
:param resp: Response to check. | |
:param org: Original response. Used only when called recursively. | |
:param tries: int Number of attempts. Used when called recursively. | |
:return: | |
""" | |
orig = orig or resp | |
if self.max_attempts is not None and attempts >= self.max_attempts: | |
raise self.MaxRetriesExceeded() | |
url = urlparse(orig.url) | |
try: | |
script = self._get_script(url, resp.text) | |
except self.IncapBlocked as err: | |
raise err | |
else: | |
if script: | |
self._generate_cookie(url.hostname, script) | |
self._send_ack(url) | |
return self._solve( | |
self.get(orig.url, bypass=True), | |
orig=orig, | |
attempts=attempts + 1 | |
) | |
return resp | |
def _get_script(self, url, content): | |
""" | |
Check page response for recpatcha or js challenge | |
:param content: string page | |
:return: string js src | |
""" | |
parser = BeautifulSoup(content, 'html.parser') | |
iframe = parser.find( | |
"iframe", | |
id="main-iframe", | |
src=re.compile("/_Incapsula_Resource?"), | |
) | |
if iframe: | |
raise self.IncapBlocked(iframe.contents[0]) | |
script = parser.find( | |
"script", src=re.compile("/_Incapsula_Resource?") | |
) | |
if script: | |
self.logger.debug('Script Found: %s', script['src']) | |
# Download the incapsula js | |
req = '{}://{}{}'.format(url.scheme, url.netloc, script['src']) | |
res = self.get(req, bypass=True) | |
# Extract the true body from the first level of obfuscation | |
# e.g | |
# | |
# (function() { | |
# var z = ""; | |
# var b = "766172205f3078326630633d5b275...*SNIP*"; | |
# eval((function() { | |
# for (var i = 0; i < b.length; i += 2) { | |
# z += String.fromCharCode(parseInt(b.substring(i, i + 2), 16)); | |
# } | |
# return z; | |
# })()); | |
# })(); | |
# | |
js_code = re.search(r'"";var [a-z]="([^"]*)', res.text) | |
if js_code is None: | |
raise self.IncapJSError('Failed to extract js') | |
return bytes.fromhex(js_code.group(1)).decode('utf-8') | |
return None | |
def _send_ack(self, url): | |
""" | |
Send Ack (don't know what else to call it?) | |
:param url: string original url | |
:return: bool | |
""" | |
req = '{}://{}/_Incapsula_Resource?SWKMTFSR=1&e={}'.format( | |
url.scheme, | |
url.netloc, | |
random() | |
) | |
self.logger.debug('Sending Ack: %s', req) | |
res = self.get(req, bypass=True) | |
if res.text == '1': | |
return True | |
return False | |
def _generate_cookie(self, domain, data): | |
""" | |
Generate ___utmvc cookie and add it to requests.session.cookies | |
:param domain: string domain name used for cookie creation | |
:param data: string content of _Incapsula_Resource JS | |
""" | |
try: | |
# attempt to get decipher string arrays and get token | |
token = self._get_token(data) | |
# grab incapsula session cookies | |
cookies = [ | |
cookie.value | |
for cookie in self.cookies | |
if cookie.name.startswith("incap_ses_") | |
] | |
if not cookies: | |
raise self.IncapError('No Incapsula session cookies found!') | |
# urlencoded list of tuples containing 'good' browser responses | |
browser_checks = ','.join( | |
[ | |
quote("=".join(check), safe="()'") | |
for check in self.BROWSER_CHECKS | |
] | |
) | |
# simple digest | |
digest = ''.join( | |
[ | |
self._checksum(browser_checks + cookie) | |
for cookie in cookies | |
] | |
) | |
# signature | |
signature = ''.join( | |
[ | |
"{:x}".format( | |
ord(b) + ord(digest[i % len(digest)]) | |
) | |
for i, b in enumerate(token) | |
] | |
) | |
# create final cookie value | |
value = b64encode( | |
( | |
b"%b,digest=%s,s=%s" | |
% ( | |
self._rc4(browser_checks, token[:5]), | |
digest.encode(), | |
signature.encode(), | |
) | |
) | |
).decode("utf-8") | |
except self.IncapError as err: | |
raise err | |
else: | |
self.logger.debug( | |
"Setting ___utmvc cookie for '%s': %s", domain, value | |
) | |
# original cookie expiry is 20 seconds | |
expires = round( | |
datetime.timestamp( | |
datetime.now() + timedelta(seconds=20) | |
) | |
) | |
self.cookies.set( | |
"___utmvc", | |
value, | |
domain=domain, | |
path="/", | |
expires=expires, | |
) | |
def _get_token(self, js_code): | |
""" | |
Get token | |
:param js_code: string output of _getJS | |
:return: string token | |
""" | |
token = None | |
arrays = {} | |
# Find all string array definitions | |
# e.g | |
# | |
# var _0x2f0c = ['wpMbIsOc', 'w57CmAU=', 'JMKPw4c=',...*SNIP*]; | |
# | |
for array in re.finditer(r"var (_0x\w+)=(\['.*?\])", js_code): | |
# Find string array init arguments and decoder function name | |
# e.g | |
# | |
# }(_0x2f0c, 0xb5));var _0xc2f0 = function(_0x4296cc, _0x2adc0c) { | |
# | |
init_rxp = r"{},(0x\w+)\)\);var (_0x\w+)".format( | |
re.escape(array.group(1)) | |
) | |
for init in re.finditer(init_rxp, js_code): | |
arrname = init.group(2) | |
arrays.update( | |
{arrname: literal_eval(array.group(2))} | |
) | |
# the number of times to "rotate" (shift) the array | |
i = int(init.group(1), 16) % len(arrays[arrname]) | |
# shift the array by n | |
arrays[arrname] = arrays[arrname][i:] + arrays[arrname][:i] | |
if not arrays: | |
raise self.IncapJSError('Failed to process string array') | |
token_call = re.search(r"(_0x\w+)\('(\w+)', ([0-9a-fx\s\+_]+)\)",js_code) | |
if token_call is None: | |
raise self.IncapJSError('Failed to find call to cipher token') | |
token_call_key_var = token_call.group(3) | |
token_method =1 | |
if "+" in token_call_key_var: | |
token_var1, token_var2 = token_call.group(3).split("+") | |
token_method = 2 | |
else: | |
token_var1 = token_call.group(3) | |
token_method = 1 | |
if token_method == 1: | |
self.logger.debug('Token method 1 in use') | |
token_var1 = re.search(r"var {}=(_0x\w+);".format(token_var1),js_code) | |
if token_var1 is None: | |
raise self.IncapJSError('Failed to find token variable 1 using method 1') | |
token_1 = re.search(r"var {}='([a-fx0-9\\]+)';".format(token_var1.group(1)), js_code) | |
if token_1 is None: | |
raise self.IncapJSError('Failed to find token key using method 1') | |
key = token_1.group(1).encode('utf-8').decode('unicode_escape').encode('utf-8') | |
elif token_method == 2: | |
self.logger.debug('Token method 2 in use') | |
token_var1 = re.search(r"var {}=(_0x\w+)\['\\x73\\x75\\x62\\x73\\x74\\x72'\]\((0x[0-9a-f]+),(0x[0-9a-f]+)\);".format(token_var1),js_code) | |
if token_var1 is None: | |
raise self.IncapJSError('Failed to find token variable 1 using method 2') | |
token_var1_substr = ( | |
int(token_var1.group(2),16), | |
int(token_var1.group(3),16)-1 | |
) | |
token_1 = re.search(r"var {}='([a-fx0-9\\]+)';".format(token_var1.group(1)), js_code) | |
if token_var1 is None: | |
print(js_code) | |
raise self.IncapJSError('Failed to find token variable 1 using method 2') | |
token_1 = token_1.group(1).encode('utf-8').decode('unicode_escape') | |
token_1 = token_1[token_var1_substr[0]:token_var1_substr[1]] | |
token_var2 = re.search(r"var {}=(_0x\w+)\['\\x73\\x75\\x62\\x73\\x74\\x72'\]\((0x[0-9a-f]+)\);".format(token_var2), js_code) | |
if token_var2 is None: | |
print(js_code) | |
raise self.IncapJSError('Failed to find token variable 2 using method 2') | |
token_var2_substr = int(token_var1.group(2),16) | |
token_2 = re.search(r"var {}='([a-fx0-9\\]+)';".format(token_var2.group(1)),js_code) | |
if token_var2 is None: | |
raise self.IncapJSError('Failed to find token variable 2 using method 2') | |
token_2 = token_2.group(1).encode('utf-8').decode('unicode_escape') | |
token_2 = token_2[token_var2_substr:] | |
key = '{}{}'.format(token_1,token_2).encode('utf-8') | |
key = key.decode('utf-8') | |
arrname = token_call.group(1) | |
i = int(token_call.group(2), 16) | |
enc = b64decode(arrays[arrname][i]).decode('utf-8') | |
token = self._rc4(enc, key).decode('utf-8') | |
self.logger.debug('Token found: %s' % token) | |
return token | |
@staticmethod | |
def _checksum(data): | |
""" | |
Checksum | |
:param data: data to checksum | |
:return: string checksum | |
""" | |
return str(sum(ord(c) for c in data)) | |
@staticmethod | |
def _rc4(message, key): | |
""" | |
RC4 Implementation. | |
:param data: message to cipher | |
:param key: string rc4 key | |
:return: bytearray ciphertext | |
""" | |
cipher_text = bytearray() | |
key = [ord(c) for c in key] | |
sbox = list(range(256)) | |
j = 0 | |
for i in range(256): | |
j = (j + sbox[i] + key[i % len(key)]) % 256 | |
sbox[i], sbox[j] = sbox[j], sbox[i] | |
i = j = 0 | |
for byte in message: | |
i = (i + 1) % 256 | |
j = (j + sbox[i]) % 256 | |
sbox[i], sbox[j] = sbox[j], sbox[i] | |
key_stream = sbox[(sbox[i] + sbox[j]) % 256] | |
cipher_text.append(key_stream ^ ord(byte)) | |
return cipher_text |
More than likely they are, for me this was just a PoC for a personal project to scrape some data - and an excuse to get hands on knowledge of AST.
Unfortunately all the gists I published did was garner attention from people trying to break incapsula for nefarious reasons (spam/bots/etc) which I don't agree with.
As to your problem, it's been a couple of years since I last looked - hard to say as it is not fresh in my memory.
But if you have been able to deobfuscate the incap js sufficiently then you should be able to see and mimic this functionality fairly easily?
@alexnrj https://gist.github.com/Wh1terat/f78416e4c681becb5bdf0a646aa37566 seems to still work for the most part.
I deobfuscated almost all samples, but in loop calculating digest there are samples using simple_digest and others ones using other js standard function (redacted intentionally). In second case there's no sense to obtain only digits in digest.
To validate my nodejs code I captured real http traffic to collect session cookies and js script, using those information I can get the token. With the obtained token using CyberChef is possible to view Incapsula cookie in clear text.
ah yes, that was one thing I did see - totally inconsistent as to which template incap use between requests.
I remember there were 2-3 a few years ago, maybe more now.
A public repo don't success for a long time. I'm as you, completely sure that Incapsula guys are looking us. Anyway I comment my trouble: is simple_digest used all time for the three methods or is randomly used?