Skip to content

Instantly share code, notes, and snippets.

@Wh1terat
Last active June 11, 2023 12:21
Show Gist options
  • Save Wh1terat/309ca982aaa89ff9804d3e548ff6d5fc to your computer and use it in GitHub Desktop.
Save Wh1terat/309ca982aaa89ff9804d3e548ff6d5fc to your computer and use it in GitHub Desktop.
Inscrapesula
#!/usr/bin/env python3
"""
InSCRAPEsula v0.1
Inspired by and sections borrowed from https://github.com/ziplokk1/incapsula-cracker-py3"
"""
import logging
import re
from ast import literal_eval
from base64 import b64encode, b64decode
from random import random
from datetime import datetime, timedelta
from bs4 import BeautifulSoup
from requests import Session
from requests.utils import quote, urlparse
__title__ = 'inscrapesula'
__version__ = '0.1'
__author__ = 'Gareth Bryan'
import http.client
http.client.HTTPConnection.debuglevel = 1
logging.basicConfig(level=logging.DEBUG)
class ISession(Session):
"""
Subclass requests.Session
"""
BROWSER_CHECKS = [
("navigator", "true"),
("navigator.vendor", "Google Inc."),
("navigator.appName", "Netscape"),
("navigator.plugins.length==0", "false"),
("navigator.platform", "MacIntel"),
("navigator.webdriver", "undefined"),
("plugin_ext", "no extention"),
("ActiveXObject", "false"),
("webkitURL", "true"),
("_phantom", "false"),
("callPhantom", "false"),
("chrome", "true"),
("yandex", "false"),
("opera", "false"),
("opr", "false"),
("safari", "false"),
("awesomium", "false"),
("puffinDevice", "false"),
("__nightmare", "false"),
("domAutomation", "false"),
("domAutomationController", "false"),
("_Selenium_IDE_Recorder", "false"),
("document.__webdriver_script_fn", "false"),
("document.$cdc_asdjflasutopfhvcZLmcfl_", "false"),
("process.version", "false"),
("global", "false"),
("global.require", "false"),
("global.process", "false"),
("WebAssembly", "true"),
("window.toString()", "[object Window]"),
("navigator.cpuClass", "false"),
("navigator.oscpu", "false"),
("navigator.connection", "true"),
("navigator.language=='C'", "false"),
("window.outerWidth==0", "false"),
("window.outerHeight==0", "false"),
("window.WebGLRenderingContext", "true"),
("document.documentMode", "undefined"),
("eval.toString().length", "33"),
("'v2b147170f281b1d69c9de8f2e4c27a45252e1a6c83ec1dd95f7c3f5f5d987e9b'.toString()", "v2b147170f281b1d69c9de8f2e4c27a45252e1a6c83ec1dd95f7c3f5f5d987e9b"),
]
class IncapError(Exception):
"""
Base exception for all errors
"""
class MaxRetriesExceeded(IncapError):
"""
Raised when retries exceeds "max_attempts"
"""
class IncapBlocked(IncapError):
"""
Raised when Incapsula blocks with reCAPTCHA
"""
class IncapJSError(IncapError):
"""
Rasied when something goes wrong in the JS deobfuscation
"""
def __init__(self, **kwargs):
self.logger = logging.getLogger('inscrapesula')
self.max_attempts = kwargs.pop('attempts', 3)
super().__init__(**kwargs)
def get(self, url, **kwargs):
"""
Override requests.session.get()
:params: url: string URL for the new request object
:params: **kwargs: dict pass kwargs to original get function
:return: Response object
"""
kwargs.setdefault('allow_redirects', True)
# if bypass kwarg is set, don't try to solve
if kwargs.pop('bypass', False):
return self.request('GET', url, **kwargs)
return self._solve(self.request('GET', url, **kwargs))
def _solve(self, resp, orig=None, attempts=0):
"""
Attempt to bypass incapsula
:param resp: Response to check.
:param org: Original response. Used only when called recursively.
:param tries: int Number of attempts. Used when called recursively.
:return:
"""
orig = orig or resp
if self.max_attempts is not None and attempts >= self.max_attempts:
raise self.MaxRetriesExceeded()
url = urlparse(orig.url)
try:
script = self._get_script(url, resp.text)
except self.IncapBlocked as err:
raise err
else:
if script:
self._generate_cookie(url.hostname, script)
self._send_ack(url)
return self._solve(
self.get(orig.url, bypass=True),
orig=orig,
attempts=attempts + 1
)
return resp
def _get_script(self, url, content):
"""
Check page response for recpatcha or js challenge
:param content: string page
:return: string js src
"""
parser = BeautifulSoup(content, 'html.parser')
iframe = parser.find(
"iframe",
id="main-iframe",
src=re.compile("/_Incapsula_Resource?"),
)
if iframe:
raise self.IncapBlocked(iframe.contents[0])
script = parser.find(
"script", src=re.compile("/_Incapsula_Resource?")
)
if script:
self.logger.debug('Script Found: %s', script['src'])
# Download the incapsula js
req = '{}://{}{}'.format(url.scheme, url.netloc, script['src'])
res = self.get(req, bypass=True)
# Extract the true body from the first level of obfuscation
# e.g
#
# (function() {
# var z = "";
# var b = "766172205f3078326630633d5b275...*SNIP*";
# eval((function() {
# for (var i = 0; i < b.length; i += 2) {
# z += String.fromCharCode(parseInt(b.substring(i, i + 2), 16));
# }
# return z;
# })());
# })();
#
js_code = re.search(r'"";var [a-z]="([^"]*)', res.text)
if js_code is None:
raise self.IncapJSError('Failed to extract js')
return bytes.fromhex(js_code.group(1)).decode('utf-8')
return None
def _send_ack(self, url):
"""
Send Ack (don't know what else to call it?)
:param url: string original url
:return: bool
"""
req = '{}://{}/_Incapsula_Resource?SWKMTFSR=1&e={}'.format(
url.scheme,
url.netloc,
random()
)
self.logger.debug('Sending Ack: %s', req)
res = self.get(req, bypass=True)
if res.text == '1':
return True
return False
def _generate_cookie(self, domain, data):
"""
Generate ___utmvc cookie and add it to requests.session.cookies
:param domain: string domain name used for cookie creation
:param data: string content of _Incapsula_Resource JS
"""
try:
# attempt to get decipher string arrays and get token
token = self._get_token(data)
# grab incapsula session cookies
cookies = [
cookie.value
for cookie in self.cookies
if cookie.name.startswith("incap_ses_")
]
if not cookies:
raise self.IncapError('No Incapsula session cookies found!')
# urlencoded list of tuples containing 'good' browser responses
browser_checks = ','.join(
[
quote("=".join(check), safe="()'")
for check in self.BROWSER_CHECKS
]
)
# simple digest
digest = ''.join(
[
self._checksum(browser_checks + cookie)
for cookie in cookies
]
)
# signature
signature = ''.join(
[
"{:x}".format(
ord(b) + ord(digest[i % len(digest)])
)
for i, b in enumerate(token)
]
)
# create final cookie value
value = b64encode(
(
b"%b,digest=%s,s=%s"
% (
self._rc4(browser_checks, token[:5]),
digest.encode(),
signature.encode(),
)
)
).decode("utf-8")
except self.IncapError as err:
raise err
else:
self.logger.debug(
"Setting ___utmvc cookie for '%s': %s", domain, value
)
# original cookie expiry is 20 seconds
expires = round(
datetime.timestamp(
datetime.now() + timedelta(seconds=20)
)
)
self.cookies.set(
"___utmvc",
value,
domain=domain,
path="/",
expires=expires,
)
def _get_token(self, js_code):
"""
Get token
:param js_code: string output of _getJS
:return: string token
"""
token = None
arrays = {}
# Find all string array definitions
# e.g
#
# var _0x2f0c = ['wpMbIsOc', 'w57CmAU=', 'JMKPw4c=',...*SNIP*];
#
for array in re.finditer(r"var (_0x\w+)=(\['.*?\])", js_code):
# Find string array init arguments and decoder function name
# e.g
#
# }(_0x2f0c, 0xb5));var _0xc2f0 = function(_0x4296cc, _0x2adc0c) {
#
init_rxp = r"{},(0x\w+)\)\);var (_0x\w+)".format(
re.escape(array.group(1))
)
for init in re.finditer(init_rxp, js_code):
arrname = init.group(2)
arrays.update(
{arrname: literal_eval(array.group(2))}
)
# the number of times to "rotate" (shift) the array
i = int(init.group(1), 16) % len(arrays[arrname])
# shift the array by n
arrays[arrname] = arrays[arrname][i:] + arrays[arrname][:i]
if not arrays:
raise self.IncapJSError('Failed to process string array')
token_call = re.search(r"(_0x\w+)\('(\w+)', ([0-9a-fx\s\+_]+)\)",js_code)
if token_call is None:
raise self.IncapJSError('Failed to find call to cipher token')
token_call_key_var = token_call.group(3)
token_method =1
if "+" in token_call_key_var:
token_var1, token_var2 = token_call.group(3).split("+")
token_method = 2
else:
token_var1 = token_call.group(3)
token_method = 1
if token_method == 1:
self.logger.debug('Token method 1 in use')
token_var1 = re.search(r"var {}=(_0x\w+);".format(token_var1),js_code)
if token_var1 is None:
raise self.IncapJSError('Failed to find token variable 1 using method 1')
token_1 = re.search(r"var {}='([a-fx0-9\\]+)';".format(token_var1.group(1)), js_code)
if token_1 is None:
raise self.IncapJSError('Failed to find token key using method 1')
key = token_1.group(1).encode('utf-8').decode('unicode_escape').encode('utf-8')
elif token_method == 2:
self.logger.debug('Token method 2 in use')
token_var1 = re.search(r"var {}=(_0x\w+)\['\\x73\\x75\\x62\\x73\\x74\\x72'\]\((0x[0-9a-f]+),(0x[0-9a-f]+)\);".format(token_var1),js_code)
if token_var1 is None:
raise self.IncapJSError('Failed to find token variable 1 using method 2')
token_var1_substr = (
int(token_var1.group(2),16),
int(token_var1.group(3),16)-1
)
token_1 = re.search(r"var {}='([a-fx0-9\\]+)';".format(token_var1.group(1)), js_code)
if token_var1 is None:
print(js_code)
raise self.IncapJSError('Failed to find token variable 1 using method 2')
token_1 = token_1.group(1).encode('utf-8').decode('unicode_escape')
token_1 = token_1[token_var1_substr[0]:token_var1_substr[1]]
token_var2 = re.search(r"var {}=(_0x\w+)\['\\x73\\x75\\x62\\x73\\x74\\x72'\]\((0x[0-9a-f]+)\);".format(token_var2), js_code)
if token_var2 is None:
print(js_code)
raise self.IncapJSError('Failed to find token variable 2 using method 2')
token_var2_substr = int(token_var1.group(2),16)
token_2 = re.search(r"var {}='([a-fx0-9\\]+)';".format(token_var2.group(1)),js_code)
if token_var2 is None:
raise self.IncapJSError('Failed to find token variable 2 using method 2')
token_2 = token_2.group(1).encode('utf-8').decode('unicode_escape')
token_2 = token_2[token_var2_substr:]
key = '{}{}'.format(token_1,token_2).encode('utf-8')
key = key.decode('utf-8')
arrname = token_call.group(1)
i = int(token_call.group(2), 16)
enc = b64decode(arrays[arrname][i]).decode('utf-8')
token = self._rc4(enc, key).decode('utf-8')
self.logger.debug('Token found: %s' % token)
return token
@staticmethod
def _checksum(data):
"""
Checksum
:param data: data to checksum
:return: string checksum
"""
return str(sum(ord(c) for c in data))
@staticmethod
def _rc4(message, key):
"""
RC4 Implementation.
:param data: message to cipher
:param key: string rc4 key
:return: bytearray ciphertext
"""
cipher_text = bytearray()
key = [ord(c) for c in key]
sbox = list(range(256))
j = 0
for i in range(256):
j = (j + sbox[i] + key[i % len(key)]) % 256
sbox[i], sbox[j] = sbox[j], sbox[i]
i = j = 0
for byte in message:
i = (i + 1) % 256
j = (j + sbox[i]) % 256
sbox[i], sbox[j] = sbox[j], sbox[i]
key_stream = sbox[(sbox[i] + sbox[j]) % 256]
cipher_text.append(key_stream ^ ord(byte))
return cipher_text
@alexnrj
Copy link

alexnrj commented Feb 17, 2022

@Wh1terat is still this gist working? I coded a nodejs project that using AST extract the token for the three method but, when I use the token with your code the request is forbidden.

@Wh1terat
Copy link
Author

@alexnrj
Almost certainly not working anymore - but if you've managed to navigate the ast of incapsula and get the token then you've done 99% of the hard work already. 😉

@alexnrj
Copy link

alexnrj commented Feb 17, 2022

@Wh1terat Can we discuss it privately?

@Wh1terat
Copy link
Author

@alexnrj Unfortunately I have to decline.
If I had wanted to maintain it I would have created a repo with issue tracker et all, I merely uploaded as a gist to help others as a starting point.

@alexnrj
Copy link

alexnrj commented Feb 17, 2022

A public repo don't success for a long time. I'm as you, completely sure that Incapsula guys are looking us. Anyway I comment my trouble: is simple_digest used all time for the three methods or is randomly used?

@Wh1terat
Copy link
Author

More than likely they are, for me this was just a PoC for a personal project to scrape some data - and an excuse to get hands on knowledge of AST.
Unfortunately all the gists I published did was garner attention from people trying to break incapsula for nefarious reasons (spam/bots/etc) which I don't agree with.

As to your problem, it's been a couple of years since I last looked - hard to say as it is not fresh in my memory.

But if you have been able to deobfuscate the incap js sufficiently then you should be able to see and mimic this functionality fairly easily?

@Wh1terat
Copy link
Author

@alexnrj
Copy link

alexnrj commented Feb 17, 2022

I deobfuscated almost all samples, but in loop calculating digest there are samples using simple_digest and others ones using other js standard function (redacted intentionally). In second case there's no sense to obtain only digits in digest.
To validate my nodejs code I captured real http traffic to collect session cookies and js script, using those information I can get the token. With the obtained token using CyberChef is possible to view Incapsula cookie in clear text.

@Wh1terat
Copy link
Author

ah yes, that was one thing I did see - totally inconsistent as to which template incap use between requests.
I remember there were 2-3 a few years ago, maybe more now.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment