-
-
Save zerebral/c08810f23290d92fe2a749f7bea41e31 to your computer and use it in GitHub Desktop.
const puppeteer_extra = require('puppeteer-extra'); | |
puppeteer_extra.use(require('puppeteer-extra-plugin-anonymize-ua')()); | |
const StealthPlugin = require('puppeteer-extra-plugin-stealth'); | |
puppeteer_extra.use(StealthPlugin()); | |
module.exports = async ({ context, browser }) => { | |
const { url } = context; | |
const ws_endpoint_url = browser.wsEndpoint(); | |
const newBrowser = await puppeteer_extra.connect({ | |
browserWSEndpoint: ws_endpoint_url | |
}); | |
const page_1 = await newBrowser.newPage(); | |
await page_1.setRequestInterception(true); | |
await page_1.authenticate({ | |
username: 'your-proxy-username', | |
password: 'your-proxy-password', | |
}); | |
page_1.on('request', (req) => { | |
if(req.resourceType() == 'stylesheet' || req.resourceType() == 'font' || req.resourceType() == 'image'){ | |
req.abort(); | |
}else{ | |
req.continue(); | |
} | |
}); | |
await page_1.goto(url, {'timeout': 30000, 'waitUntil':'networkidle2'}); | |
const data = await page_1.content(); | |
await newBrowser.close(); | |
return { | |
data, | |
// Make sure to match the appropriate content here\n" + | |
// You'll likely want 'application/json'\n" + | |
type: 'application/html', | |
}; | |
}; |
@lonkle Yes you'd need to provide your own proxy auth
Your steps on creating the images are correct. The script will need to be posted to /function endpoint every time your program tries to load a web page
@zerebral Do you think I need to edit the script due to me getting:
Exactly one of browserWSEndpoint, browserURL or transport must be passed to puppeteer.connect
Every time I run it in /function
I get that error with the latest build of browserless (which uses a different base FROM
image. Have you tested this on the latest version (they changed the FROM
base image over the summer)?
So you think the DOCKERFILE
changes are enough then? Do you know of any site or way I can check that it’s working properly, like a headless browser test page or something?
If you’re telling me not to use the script - I do have to ask what the script is for then, since you just previously told me I should run the script every time I use the /functuon
endpoint (or any other way to invoke a script)? Will I be missing out on certain protections by not using it first?
Example, I’m trying to get all greens on this page from within Browserless: https://intoli.com/blog/not-possible-to-block-chrome-headless/chrome-headless-test.html
I had tested this URL previously and that gave me all green. Trying to find that version. My bad on not using the specific browserless version in the Dockerfile too as a base image.
This is the “base” `DOCKERFILE now and this is the “primary” DOCKERFILE (where I added your parameters and they work perfectly).
Base: https://github.com/browserless/chrome/blob/master/base/Dockerfile
Primary: https://github.com/browserless/chrome/blob/master/Dockerfile
And since you were telling me to use your script earlier, and now you’re saying don’t use the script - I’m a little confused. In my situation, is the DOCKERFILE changes enough to not have to use the script? Because the script is what makes it look like it’s using the extra
puppeteer stealth stuff rather than just having it installed (which the DOCKERFILE edits do (along with setting sane environment variables).
Sorry about all of the questions. I just really want to get all greens on that page and want to use your script to do so. Thank you so much for writing this for Browserless!
@lonkle - Here's the working version of it. I dont have a pure JS version of it as I post to the /function endpoint from Java side
` @test
public void case_stealth_check() {
String code =
"const puppeteer_extra = require('puppeteer-extra');\n" +
"puppeteer_extra.use(require('puppeteer-extra-plugin-anonymize-ua')());\n" +
"const StealthPlugin = require('puppeteer-extra-plugin-stealth');\n" +
"puppeteer_extra.use(StealthPlugin());\n" +
"module.exports = async ({ context, browser }) => {\n" +
" const { url } = context;\n" +
" const ws_endpoint_url = browser.wsEndpoint();\n" +
" const newBrowser = await puppeteer_extra.connect({\n" +
" browserWSEndpoint: ws_endpoint_url\n" +
"});\n" +
"const page_1 = await newBrowser.newPage();\n" +
"await page_1.setRequestInterception(true);\n" +
"await page_1.authenticate({\n" +
" username: '" + LUMINATI_USERNAME + "',\n" +
" password: '" + LUMINATI_PASSWORD + "',\n" +
"});\n" +
"page_1.on('request', (req) => {\n" +
"if(req.resourceType() == 'stylesheet' || req.resourceType() == 'font' || req.resourceType() == 'image'){\n" +
"req.abort();" +
"}\n" +
"else {" +
"req.continue();\n" +
"}" +
"});" +
" await page_1.goto(url, {'timeout': 180000, 'waitUntil':'domcontentloaded'});\n" +
"\n" +
" const data = await page_1.content();\n" +
" await newBrowser.close();\n" +
"\n" +
" return {\n" +
" data,\n" +
" // Make sure to match the appropriate content here\n" +
" // You'll likely want 'application/json'\n" +
" type: 'application/html',\n" +
" };\n" +
"};";
HashMap<String, String> context = new HashMap<>();
context.put("url", "https://intoli.com/blog/not-possible-to-block-chrome-headless/chrome-headless-test.html");
//context.put("url", "https://www.skymotorcars.com/vehicle-details/used-2018-alfa-romeo-stelvio-awd-ti-sport-west-chester-pa-id-35803263");
//context.put("url", "http://www.httpbin.org/headers");
//context.put("url", "https://bot.sannysoft.com");
//context.put("url", "http://www.httpbin.org/ip");
//context.put("url", "https://arh.antoinevastel.com/bots/areyouheadless");
//context.put("url", "https://browserleaks.com/canvas");
//context.put("url", "https://amiunique.org/fp");
//blocked
//context.put("url", "https://www.machinerytrader.com/listings/construction-equipment/for-sale/category/1025/dozers");
//context.put("url", "https://www.carsforsale.com/Search?SearchTypeID=2&Radius=100&Conditions=Used&PageNumber=1&OrderBy=relevance&OrderDirection=desc");
//context.put("url", "https://www.boattrader.com/boats/condition-used/");
HashMap<String, Object> payload = new HashMap<>();
payload.put("code", code);
payload.put("context", context);
long now = System.currentTimeMillis();
String endpoint = "http://localhost:3000";
try {
Response response = Utils.okHTTPPost(endpoint + "/function?token=fkeBbwmUxSpoFPRUP0UX&headless=false&--proxy-server=" + LUMINATI_PROXY_URL,
payload);
String body = response.body().string();
System.out.println(body);
System.out.println(body.length());
System.out.println(body.hashCode());
System.out.println(response.code());
System.out.println(response.message());
try {
new FileWriter(new File("/tmp/" + context.get("url").hashCode() + ".html")).write(body);
} catch (IOException e) {
e.printStackTrace();
}
System.out.println("Loading took - " + (System.currentTimeMillis() - now) + " msec");
} catch (IOException e) {
e.printStackTrace();
}
}
`
@lonkle - Here I'm posting to the locally hosted Browserless instances thats hosted with Puppeteer stealth.. I tested and confirmed that it gives all green for all the intolli tests -
HTML Received -
<title>Chrome Headless Detection (Round II)</title>
<style>
td:last-child {
background-color: #c8d86d;
max-width:300px;
word-wrap:break-word;
}
td.failed {
background-color: #f45159;
}
table, th, td {
border: 1px solid black;
}
span.age {
float: right;
margin-left: 40px;
margin-right: 10px;
}
</style>
Test Name | Result |
---|---|
User Agent (Old) | Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.0 Safari/537.36 |
WebDriver (New) | missing (passed) |
Chrome (New) | present (passed) |
Permissions (New) | default |
Plugins Length (Old) | 3 |
Languages (Old) | en-US,en |
@lonkle - Though this works with the sample bot detection pages, I havent had good results when I tried this on actual websites that have advanced bot detection tech installed on them like Distil / Cloudflare's..
@lonkle FYI Exploring scraperapi.com - They claim to have a huge residential IP pool and can render pages in full browser mode. Also claim to solve captcha on their end. Initial tests suggest they struggle as well for some of the websites we have been trying to load that end up with captcha's. Working with their support to get those resolved.. Pricing wise it seems pretty cost effective. Since this always looks like a cat & mouse game, I'd rather have a 3rd party play it and resolve things as we see them rather than trying to solve it myself.
Woah, I missed the new work you put into this. This is brilliant. I also need to feed the code into the functions
endpoint in my app so I'm doing the same as you. Funny thing, I likewise use scraperapi.com for other things - I'm glad I found them.
I want to see if I can get this working. You're amazing @zerebral - you're the only person that's attempted getting puppeteer-extra / stealth working on Browserless.
@lonkle Havent tested this with the latest browserless yet. I'll give it a try and revert back in a couple of days.
@zerebral Yeah, no pressure. I just want to make browserless as "browserless" as it can be and I think you accomplished it. Tbh, my use case of Browserless doesn't require "extra" or "stealth" puppeteer features. But, I think it's best I build it with those add-ons to make it more future proof and you're the only person I've found in numerous Google searches whose tried to combine "stealth" with Browserless. Really impressed. Also happy that we both use the /function
endpoint in our applications so it requires next to no code changes for me to implement your solution which is, honestly, a bit over my head.
@lonkle - We have been able to get this working with the latest browserless version. The error you are getting is that from the Browerless debugger console? Sometimes the debugger console does not work with the JS we put there.. Try posting the script to the /function endpoint from your program and that should do it.
I haven’t tried your latest changes in your comment above. But I was using CURL to post the code to /function
when doing so. I wrote about that in my first comment -
https://gist.github.com/zerebral/c08810f23290d92fe2a749f7bea41e31#gistcomment-3469642
But I might have done it wrong - so I’ll convert your JAVA into curl
code I can post. I’ll try again today, thanks so much for following up with me!
So function
and the live debugger run in different ways, you think?
And your original edits to the DOCKERFILE aside from the FROM
line, you didn’t need to change any of those right?
Yeah I have seen that.. Sometimes some valid code fragments give errors in the debugger but it works just fine when you post it to /functions. I dont understand much of as to why that happens, but it does.
Yeah the dockerfile works just fine too. You may want to take a look at the docs to see how the ENV vars need to be defined for your use.
I'm getting puppeteer_extra is not defined
when POST
ing to /function
even though I added these lines to the bottom of my Dockerfile:
RUN npm install puppeteer-extra
RUN npm install puppeteer-extra-plugin-stealth
RUN npm install puppeteer-extra-plugin-anonymize-ua
ENV FUNCTION_EXTERNALS '["puppeteer-extra-plugin-anonymize-ua", "puppeteer-extra", "puppeteer-extra-plugin-stealth"]'
Did I miss one?
Or do I need to put your extra puppeteer_extra
installation and environment additions on specific lines in the DOCKERFILE
(https://github.com/browserless/chrome/blob/master/Dockerfile)?
I added:
RUN npm install puppeteer-extra
RUN npm install puppeteer-extra-plugin-stealth
RUN npm install puppeteer-extra-plugin-anonymize-ua
ENV FUNCTION_EXTERNALS '["puppeteer-extra-plugin-anonymize-ua", "puppeteer-extra", "puppeteer-extra-plugin-stealth"]'
near the top and still had the puppeteer_extra is not defined
error when POST
ing with cURL. I'm gonna try different places to add those lines.
Oh, and this will only ever work in the /function
endpoint. In the browserless documentation, it specifically states that FUNCTION_EXTERNALS
can only be added to the /functions
endpoint. But that seems to be my issue. My FUNCTIONAL_EXTERNALS
aren't including puppeteer-extra
and I'm guessing the line:
ENV FUNCTION_EXTERNALS '["puppeteer-extra-plugin-anonymize-ua", "puppeteer-extra", "puppeteer-extra-plugin-stealth"]'
is to blame. Gonna try playing around with the syntax there.
It ended up being the syntax of that line and I fixed it but now I've got this error:
The module 'puppeteer-extra' is not whitelisted in VM.
I thought that's what that environment variable did, was whitelist it to work with browserless. So close!
Okay, so if I escape the quotation marks it works, or even if I make FUNCTION_EXTERNALS
set to true (which I think allows for any external function), but now I get this:
VMError: Module 'puppeteer-extra' is not allowed to be required. The path is outside the border!
Fixed that whole debacle. So now it runs with no issues through the Node VM and everything (all whitelisted).
But I'm still getting:
Exactly one of browserWSEndpoint, browserURL or transport must be passed to puppeteer.connect
when running your custom code via the /function
endpoint. I'm assuming that's because browserless must already be connected to it and I need to disconnect it first or something? Or maybe I need to launch it if it isn't already running. 🤔
Found out that this is because browser
is undefined so there's nothing to connect to since . That's odd, the error message is a little misleading too. OOOOOooo, I got it.
const ws_endpoint_url = await browser.wsEndpoint();
It's likely my machine is slower than yours so it needs that await
there. BOOM! 💥 That took way too lonk, but thank you for your help!
Nope, that was wrong - browser is still undefined, and doesn't return an endpoint. The only reason my test worked was because I hardcoded the browserWSEndpoint
with localhost:3000
which made everything work. But I wonder why there is no browser variable to get that data from.
Either way, if I hardcode it, it works so I'll revisit this if hardcoding the WS URL stops working. Super weird that I had to hardcode that URL. I'm guessing your javascript code had a browser
variable somewhere where I don't see that wasn't passed into \function
with the context
array.
Thank you all for sharing your experience. It has been extremely useful in getting this to work.
In case someone else comes across this...
Puppeteer Extra Stealth
This plugin is now part of browserless. You can make it work by adding the ENV var to docker or dockercompose:
environment:
DEFAULT_STEALTH: 'true'
Getting other Puppeteer extra plugins to work
If you want to use another plugin, such as the recaptcha one, you can get it working in the following way:
In case someone else comes across this, here is how I got it working.
Docker
docker-compose.yml
version: '2'
services:
browserless:
# image: browserless/chrome:1.46-puppeteer-1.20.0
build:
context: .
dockerfile: Dockerfile
restart: always
environment:
MAX_CONCURRENT_SESSIONS: 2
DEFAULT_STEALTH: 'true'
CONNECTION_TIMEOUT: 180000
Dockerfile
FROM browserless/chrome:1.46-puppeteer-1.20.0
RUN npm install puppeteer-extra-plugin-recaptcha
ENV FUNCTION_EXTERNALS '["puppeteer-extra-plugin-recaptcha", "puppeteer-extra"]'
Build the image
docker-compose build
Browserless
JS function
module.exports = async ({ page, context }) => {
async function load_pe_recaptcha(page, two_captcha_key) {
const puppeteer_extra = require('puppeteer-extra');
const RecaptchaPlugin = require('puppeteer-extra-plugin-recaptcha');
// For some reason using browser.wsEndpoint() did not work
var browser = await page.browser();
var wsEndpoint = await browser._wsEndpoint;
puppeteer_extra.use(
RecaptchaPlugin({
provider: {
id: '2captcha',
token: two_captcha_key,
},
visualFeedback: true,
})
);
await puppeteer_extra.connect({
browserWSEndpoint: wsEndpoint,
});
return wsEndpoint;
};
const two_captcha_key = context['2captcha_key'];
const wsEndpoint = load_pe_recaptcha(page, two_captcha_key);
await page.goto('https://www.google.com/recaptcha/api2/demo')
// After calling load_pe_recaptcha() the additional
// puppeteer extra functions become available
const captcha_resp = await page.solveRecaptchas();
await Promise.all([
page.waitForNavigation(),
page.click(`#recaptcha-demo-submit`)
]);
const data = {
endpoint: wsEndpoint,
captcha_resp: captcha_resp,
};
return {
data,
type: 'application/json',
}
};
Running it
You have to run it through the /function
endpoint. It doesn't work in the browser console.
I did every thing, but seems i cant send the code to /function
here the error I get from the logs:
browserless:job LYESRDNJ5MER2GVJRN303EF4EMBBU0BU: /function: Inbound HTTP request. Context: undefined
curl -X POST \ 'http://xxxxx/function' \ -H 'Content-Type: application/json' \ -d '{"code":"module.exports = async ({ page, context }) => { async function load_pe_recaptcha(page, two_captcha_key) { const puppeteer_extra = require(\"puppeteer-extra\"); const RecaptchaPlugin = require(\"puppeteer-extra-plugin-recaptcha\"); var browser = await page.browser(); var wsEndpoint = await browser._wsEndpoint; puppeteer_extra.use( RecaptchaPlugin({ provider: { id: \"2captcha\", token: two_captcha_key, }, visualFeedback: true, }) ); await puppeteer_extra.connect({ browserWSEndpoint: wsEndpoint, }); return wsEndpoint; }; const two_captcha_key = context[\"2captcha_key\"]; const wsEndpoint = load_pe_recaptcha(page, two_captcha_key); await page.goto(\"https://www.google.com/recaptcha/api2/demo\") // After calling load_pe_recaptcha() the additional // puppeteer extra functions become available const captcha_resp = await page.solveRecaptchas(); await Promise.all([ page.waitForNavigation(), page.click(
#recaptcha-demo-submit) ]); const data = { endpoint: wsEndpoint, captcha_resp: captcha_resp, }; return { data, type: \"application/json\", } };"}'
Your
gist
is usingFROM browserless/chrome:latest
while the latest browserless release usesFROM browserless/base:1.5.0
(https://github.com/browserless/chrome/blob/master/Dockerfile), that might what is causing your script to fail. Because when changing thatFROM
line, I get a bunch of build errors and it won't compile.