-
-
Save zerebral/c08810f23290d92fe2a749f7bea41e31 to your computer and use it in GitHub Desktop.
const puppeteer_extra = require('puppeteer-extra'); | |
puppeteer_extra.use(require('puppeteer-extra-plugin-anonymize-ua')()); | |
const StealthPlugin = require('puppeteer-extra-plugin-stealth'); | |
puppeteer_extra.use(StealthPlugin()); | |
module.exports = async ({ context, browser }) => { | |
const { url } = context; | |
const ws_endpoint_url = browser.wsEndpoint(); | |
const newBrowser = await puppeteer_extra.connect({ | |
browserWSEndpoint: ws_endpoint_url | |
}); | |
const page_1 = await newBrowser.newPage(); | |
await page_1.setRequestInterception(true); | |
await page_1.authenticate({ | |
username: 'your-proxy-username', | |
password: 'your-proxy-password', | |
}); | |
page_1.on('request', (req) => { | |
if(req.resourceType() == 'stylesheet' || req.resourceType() == 'font' || req.resourceType() == 'image'){ | |
req.abort(); | |
}else{ | |
req.continue(); | |
} | |
}); | |
await page_1.goto(url, {'timeout': 30000, 'waitUntil':'networkidle2'}); | |
const data = await page_1.content(); | |
await newBrowser.close(); | |
return { | |
data, | |
// Make sure to match the appropriate content here\n" + | |
// You'll likely want 'application/json'\n" + | |
type: 'application/html', | |
}; | |
}; |
Thank you all for sharing your experience. It has been extremely useful in getting this to work.
In case someone else comes across this...
Puppeteer Extra Stealth
This plugin is now part of browserless. You can make it work by adding the ENV var to docker or dockercompose:
environment:
DEFAULT_STEALTH: 'true'
Getting other Puppeteer extra plugins to work
If you want to use another plugin, such as the recaptcha one, you can get it working in the following way:
In case someone else comes across this, here is how I got it working.
Docker
docker-compose.yml
version: '2'
services:
browserless:
# image: browserless/chrome:1.46-puppeteer-1.20.0
build:
context: .
dockerfile: Dockerfile
restart: always
environment:
MAX_CONCURRENT_SESSIONS: 2
DEFAULT_STEALTH: 'true'
CONNECTION_TIMEOUT: 180000
Dockerfile
FROM browserless/chrome:1.46-puppeteer-1.20.0
RUN npm install puppeteer-extra-plugin-recaptcha
ENV FUNCTION_EXTERNALS '["puppeteer-extra-plugin-recaptcha", "puppeteer-extra"]'
Build the image
docker-compose build
Browserless
JS function
module.exports = async ({ page, context }) => {
async function load_pe_recaptcha(page, two_captcha_key) {
const puppeteer_extra = require('puppeteer-extra');
const RecaptchaPlugin = require('puppeteer-extra-plugin-recaptcha');
// For some reason using browser.wsEndpoint() did not work
var browser = await page.browser();
var wsEndpoint = await browser._wsEndpoint;
puppeteer_extra.use(
RecaptchaPlugin({
provider: {
id: '2captcha',
token: two_captcha_key,
},
visualFeedback: true,
})
);
await puppeteer_extra.connect({
browserWSEndpoint: wsEndpoint,
});
return wsEndpoint;
};
const two_captcha_key = context['2captcha_key'];
const wsEndpoint = load_pe_recaptcha(page, two_captcha_key);
await page.goto('https://www.google.com/recaptcha/api2/demo')
// After calling load_pe_recaptcha() the additional
// puppeteer extra functions become available
const captcha_resp = await page.solveRecaptchas();
await Promise.all([
page.waitForNavigation(),
page.click(`#recaptcha-demo-submit`)
]);
const data = {
endpoint: wsEndpoint,
captcha_resp: captcha_resp,
};
return {
data,
type: 'application/json',
}
};
Running it
You have to run it through the /function
endpoint. It doesn't work in the browser console.
I did every thing, but seems i cant send the code to /function
here the error I get from the logs:
browserless:job LYESRDNJ5MER2GVJRN303EF4EMBBU0BU: /function: Inbound HTTP request. Context: undefined
curl -X POST \ 'http://xxxxx/function' \ -H 'Content-Type: application/json' \ -d '{"code":"module.exports = async ({ page, context }) => { async function load_pe_recaptcha(page, two_captcha_key) { const puppeteer_extra = require(\"puppeteer-extra\"); const RecaptchaPlugin = require(\"puppeteer-extra-plugin-recaptcha\"); var browser = await page.browser(); var wsEndpoint = await browser._wsEndpoint; puppeteer_extra.use( RecaptchaPlugin({ provider: { id: \"2captcha\", token: two_captcha_key, }, visualFeedback: true, }) ); await puppeteer_extra.connect({ browserWSEndpoint: wsEndpoint, }); return wsEndpoint; }; const two_captcha_key = context[\"2captcha_key\"]; const wsEndpoint = load_pe_recaptcha(page, two_captcha_key); await page.goto(\"https://www.google.com/recaptcha/api2/demo\") // After calling load_pe_recaptcha() the additional // puppeteer extra functions become available const captcha_resp = await page.solveRecaptchas(); await Promise.all([ page.waitForNavigation(), page.click(
#recaptcha-demo-submit) ]); const data = { endpoint: wsEndpoint, captcha_resp: captcha_resp, }; return { data, type: \"application/json\", } };"}'
Nope, that was wrong - browser is still undefined, and doesn't return an endpoint. The only reason my test worked was because I hardcoded the
browserWSEndpoint
withlocalhost:3000
which made everything work. But I wonder why there is no browser variable to get that data from.Either way, if I hardcode it, it works so I'll revisit this if hardcoding the WS URL stops working. Super weird that I had to hardcode that URL. I'm guessing your javascript code had a
browser
variable somewhere where I don't see that wasn't passed into\function
with thecontext
array.