Created
February 23, 2024 12:01
-
-
Save AhmedSamy/c11f502a60d162dbf4fadb4904065177 to your computer and use it in GitHub Desktop.
scrap.js
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import { launchPuppeteer } from 'crawlee' | |
import path from 'path' | |
import { fileURLToPath } from 'url'; | |
import { dirname } from 'path'; | |
const __filename = fileURLToPath(import.meta.url); | |
const __dirname = dirname(__filename); | |
const companyName = 'Paylane GmbH' | |
// Launch the web browser. | |
const browser = await launchPuppeteer({ | |
launchOptions: { | |
headless: false, defaultViewport: null, args: ['--start-maximized'], | |
} | |
}) | |
// Create and navigate new page | |
console.log('Open target page') | |
const page = await browser.newPage() | |
await page.goto('https://www.handelsregister.de/rp_web/normalesuche.xhtml') | |
// Fill form fields and select desired search options | |
console.log('Fill in search form') | |
//take screenshot | |
await page.screenshot({ path: 'before_form.png' }) | |
await page.type('#form\\:schlagwoerter', companyName) | |
await page.screenshot({ path: 'after_form.png' }) | |
// Submit the form and wait for full load of next page | |
console.log('Submit search form') | |
await Promise.all([ | |
page.waitForNavigation({ waitUntil: 'networkidle2' }), | |
page.click('#form\\:btnSuche[type="submit"]'), | |
]) | |
await page.screenshot({ path: 'after_submit.png' }) | |
// Obtain and print list of search results | |
console.log('Extract search results') | |
//get all anchors with class dokumentList | |
const anchors = await page.$$('.dokumentList') | |
//print anchors text | |
let dkAnchor | |
for (let i = 0; i < anchors.length; i++) { | |
const value = await (await anchors[i].getProperty('innerText')).jsonValue() | |
if (value.includes('DK')) { | |
dkAnchor = anchors[i] | |
console.log('Found DK link:', value) | |
} | |
} | |
//click on the first DK link | |
await Promise.all([ | |
page.waitForNavigation({ waitUntil: 'networkidle2' }), | |
dkAnchor.click(), | |
]) | |
// find a span that contains the text "List of shareholders" | |
let spans = await page.$$('span') | |
let rootUrlLink | |
for (let i = 0; i < spans.length; i++) { | |
const value = await (await spans[i].getProperty('innerText')).jsonValue() | |
if (value.includes('Documents on register number')) { | |
rootUrlLink = spans[i] | |
console.log('Span Found:', value) | |
} | |
} | |
console.log('Click on rootUrlLink:', rootUrlLink) | |
//click on rootUrlLink | |
await Promise.all([ | |
rootUrlLink.click(), | |
]) | |
spans = await page.$$('span') | |
let shareholderLink | |
for (let i = 0; i < spans.length; i++) { | |
const value = await (await spans[i].getProperty('innerText')).jsonValue() | |
if (value.includes('List of shareholders')) { | |
shareholderLink = spans[i] | |
console.log('Span Found list of shareholders:', value) | |
} | |
} | |
await page.screenshot({ path: 'after_span_search_shareholder.png', fullPage: true }) | |
console.log('Click on shareholderLink:', shareholderLink) | |
//click on shareholderLink | |
await Promise.all([ | |
shareholderLink.click(), | |
]) | |
//wait 5 seconds | |
await page.waitForTimeout(2000) | |
//find a button with inner text "Download" | |
let buttons = await page.$$('button') | |
let downloadButton | |
for (let i = 0; i < buttons.length; i++) { | |
const value = await (await buttons[i].getProperty('innerText')).jsonValue() | |
if (value.includes('Download')) { | |
downloadButton = buttons[i] | |
console.log('Button Found:', value) | |
} | |
} | |
await page.screenshot({ path: 'after_button_search.png', fullPage: true }) | |
console.log(downloadButton.asElement()) | |
const client = await page.target().createCDPSession() | |
await client.send('Page.setDownloadBehavior', { | |
behavior: 'allow', | |
downloadPath: path.join(__dirname, 'downloads') | |
}); | |
console.log('Click on downloadButton:', downloadButton) | |
//click on downloadButton | |
await Promise.all([ | |
downloadButton.click(), | |
]) | |
//wait 5 seconds | |
await page.waitForTimeout(5000) | |
// Close browser | |
await browser.close() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment