Created
August 14, 2024 02:27
-
-
Save matsubo/ba008d3b8f69c0e928ace98618b8adc2 to your computer and use it in GitHub Desktop.
Tried to crawl https://openai.com/news/ but blocked with 403 error.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// For more information, see https://crawlee.dev/ | |
import { PlaywrightCrawler } from 'crawlee'; | |
const userAgent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'; | |
// PlaywrightCrawler crawls the web using a headless | |
// browser controlled by the Playwright library. | |
const crawler = new PlaywrightCrawler({ | |
// Use the requestHandler to process each of the crawled pages. | |
async requestHandler({ request, page, enqueueLinks, log, pushData }) { | |
await page.waitForLoadState('networkidle', { timeout: 3000 }); | |
await page.mouse.move(100, 100); | |
await page.mouse.down(); | |
await page.mouse.move(200, 200); | |
await page.mouse.up(); | |
const title = await page.title(); | |
log.info(`Title of ${request.loadedUrl} is '${title}'`); | |
// Save results as JSON to ./storage/datasets/default | |
await pushData({ title, url: request.loadedUrl }); | |
// Extract links from the current page | |
// and add them to the crawling queue. | |
await enqueueLinks(); | |
}, | |
// Comment this option to scrape the full website. | |
maxRequestsPerCrawl: 20, | |
// Uncomment this option to see the browser window. | |
headless: false, | |
maxRequestRetries: 1, | |
browserPoolOptions: { | |
preLaunchHooks: [ | |
async (pageId, launchContext) => { | |
launchContext.launchOptions = { | |
...launchContext.launchOptions, | |
viewport: { width: 800, height: 600}, | |
locale: 'en-US', | |
permissions: ['geolocation'], | |
extraHTTPHeaders: { | |
'Accept-Language': 'en-US,en;q=0.9' | |
} | |
}; | |
} | |
] | |
}, | |
persistCookiesPerSession: true, | |
}); | |
// Add first URL to the queue and start the crawl. | |
await crawler.run(['https://openai.com/news/']); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment