matsubo · August 14, 2024 02:27
diff --git a/main.js b/main.js
 // For more information, see https://crawlee.dev/
 import { PlaywrightCrawler } from 'crawlee';

 const userAgent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36';


 // PlaywrightCrawler crawls the web using a headless
 // browser controlled by the Playwright library.
 const crawler = new PlaywrightCrawler({
  // Use the requestHandler to process each of the crawled pages.
  async requestHandler({ request, page, enqueueLinks, log, pushData }) {
    await page.waitForLoadState('networkidle', { timeout: 3000 });

    await page.mouse.move(100, 100);
    await page.mouse.down();
    await page.mouse.move(200, 200);
    await page.mouse.up();


    const title = await page.title();
    log.info(`Title of ${request.loadedUrl} is '${title}'`);

    // Save results as JSON to ./storage/datasets/default
    await pushData({ title, url: request.loadedUrl });

    // Extract links from the current page
    // and add them to the crawling queue.
    await enqueueLinks();
  },
  // Comment this option to scrape the full website.
  maxRequestsPerCrawl: 20,
  // Uncomment this option to see the browser window.
  headless: false,
  maxRequestRetries: 1,
  browserPoolOptions: {
    preLaunchHooks: [
      async (pageId, launchContext) => {
        launchContext.launchOptions = {
          ...launchContext.launchOptions,
          viewport: { width: 800, height: 600},
          locale: 'en-US',
          permissions: ['geolocation'],
          extraHTTPHeaders: {
            'Accept-Language': 'en-US,en;q=0.9'
          }
        };
      }
    ]
  },
  persistCookiesPerSession: true,

 });

 // Add first URL to the queue and start the crawl.
 await crawler.run(['https://openai.com/news/']);
	// For more information, see https://crawlee.dev/
	import { PlaywrightCrawler } from 'crawlee';

	const userAgent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36';


	// PlaywrightCrawler crawls the web using a headless
	// browser controlled by the Playwright library.
	const crawler = new PlaywrightCrawler({
	// Use the requestHandler to process each of the crawled pages.
	async requestHandler({ request, page, enqueueLinks, log, pushData }) {
	await page.waitForLoadState('networkidle', { timeout: 3000 });

	await page.mouse.move(100, 100);
	await page.mouse.down();
	await page.mouse.move(200, 200);
	await page.mouse.up();


	const title = await page.title();
	log.info(`Title of ${request.loadedUrl} is '${title}'`);

	// Save results as JSON to ./storage/datasets/default
	await pushData({ title, url: request.loadedUrl });

	// Extract links from the current page
	// and add them to the crawling queue.
	await enqueueLinks();
	},
	// Comment this option to scrape the full website.
	maxRequestsPerCrawl: 20,
	// Uncomment this option to see the browser window.
	headless: false,
	maxRequestRetries: 1,
	browserPoolOptions: {
	preLaunchHooks: [
	async (pageId, launchContext) => {
	launchContext.launchOptions = {
	...launchContext.launchOptions,
	viewport: { width: 800, height: 600},
	locale: 'en-US',
	permissions: ['geolocation'],
	extraHTTPHeaders: {
	'Accept-Language': 'en-US,en;q=0.9'
	}
	};
	}
	]
	},
	persistCookiesPerSession: true,

	});

	// Add first URL to the queue and start the crawl.
	await crawler.run(['https://openai.com/news/']);