eleco · May 13, 2022 08:09
diff --git a/google crawl with pyppeteer b/google crawl with pyppeteer
 import asyncio

 from pyppeteer import launch


 async def get_article_titles(keywords):
    # launch browser in headless mode
    browser = await launch({"headless": False, "args": ["--start-maximized"]})
    # create a new page
    page = await browser.newPage()
    # set page viewport to the largest size
    await page.setViewport({"width": 1600, "height": 900})
    # navigate to the page
    await page.goto("https://www.google.fr/search?q=strategy journey")

    # wait for search results to load
    await page.waitFor(1000)

    pages = await browser.pages();
    print(pages)

    popup = pages[1]

    x = await popup.querySelectorAll("button")
    for xx in x:
        title = await xx.getProperty("textContent")
        tilej = await title.jsonValue()
        if tilej == 'I agree':
            await xx.click()

    await page.waitFor(100)
    # btn = await popup.xpath('//button[contains(text(),"e")]')
    # wait page.click('//button[contains(text(),"I agree")]')

    url = await page.evaluate("() => window.location.href")
    print(url)


    # extract the article titles
    #topics = await page.querySelectorAll("a>div>h3")

    results = await page.Jx("//div[@class = 'g']//a[h3]");
    #topics = await page.evaluate('(results) => results.href' ,results)

    for topic in results:
        title = await topic.getProperty("textContent")
        print(await title.jsonValue())

        link = await topic.getProperty("href")
        prize_href = await page.evaluate('(g) => g.href', topic)

        # print the article titles
        print(prize_href)
        print(await link.jsonValue())


 print("Starting...")
 asyncio.get_event_loop().run_until_complete(
    get_article_titles(["python", "opensource", "opencv"])
 )
 print("Finished extracting articles titles")
	import asyncio

	from pyppeteer import launch


	async def get_article_titles(keywords):
	# launch browser in headless mode
	browser = await launch({"headless": False, "args": ["--start-maximized"]})
	# create a new page
	page = await browser.newPage()
	# set page viewport to the largest size
	await page.setViewport({"width": 1600, "height": 900})
	# navigate to the page
	await page.goto("https://www.google.fr/search?q=strategy journey")

	# wait for search results to load
	await page.waitFor(1000)

	pages = await browser.pages();
	print(pages)

	popup = pages[1]

	x = await popup.querySelectorAll("button")
	for xx in x:
	title = await xx.getProperty("textContent")
	tilej = await title.jsonValue()
	if tilej == 'I agree':
	await xx.click()

	await page.waitFor(100)
	# btn = await popup.xpath('//button[contains(text(),"e")]')
	# wait page.click('//button[contains(text(),"I agree")]')

	url = await page.evaluate("() => window.location.href")
	print(url)


	# extract the article titles
	#topics = await page.querySelectorAll("a>div>h3")

	results = await page.Jx("//div[@class = 'g']//a[h3]");
	#topics = await page.evaluate('(results) => results.href' ,results)

	for topic in results:
	title = await topic.getProperty("textContent")
	print(await title.jsonValue())

	link = await topic.getProperty("href")
	prize_href = await page.evaluate('(g) => g.href', topic)

	# print the article titles
	print(prize_href)
	print(await link.jsonValue())


	print("Starting...")
	asyncio.get_event_loop().run_until_complete(
	get_article_titles(["python", "opensource", "opencv"])
	)
	print("Finished extracting articles titles")
No results found