Skip to content

Instantly share code, notes, and snippets.

@ilyazub
Last active May 3, 2021 14:19
Show Gist options
  • Save ilyazub/72bd8166d964868aa52ca665cfcdfbc6 to your computer and use it in GitHub Desktop.
Save ilyazub/72bd8166d964868aa52ca665cfcdfbc6 to your computer and use it in GitHub Desktop.
Scrape Google News with Pagination using Python Generators
from bs4 import BeautifulSoup
import requests, urllib.parse
def paginate(url, previous_url=None):
# Break from infinite recursion
if url == previous_url: return
headers = {
"User-Agent":
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3538.102 Safari/537.36 Edge/18.19582"
}
response = requests.get(url, headers=headers).text
soup = BeautifulSoup(response, 'lxml')
# First page
yield soup
next_page_node = soup.select_one('a#pnnext')
# Stop when there is no next page
if next_page_node is None: return
next_page_url = urllib.parse.urljoin('https://www.google.com/',
next_page_node['href'])
# Pages after the first one
yield from paginate(next_page_url, url)
def scrape():
pages = paginate(
"https://www.google.com/search?hl=en-US&q=something+important&tbm=nws")
for soup in pages:
print(f'Current page: {int(soup.select_one(".YyVfkd").text)}')
print()
for data in soup.findAll('div', class_='dbsr'):
title = data.find('div', class_='JheGif nDgy9d').text
link = data.a['href']
print(f'Title: {title}')
print(f'Link: {link}')
print()
scrape()
@ilyazub
Copy link
Author

ilyazub commented May 3, 2021

The description is in the blog post.

The next step would be to refactor the paginate function to leave only one yield but we'll do this next time. Or do it yourself and share your results.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment