Created
March 26, 2023 01:12
-
-
Save reefwing/63b2e77acb22e9346a4971ecfdc8ca08 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
from bs4 import BeautifulSoup | |
# Initialize variables | |
base_url = "http://quotes.toscrape.com" | |
current_url = base_url | |
quotes = [] | |
# Loop through all pages of quotes | |
while True: | |
# Send a request to the current page and get its content | |
response = requests.get(current_url) | |
content = response.content | |
# Parse the content with BeautifulSoup | |
soup = BeautifulSoup(content, "html.parser") | |
# Find all the quote and author elements on the page | |
for quote in soup.find_all("div", class_="quote"): | |
text = quote.find("span", class_="text").text | |
author = quote.find("small", class_="author").text | |
quotes.append({"quote": text, "author": author}) | |
# Check if there is a "Next" button on the page | |
next_button = soup.find("li", class_="next") | |
if next_button: | |
# If there is a "Next" button, update the current URL to the next page | |
next_url = next_button.find("a")["href"] | |
current_url = base_url + next_url | |
else: | |
# If there is no "Next" button, break out of the loop | |
break | |
# Print the list of quotes | |
print(quotes) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment