Created
January 5, 2023 19:21
-
-
Save realhackcraft/8b9a301e6beb5491e8a63ba9bcdd53d8 to your computer and use it in GitHub Desktop.
Python webscraper using ChatGPT
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import csv | |
import inspect | |
import random | |
import requests | |
from colorama import Fore | |
from bs4 import BeautifulSoup | |
data = [] | |
base_url = 'http://books.toscrape.com' | |
def scrap_books(): | |
# Set base URL and page number | |
page_number = 1 | |
# Set flag to indicate scraping is not finished | |
scraping = True | |
while scraping: | |
# Make request and get HTML | |
r = requests.get(base_url + "/catalogue/page-{}.html".format(page_number)) | |
html = r.text | |
# Create BeautifulSoup object | |
soup = BeautifulSoup(html, 'html.parser') | |
# Find all books | |
books = soup.find_all('article', class_='product_pod') | |
# Scrape data for each book | |
for book in books: | |
title = book.h3.a['title'] | |
price = book.find('div', class_='product_price').p.text | |
price = price.replace('Â', '') # Remove 'Â' character from price | |
link = base_url + book.h3.a['href'] | |
# Append data for this book to list | |
data.append((title, price, link)) | |
# Check if next page exists | |
next_button = soup.find('li', class_='next') | |
if not next_button: | |
scraping = False | |
else: | |
page_number += 1 | |
# Write data to CSV | |
with open('books.csv', 'w', newline='') as csvfile: | |
writer = csv.writer(csvfile) | |
writer.writerow(('title', 'price', 'link')) # Add header row | |
writer.writerows(data) | |
# Print success message | |
num_books = len(data) | |
message = f"Finished scraping {base_url} and found {num_books} books. " | |
print(rainbow_text(message) + print_link(os.path.abspath('books.csv'))) | |
def rainbow_text(text): | |
rainbow = [Fore.RED, Fore.YELLOW, Fore.GREEN, Fore.CYAN, Fore.BLUE, Fore.MAGENTA] | |
output = random.choice(rainbow) + text | |
return output | |
def print_link(file=None, line=None): | |
""" Print a link in PyCharm to a line in file. | |
Defaults to line where this function was called. """ | |
if file is None: | |
file = inspect.stack()[1].filename | |
if line is None: | |
line = inspect.stack()[1].lineno | |
string = f'File "{file}", line {max(line, 1)}'.replace("\\", "/") | |
return string | |
if __name__ == "__main__": | |
scrap_books() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment