Created
November 12, 2024 20:25
-
-
Save JeremyTheModernist/c35a6212622872ba76cf508c7e359bd7 to your computer and use it in GitHub Desktop.
Beautiful Soup Website Scraper
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# A class to represent a Webpage | |
class Website: | |
""" | |
A utility class to represent a Website that we have scraped, now with links | |
""" | |
# Type hints for class attributes | |
url: str # Store the website's URL | |
title: str # Store the webpage title | |
body: str # Store the raw HTML content | |
links: List[str] # Store list of all hyperlinks found | |
text: str # Store the cleaned text content | |
images: List[str] | |
def __init__(self, url): | |
# Constructor method, initializes a new Website instance | |
self.url = url # Save the URL | |
# Make HTTP GET request to the URL and store the response | |
response = requests.get(url) | |
self.body = response.content | |
# Create BeautifulSoup object for parsing HTML | |
soup = BeautifulSoup(self.body, 'html.parser') | |
# Extract title, use "No title found" if title tag doesn't exist | |
self.title = soup.title.string if soup.title else "No title found" | |
# Process the main body content if it exists | |
if soup.body: | |
# Remove unwanted elements (scripts, styles, images, and input fields) | |
for irrelevant in soup.body(["script", "style", "input"]): | |
irrelevant.decompose() # Remove these elements from the parsed tree | |
self.images = soup.body(["img"]) | |
# Extract text content with newlines as separator and remove extra whitespace | |
self.text = soup.body.get_text(separator="\n", strip=True) | |
else: | |
# If no body tag exists, set text as empty string | |
self.text = "" | |
# Extract all hyperlinks (href attributes) from anchor tags | |
links = [link.get('href') for link in soup.find_all('a')] | |
# Filter out None values (links without href attribute) | |
self.links = [link for link in links if link] | |
def get_contents(self): | |
# Method to return formatted string of webpage title and contents | |
return f"Webpage Title:\n{self.title}\nWebpage Contents:\n{self.text}\n\n" | |
def display_markdown(self): | |
return |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment