Skip to content

Instantly share code, notes, and snippets.

@JeremyTheModernist
Created November 12, 2024 20:25
Show Gist options
  • Save JeremyTheModernist/c35a6212622872ba76cf508c7e359bd7 to your computer and use it in GitHub Desktop.
Save JeremyTheModernist/c35a6212622872ba76cf508c7e359bd7 to your computer and use it in GitHub Desktop.
Beautiful Soup Website Scraper
# A class to represent a Webpage
class Website:
"""
A utility class to represent a Website that we have scraped, now with links
"""
# Type hints for class attributes
url: str # Store the website's URL
title: str # Store the webpage title
body: str # Store the raw HTML content
links: List[str] # Store list of all hyperlinks found
text: str # Store the cleaned text content
images: List[str]
def __init__(self, url):
# Constructor method, initializes a new Website instance
self.url = url # Save the URL
# Make HTTP GET request to the URL and store the response
response = requests.get(url)
self.body = response.content
# Create BeautifulSoup object for parsing HTML
soup = BeautifulSoup(self.body, 'html.parser')
# Extract title, use "No title found" if title tag doesn't exist
self.title = soup.title.string if soup.title else "No title found"
# Process the main body content if it exists
if soup.body:
# Remove unwanted elements (scripts, styles, images, and input fields)
for irrelevant in soup.body(["script", "style", "input"]):
irrelevant.decompose() # Remove these elements from the parsed tree
self.images = soup.body(["img"])
# Extract text content with newlines as separator and remove extra whitespace
self.text = soup.body.get_text(separator="\n", strip=True)
else:
# If no body tag exists, set text as empty string
self.text = ""
# Extract all hyperlinks (href attributes) from anchor tags
links = [link.get('href') for link in soup.find_all('a')]
# Filter out None values (links without href attribute)
self.links = [link for link in links if link]
def get_contents(self):
# Method to return formatted string of webpage title and contents
return f"Webpage Title:\n{self.title}\nWebpage Contents:\n{self.text}\n\n"
def display_markdown(self):
return
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment