JeremyTheModernist · November 12, 2024 20:25
diff --git a/website-scraper.py b/website-scraper.py
 # A class to represent a Webpage

 class Website:
    """
    A utility class to represent a Website that we have scraped, now with links
    """
    # Type hints for class attributes
    url: str          # Store the website's URL
    title: str        # Store the webpage title
    body: str         # Store the raw HTML content
    links: List[str]  # Store list of all hyperlinks found
    text: str         # Store the cleaned text content
    images: List[str]

    def __init__(self, url):
        # Constructor method, initializes a new Website instance
        self.url = url  # Save the URL
        
        # Make HTTP GET request to the URL and store the response
        response = requests.get(url)
        self.body = response.content
        
        # Create BeautifulSoup object for parsing HTML
        soup = BeautifulSoup(self.body, 'html.parser')
        
        # Extract title, use "No title found" if title tag doesn't exist
        self.title = soup.title.string if soup.title else "No title found"
        
        # Process the main body content if it exists
        if soup.body:
            # Remove unwanted elements (scripts, styles, images, and input fields)
            for irrelevant in soup.body(["script", "style", "input"]):
                irrelevant.decompose()  # Remove these elements from the parsed tree
            self.images = soup.body(["img"])
            # Extract text content with newlines as separator and remove extra whitespace
            self.text = soup.body.get_text(separator="\n", strip=True)
        else:
            # If no body tag exists, set text as empty string
            self.text = ""
        
        # Extract all hyperlinks (href attributes) from anchor tags
        links = [link.get('href') for link in soup.find_all('a')]
        
        # Filter out None values (links without href attribute)
        self.links = [link for link in links if link]

    def get_contents(self):
        # Method to return formatted string of webpage title and contents
        return f"Webpage Title:\n{self.title}\nWebpage Contents:\n{self.text}\n\n"

    def display_markdown(self):
        return
	# A class to represent a Webpage

	class Website:
	"""
	A utility class to represent a Website that we have scraped, now with links
	"""
	# Type hints for class attributes
	url: str # Store the website's URL
	title: str # Store the webpage title
	body: str # Store the raw HTML content
	links: List[str] # Store list of all hyperlinks found
	text: str # Store the cleaned text content
	images: List[str]

	def __init__(self, url):
	# Constructor method, initializes a new Website instance
	self.url = url # Save the URL

	# Make HTTP GET request to the URL and store the response
	response = requests.get(url)
	self.body = response.content

	# Create BeautifulSoup object for parsing HTML
	soup = BeautifulSoup(self.body, 'html.parser')

	# Extract title, use "No title found" if title tag doesn't exist
	self.title = soup.title.string if soup.title else "No title found"

	# Process the main body content if it exists
	if soup.body:
	# Remove unwanted elements (scripts, styles, images, and input fields)
	for irrelevant in soup.body(["script", "style", "input"]):
	irrelevant.decompose() # Remove these elements from the parsed tree
	self.images = soup.body(["img"])
	# Extract text content with newlines as separator and remove extra whitespace
	self.text = soup.body.get_text(separator="\n", strip=True)
	else:
	# If no body tag exists, set text as empty string
	self.text = ""

	# Extract all hyperlinks (href attributes) from anchor tags
	links = [link.get('href') for link in soup.find_all('a')]

	# Filter out None values (links without href attribute)
	self.links = [link for link in links if link]

	def get_contents(self):
	# Method to return formatted string of webpage title and contents
	return f"Webpage Title:\n{self.title}\nWebpage Contents:\n{self.text}\n\n"

	def display_markdown(self):
	return