Skip to content

Instantly share code, notes, and snippets.

@martin-martin
Created March 29, 2021 13:37
Show Gist options
  • Save martin-martin/beb8a5576ff062a062ed979f3889f008 to your computer and use it in GitHub Desktop.
Save martin-martin/beb8a5576ff062a062ed979f3889f008 to your computer and use it in GitHub Desktop.
Example scraper and scraper test solution
import requests
from bs4 import BeautifulSoup
BASE_URL = "https://codingnomads.github.io/recipes/"
def get_page_content(url):
"""Gets the response from a HTTP call to the URL."""
page = requests.get(url)
return page
def get_html_content(url):
"""Gets the HTML from a page."""
html = get_page_content(url).text
return html
def make_soup(html):
"""Converts an HTML string to a BeautifulSoup object."""
soup = BeautifulSoup(html, "html.parser")
return soup
def get_recipe_links(soup):
"""Extracts the URLs of all links on a page, given a bs4 object."""
links = [link["href"] for link in soup.find_all("a")]
return links
def get_author(soup):
"""Extracts the name of the author of a recipe."""
author = soup.find("p", class_="author").text.strip("by ")
return author
def get_recipe(soup):
"""Extracts the recipe text from a bs4 object."""
recipe = soup.find("div", class_="md").text
return recipe
if __name__ == "__main__":
index_html = get_html_content(BASE_URL)
index_soup = make_soup(index_html)
recipe_links = get_recipe_links(index_soup)
for r_link in recipe_links:
URL = f"{BASE_URL}/{r_link}"
soup = make_soup(get_html_content(URL))
author = get_author(soup)
recipe = get_recipe(soup)
print(f"({author})\t[{recipe}]\n\n\n")
import unittest
import rescrape
class TestRescrape(unittest.TestCase):
def setUp(self):
self.BASE_URL = "https://codingnomads.github.io/recipes/"
self.url = f"{self.BASE_URL}recipes/11-making-my-own-baguet.html"
def test_get_valid_html_response(self):
index_page = rescrape.get_page_content(self.BASE_URL)
page = rescrape.get_page_content(self.url)
self.assertEqual(index_page.status_code, 200)
self.assertEqual(page.status_code, 200)
def test_get_html_content_returns_html_string(self):
index_html = rescrape.get_html_content(self.BASE_URL)
html = rescrape.get_html_content(self.url)
self.assertIn("<!DOCTYPE html>", index_html)
self.assertIn("<!DOCTYPE html>", html)
def test_make_soup_makes_soup(self):
html = rescrape.get_html_content(self.url)
soup = rescrape.make_soup(html)
self.assertEqual("<class 'bs4.BeautifulSoup'>", str(type(soup)))
def test_get_recipe_links_gets_recipe_links(self):
index_html = rescrape.get_html_content(self.BASE_URL)
index_soup = rescrape.make_soup(index_html)
self.assertGreater(len(rescrape.get_recipe_links(index_soup)), 0)
def test_get_author_finds_author(self):
html = rescrape.get_html_content(self.url)
soup = rescrape.make_soup(html)
author = rescrape.get_author(soup)
self.assertNotEqual(len(author), 0)
self.assertEqual("Jadafaa", author)
def test_get_recipe_gets_recipe_text(self):
html = rescrape.get_html_content(self.url)
soup = rescrape.make_soup(html)
recipe = rescrape.get_recipe(soup)
self.assertIsInstance(recipe, str)
self.assertGreater(len(recipe), 0)
if __name__ == "__main__":
unittest.main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment