Skip to content

Instantly share code, notes, and snippets.

@patwooky
Last active January 14, 2025 05:32
Show Gist options
  • Save patwooky/57b901469ed53e34e3ea82cdb7270d8c to your computer and use it in GitHub Desktop.
Save patwooky/57b901469ed53e34e3ea82cdb7270d8c to your computer and use it in GitHub Desktop.
20250112 Python Web-scraping Test
from bs4 import BeautifulSoup
from urllib.request import urlopen
url = "http://olympus.realpython.org/profiles/dionysus"
page = urlopen(url)
html = page.read().decode("utf-8")
soup = BeautifulSoup(html, "html.parser")
print(soup.get_text())
# python 3
# from realpython.com web scraping tutorial
# https://realpython.com/python-web-scraping-practical-introduction/
from urllib.request import urlopen
import re
# print(dir(ul))
# url = "http://olympus.realpython.org/profiles/aphrodite"
# url = "http://olympus.realpython.org/profiles/poseidon"
# page = urlopen(url)
# html_bytes = page.read()
# html = html_bytes.decode("utf-8")
# titleStartEnd = [html.find("<title>") + len("<title>"), html.find("</title>")]
# titleText = html[titleStartEnd[0]:titleStartEnd[1]]
# # print(titleText)
# pattern = "<title.*?>.*?</title.*?>"
# match_results = re.search(pattern, html, re.IGNORECASE)
# title = match_results.group()
# title = re.sub("<.*?>", "", title) # Remove HTML tags
# print(title)
# Write a program that grabs the full HTML from the following URL:
url = "http://olympus.realpython.org/profiles/dionysus"
# Then use .find() to display the text following Name: and Favorite Color: (not including any leading spaces or trailing HTML tags that might appear on the same line).
page = urlopen(url)
html_bytes = page.read()
html = html_bytes.decode("utf-8")
print(html)
pattern = r"<h\d>*Name:*(.*?)\s*</h\d>"
match_results = re.search(pattern, html, re.IGNORECASE)
print(f'match_results {match_results}')
print(f'Name: {match_results.group(1).strip()}')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment