Last active
January 14, 2025 05:32
-
-
Save patwooky/57b901469ed53e34e3ea82cdb7270d8c to your computer and use it in GitHub Desktop.
20250112 Python Web-scraping Test
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from bs4 import BeautifulSoup | |
from urllib.request import urlopen | |
url = "http://olympus.realpython.org/profiles/dionysus" | |
page = urlopen(url) | |
html = page.read().decode("utf-8") | |
soup = BeautifulSoup(html, "html.parser") | |
print(soup.get_text()) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# python 3 | |
# from realpython.com web scraping tutorial | |
# https://realpython.com/python-web-scraping-practical-introduction/ | |
from urllib.request import urlopen | |
import re | |
# print(dir(ul)) | |
# url = "http://olympus.realpython.org/profiles/aphrodite" | |
# url = "http://olympus.realpython.org/profiles/poseidon" | |
# page = urlopen(url) | |
# html_bytes = page.read() | |
# html = html_bytes.decode("utf-8") | |
# titleStartEnd = [html.find("<title>") + len("<title>"), html.find("</title>")] | |
# titleText = html[titleStartEnd[0]:titleStartEnd[1]] | |
# # print(titleText) | |
# pattern = "<title.*?>.*?</title.*?>" | |
# match_results = re.search(pattern, html, re.IGNORECASE) | |
# title = match_results.group() | |
# title = re.sub("<.*?>", "", title) # Remove HTML tags | |
# print(title) | |
# Write a program that grabs the full HTML from the following URL: | |
url = "http://olympus.realpython.org/profiles/dionysus" | |
# Then use .find() to display the text following Name: and Favorite Color: (not including any leading spaces or trailing HTML tags that might appear on the same line). | |
page = urlopen(url) | |
html_bytes = page.read() | |
html = html_bytes.decode("utf-8") | |
print(html) | |
pattern = r"<h\d>*Name:*(.*?)\s*</h\d>" | |
match_results = re.search(pattern, html, re.IGNORECASE) | |
print(f'match_results {match_results}') | |
print(f'Name: {match_results.group(1).strip()}') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment