Last active
December 4, 2024 06:03
-
-
Save jessefmoore/89cebd049e3fa219d2f6032530e07808 to your computer and use it in GitHub Desktop.
Getwebpagetext.py
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# 12/03/2024 Marko | |
# Dfir-jesseee inspired | |
# sudo apt upgrade -y | |
# sudo apt install virtualenv | |
# mkdir ~/python-environments && cd ~/python-environments | |
# virtualenv aidev | |
# ls aidev/lib | |
# source aidev/bin/activate | |
# pip install bs4 | |
# pip install requests | |
from bs4 import BeautifulSoup | |
import requests | |
init_url = "https://policy.uw.edu/directory/aps/section-00/" | |
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36', | |
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9', | |
'Accept-Language': 'en-US,en;q=0.5', | |
'Accept-Encoding': 'gzip, deflate, br'} | |
init_response = requests.get(init_url, headers=headers) | |
soup = BeautifulSoup(init_response.content, 'html.parser') | |
output_file_name = "webpage_output.txt" | |
init_url_nodes = soup.find("div", class_="entry-content").findAll("p") | |
#Create our list of target pages | |
toget = ["2.2", "2.4", "2.5", "2.6"] | |
f = open(output_file_name, "w") | |
print(f"Outputting to {output_file_name}") | |
for node in init_url_nodes: | |
if node.find("strong").string in toget: | |
#print(node) | |
url_to_travel_to = node.find("a")["href"] | |
print(f"Found: {url_to_travel_to}") | |
response = requests.get(url_to_travel_to, headers=headers) | |
soup = BeautifulSoup(response.content, 'html.parser') | |
text = soup.get_text().replace('\r', '').replace('\n', '') | |
f.write(f"{text}\n\n") | |
f.close() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment