Skip to content

Instantly share code, notes, and snippets.

@jessefmoore
Last active December 4, 2024 06:03
Show Gist options
  • Save jessefmoore/89cebd049e3fa219d2f6032530e07808 to your computer and use it in GitHub Desktop.
Save jessefmoore/89cebd049e3fa219d2f6032530e07808 to your computer and use it in GitHub Desktop.
Getwebpagetext.py
# 12/03/2024 Marko
# Dfir-jesseee inspired
# sudo apt upgrade -y
# sudo apt install virtualenv
# mkdir ~/python-environments && cd ~/python-environments
# virtualenv aidev
# ls aidev/lib
# source aidev/bin/activate
# pip install bs4
# pip install requests
from bs4 import BeautifulSoup
import requests
init_url = "https://policy.uw.edu/directory/aps/section-00/"
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'Accept-Language': 'en-US,en;q=0.5',
'Accept-Encoding': 'gzip, deflate, br'}
init_response = requests.get(init_url, headers=headers)
soup = BeautifulSoup(init_response.content, 'html.parser')
output_file_name = "webpage_output.txt"
init_url_nodes = soup.find("div", class_="entry-content").findAll("p")
#Create our list of target pages
toget = ["2.2", "2.4", "2.5", "2.6"]
f = open(output_file_name, "w")
print(f"Outputting to {output_file_name}")
for node in init_url_nodes:
if node.find("strong").string in toget:
#print(node)
url_to_travel_to = node.find("a")["href"]
print(f"Found: {url_to_travel_to}")
response = requests.get(url_to_travel_to, headers=headers)
soup = BeautifulSoup(response.content, 'html.parser')
text = soup.get_text().replace('\r', '').replace('\n', '')
f.write(f"{text}\n\n")
f.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment