Last active
December 6, 2023 16:57
-
-
Save RooneyMcNibNug/986b2a8c86e50379c854d759e34dbb30 to your computer and use it in GitHub Desktop.
For when you want to scrape a publicly available thread (including images) on forums.somethingawful.com
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
### Python script using bs4 to scrape a Something Awful forums thread. ### | |
# | |
# ! THIS WILL ONLY WORK ON THREADS THAT ARE PUBLICLY ACCESSIBLE, IE ONE'S YOU DON'T NEED TO LOG IN TO SEE ! | |
# (I didn't want to deal with auth stuff..) | |
# | |
# Make sure you replace the thread_url variable with the link you want to scrape, same as the example in the code here. | |
# | |
# This will dump to an HTML file with decent formatting (a bit too wall-of-text atm, but I have too many other things to tend to in life). | |
# | |
# If you want to have better file naming including something like the ThreadID, do this: | |
# $ SA_public_thread_scrape.py && thread_id=$(sed -n 's/.*threadid=\([0-9]*\).*/\1/p' SA_public_thread_scrape.py) && mv scraped_data.html scraped_data_${thread_id}.html | |
import requests | |
from bs4 import BeautifulSoup | |
def scrape_page(page_url): | |
response = requests.get(page_url) | |
soup = BeautifulSoup(response.content, 'html.parser') | |
posts = soup.find_all('table', class_='post') | |
for post in posts: | |
# Extract user information | |
userinfo = post.find('dl', class_='userinfo') | |
try: | |
# Extract 'author' (user) | |
author_text = userinfo.find('dt', class_='author').get_text() | |
except AttributeError: | |
author_text = "Unknown" | |
# Extract 'registered' date | |
try: | |
registered_text = userinfo.find('dd', class_='registered').get_text() | |
except AttributeError: | |
registered_text = "Unknown" | |
# Format the user data with a "-" between | |
user_data = f"<strong>User:</strong> {author_text} - Registered: {registered_text}<br>" | |
# Extract the timestamp of the post | |
timestamp = post.find('td', class_='postdate').get_text() | |
# Extract the body of the post | |
content = post.find('td', class_='postbody').get_text(strip=True) | |
# Extract and format any images in the post, minus avatars and tagsigns and such | |
images = post.find_all('img', class_=lambda x: x in ['img', 'timg', 'complete']) | |
image_data = "" | |
for img in images: | |
image_src = img.get('src') | |
if ("safs/titles" in image_src or "images/gangtags" in image_src or "images/svgs" in image_src or | |
"customtitles" in image_src or "images/avatars" in image_src or "images/newbie.gif" in image_src | |
or "images/title-banned.gif" in image_src): | |
continue | |
image_data += f'<img src="{image_src}" alt="Image"><br>' | |
# Combine the user data, post data, and images | |
post_data = user_data +\ | |
f"<strong>Timestamp:</strong> {timestamp}<br>"\ | |
f"<strong>Content:</strong> {content}<br>" +\ | |
image_data +\ | |
"<hr><br>" | |
scraped_data.append(post_data) | |
def get_next_page(current_page): | |
response = requests.get(current_page) | |
soup = BeautifulSoup(response.content, 'html.parser') | |
next_page_link = soup.find('a', string='›') # find an anchor element whose string content is '›' | |
if next_page_link is not None: | |
next_page_url = 'https://forums.somethingawful.com/' + next_page_link['href'] | |
return next_page_url | |
else: | |
return None | |
thread_url = 'https://forums.somethingawful.com/showthread.php?threadid=4048837' # THE SOMETHINGAWFUL FORUM URL YOU WANT TO SCRAPE GOES HERE! | |
scraped_data = [] | |
current_page = thread_url | |
while current_page is not None: | |
scrape_page(current_page) | |
current_page = get_next_page(current_page) | |
# Save the scraped data to an HTML file | |
with open('scraped_data.html', 'w', encoding='utf-8') as f: | |
f.write("<html>") | |
f.write("<head>") | |
f.write("<title>Scraped Data</title>") | |
f.write("</head>") | |
f.write("<body>") | |
for data in scraped_data: | |
f.write(data) | |
f.write("</body>") | |
f.write("</html>") | |
print("Scraped data has been saved to scraped_data.html") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment