Created
April 17, 2024 03:04
-
-
Save Jackster/78bd56c4a0c5f810a55b08ed47f3e6ed to your computer and use it in GitHub Desktop.
A basic Python script to archive posts from a Vbulletin forum into a local DB. Specify the URL and range of threads you want to crawl. Detects multiple pages and saves the content of each post along with the post data (user, timedate, post ID, thread ID) to a SQLite DB. Does not extract media. Created as a proof of concept for a larger scale for…
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
import requests | |
from bs4 import BeautifulSoup | |
import sqlite3 | |
from datetime import datetime | |
# Function to create a SQLite database and table for storing scrape details | |
# Function to create a SQLite database and table for storing scrape details | |
def create_scrape_table(): | |
conn = sqlite3.connect('forum_scrape_info.db') | |
c = conn.cursor() | |
c.execute('''CREATE TABLE IF NOT EXISTS scrape_info | |
(thread_id TEXT, scrape_datetime TEXT, status TEXT)''') | |
c.execute('''CREATE TABLE IF NOT EXISTS posts | |
(thread_id TEXT, thread_title TEXT, username TEXT, member_id TEXT, post_date_time TEXT, post_pos TEXT, post_content_html TEXT, post_id TEXT PRIMARY KEY)''') | |
conn.commit() | |
conn.close() | |
# Function to insert scrape details into the SQLite database | |
def insert_scrape_data(thread_id, scrape_datetime, status): | |
conn = sqlite3.connect('forum_scrape_info.db') | |
c = conn.cursor() | |
c.execute("INSERT INTO scrape_info (thread_id, scrape_datetime, status) VALUES (?, ?, ?)", | |
(thread_id, scrape_datetime, status)) | |
conn.commit() | |
conn.close() | |
# Function to insert post details into the SQLite database | |
def insert_post_data(thread_id, thread_title, username, member_id, post_date_time, post_pos, post_content_html, post_id): | |
conn = sqlite3.connect('forum_scrape_info.db') | |
c = conn.cursor() | |
c.execute("INSERT OR REPLACE INTO posts (thread_id, thread_title, username, member_id, post_date_time, post_pos, post_content_html, post_id) VALUES (?, ?, ?, ?, ?, ?, ?, ?)", | |
(thread_id, thread_title, username, member_id, post_date_time, post_pos, post_content_html, post_id)) | |
conn.commit() | |
conn.close() | |
# Create the scrape info table if it doesn't exist | |
create_scrape_table() | |
# Function to scrape a thread and insert its details into the database | |
def scrape_thread(thread_url): | |
response = requests.get(thread_url) | |
print("Requested URL:", thread_url) # Print the requested URL | |
soup = BeautifulSoup(response.content, 'html.parser') | |
# Check for standard error | |
standard_error_elem = soup.find('div', class_='standard_error') | |
if standard_error_elem: | |
print(f"Error: Thread {thread_url} does not exist or encountered an error.") | |
insert_scrape_data(re.search(r'\d+', thread_url).group(), datetime.now().strftime('%Y-%m-%d %H:%M:%S'), 'Error') | |
return | |
# Extract thread ID and title | |
thread_id = re.search(r'\d+', thread_url).group() | |
thread_title = soup.find('div', id='pagetitle').find('span', class_='threadtitle').text.strip() | |
# Check if pagination info exists | |
pagination_elem = soup.find('div', id='pagination_top') | |
if pagination_elem: | |
# Pagination info exists, extract total number of pages | |
page_info_elem = pagination_elem.find('a', class_='popupctrl') | |
if page_info_elem: | |
page_info = page_info_elem.text | |
num_pages = int(re.search(r'(\d+) of (\d+)', page_info).group(2)) | |
print(f"Detected {num_pages} pages.") | |
else: | |
num_pages = 1 | |
print("Detected single page.") | |
else: | |
# Pagination info not found, assume single page | |
num_pages = 1 | |
print("Detected single page.") | |
# Scrape each page | |
for page_num in range(1, num_pages + 1): | |
page_url = f"{thread_url}/page{page_num}" if page_num > 1 else thread_url | |
print(f"Scraping page {page_num}: {page_url}") | |
response = requests.get(page_url) | |
soup = BeautifulSoup(response.content, 'html.parser') | |
main_posts_container = soup.find('ol', class_='posts') | |
# Extract details from each main post and insert into the database | |
for post in main_posts_container.find_all(class_='postcontainer'): | |
post_id = post.get('id').split('_')[-1] | |
# Extract member ID from URL | |
username_link_elem = post.find(class_='username offline popupctrl') | |
if username_link_elem: | |
username_link = username_link_elem.get('href') | |
member_id = re.search(r'\d+', username_link).group() | |
else: | |
member_id = 'Member ID not available' | |
# Extract post date and time if available | |
date_elem = post.find(class_='date') | |
post_date_time = date_elem.get_text(strip=True) if date_elem else 'Date and Time not available' | |
# Extract post position if available | |
post_pos = post.find('a', class_='postcounter').get_text(strip=True).lstrip('#') if post.find('a', class_='postcounter') else 'Post Position not available' | |
# Extract username | |
username_container = post.find(class_='username_container') | |
username = username_container.find('strong').get_text() if (username_container and username_container.find('strong')) else 'Unknown' | |
# Extract post content if available | |
post_content_elem = post.find(class_='content') | |
post_content_html = 'Content not available' | |
if post_content_elem: | |
blockquote_elem = post_content_elem.find('blockquote', class_='postcontent restore') | |
if blockquote_elem: | |
# Extract inner HTML of blockquote element | |
post_content_html = str(blockquote_elem) | |
# Insert data into the database | |
insert_post_data(thread_id, thread_title, username, member_id, post_date_time, post_pos, post_content_html, post_id) | |
# Insert scrape success info into the database | |
insert_scrape_data(thread_id, datetime.now().strftime('%Y-%m-%d %H:%M:%S'), 'Success') | |
def scrape_threads_in_range(start_thread_id, end_thread_id): | |
for thread_id in range(start_thread_id, end_thread_id + 1): | |
thread_url = f"https://url.tld/showthread.php?{thread_id}" | |
scrape_thread(thread_url) | |
# Example usage: Scrape threads from thread ID 1 to 100 | |
scrape_threads_in_range(1, 100) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Note, this script is for a proof of concept for a larger web forum arching project.
It does what it says on the tin without much if any error detection.
It will run as fast as your internet/terminal can pull from the website without care of the web server or bot detection.
The post content is saved as HTML. It does not save any attached media. This will be done in another script.
There is no login/cookie support with this script either. You wont be able to archive threads that are not public.
Does not detect/save extra post contents such as Thanks.
Has been tested with a basic VB version 4.2.5 forum. It uses BeautifulSoup4 to extract and parse HTML to find the contents.
This might not work on all VB v4 forums. And probably needs reworking to work with v5 onwards or any custom themes.
Run at your own risk. There is no encapsulating or parameterizing of the HTML being read/written to the SQL so probably not a good idea to run on your main terminal where you do your internet banking...