Skip to content

Instantly share code, notes, and snippets.

@Jackster
Created April 17, 2024 03:04
Show Gist options
  • Save Jackster/78bd56c4a0c5f810a55b08ed47f3e6ed to your computer and use it in GitHub Desktop.
Save Jackster/78bd56c4a0c5f810a55b08ed47f3e6ed to your computer and use it in GitHub Desktop.
A basic Python script to archive posts from a Vbulletin forum into a local DB. Specify the URL and range of threads you want to crawl. Detects multiple pages and saves the content of each post along with the post data (user, timedate, post ID, thread ID) to a SQLite DB. Does not extract media. Created as a proof of concept for a larger scale for…
import re
import requests
from bs4 import BeautifulSoup
import sqlite3
from datetime import datetime
# Function to create a SQLite database and table for storing scrape details
# Function to create a SQLite database and table for storing scrape details
def create_scrape_table():
conn = sqlite3.connect('forum_scrape_info.db')
c = conn.cursor()
c.execute('''CREATE TABLE IF NOT EXISTS scrape_info
(thread_id TEXT, scrape_datetime TEXT, status TEXT)''')
c.execute('''CREATE TABLE IF NOT EXISTS posts
(thread_id TEXT, thread_title TEXT, username TEXT, member_id TEXT, post_date_time TEXT, post_pos TEXT, post_content_html TEXT, post_id TEXT PRIMARY KEY)''')
conn.commit()
conn.close()
# Function to insert scrape details into the SQLite database
def insert_scrape_data(thread_id, scrape_datetime, status):
conn = sqlite3.connect('forum_scrape_info.db')
c = conn.cursor()
c.execute("INSERT INTO scrape_info (thread_id, scrape_datetime, status) VALUES (?, ?, ?)",
(thread_id, scrape_datetime, status))
conn.commit()
conn.close()
# Function to insert post details into the SQLite database
def insert_post_data(thread_id, thread_title, username, member_id, post_date_time, post_pos, post_content_html, post_id):
conn = sqlite3.connect('forum_scrape_info.db')
c = conn.cursor()
c.execute("INSERT OR REPLACE INTO posts (thread_id, thread_title, username, member_id, post_date_time, post_pos, post_content_html, post_id) VALUES (?, ?, ?, ?, ?, ?, ?, ?)",
(thread_id, thread_title, username, member_id, post_date_time, post_pos, post_content_html, post_id))
conn.commit()
conn.close()
# Create the scrape info table if it doesn't exist
create_scrape_table()
# Function to scrape a thread and insert its details into the database
def scrape_thread(thread_url):
response = requests.get(thread_url)
print("Requested URL:", thread_url) # Print the requested URL
soup = BeautifulSoup(response.content, 'html.parser')
# Check for standard error
standard_error_elem = soup.find('div', class_='standard_error')
if standard_error_elem:
print(f"Error: Thread {thread_url} does not exist or encountered an error.")
insert_scrape_data(re.search(r'\d+', thread_url).group(), datetime.now().strftime('%Y-%m-%d %H:%M:%S'), 'Error')
return
# Extract thread ID and title
thread_id = re.search(r'\d+', thread_url).group()
thread_title = soup.find('div', id='pagetitle').find('span', class_='threadtitle').text.strip()
# Check if pagination info exists
pagination_elem = soup.find('div', id='pagination_top')
if pagination_elem:
# Pagination info exists, extract total number of pages
page_info_elem = pagination_elem.find('a', class_='popupctrl')
if page_info_elem:
page_info = page_info_elem.text
num_pages = int(re.search(r'(\d+) of (\d+)', page_info).group(2))
print(f"Detected {num_pages} pages.")
else:
num_pages = 1
print("Detected single page.")
else:
# Pagination info not found, assume single page
num_pages = 1
print("Detected single page.")
# Scrape each page
for page_num in range(1, num_pages + 1):
page_url = f"{thread_url}/page{page_num}" if page_num > 1 else thread_url
print(f"Scraping page {page_num}: {page_url}")
response = requests.get(page_url)
soup = BeautifulSoup(response.content, 'html.parser')
main_posts_container = soup.find('ol', class_='posts')
# Extract details from each main post and insert into the database
for post in main_posts_container.find_all(class_='postcontainer'):
post_id = post.get('id').split('_')[-1]
# Extract member ID from URL
username_link_elem = post.find(class_='username offline popupctrl')
if username_link_elem:
username_link = username_link_elem.get('href')
member_id = re.search(r'\d+', username_link).group()
else:
member_id = 'Member ID not available'
# Extract post date and time if available
date_elem = post.find(class_='date')
post_date_time = date_elem.get_text(strip=True) if date_elem else 'Date and Time not available'
# Extract post position if available
post_pos = post.find('a', class_='postcounter').get_text(strip=True).lstrip('#') if post.find('a', class_='postcounter') else 'Post Position not available'
# Extract username
username_container = post.find(class_='username_container')
username = username_container.find('strong').get_text() if (username_container and username_container.find('strong')) else 'Unknown'
# Extract post content if available
post_content_elem = post.find(class_='content')
post_content_html = 'Content not available'
if post_content_elem:
blockquote_elem = post_content_elem.find('blockquote', class_='postcontent restore')
if blockquote_elem:
# Extract inner HTML of blockquote element
post_content_html = str(blockquote_elem)
# Insert data into the database
insert_post_data(thread_id, thread_title, username, member_id, post_date_time, post_pos, post_content_html, post_id)
# Insert scrape success info into the database
insert_scrape_data(thread_id, datetime.now().strftime('%Y-%m-%d %H:%M:%S'), 'Success')
def scrape_threads_in_range(start_thread_id, end_thread_id):
for thread_id in range(start_thread_id, end_thread_id + 1):
thread_url = f"https://url.tld/showthread.php?{thread_id}"
scrape_thread(thread_url)
# Example usage: Scrape threads from thread ID 1 to 100
scrape_threads_in_range(1, 100)
@Jackster
Copy link
Author

Jackster commented Apr 17, 2024

Note, this script is for a proof of concept for a larger web forum arching project.
It does what it says on the tin without much if any error detection.
It will run as fast as your internet/terminal can pull from the website without care of the web server or bot detection.

The post content is saved as HTML. It does not save any attached media. This will be done in another script.
There is no login/cookie support with this script either. You wont be able to archive threads that are not public.
Does not detect/save extra post contents such as Thanks.

Has been tested with a basic VB version 4.2.5 forum. It uses BeautifulSoup4 to extract and parse HTML to find the contents.
This might not work on all VB v4 forums. And probably needs reworking to work with v5 onwards or any custom themes.

Run at your own risk. There is no encapsulating or parameterizing of the HTML being read/written to the SQL so probably not a good idea to run on your main terminal where you do your internet banking...

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment