Jackster · April 17, 2024 03:04 · Jackster · Apr 17, 2024
diff --git a/VBpostarchive.py b/VBpostarchive.py
 import re
 import requests
 from bs4 import BeautifulSoup
 import sqlite3
 from datetime import datetime

 # Function to create a SQLite database and table for storing scrape details
 # Function to create a SQLite database and table for storing scrape details
 def create_scrape_table():
    conn = sqlite3.connect('forum_scrape_info.db')
    c = conn.cursor()
    c.execute('''CREATE TABLE IF NOT EXISTS scrape_info
                 (thread_id TEXT, scrape_datetime TEXT, status TEXT)''')
    c.execute('''CREATE TABLE IF NOT EXISTS posts
                 (thread_id TEXT, thread_title TEXT, username TEXT, member_id TEXT, post_date_time TEXT, post_pos TEXT, post_content_html TEXT, post_id TEXT PRIMARY KEY)''')
    conn.commit()
    conn.close()


 # Function to insert scrape details into the SQLite database
 def insert_scrape_data(thread_id, scrape_datetime, status):
    conn = sqlite3.connect('forum_scrape_info.db')
    c = conn.cursor()
    c.execute("INSERT INTO scrape_info (thread_id, scrape_datetime, status) VALUES (?, ?, ?)",
              (thread_id, scrape_datetime, status))
    conn.commit()
    conn.close()

 # Function to insert post details into the SQLite database
 def insert_post_data(thread_id, thread_title, username, member_id, post_date_time, post_pos, post_content_html, post_id):
    conn = sqlite3.connect('forum_scrape_info.db')
    c = conn.cursor()
    c.execute("INSERT OR REPLACE INTO posts (thread_id, thread_title, username, member_id, post_date_time, post_pos, post_content_html, post_id) VALUES (?, ?, ?, ?, ?, ?, ?, ?)",
              (thread_id, thread_title, username, member_id, post_date_time, post_pos, post_content_html, post_id))
    conn.commit()
    conn.close()



 # Create the scrape info table if it doesn't exist
 create_scrape_table()

 # Function to scrape a thread and insert its details into the database
 def scrape_thread(thread_url):
    response = requests.get(thread_url)
    print("Requested URL:", thread_url)  # Print the requested URL
    soup = BeautifulSoup(response.content, 'html.parser')

    # Check for standard error
    standard_error_elem = soup.find('div', class_='standard_error')
    if standard_error_elem:
        print(f"Error: Thread {thread_url} does not exist or encountered an error.")
        insert_scrape_data(re.search(r'\d+', thread_url).group(), datetime.now().strftime('%Y-%m-%d %H:%M:%S'), 'Error')
        return

    # Extract thread ID and title
    thread_id = re.search(r'\d+', thread_url).group()
    thread_title = soup.find('div', id='pagetitle').find('span', class_='threadtitle').text.strip()

    # Check if pagination info exists
    pagination_elem = soup.find('div', id='pagination_top')
    if pagination_elem:
        # Pagination info exists, extract total number of pages
        page_info_elem = pagination_elem.find('a', class_='popupctrl')
        if page_info_elem:
            page_info = page_info_elem.text
            num_pages = int(re.search(r'(\d+) of (\d+)', page_info).group(2))
            print(f"Detected {num_pages} pages.")
        else:
            num_pages = 1
            print("Detected single page.")
    else:
        # Pagination info not found, assume single page
        num_pages = 1
        print("Detected single page.")

    # Scrape each page
    for page_num in range(1, num_pages + 1):
        page_url = f"{thread_url}/page{page_num}" if page_num > 1 else thread_url
        print(f"Scraping page {page_num}: {page_url}")

        response = requests.get(page_url)
        soup = BeautifulSoup(response.content, 'html.parser')

        main_posts_container = soup.find('ol', class_='posts')

        # Extract details from each main post and insert into the database
        for post in main_posts_container.find_all(class_='postcontainer'):
            post_id = post.get('id').split('_')[-1]

            # Extract member ID from URL
            username_link_elem = post.find(class_='username offline popupctrl')
            if username_link_elem:
                username_link = username_link_elem.get('href')
                member_id = re.search(r'\d+', username_link).group()
            else:
                member_id = 'Member ID not available'

            # Extract post date and time if available
            date_elem = post.find(class_='date')
            post_date_time = date_elem.get_text(strip=True) if date_elem else 'Date and Time not available'

            # Extract post position if available
            post_pos = post.find('a', class_='postcounter').get_text(strip=True).lstrip('#') if post.find('a', class_='postcounter') else 'Post Position not available'

            # Extract username
            username_container = post.find(class_='username_container')
            username = username_container.find('strong').get_text() if (username_container and username_container.find('strong')) else 'Unknown'

            # Extract post content if available
            post_content_elem = post.find(class_='content')
            post_content_html = 'Content not available'
            if post_content_elem:
                blockquote_elem = post_content_elem.find('blockquote', class_='postcontent restore')
                if blockquote_elem:
                    # Extract inner HTML of blockquote element
                    post_content_html = str(blockquote_elem)

            # Insert data into the database
            insert_post_data(thread_id, thread_title, username, member_id, post_date_time, post_pos, post_content_html, post_id)

    # Insert scrape success info into the database
    insert_scrape_data(thread_id, datetime.now().strftime('%Y-%m-%d %H:%M:%S'), 'Success')

 def scrape_threads_in_range(start_thread_id, end_thread_id):
    for thread_id in range(start_thread_id, end_thread_id + 1):
        thread_url = f"https://url.tld/showthread.php?{thread_id}" 
        scrape_thread(thread_url)

 # Example usage: Scrape threads from thread ID 1 to 100
 scrape_threads_in_range(1, 100)
	import re
	import requests
	from bs4 import BeautifulSoup
	import sqlite3
	from datetime import datetime

	# Function to create a SQLite database and table for storing scrape details
	# Function to create a SQLite database and table for storing scrape details
	def create_scrape_table():
	conn = sqlite3.connect('forum_scrape_info.db')
	c = conn.cursor()
	c.execute('''CREATE TABLE IF NOT EXISTS scrape_info
	(thread_id TEXT, scrape_datetime TEXT, status TEXT)''')
	c.execute('''CREATE TABLE IF NOT EXISTS posts
	(thread_id TEXT, thread_title TEXT, username TEXT, member_id TEXT, post_date_time TEXT, post_pos TEXT, post_content_html TEXT, post_id TEXT PRIMARY KEY)''')
	conn.commit()
	conn.close()


	# Function to insert scrape details into the SQLite database
	def insert_scrape_data(thread_id, scrape_datetime, status):
	conn = sqlite3.connect('forum_scrape_info.db')
	c = conn.cursor()
	c.execute("INSERT INTO scrape_info (thread_id, scrape_datetime, status) VALUES (?, ?, ?)",
	(thread_id, scrape_datetime, status))
	conn.commit()
	conn.close()

	# Function to insert post details into the SQLite database
	def insert_post_data(thread_id, thread_title, username, member_id, post_date_time, post_pos, post_content_html, post_id):
	conn = sqlite3.connect('forum_scrape_info.db')
	c = conn.cursor()
	c.execute("INSERT OR REPLACE INTO posts (thread_id, thread_title, username, member_id, post_date_time, post_pos, post_content_html, post_id) VALUES (?, ?, ?, ?, ?, ?, ?, ?)",
	(thread_id, thread_title, username, member_id, post_date_time, post_pos, post_content_html, post_id))
	conn.commit()
	conn.close()



	# Create the scrape info table if it doesn't exist
	create_scrape_table()

	# Function to scrape a thread and insert its details into the database
	def scrape_thread(thread_url):
	response = requests.get(thread_url)
	print("Requested URL:", thread_url) # Print the requested URL
	soup = BeautifulSoup(response.content, 'html.parser')

	# Check for standard error
	standard_error_elem = soup.find('div', class_='standard_error')
	if standard_error_elem:
	print(f"Error: Thread {thread_url} does not exist or encountered an error.")
	insert_scrape_data(re.search(r'\d+', thread_url).group(), datetime.now().strftime('%Y-%m-%d %H:%M:%S'), 'Error')
	return

	# Extract thread ID and title
	thread_id = re.search(r'\d+', thread_url).group()
	thread_title = soup.find('div', id='pagetitle').find('span', class_='threadtitle').text.strip()

	# Check if pagination info exists
	pagination_elem = soup.find('div', id='pagination_top')
	if pagination_elem:
	# Pagination info exists, extract total number of pages
	page_info_elem = pagination_elem.find('a', class_='popupctrl')
	if page_info_elem:
	page_info = page_info_elem.text
	num_pages = int(re.search(r'(\d+) of (\d+)', page_info).group(2))
	print(f"Detected {num_pages} pages.")
	else:
	num_pages = 1
	print("Detected single page.")
	else:
	# Pagination info not found, assume single page
	num_pages = 1
	print("Detected single page.")

	# Scrape each page
	for page_num in range(1, num_pages + 1):
	page_url = f"{thread_url}/page{page_num}" if page_num > 1 else thread_url
	print(f"Scraping page {page_num}: {page_url}")

	response = requests.get(page_url)
	soup = BeautifulSoup(response.content, 'html.parser')

	main_posts_container = soup.find('ol', class_='posts')

	# Extract details from each main post and insert into the database
	for post in main_posts_container.find_all(class_='postcontainer'):
	post_id = post.get('id').split('_')[-1]

	# Extract member ID from URL
	username_link_elem = post.find(class_='username offline popupctrl')
	if username_link_elem:
	username_link = username_link_elem.get('href')
	member_id = re.search(r'\d+', username_link).group()
	else:
	member_id = 'Member ID not available'

	# Extract post date and time if available
	date_elem = post.find(class_='date')
	post_date_time = date_elem.get_text(strip=True) if date_elem else 'Date and Time not available'

	# Extract post position if available
	post_pos = post.find('a', class_='postcounter').get_text(strip=True).lstrip('#') if post.find('a', class_='postcounter') else 'Post Position not available'

	# Extract username
	username_container = post.find(class_='username_container')
	username = username_container.find('strong').get_text() if (username_container and username_container.find('strong')) else 'Unknown'

	# Extract post content if available
	post_content_elem = post.find(class_='content')
	post_content_html = 'Content not available'
	if post_content_elem:
	blockquote_elem = post_content_elem.find('blockquote', class_='postcontent restore')
	if blockquote_elem:
	# Extract inner HTML of blockquote element
	post_content_html = str(blockquote_elem)

	# Insert data into the database
	insert_post_data(thread_id, thread_title, username, member_id, post_date_time, post_pos, post_content_html, post_id)

	# Insert scrape success info into the database
	insert_scrape_data(thread_id, datetime.now().strftime('%Y-%m-%d %H:%M:%S'), 'Success')

	def scrape_threads_in_range(start_thread_id, end_thread_id):
	for thread_id in range(start_thread_id, end_thread_id + 1):
	thread_url = f"https://url.tld/showthread.php?{thread_id}"
	scrape_thread(thread_url)

	# Example usage: Scrape threads from thread ID 1 to 100
	scrape_threads_in_range(1, 100)