theoknock · April 22, 2024 20:43
diff --git a/Site Crawler.py b/Site Crawler.py
 import requests
 from bs4 import BeautifulSoup
 import json

 def fetch_posts(url, visited_urls, base_url, file):
    if url in visited_urls:
        return

    print(f"Visiting {url}")
    visited_urls.add(url)
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')

    # Collect and write posts on the current page to the JSON file
    posts = []
    for post_div in soup.find_all('div', class_='post'):
        post_content = post_div.text.strip()
        posts.append(post_content)
    
    # Write the collected posts to file
    if posts:
        json.dump(posts, file)
        file.write("\n")  # Newline for separating entries for easier reading

    # Find all links to other blog pages and filter to include only internal links
    for link in soup.find_all('a'):
        href = link.get('href')
        if href and base_url in href:  # Checks if the link is internal
            fetch_posts(href, visited_urls, base_url, file)

 # URL of the blog's main page
 start_url = 'https://demonicactivity.blogspot.com'  # Replace with your blog's URL

 # Set to keep track of visited URLs to avoid loops
 visited_urls = set()

 # Base URL to check against links to ensure they are internal
 base_url = 'https://demonicactivity.blogspot.com'

 # Open a file to write the posts as they are collected
 with open('blog_posts.json', 'w') as file:
    fetch_posts(start_url, visited_urls, base_url, file)

 print("Completed. Posts have been saved to 'blog_posts.json'.")
	import requests
	from bs4 import BeautifulSoup
	import json

	def fetch_posts(url, visited_urls, base_url, file):
	if url in visited_urls:
	return

	print(f"Visiting {url}")
	visited_urls.add(url)
	response = requests.get(url)
	soup = BeautifulSoup(response.text, 'html.parser')

	# Collect and write posts on the current page to the JSON file
	posts = []
	for post_div in soup.find_all('div', class_='post'):
	post_content = post_div.text.strip()
	posts.append(post_content)

	# Write the collected posts to file
	if posts:
	json.dump(posts, file)
	file.write("\n") # Newline for separating entries for easier reading

	# Find all links to other blog pages and filter to include only internal links
	for link in soup.find_all('a'):
	href = link.get('href')
	if href and base_url in href: # Checks if the link is internal
	fetch_posts(href, visited_urls, base_url, file)

	# URL of the blog's main page
	start_url = 'https://demonicactivity.blogspot.com' # Replace with your blog's URL

	# Set to keep track of visited URLs to avoid loops
	visited_urls = set()

	# Base URL to check against links to ensure they are internal
	base_url = 'https://demonicactivity.blogspot.com'

	# Open a file to write the posts as they are collected
	with open('blog_posts.json', 'w') as file:
	fetch_posts(start_url, visited_urls, base_url, file)

	print("Completed. Posts have been saved to 'blog_posts.json'.")