Skip to content

Instantly share code, notes, and snippets.

@theoknock
Last active April 22, 2024 20:43
Show Gist options
  • Save theoknock/9392c0c979e6ccb22db6af3a65d83d64 to your computer and use it in GitHub Desktop.
Save theoknock/9392c0c979e6ccb22db6af3a65d83d64 to your computer and use it in GitHub Desktop.
Collects data from a given site based on tag and parameter criteria
import requests
from bs4 import BeautifulSoup
import json
def fetch_posts(url, visited_urls, base_url, file):
if url in visited_urls:
return
print(f"Visiting {url}")
visited_urls.add(url)
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
# Collect and write posts on the current page to the JSON file
posts = []
for post_div in soup.find_all('div', class_='post'):
post_content = post_div.text.strip()
posts.append(post_content)
# Write the collected posts to file
if posts:
json.dump(posts, file)
file.write("\n") # Newline for separating entries for easier reading
# Find all links to other blog pages and filter to include only internal links
for link in soup.find_all('a'):
href = link.get('href')
if href and base_url in href: # Checks if the link is internal
fetch_posts(href, visited_urls, base_url, file)
# URL of the blog's main page
start_url = 'https://demonicactivity.blogspot.com' # Replace with your blog's URL
# Set to keep track of visited URLs to avoid loops
visited_urls = set()
# Base URL to check against links to ensure they are internal
base_url = 'https://demonicactivity.blogspot.com'
# Open a file to write the posts as they are collected
with open('blog_posts.json', 'w') as file:
fetch_posts(start_url, visited_urls, base_url, file)
print("Completed. Posts have been saved to 'blog_posts.json'.")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment