Last active
April 22, 2024 20:43
-
-
Save theoknock/9392c0c979e6ccb22db6af3a65d83d64 to your computer and use it in GitHub Desktop.
Collects data from a given site based on tag and parameter criteria
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
from bs4 import BeautifulSoup | |
import json | |
def fetch_posts(url, visited_urls, base_url, file): | |
if url in visited_urls: | |
return | |
print(f"Visiting {url}") | |
visited_urls.add(url) | |
response = requests.get(url) | |
soup = BeautifulSoup(response.text, 'html.parser') | |
# Collect and write posts on the current page to the JSON file | |
posts = [] | |
for post_div in soup.find_all('div', class_='post'): | |
post_content = post_div.text.strip() | |
posts.append(post_content) | |
# Write the collected posts to file | |
if posts: | |
json.dump(posts, file) | |
file.write("\n") # Newline for separating entries for easier reading | |
# Find all links to other blog pages and filter to include only internal links | |
for link in soup.find_all('a'): | |
href = link.get('href') | |
if href and base_url in href: # Checks if the link is internal | |
fetch_posts(href, visited_urls, base_url, file) | |
# URL of the blog's main page | |
start_url = 'https://demonicactivity.blogspot.com' # Replace with your blog's URL | |
# Set to keep track of visited URLs to avoid loops | |
visited_urls = set() | |
# Base URL to check against links to ensure they are internal | |
base_url = 'https://demonicactivity.blogspot.com' | |
# Open a file to write the posts as they are collected | |
with open('blog_posts.json', 'w') as file: | |
fetch_posts(start_url, visited_urls, base_url, file) | |
print("Completed. Posts have been saved to 'blog_posts.json'.") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment