Last active
May 20, 2023 02:08
-
-
Save jasonsnell/e8578f4a41ee9633242bb8f25e26bf1c to your computer and use it in GitHub Desktop.
Yet Another Web to RSS Scraper
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! /usr/bin/env python3 | |
import os | |
import requests | |
from bs4 import BeautifulSoup | |
import hashlib | |
import PyRSS2Gen | |
import datetime | |
import html.entities | |
from urllib.parse import urljoin | |
import json | |
from collections import deque | |
# Define the URL of the webpage to load | |
url = 'https://www.sfchronicle.com/sports/' | |
filepath = '/path/to/your/webserver/' | |
slug = 'sportinggreen' | |
feedTitle = 'Sporting Green - SF Chronicle' | |
links = set() | |
# Load previously saved links from file | |
try: | |
with open(f'{filepath}{slug}.json', 'r') as f: | |
json_links = json.load(f) | |
links_set = set(map(tuple, json_links)) | |
for item in links_set: | |
links.add(item) | |
print('Found ' + str(len(links)) + ' previous items.') | |
except FileNotFoundError: | |
print('no data file found.') | |
# Get current time | |
current_time = datetime.datetime.utcnow() | |
dupes = 0 | |
nopes = 0 | |
# Load the webpage content | |
response = requests.get(url) | |
html_str = response.content.decode('utf-8') # Convert bytes to string | |
# Parse the HTML using BeautifulSoup | |
soup = BeautifulSoup(html_str, 'html.parser') | |
# Extract the title of the webpage | |
webpage_title = soup.title.string.strip() | |
# Extract the links and their linked text from the webpage | |
# This is SF Chronicle specific, you will need to adjust | |
for link in soup.find_all('a', class_='hdn-analytics'): | |
href = link.get('href') | |
# this is a Chronicle specific URL path, you may need to rejigger | |
if href and '/article/' in href: | |
href = urljoin(url, href) # Prepend the domain to relative links | |
if link.string: | |
# check if the specific string is present in any of the sublists | |
is_present = any(href in sublist for sublist in links) | |
if is_present: | |
dupes += 1 | |
else: | |
nopes += 1 | |
title = link.string.strip() | |
title = html.escape(title, quote=True) # Escape special characters | |
title = title.encode('ascii', 'xmlcharrefreplace').decode() # Convert 8-bit characters to HTML entities | |
timestamp = current_time.isoformat() | |
links.add((href, title, timestamp)) | |
print('Found ' + str(nopes) + ' new items and ' + str(dupes) + " duplicates.") | |
# Save unique links to file | |
links_list = list(links) | |
with open(f'{filepath}{slug}.json', 'w') as f: | |
json.dump(links_list[-250:], f) | |
# Generate an RSS feed from the links | |
rss_items = [] | |
for link, title, date in links: | |
rss_item = PyRSS2Gen.RSSItem( | |
title=title, | |
link=link, | |
guid=link, | |
pubDate=date | |
) | |
rss_items.append(rss_item) | |
rss_feed = PyRSS2Gen.RSS2( | |
title=feedTitle, | |
link=url, | |
description='RSS feed of the unique links on {}'.format(url), | |
lastBuildDate=datetime.datetime.now(), | |
items=rss_items, | |
) | |
# Save the RSS feed to a file | |
filename = f'{filepath}{slug}.rss' | |
with open(filename, 'w', encoding='utf-8') as f: | |
rss_feed.write_xml(f) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment