jasonsnell · May 20, 2023 02:08
diff --git a/sportinggreen.py b/sportinggreen.py
 #! /usr/bin/env python3

 import os
 import requests
 from bs4 import BeautifulSoup
 import hashlib
 import PyRSS2Gen
 import datetime
 import html.entities
 from urllib.parse import urljoin
 import json
 from collections import deque

 # Define the URL of the webpage to load
 url = 'https://www.sfchronicle.com/sports/'

 filepath = '/path/to/your/webserver/'
 slug = 'sportinggreen'
 feedTitle = 'Sporting Green - SF Chronicle'
 links = set()

 # Load previously saved links from file
 try:
    with open(f'{filepath}{slug}.json', 'r') as f:
        json_links = json.load(f)
        links_set = set(map(tuple, json_links))
        for item in links_set:
            links.add(item)
        print('Found ' + str(len(links)) + ' previous items.')
 except FileNotFoundError:
    print('no data file found.')

 # Get current time
 current_time = datetime.datetime.utcnow()

 dupes = 0
 nopes = 0

 # Load the webpage content
 response = requests.get(url)
 html_str = response.content.decode('utf-8')  # Convert bytes to string

 # Parse the HTML using BeautifulSoup
 soup = BeautifulSoup(html_str, 'html.parser')

 # Extract the title of the webpage
 webpage_title = soup.title.string.strip()

 # Extract the links and their linked text from the webpage
 # This is SF Chronicle specific, you will need to adjust
 for link in soup.find_all('a', class_='hdn-analytics'):
    href = link.get('href')
    # this is a Chronicle specific URL path, you may need to rejigger
    if href and '/article/' in href:
        href = urljoin(url, href)  # Prepend the domain to relative links

        if link.string:

            # check if the specific string is present in any of the sublists
            is_present = any(href in sublist for sublist in links)

            if is_present:
                dupes += 1
            else:
                nopes += 1
                title = link.string.strip()
                title = html.escape(title, quote=True)  # Escape special characters
                title = title.encode('ascii', 'xmlcharrefreplace').decode()  # Convert 8-bit characters to HTML entities
                timestamp = current_time.isoformat()
                links.add((href, title, timestamp))

 print('Found ' + str(nopes) + ' new items and ' + str(dupes) + " duplicates.")

 # Save unique links to file
 links_list = list(links)
 with open(f'{filepath}{slug}.json', 'w') as f:
    json.dump(links_list[-250:], f)


 # Generate an RSS feed from the links
 rss_items = []
 for link, title, date in links:
    rss_item = PyRSS2Gen.RSSItem(
        title=title,
        link=link,
        guid=link,
        pubDate=date
    )
    rss_items.append(rss_item)

 rss_feed = PyRSS2Gen.RSS2(
    title=feedTitle,
    link=url,
    description='RSS feed of the unique links on {}'.format(url),
    lastBuildDate=datetime.datetime.now(),
    items=rss_items,
 )

 # Save the RSS feed to a file
 filename = f'{filepath}{slug}.rss'
 with open(filename, 'w', encoding='utf-8') as f:
    rss_feed.write_xml(f)
	#! /usr/bin/env python3

	import os
	import requests
	from bs4 import BeautifulSoup
	import hashlib
	import PyRSS2Gen
	import datetime
	import html.entities
	from urllib.parse import urljoin
	import json
	from collections import deque

	# Define the URL of the webpage to load
	url = 'https://www.sfchronicle.com/sports/'

	filepath = '/path/to/your/webserver/'
	slug = 'sportinggreen'
	feedTitle = 'Sporting Green - SF Chronicle'
	links = set()

	# Load previously saved links from file
	try:
	with open(f'{filepath}{slug}.json', 'r') as f:
	json_links = json.load(f)
	links_set = set(map(tuple, json_links))
	for item in links_set:
	links.add(item)
	print('Found ' + str(len(links)) + ' previous items.')
	except FileNotFoundError:
	print('no data file found.')

	# Get current time
	current_time = datetime.datetime.utcnow()

	dupes = 0
	nopes = 0

	# Load the webpage content
	response = requests.get(url)
	html_str = response.content.decode('utf-8') # Convert bytes to string

	# Parse the HTML using BeautifulSoup
	soup = BeautifulSoup(html_str, 'html.parser')

	# Extract the title of the webpage
	webpage_title = soup.title.string.strip()

	# Extract the links and their linked text from the webpage
	# This is SF Chronicle specific, you will need to adjust
	for link in soup.find_all('a', class_='hdn-analytics'):
	href = link.get('href')
	# this is a Chronicle specific URL path, you may need to rejigger
	if href and '/article/' in href:
	href = urljoin(url, href) # Prepend the domain to relative links

	if link.string:

	# check if the specific string is present in any of the sublists
	is_present = any(href in sublist for sublist in links)

	if is_present:
	dupes += 1
	else:
	nopes += 1
	title = link.string.strip()
	title = html.escape(title, quote=True) # Escape special characters
	title = title.encode('ascii', 'xmlcharrefreplace').decode() # Convert 8-bit characters to HTML entities
	timestamp = current_time.isoformat()
	links.add((href, title, timestamp))

	print('Found ' + str(nopes) + ' new items and ' + str(dupes) + " duplicates.")

	# Save unique links to file
	links_list = list(links)
	with open(f'{filepath}{slug}.json', 'w') as f:
	json.dump(links_list[-250:], f)


	# Generate an RSS feed from the links
	rss_items = []
	for link, title, date in links:
	rss_item = PyRSS2Gen.RSSItem(
	title=title,
	link=link,
	guid=link,
	pubDate=date
	)
	rss_items.append(rss_item)

	rss_feed = PyRSS2Gen.RSS2(
	title=feedTitle,
	link=url,
	description='RSS feed of the unique links on {}'.format(url),
	lastBuildDate=datetime.datetime.now(),
	items=rss_items,
	)

	# Save the RSS feed to a file
	filename = f'{filepath}{slug}.rss'
	with open(filename, 'w', encoding='utf-8') as f:
	rss_feed.write_xml(f)