DavesCodeMusings · July 14, 2023 14:59
diff --git a/rssinator.ini b/rssinator.ini
 [site]
 url = https://www.your-site.org/

 [rss]
 description = News and Updates
 default_author = [email protected] (Hot Shot Author)
diff --git a/rssinator.py b/rssinator.py
 #!/usr/bin/env python

 #
 # Scrape a web site looking for <article> tags and use info
 # within to construct an rss.xml feed.
 #
 # Example web site html fragment:
 # <article id="page-anchor">
 #   <h2>Title</h2>
 #   <p>First Paragraph</p>
 #   <p>Second Paragraph</p>
 #   ...
 #   <p>Last Paragraph</p>
 #   <address><a href="mailto:[email protected]">Author Name</a></address>
 #   <time itemprop="datePublished" datetime="2023-07-14">July 14, 2023</time>
 # </article>
 #

 from os import path
 from configparser import ConfigParser
 from sys import exit
 from requests import get
 from bs4 import BeautifulSoup
 from re import sub

 script_name = path.basename(__file__)
 config_file = path.splitext(script_name)[0] + '.ini'
 config = ConfigParser()
 config.read(config_file)
 site_url = config['site'].get('url')
 rss_description = config['rss'].get('description')
 default_author = config['rss'].get('default_author')

 if site_url == None:
    print("No url parameter found in", config_file)
    exit(-1)

 reply = get(site_url + "index.html")
 reply.encoding = config['site'].get('encoding', 'utf-8')
 if reply.status_code != 200:
    print("Status code:", reply.status_code)
    exit(reply.status_code)

 html = BeautifulSoup(reply.text, "html.parser")
 xml = BeautifulSoup(features="xml")

 rss = xml.new_tag("rss", version="2.0")
 xml.append(rss)

 channel = xml.new_tag("channel")
 xml.rss.append(channel)

 xml.rss.channel.append(html.title)

 link = xml.new_tag("link")
 link.string = site_url
 xml.rss.channel.append(link)

 description = xml.new_tag("description")
 description.string = rss_description
 xml.rss.channel.append(description)

 language = xml.new_tag("language")
 language.string = "en-US"
 xml.rss.channel.append(language)

 for article in html.find_all("article"):
    item = xml.new_tag("item")
    article_title = xml.new_tag("title")
    article_title.string = article.h2.text
    item.append(article_title)
    article_link = xml.new_tag("link")
    article_link.string = site_url + "#" + article.get("id")
    item.append(article_link)
    blurb = article.find_all("p")
    article_description = xml.new_tag("description")
    first_blurb_para = blurb[0].text
    article_description.string = sub(" +", " ", first_blurb_para)
    item.append(article_description)
    if not article.address:
        article_author = rss_author
    else:
        if article.address.a and article.address.a.get("href") and article.address.a.get("href").startswith("mailto:"):
            article_author = article.address.a.get("href").replace("mailto:", "") + " (" + article.address.string + ")"
        else:
            article_author = article.address.string
    author = xml.new_tag("author")
    author.string = article_author
    item.append(author)
    if article.time and article.time.get("itemprop") == "datePublished":
        article_pub_date = xml.new_tag("pubDate")
        article_pub_date.string = article.time.get("datetime")
        item.append(article_pub_date)
    xml.rss.channel.append(item)

 print(xml.prettify(formatter="minimal"))
	[site]
	url = https://www.your-site.org/

	[rss]
	description = News and Updates
	default_author = [email protected] (Hot Shot Author)
	#!/usr/bin/env python

	#
	# Scrape a web site looking for <article> tags and use info
	# within to construct an rss.xml feed.
	#
	# Example web site html fragment:
	# <article id="page-anchor">
	# <h2>Title</h2>
	# <p>First Paragraph</p>
	# <p>Second Paragraph</p>
	# ...
	# <p>Last Paragraph</p>
	# <address><a href="mailto:[email protected]">Author Name</a></address>
	# <time itemprop="datePublished" datetime="2023-07-14">July 14, 2023</time>
	# </article>
	#

	from os import path
	from configparser import ConfigParser
	from sys import exit
	from requests import get
	from bs4 import BeautifulSoup
	from re import sub

	script_name = path.basename(__file__)
	config_file = path.splitext(script_name)[0] + '.ini'
	config = ConfigParser()
	config.read(config_file)
	site_url = config['site'].get('url')
	rss_description = config['rss'].get('description')
	default_author = config['rss'].get('default_author')

	if site_url == None:
	print("No url parameter found in", config_file)
	exit(-1)

	reply = get(site_url + "index.html")
	reply.encoding = config['site'].get('encoding', 'utf-8')
	if reply.status_code != 200:
	print("Status code:", reply.status_code)
	exit(reply.status_code)

	html = BeautifulSoup(reply.text, "html.parser")
	xml = BeautifulSoup(features="xml")

	rss = xml.new_tag("rss", version="2.0")
	xml.append(rss)

	channel = xml.new_tag("channel")
	xml.rss.append(channel)

	xml.rss.channel.append(html.title)

	link = xml.new_tag("link")
	link.string = site_url
	xml.rss.channel.append(link)

	description = xml.new_tag("description")
	description.string = rss_description
	xml.rss.channel.append(description)

	language = xml.new_tag("language")
	language.string = "en-US"
	xml.rss.channel.append(language)

	for article in html.find_all("article"):
	item = xml.new_tag("item")
	article_title = xml.new_tag("title")
	article_title.string = article.h2.text
	item.append(article_title)
	article_link = xml.new_tag("link")
	article_link.string = site_url + "#" + article.get("id")
	item.append(article_link)
	blurb = article.find_all("p")
	article_description = xml.new_tag("description")
	first_blurb_para = blurb[0].text
	article_description.string = sub(" +", " ", first_blurb_para)
	item.append(article_description)
	if not article.address:
	article_author = rss_author
	else:
	if article.address.a and article.address.a.get("href") and article.address.a.get("href").startswith("mailto:"):
	article_author = article.address.a.get("href").replace("mailto:", "") + " (" + article.address.string + ")"
	else:
	article_author = article.address.string
	author = xml.new_tag("author")
	author.string = article_author
	item.append(author)
	if article.time and article.time.get("itemprop") == "datePublished":
	article_pub_date = xml.new_tag("pubDate")
	article_pub_date.string = article.time.get("datetime")
	item.append(article_pub_date)
	xml.rss.channel.append(item)

	print(xml.prettify(formatter="minimal"))