Skip to content

Instantly share code, notes, and snippets.

@DavesCodeMusings
Last active July 14, 2023 14:59
Show Gist options
  • Select an option

  • Save DavesCodeMusings/ec0217aae21aeaa600312e75089f3e0c to your computer and use it in GitHub Desktop.

Select an option

Save DavesCodeMusings/ec0217aae21aeaa600312e75089f3e0c to your computer and use it in GitHub Desktop.
Scrape your website's article tags to create XML for an RSS feed
[site]
url = https://www.your-site.org/
[rss]
description = News and Updates
default_author = [email protected] (Hot Shot Author)
#!/usr/bin/env python
#
# Scrape a web site looking for <article> tags and use info
# within to construct an rss.xml feed.
#
# Example web site html fragment:
# <article id="page-anchor">
# <h2>Title</h2>
# <p>First Paragraph</p>
# <p>Second Paragraph</p>
# ...
# <p>Last Paragraph</p>
# <address><a href="mailto:[email protected]">Author Name</a></address>
# <time itemprop="datePublished" datetime="2023-07-14">July 14, 2023</time>
# </article>
#
from os import path
from configparser import ConfigParser
from sys import exit
from requests import get
from bs4 import BeautifulSoup
from re import sub
script_name = path.basename(__file__)
config_file = path.splitext(script_name)[0] + '.ini'
config = ConfigParser()
config.read(config_file)
site_url = config['site'].get('url')
rss_description = config['rss'].get('description')
default_author = config['rss'].get('default_author')
if site_url == None:
print("No url parameter found in", config_file)
exit(-1)
reply = get(site_url + "index.html")
reply.encoding = config['site'].get('encoding', 'utf-8')
if reply.status_code != 200:
print("Status code:", reply.status_code)
exit(reply.status_code)
html = BeautifulSoup(reply.text, "html.parser")
xml = BeautifulSoup(features="xml")
rss = xml.new_tag("rss", version="2.0")
xml.append(rss)
channel = xml.new_tag("channel")
xml.rss.append(channel)
xml.rss.channel.append(html.title)
link = xml.new_tag("link")
link.string = site_url
xml.rss.channel.append(link)
description = xml.new_tag("description")
description.string = rss_description
xml.rss.channel.append(description)
language = xml.new_tag("language")
language.string = "en-US"
xml.rss.channel.append(language)
for article in html.find_all("article"):
item = xml.new_tag("item")
article_title = xml.new_tag("title")
article_title.string = article.h2.text
item.append(article_title)
article_link = xml.new_tag("link")
article_link.string = site_url + "#" + article.get("id")
item.append(article_link)
blurb = article.find_all("p")
article_description = xml.new_tag("description")
first_blurb_para = blurb[0].text
article_description.string = sub(" +", " ", first_blurb_para)
item.append(article_description)
if not article.address:
article_author = rss_author
else:
if article.address.a and article.address.a.get("href") and article.address.a.get("href").startswith("mailto:"):
article_author = article.address.a.get("href").replace("mailto:", "") + " (" + article.address.string + ")"
else:
article_author = article.address.string
author = xml.new_tag("author")
author.string = article_author
item.append(author)
if article.time and article.time.get("itemprop") == "datePublished":
article_pub_date = xml.new_tag("pubDate")
article_pub_date.string = article.time.get("datetime")
item.append(article_pub_date)
xml.rss.channel.append(item)
print(xml.prettify(formatter="minimal"))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment