Last active
July 14, 2023 14:59
-
-
Save DavesCodeMusings/ec0217aae21aeaa600312e75089f3e0c to your computer and use it in GitHub Desktop.
Scrape your website's article tags to create XML for an RSS feed
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| [site] | |
| url = https://www.your-site.org/ | |
| [rss] | |
| description = News and Updates | |
| default_author = [email protected] (Hot Shot Author) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python | |
| # | |
| # Scrape a web site looking for <article> tags and use info | |
| # within to construct an rss.xml feed. | |
| # | |
| # Example web site html fragment: | |
| # <article id="page-anchor"> | |
| # <h2>Title</h2> | |
| # <p>First Paragraph</p> | |
| # <p>Second Paragraph</p> | |
| # ... | |
| # <p>Last Paragraph</p> | |
| # <address><a href="mailto:[email protected]">Author Name</a></address> | |
| # <time itemprop="datePublished" datetime="2023-07-14">July 14, 2023</time> | |
| # </article> | |
| # | |
| from os import path | |
| from configparser import ConfigParser | |
| from sys import exit | |
| from requests import get | |
| from bs4 import BeautifulSoup | |
| from re import sub | |
| script_name = path.basename(__file__) | |
| config_file = path.splitext(script_name)[0] + '.ini' | |
| config = ConfigParser() | |
| config.read(config_file) | |
| site_url = config['site'].get('url') | |
| rss_description = config['rss'].get('description') | |
| default_author = config['rss'].get('default_author') | |
| if site_url == None: | |
| print("No url parameter found in", config_file) | |
| exit(-1) | |
| reply = get(site_url + "index.html") | |
| reply.encoding = config['site'].get('encoding', 'utf-8') | |
| if reply.status_code != 200: | |
| print("Status code:", reply.status_code) | |
| exit(reply.status_code) | |
| html = BeautifulSoup(reply.text, "html.parser") | |
| xml = BeautifulSoup(features="xml") | |
| rss = xml.new_tag("rss", version="2.0") | |
| xml.append(rss) | |
| channel = xml.new_tag("channel") | |
| xml.rss.append(channel) | |
| xml.rss.channel.append(html.title) | |
| link = xml.new_tag("link") | |
| link.string = site_url | |
| xml.rss.channel.append(link) | |
| description = xml.new_tag("description") | |
| description.string = rss_description | |
| xml.rss.channel.append(description) | |
| language = xml.new_tag("language") | |
| language.string = "en-US" | |
| xml.rss.channel.append(language) | |
| for article in html.find_all("article"): | |
| item = xml.new_tag("item") | |
| article_title = xml.new_tag("title") | |
| article_title.string = article.h2.text | |
| item.append(article_title) | |
| article_link = xml.new_tag("link") | |
| article_link.string = site_url + "#" + article.get("id") | |
| item.append(article_link) | |
| blurb = article.find_all("p") | |
| article_description = xml.new_tag("description") | |
| first_blurb_para = blurb[0].text | |
| article_description.string = sub(" +", " ", first_blurb_para) | |
| item.append(article_description) | |
| if not article.address: | |
| article_author = rss_author | |
| else: | |
| if article.address.a and article.address.a.get("href") and article.address.a.get("href").startswith("mailto:"): | |
| article_author = article.address.a.get("href").replace("mailto:", "") + " (" + article.address.string + ")" | |
| else: | |
| article_author = article.address.string | |
| author = xml.new_tag("author") | |
| author.string = article_author | |
| item.append(author) | |
| if article.time and article.time.get("itemprop") == "datePublished": | |
| article_pub_date = xml.new_tag("pubDate") | |
| article_pub_date.string = article.time.get("datetime") | |
| item.append(article_pub_date) | |
| xml.rss.channel.append(item) | |
| print(xml.prettify(formatter="minimal")) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment