Created
December 30, 2022 18:11
-
-
Save johnowhitaker/741e9c9f16f7eb9c879bc0dbcd780b59 to your computer and use it in GitHub Desktop.
app to summarize an RSS feed and write to a new RSS feed
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import trafilatura | |
import feedparser | |
import requests | |
from bs4 import BeautifulSoup | |
from feedgenerator import DefaultFeed, Enclosure | |
API_URL = "https://api-inference.huggingface.co/models/facebook/bart-large-cnn" | |
headers = {"Authorization": "Bearer HF_TOKEN"} | |
def query(payload): | |
response = requests.post(API_URL, headers=headers, json=payload) | |
return response.json() | |
def summarize(text): | |
if text is None: return None | |
output = query({ | |
"inputs": text, # TODO pick max size | |
"max_length":300, | |
"min_length":30, | |
"do_sample":False | |
}) | |
return output[0]['summary_text'] | |
HN_Feed = feedparser.parse('https://hnrss.org/frontpage') | |
Prev_Feed = feedparser.parse('feed.xml') | |
Out_Feed = DefaultFeed( | |
title="DistilHN Feed", | |
link="http://example.com/rss", | |
description="Front Page articles from HN, sumarized with AI" | |
) | |
for p in HN_Feed.entries: | |
print(p['title']) | |
im_url = 'https://news.ycombinator.com/favicon.ico' | |
if not 'ycombinator' in p['link']: | |
im_url = 'https://placekitten.com/g/300/200' | |
# Summarize | |
summary = p['summary'] | |
if 'Article URL' in summary: | |
try: | |
summary = "Summary failed. Article URL: " + p['link'] | |
downloaded = trafilatura.fetch_url(p['link']) | |
text = trafilatura.extract(downloaded, include_comments=False, include_tables=False) | |
if text is not None: | |
# Get summary: | |
summary = summarize(text) | |
# Get image URL | |
soup = BeautifulSoup(downloaded, 'html.parser') | |
im = soup.find("meta", property="og:image") | |
im_url = im['content'] if im else im_url | |
except: | |
summary = "Summary failed. Article URL: " + p['link'] | |
im_url = 'None' | |
else: | |
# Truncate & Remove HTML (for askHN and similar) | |
if len(summary)>10: summary = trafilatura.extract(summary)[:240] + '...' | |
# Special rules | |
# YouTube | |
if 'youtube' in p['link']: | |
summary = "YouTube Video: " + p['link'] | |
im_url = 'None' | |
# Mastadon | |
if 'mastodon' in p['link'] or 'mastadon' in p.summary: | |
summary = "Mastadon Post: " + p['link'] | |
im_url = 'None' | |
if 'twitter' in p['link']: | |
summary = "Twitter Post: " + p['link'] | |
im_url = 'None' | |
# Add to feed | |
Out_Feed.add_item( | |
title=p['title'], | |
link=p['link'], | |
description=summary, | |
comments = p['comments'], | |
enclosure = Enclosure(im_url, '1234', 'image/jpeg'), | |
) | |
# Generate the RSS feed XML | |
rss = Out_Feed.writeString('utf-8') | |
# Save the RSS feed to a file | |
with open('feed.xml', 'w') as f: | |
f.write(rss) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment