johnowhitaker · December 30, 2022 18:11
diff --git a/make_rss.py b/make_rss.py
 import trafilatura
 import feedparser
 import requests
 from bs4 import BeautifulSoup
 from feedgenerator import DefaultFeed, Enclosure


 API_URL = "https://api-inference.huggingface.co/models/facebook/bart-large-cnn"
 headers = {"Authorization": "Bearer HF_TOKEN"}

 def query(payload):
    response = requests.post(API_URL, headers=headers, json=payload)
    return response.json()

 def summarize(text):
    if text is None: return None
    output = query({
        "inputs": text, # TODO pick max size
            "max_length":300, 
            "min_length":30, 
            "do_sample":False
    })
    return output[0]['summary_text']

 HN_Feed = feedparser.parse('https://hnrss.org/frontpage')
 Prev_Feed = feedparser.parse('feed.xml')
 Out_Feed = DefaultFeed(
    title="DistilHN Feed",
    link="http://example.com/rss",
    description="Front Page articles from HN, sumarized with AI"
 )

 for p in HN_Feed.entries:

    print(p['title'])

    im_url = 'https://news.ycombinator.com/favicon.ico'
    if not 'ycombinator' in p['link']:
        im_url = 'https://placekitten.com/g/300/200'

    # Summarize
    summary = p['summary']
    if 'Article URL' in summary:
        try:
            summary = "Summary failed. Article URL: " + p['link']
            downloaded = trafilatura.fetch_url(p['link'])
            text =  trafilatura.extract(downloaded, include_comments=False, include_tables=False)
            if text is not None:

                # Get summary:
                summary = summarize(text)

                # Get image URL
                soup = BeautifulSoup(downloaded, 'html.parser')
                im = soup.find("meta", property="og:image")
                im_url = im['content'] if im else im_url
        except:
            summary = "Summary failed. Article URL: " + p['link']
            im_url = 'None'
    else:
        # Truncate & Remove HTML (for askHN and similar)
        if len(summary)>10: summary = trafilatura.extract(summary)[:240] + '...' 

    # Special rules

    # YouTube
    if 'youtube' in p['link']:
        summary = "YouTube Video: " + p['link']
        im_url = 'None'
    
    # Mastadon
    if 'mastodon' in p['link'] or 'mastadon' in p.summary:
        summary = "Mastadon Post: " + p['link']
        im_url = 'None'

    # Twitter
    if 'twitter' in p['link']:
        summary = "Twitter Post: " + p['link']
        im_url = 'None'

    # Add to feed
    Out_Feed.add_item(
        title=p['title'],
        link=p['link'],
        description=summary,
        comments = p['comments'],
        enclosure = Enclosure(im_url, '1234', 'image/jpeg'),
    )

 # Generate the RSS feed XML
 rss = Out_Feed.writeString('utf-8')

 # Save the RSS feed to a file
 with open('feed.xml', 'w') as f:
    f.write(rss)
	import trafilatura
	import feedparser
	import requests
	from bs4 import BeautifulSoup
	from feedgenerator import DefaultFeed, Enclosure


	API_URL = "https://api-inference.huggingface.co/models/facebook/bart-large-cnn"
	headers = {"Authorization": "Bearer HF_TOKEN"}

	def query(payload):
	response = requests.post(API_URL, headers=headers, json=payload)
	return response.json()

	def summarize(text):
	if text is None: return None
	output = query({
	"inputs": text, # TODO pick max size
	"max_length":300,
	"min_length":30,
	"do_sample":False
	})
	return output[0]['summary_text']

	HN_Feed = feedparser.parse('https://hnrss.org/frontpage')
	Prev_Feed = feedparser.parse('feed.xml')
	Out_Feed = DefaultFeed(
	title="DistilHN Feed",
	link="http://example.com/rss",
	description="Front Page articles from HN, sumarized with AI"
	)

	for p in HN_Feed.entries:

	print(p['title'])

	im_url = 'https://news.ycombinator.com/favicon.ico'
	if not 'ycombinator' in p['link']:
	im_url = 'https://placekitten.com/g/300/200'

	# Summarize
	summary = p['summary']
	if 'Article URL' in summary:
	try:
	summary = "Summary failed. Article URL: " + p['link']
	downloaded = trafilatura.fetch_url(p['link'])
	text = trafilatura.extract(downloaded, include_comments=False, include_tables=False)
	if text is not None:

	# Get summary:
	summary = summarize(text)

	# Get image URL
	soup = BeautifulSoup(downloaded, 'html.parser')
	im = soup.find("meta", property="og:image")
	im_url = im['content'] if im else im_url
	except:
	summary = "Summary failed. Article URL: " + p['link']
	im_url = 'None'
	else:
	# Truncate & Remove HTML (for askHN and similar)
	if len(summary)>10: summary = trafilatura.extract(summary)[:240] + '...'

	# Special rules

	# YouTube
	if 'youtube' in p['link']:
	summary = "YouTube Video: " + p['link']
	im_url = 'None'

	# Mastadon
	if 'mastodon' in p['link'] or 'mastadon' in p.summary:
	summary = "Mastadon Post: " + p['link']
	im_url = 'None'

	# Twitter
	if 'twitter' in p['link']:
	summary = "Twitter Post: " + p['link']
	im_url = 'None'

	# Add to feed
	Out_Feed.add_item(
	title=p['title'],
	link=p['link'],
	description=summary,
	comments = p['comments'],
	enclosure = Enclosure(im_url, '1234', 'image/jpeg'),
	)

	# Generate the RSS feed XML
	rss = Out_Feed.writeString('utf-8')

	# Save the RSS feed to a file
	with open('feed.xml', 'w') as f:
	f.write(rss)