Created
August 24, 2025 08:57
-
-
Save dovidio/a36ad141cfd6f33966a8bbfc2ffcb92f to your computer and use it in GitHub Desktop.
LLM powered RSS digest
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import feedparser | |
| import requests | |
| from datetime import datetime, timezone | |
| from bs4 import BeautifulSoup | |
| import re | |
| import openai | |
| import os | |
| def parse_rss_feed(feed_url): | |
| """Parse a single RSS feed and extract articles""" | |
| # Fetch the RSS feed | |
| response = requests.get(feed_url, timeout=30) | |
| response.raise_for_status() | |
| # Parse the XML content | |
| feed = feedparser.parse(response.content) | |
| articles = [] | |
| for entry in feed.entries: | |
| # Extract publication date | |
| pub_date = None | |
| if hasattr(entry, 'published_parsed') and entry.published_parsed: | |
| pub_date = datetime(*entry.published_parsed[:6], tzinfo=timezone.utc) | |
| # Get article content (summary from RSS) | |
| content = getattr(entry, 'summary', '') | |
| article = { | |
| 'title': entry.get('title', 'No Title'), | |
| 'url': entry.get('link', ''), | |
| 'content': content, | |
| 'published': pub_date, | |
| 'author': entry.get('author', 'Unknown') | |
| } | |
| articles.append(article) | |
| return articles | |
| def extract_article_content(url): | |
| """Fetch and extract main content from a web page""" | |
| headers = { | |
| 'User-Agent': 'Mozilla/5.0 (compatible; RSS Reader/1.0)' | |
| } | |
| try: | |
| response = requests.get(url, headers=headers, timeout=30) | |
| response.raise_for_status() | |
| # Parse HTML with Beautiful Soup | |
| soup = BeautifulSoup(response.text, 'html.parser') | |
| # Remove unwanted elements | |
| for element in soup(['script', 'style', 'nav', 'footer', 'header', 'aside']): | |
| element.decompose() | |
| # Extract text content | |
| content = soup.get_text(separator=' ', strip=True) | |
| # Clean up whitespace | |
| content = ' '.join(content.split()) | |
| # Limit content length for LLM processing | |
| max_chars = 8000 # Roughly 2000 tokens | |
| if len(content) > max_chars: | |
| content = content[:max_chars] + "..." | |
| return content if len(content) > 100 else None | |
| except Exception as e: | |
| print(f"Error extracting content from {url}: {e}") | |
| return None | |
| # Enhance articles with full content | |
| def enhance_articles(articles): | |
| """Add full content to articles""" | |
| for article in articles: | |
| if article['url']: | |
| full_content = extract_article_content(article['url']) | |
| if full_content and len(full_content) > len(article['content']): | |
| article['content'] = full_content | |
| print(f"Enhanced: {article['title']}") | |
| return articles | |
| class DigestGenerator: | |
| def __init__(self, api_key): | |
| self.client = openai.OpenAI(api_key=api_key) | |
| def create_digest(self, articles): | |
| """Generate an AI-powered digest of articles""" | |
| # Prepare articles for the prompt | |
| article_summaries = [] | |
| for i, article in enumerate(articles[:10], 1): # Limit to 10 articles | |
| summary = { | |
| 'title': article['title'], | |
| 'content': article['content'][:1500], # Truncate for token limits | |
| 'url': article['url'], | |
| 'published': article['published'].strftime('%Y-%m-%d') if article['published'] else 'Unknown' | |
| } | |
| article_summaries.append(summary) | |
| # Create the prompt | |
| prompt = self._build_digest_prompt(article_summaries) | |
| try: | |
| response = self.client.chat.completions.create( | |
| model="gpt-3.5-turbo", | |
| messages=[{"role": "user", "content": prompt}], | |
| temperature=0.3, | |
| max_tokens=2000 | |
| ) | |
| return response.choices[0].message.content | |
| except Exception as e: | |
| print(f"Error generating digest: {e}") | |
| return self._fallback_digest(article_summaries) | |
| def _build_digest_prompt(self, articles): | |
| """Build the prompt for AI digest generation""" | |
| prompt = f"""Create a comprehensive daily digest from these {len(articles)} articles. | |
| Instructions: | |
| - Summarize the key themes and trends | |
| - Group related topics together | |
| - Highlight the most important developments | |
| - Keep it engaging and informative | |
| - Use markdown formatting | |
| Articles: | |
| """ | |
| for i, article in enumerate(articles, 1): | |
| prompt += f""" | |
| ## Article {i}: {article['title']} | |
| **URL:** {article['url']} | |
| **Published:** {article['published']} | |
| {article['content']} | |
| --- | |
| """ | |
| return prompt | |
| def _fallback_digest(self, articles): | |
| """Simple fallback if AI fails""" | |
| digest = f"# Daily Digest - {datetime.now().strftime('%Y-%m-%d')}\n\n" | |
| for article in articles: | |
| digest += f"## {article['title']}\n" | |
| digest += f"**Published:** {article['published']}\n" | |
| digest += f"**Link:** {article['url']}\n\n" | |
| digest += f"{article['content'][:200]}...\n\n---\n\n" | |
| return digest | |
| def main(): | |
| RSS_FEEDS = [ | |
| "https://feeds.bbci.co.uk/news/rss.xml", | |
| "http://rss.cnn.com/rss/cnn_latest.rss/" | |
| ] | |
| print("Fetching articles from RSS feeds...") | |
| all_articles = [] | |
| # Step 1: Parse RSS feeds | |
| for feed_url in RSS_FEEDS: | |
| try: | |
| articles = parse_rss_feed(feed_url) | |
| all_articles.extend(articles) | |
| print(f"Fetched {len(articles)} articles from {feed_url}") | |
| except Exception as e: | |
| print(f"Error processing {feed_url}: {e}") | |
| if not all_articles: | |
| print("No articles found!") | |
| return | |
| # Sort by publication date (newest first) | |
| all_articles.sort(key=lambda x: x['published'] or datetime.min, reverse=True) | |
| # Step 2: Enhance with full content | |
| print("Extracting full article content...") | |
| enhanced_articles = enhance_articles(all_articles[:15]) # Process top 15 | |
| # Step 3: Generate AI digest | |
| print("Generating AI digest...") | |
| openai_api_key = os.getenv("OPENAI_API_KEY") | |
| generator = DigestGenerator(openai_api_key) | |
| digest = generator.create_digest(enhanced_articles) | |
| # Save and display results | |
| timestamp = datetime.now().strftime('%Y%m%d_%H%M%S') | |
| filename = f"digest_{timestamp}.md" | |
| with open(filename, 'w', encoding='utf-8') as f: | |
| f.write(digest) | |
| print(f"\n✅ Digest saved to {filename}") | |
| print("\n" + "="*60) | |
| print(digest) | |
| print("="*60) | |
| if __name__ == "__main__": | |
| main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment