Created
October 4, 2025 16:35
-
-
Save duarteocarmo/4869cae3f8c5bd5c95a556cc3a70ece3 to your computer and use it in GitHub Desktop.
Categorize your OPML feed with an LLM
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # requires-python = ">=3.13" | |
| # /// script | |
| # dependencies = [ | |
| # "pydantic", | |
| # "feedparser", | |
| # "tqdm", | |
| # "litellm", | |
| # ] | |
| # /// | |
| import argparse | |
| import copy | |
| import random | |
| import xml.etree.ElementTree as ET | |
| from concurrent.futures import ThreadPoolExecutor | |
| from datetime import datetime | |
| from enum import Enum | |
| import feedparser | |
| from litellm import completion | |
| from pydantic import BaseModel, Field | |
| from tqdm import tqdm | |
| categories = [ | |
| "Substack", | |
| "Machine Learning/AI", | |
| "Personal Blogs", | |
| "Substack feed", | |
| "Newsletter", | |
| "Reddit", | |
| "Portugal", | |
| "World News", | |
| "Inactive", | |
| "Technology", | |
| "Other", | |
| ] | |
| CategoryEnum = Enum("CategoryEnum", {category: category for category in categories}) | |
| class CategoryResponse(BaseModel): | |
| category: CategoryEnum = Field(..., description="The category of the RSS feed.") | |
| short_reason: str | |
| def get_last_entries_from(rss_url: str, n: int = 5) -> str: | |
| """Fetch the last n entries from the given RSS feed URL.""" | |
| feed = feedparser.parse(rss_url) | |
| entries = feed.entries | |
| try: | |
| entries.sort( | |
| key=lambda entry: entry.get("published_parsed", None), reverse=True | |
| ) | |
| except Exception: | |
| pass | |
| try: | |
| entries.sort(key=lambda entry: entry.get("published", None), reverse=True) | |
| except Exception: | |
| pass | |
| total_text = "" | |
| for entry in entries[:n]: | |
| title = entry.get("title", "N/A")[:100] | |
| summary = entry.get("summary", "N/A")[:240] + "..." | |
| date = entry.get("published", "N/A")[:16] | |
| content = str(entry.get("content", [{"value": "N/A"}]))[:240] + "..." | |
| total_text += ( | |
| f"Title: {title}\nDate: {date}\nSummary: {summary}\nContent: {content}\n\n" | |
| ) | |
| return total_text.strip() | |
| def categorize(entry: ET.Element) -> CategoryEnum: | |
| url = entry.attrib["xmlUrl"] | |
| last_n_entries = get_last_entries_from(url) | |
| today_date_as_string = datetime.now().strftime("%Y-%m-%d") | |
| title = entry.attrib.get("title", "N/A") | |
| s_prompt = f""" | |
| - Your task is to categorize the following RSS feed into one of the following categories: {", ".join(categories)}. | |
| - If none of the categories fit, use "Other". | |
| - Kill the newsletter is just a mirror I use to read newsletters; might be a blog or something else | |
| - Substack links go on Substack category | |
| - Today is {today_date_as_string}, feeds that have not published in the last year should be categorized as "Inactive". | |
| - If no entries are found, categorize as "Inactive". | |
| """ | |
| u_prompt = f""" | |
| RSS Feed Title: {title} | |
| RSS Feed Description: {entry.attrib.get("text", "N/A")} | |
| RSS Feed URL: {url} | |
| <most_recent_entries> | |
| {"\n".join(last_n_entries)} | |
| </most_recent_entries> | |
| """ | |
| messages = [ | |
| {"role": "system", "content": s_prompt}, | |
| {"role": "user", "content": u_prompt}, | |
| ] | |
| resp = completion( | |
| model="gpt-5-mini", | |
| messages=messages, | |
| response_format=CategoryResponse, | |
| ) | |
| raw_response = resp.choices[0].message.content | |
| category = CategoryResponse.model_validate_json(raw_response).category | |
| reason = CategoryResponse.model_validate_json(raw_response).short_reason | |
| # print(f"Title: {title}\nCategory: {category}\nUrl: {url}\nReason: {reason}\n") | |
| return category | |
| def parse_args() -> argparse.Namespace: | |
| parser = argparse.ArgumentParser( | |
| description="Categorize RSS feeds inside an OPML file." | |
| ) | |
| parser.add_argument("input_path", help="Path to the original OPML file.") | |
| parser.add_argument( | |
| "--output", | |
| required=True, | |
| help="Path to store the categorized OPML file.", | |
| ) | |
| parser.add_argument( | |
| "--limit", | |
| type=int, | |
| help="Number of feeds to categorize after shuffling.", | |
| ) | |
| return parser.parse_args() | |
| def collect_feeds(root: ET.Element) -> list[ET.Element]: | |
| body = root.find("body") | |
| if body is None: | |
| msg = "OPML file missing body section" | |
| raise ValueError(msg) | |
| feeds = [entry for entry in body.iter("outline") if "xmlUrl" in entry.attrib] | |
| return feeds | |
| def get_category_outline(body: ET.Element, category: str) -> ET.Element: | |
| for outline in body.findall("outline"): | |
| if outline.attrib.get("text") == category and "xmlUrl" not in outline.attrib: | |
| return outline | |
| node = ET.Element("outline", {"text": category, "title": category}) | |
| body.append(node) | |
| return node | |
| def write_categorized_opml( | |
| root: ET.Element, | |
| output_path: str, | |
| feeds: list[ET.Element], | |
| categories_for_feeds: list[CategoryEnum], | |
| ) -> None: # noqa: E501 | |
| new_root = ET.Element(root.tag, root.attrib) | |
| head = root.find("head") | |
| if head is not None: | |
| new_root.append(copy.deepcopy(head)) | |
| body = ET.Element("body") | |
| new_root.append(body) | |
| category_nodes: dict[str, ET.Element] = {} | |
| for entry, category in zip(feeds, categories_for_feeds): | |
| category_name = category.value | |
| if category_name not in category_nodes: | |
| category_nodes[category_name] = get_category_outline(body, category_name) | |
| category_nodes[category_name].append(copy.deepcopy(entry)) | |
| ET.ElementTree(new_root).write( | |
| output_path, | |
| encoding="UTF-8", | |
| xml_declaration=True, | |
| ) | |
| def categorize_feeds(selected_feeds: list[ET.Element]) -> list[CategoryEnum]: | |
| if not selected_feeds: | |
| return [] | |
| with ThreadPoolExecutor(max_workers=8) as executor: | |
| results = list( | |
| tqdm( | |
| executor.map(categorize, selected_feeds), | |
| total=len(selected_feeds), | |
| desc="Categorizing", | |
| ) | |
| ) | |
| return results | |
| def main(input_path: str, output_path: str, limit: int | None) -> None: | |
| with open(input_path, "r", encoding="utf-8") as file: | |
| content = file.read() | |
| tree = ET.ElementTree(ET.fromstring(content)) | |
| root = tree.getroot() | |
| feeds = collect_feeds(root) | |
| selected_feeds = feeds.copy() | |
| if limit is not None: | |
| if limit < 1: | |
| selected_feeds = [] | |
| else: | |
| random.shuffle(selected_feeds) | |
| selected_feeds = selected_feeds[:limit] | |
| categories_for_feeds = categorize_feeds(selected_feeds) | |
| write_categorized_opml(root, output_path, selected_feeds, categories_for_feeds) | |
| if __name__ == "__main__": | |
| args = parse_args() | |
| main(args.input_path, args.output, args.limit) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment