Skip to content

Instantly share code, notes, and snippets.

@duarteocarmo
Created October 4, 2025 16:35
Show Gist options
  • Save duarteocarmo/4869cae3f8c5bd5c95a556cc3a70ece3 to your computer and use it in GitHub Desktop.
Save duarteocarmo/4869cae3f8c5bd5c95a556cc3a70ece3 to your computer and use it in GitHub Desktop.
Categorize your OPML feed with an LLM
# requires-python = ">=3.13"
# /// script
# dependencies = [
# "pydantic",
# "feedparser",
# "tqdm",
# "litellm",
# ]
# ///
import argparse
import copy
import random
import xml.etree.ElementTree as ET
from concurrent.futures import ThreadPoolExecutor
from datetime import datetime
from enum import Enum
import feedparser
from litellm import completion
from pydantic import BaseModel, Field
from tqdm import tqdm
categories = [
"Substack",
"Machine Learning/AI",
"Personal Blogs",
"Substack feed",
"Newsletter",
"Reddit",
"Portugal",
"World News",
"Inactive",
"Technology",
"Other",
]
CategoryEnum = Enum("CategoryEnum", {category: category for category in categories})
class CategoryResponse(BaseModel):
category: CategoryEnum = Field(..., description="The category of the RSS feed.")
short_reason: str
def get_last_entries_from(rss_url: str, n: int = 5) -> str:
"""Fetch the last n entries from the given RSS feed URL."""
feed = feedparser.parse(rss_url)
entries = feed.entries
try:
entries.sort(
key=lambda entry: entry.get("published_parsed", None), reverse=True
)
except Exception:
pass
try:
entries.sort(key=lambda entry: entry.get("published", None), reverse=True)
except Exception:
pass
total_text = ""
for entry in entries[:n]:
title = entry.get("title", "N/A")[:100]
summary = entry.get("summary", "N/A")[:240] + "..."
date = entry.get("published", "N/A")[:16]
content = str(entry.get("content", [{"value": "N/A"}]))[:240] + "..."
total_text += (
f"Title: {title}\nDate: {date}\nSummary: {summary}\nContent: {content}\n\n"
)
return total_text.strip()
def categorize(entry: ET.Element) -> CategoryEnum:
url = entry.attrib["xmlUrl"]
last_n_entries = get_last_entries_from(url)
today_date_as_string = datetime.now().strftime("%Y-%m-%d")
title = entry.attrib.get("title", "N/A")
s_prompt = f"""
- Your task is to categorize the following RSS feed into one of the following categories: {", ".join(categories)}.
- If none of the categories fit, use "Other".
- Kill the newsletter is just a mirror I use to read newsletters; might be a blog or something else
- Substack links go on Substack category
- Today is {today_date_as_string}, feeds that have not published in the last year should be categorized as "Inactive".
- If no entries are found, categorize as "Inactive".
"""
u_prompt = f"""
RSS Feed Title: {title}
RSS Feed Description: {entry.attrib.get("text", "N/A")}
RSS Feed URL: {url}
<most_recent_entries>
{"\n".join(last_n_entries)}
</most_recent_entries>
"""
messages = [
{"role": "system", "content": s_prompt},
{"role": "user", "content": u_prompt},
]
resp = completion(
model="gpt-5-mini",
messages=messages,
response_format=CategoryResponse,
)
raw_response = resp.choices[0].message.content
category = CategoryResponse.model_validate_json(raw_response).category
reason = CategoryResponse.model_validate_json(raw_response).short_reason
# print(f"Title: {title}\nCategory: {category}\nUrl: {url}\nReason: {reason}\n")
return category
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(
description="Categorize RSS feeds inside an OPML file."
)
parser.add_argument("input_path", help="Path to the original OPML file.")
parser.add_argument(
"--output",
required=True,
help="Path to store the categorized OPML file.",
)
parser.add_argument(
"--limit",
type=int,
help="Number of feeds to categorize after shuffling.",
)
return parser.parse_args()
def collect_feeds(root: ET.Element) -> list[ET.Element]:
body = root.find("body")
if body is None:
msg = "OPML file missing body section"
raise ValueError(msg)
feeds = [entry for entry in body.iter("outline") if "xmlUrl" in entry.attrib]
return feeds
def get_category_outline(body: ET.Element, category: str) -> ET.Element:
for outline in body.findall("outline"):
if outline.attrib.get("text") == category and "xmlUrl" not in outline.attrib:
return outline
node = ET.Element("outline", {"text": category, "title": category})
body.append(node)
return node
def write_categorized_opml(
root: ET.Element,
output_path: str,
feeds: list[ET.Element],
categories_for_feeds: list[CategoryEnum],
) -> None: # noqa: E501
new_root = ET.Element(root.tag, root.attrib)
head = root.find("head")
if head is not None:
new_root.append(copy.deepcopy(head))
body = ET.Element("body")
new_root.append(body)
category_nodes: dict[str, ET.Element] = {}
for entry, category in zip(feeds, categories_for_feeds):
category_name = category.value
if category_name not in category_nodes:
category_nodes[category_name] = get_category_outline(body, category_name)
category_nodes[category_name].append(copy.deepcopy(entry))
ET.ElementTree(new_root).write(
output_path,
encoding="UTF-8",
xml_declaration=True,
)
def categorize_feeds(selected_feeds: list[ET.Element]) -> list[CategoryEnum]:
if not selected_feeds:
return []
with ThreadPoolExecutor(max_workers=8) as executor:
results = list(
tqdm(
executor.map(categorize, selected_feeds),
total=len(selected_feeds),
desc="Categorizing",
)
)
return results
def main(input_path: str, output_path: str, limit: int | None) -> None:
with open(input_path, "r", encoding="utf-8") as file:
content = file.read()
tree = ET.ElementTree(ET.fromstring(content))
root = tree.getroot()
feeds = collect_feeds(root)
selected_feeds = feeds.copy()
if limit is not None:
if limit < 1:
selected_feeds = []
else:
random.shuffle(selected_feeds)
selected_feeds = selected_feeds[:limit]
categories_for_feeds = categorize_feeds(selected_feeds)
write_categorized_opml(root, output_path, selected_feeds, categories_for_feeds)
if __name__ == "__main__":
args = parse_args()
main(args.input_path, args.output, args.limit)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment