duarteocarmo · October 4, 2025 16:35
diff --git a/opml_categorize.py b/opml_categorize.py
 # requires-python = ">=3.13"
 # /// script
 # dependencies = [
 # "pydantic",
 # "feedparser",
 # "tqdm",
 # "litellm",
 # ]
 # ///

 import argparse
 import copy
 import random
 import xml.etree.ElementTree as ET
 from concurrent.futures import ThreadPoolExecutor
 from datetime import datetime
 from enum import Enum

 import feedparser
 from litellm import completion
 from pydantic import BaseModel, Field
 from tqdm import tqdm


 categories = [
    "Substack",
    "Machine Learning/AI",
    "Personal Blogs",
    "Substack feed",
    "Newsletter",
    "Reddit",
    "Portugal",
    "World News",
    "Inactive",
    "Technology",
    "Other",
 ]

 CategoryEnum = Enum("CategoryEnum", {category: category for category in categories})


 class CategoryResponse(BaseModel):
    category: CategoryEnum = Field(..., description="The category of the RSS feed.")
    short_reason: str


 def get_last_entries_from(rss_url: str, n: int = 5) -> str:
    """Fetch the last n entries from the given RSS feed URL."""

    feed = feedparser.parse(rss_url)
    entries = feed.entries
    try:
        entries.sort(
            key=lambda entry: entry.get("published_parsed", None), reverse=True
        )
    except Exception:
        pass

    try:
        entries.sort(key=lambda entry: entry.get("published", None), reverse=True)
    except Exception:
        pass

    total_text = ""
    for entry in entries[:n]:
        title = entry.get("title", "N/A")[:100]
        summary = entry.get("summary", "N/A")[:240] + "..."
        date = entry.get("published", "N/A")[:16]
        content = str(entry.get("content", [{"value": "N/A"}]))[:240] + "..."
        total_text += (
            f"Title: {title}\nDate: {date}\nSummary: {summary}\nContent: {content}\n\n"
        )

    return total_text.strip()


 def categorize(entry: ET.Element) -> CategoryEnum:
    url = entry.attrib["xmlUrl"]
    last_n_entries = get_last_entries_from(url)
    today_date_as_string = datetime.now().strftime("%Y-%m-%d")
    title = entry.attrib.get("title", "N/A")
    s_prompt = f"""
    - Your task is to categorize the following RSS feed into one of the following categories: {", ".join(categories)}.
    - If none of the categories fit, use "Other".
    - Kill the newsletter is just a mirror I use to read newsletters; might be a blog or something else 
    - Substack links go on Substack category
    - Today is {today_date_as_string}, feeds that have not published in the last year should be categorized as "Inactive".
    - If no entries are found, categorize as "Inactive".
    """
    u_prompt = f"""
    RSS Feed Title: {title}
    RSS Feed Description: {entry.attrib.get("text", "N/A")}
    RSS Feed URL: {url}
    <most_recent_entries>
    {"\n".join(last_n_entries)}
    </most_recent_entries>
    """

    messages = [
        {"role": "system", "content": s_prompt},
        {"role": "user", "content": u_prompt},
    ]

    resp = completion(
        model="gpt-5-mini",
        messages=messages,
        response_format=CategoryResponse,
    )
    raw_response = resp.choices[0].message.content
    category = CategoryResponse.model_validate_json(raw_response).category
    reason = CategoryResponse.model_validate_json(raw_response).short_reason
    # print(f"Title: {title}\nCategory: {category}\nUrl: {url}\nReason: {reason}\n")

    return category


 def parse_args() -> argparse.Namespace:
    parser = argparse.ArgumentParser(
        description="Categorize RSS feeds inside an OPML file."
    )
    parser.add_argument("input_path", help="Path to the original OPML file.")
    parser.add_argument(
        "--output",
        required=True,
        help="Path to store the categorized OPML file.",
    )
    parser.add_argument(
        "--limit",
        type=int,
        help="Number of feeds to categorize after shuffling.",
    )
    return parser.parse_args()


 def collect_feeds(root: ET.Element) -> list[ET.Element]:
    body = root.find("body")
    if body is None:
        msg = "OPML file missing body section"
        raise ValueError(msg)

    feeds = [entry for entry in body.iter("outline") if "xmlUrl" in entry.attrib]
    return feeds


 def get_category_outline(body: ET.Element, category: str) -> ET.Element:
    for outline in body.findall("outline"):
        if outline.attrib.get("text") == category and "xmlUrl" not in outline.attrib:
            return outline

    node = ET.Element("outline", {"text": category, "title": category})
    body.append(node)
    return node


 def write_categorized_opml(
    root: ET.Element,
    output_path: str,
    feeds: list[ET.Element],
    categories_for_feeds: list[CategoryEnum],
 ) -> None:  # noqa: E501
    new_root = ET.Element(root.tag, root.attrib)

    head = root.find("head")
    if head is not None:
        new_root.append(copy.deepcopy(head))

    body = ET.Element("body")
    new_root.append(body)

    category_nodes: dict[str, ET.Element] = {}
    for entry, category in zip(feeds, categories_for_feeds):
        category_name = category.value
        if category_name not in category_nodes:
            category_nodes[category_name] = get_category_outline(body, category_name)
        category_nodes[category_name].append(copy.deepcopy(entry))

    ET.ElementTree(new_root).write(
        output_path,
        encoding="UTF-8",
        xml_declaration=True,
    )


 def categorize_feeds(selected_feeds: list[ET.Element]) -> list[CategoryEnum]:
    if not selected_feeds:
        return []

    with ThreadPoolExecutor(max_workers=8) as executor:
        results = list(
            tqdm(
                executor.map(categorize, selected_feeds),
                total=len(selected_feeds),
                desc="Categorizing",
            )
        )

    return results


 def main(input_path: str, output_path: str, limit: int | None) -> None:
    with open(input_path, "r", encoding="utf-8") as file:
        content = file.read()

    tree = ET.ElementTree(ET.fromstring(content))
    root = tree.getroot()

    feeds = collect_feeds(root)
    selected_feeds = feeds.copy()
    if limit is not None:
        if limit < 1:
            selected_feeds = []
        else:
            random.shuffle(selected_feeds)
            selected_feeds = selected_feeds[:limit]

    categories_for_feeds = categorize_feeds(selected_feeds)

    write_categorized_opml(root, output_path, selected_feeds, categories_for_feeds)


 if __name__ == "__main__":
    args = parse_args()
    main(args.input_path, args.output, args.limit)
	# requires-python = ">=3.13"
	# /// script
	# dependencies = [
	# "pydantic",
	# "feedparser",
	# "tqdm",
	# "litellm",
	# ]
	# ///

	import argparse
	import copy
	import random
	import xml.etree.ElementTree as ET
	from concurrent.futures import ThreadPoolExecutor
	from datetime import datetime
	from enum import Enum

	import feedparser
	from litellm import completion
	from pydantic import BaseModel, Field
	from tqdm import tqdm


	categories = [
	"Substack",
	"Machine Learning/AI",
	"Personal Blogs",
	"Substack feed",
	"Newsletter",
	"Reddit",
	"Portugal",
	"World News",
	"Inactive",
	"Technology",
	"Other",
	]

	CategoryEnum = Enum("CategoryEnum", {category: category for category in categories})


	class CategoryResponse(BaseModel):
	category: CategoryEnum = Field(..., description="The category of the RSS feed.")
	short_reason: str


	def get_last_entries_from(rss_url: str, n: int = 5) -> str:
	"""Fetch the last n entries from the given RSS feed URL."""

	feed = feedparser.parse(rss_url)
	entries = feed.entries
	try:
	entries.sort(
	key=lambda entry: entry.get("published_parsed", None), reverse=True
	)
	except Exception:
	pass

	try:
	entries.sort(key=lambda entry: entry.get("published", None), reverse=True)
	except Exception:
	pass

	total_text = ""
	for entry in entries[:n]:
	title = entry.get("title", "N/A")[:100]
	summary = entry.get("summary", "N/A")[:240] + "..."
	date = entry.get("published", "N/A")[:16]
	content = str(entry.get("content", [{"value": "N/A"}]))[:240] + "..."
	total_text += (
	f"Title: {title}\nDate: {date}\nSummary: {summary}\nContent: {content}\n\n"
	)

	return total_text.strip()


	def categorize(entry: ET.Element) -> CategoryEnum:
	url = entry.attrib["xmlUrl"]
	last_n_entries = get_last_entries_from(url)
	today_date_as_string = datetime.now().strftime("%Y-%m-%d")
	title = entry.attrib.get("title", "N/A")
	s_prompt = f"""
	- Your task is to categorize the following RSS feed into one of the following categories: {", ".join(categories)}.
	- If none of the categories fit, use "Other".
	- Kill the newsletter is just a mirror I use to read newsletters; might be a blog or something else
	- Substack links go on Substack category
	- Today is {today_date_as_string}, feeds that have not published in the last year should be categorized as "Inactive".
	- If no entries are found, categorize as "Inactive".
	"""
	u_prompt = f"""
	RSS Feed Title: {title}
	RSS Feed Description: {entry.attrib.get("text", "N/A")}
	RSS Feed URL: {url}
	<most_recent_entries>
	{"\n".join(last_n_entries)}
	</most_recent_entries>
	"""

	messages = [
	{"role": "system", "content": s_prompt},
	{"role": "user", "content": u_prompt},
	]

	resp = completion(
	model="gpt-5-mini",
	messages=messages,
	response_format=CategoryResponse,
	)
	raw_response = resp.choices[0].message.content
	category = CategoryResponse.model_validate_json(raw_response).category
	reason = CategoryResponse.model_validate_json(raw_response).short_reason
	# print(f"Title: {title}\nCategory: {category}\nUrl: {url}\nReason: {reason}\n")

	return category


	def parse_args() -> argparse.Namespace:
	parser = argparse.ArgumentParser(
	description="Categorize RSS feeds inside an OPML file."
	)
	parser.add_argument("input_path", help="Path to the original OPML file.")
	parser.add_argument(
	"--output",
	required=True,
	help="Path to store the categorized OPML file.",
	)
	parser.add_argument(
	"--limit",
	type=int,
	help="Number of feeds to categorize after shuffling.",
	)
	return parser.parse_args()


	def collect_feeds(root: ET.Element) -> list[ET.Element]:
	body = root.find("body")
	if body is None:
	msg = "OPML file missing body section"
	raise ValueError(msg)

	feeds = [entry for entry in body.iter("outline") if "xmlUrl" in entry.attrib]
	return feeds


	def get_category_outline(body: ET.Element, category: str) -> ET.Element:
	for outline in body.findall("outline"):
	if outline.attrib.get("text") == category and "xmlUrl" not in outline.attrib:
	return outline

	node = ET.Element("outline", {"text": category, "title": category})
	body.append(node)
	return node


	def write_categorized_opml(
	root: ET.Element,
	output_path: str,
	feeds: list[ET.Element],
	categories_for_feeds: list[CategoryEnum],
	) -> None: # noqa: E501
	new_root = ET.Element(root.tag, root.attrib)

	head = root.find("head")
	if head is not None:
	new_root.append(copy.deepcopy(head))

	body = ET.Element("body")
	new_root.append(body)

	category_nodes: dict[str, ET.Element] = {}
	for entry, category in zip(feeds, categories_for_feeds):
	category_name = category.value
	if category_name not in category_nodes:
	category_nodes[category_name] = get_category_outline(body, category_name)
	category_nodes[category_name].append(copy.deepcopy(entry))

	ET.ElementTree(new_root).write(
	output_path,
	encoding="UTF-8",
	xml_declaration=True,
	)


	def categorize_feeds(selected_feeds: list[ET.Element]) -> list[CategoryEnum]:
	if not selected_feeds:
	return []

	with ThreadPoolExecutor(max_workers=8) as executor:
	results = list(
	tqdm(
	executor.map(categorize, selected_feeds),
	total=len(selected_feeds),
	desc="Categorizing",
	)
	)

	return results


	def main(input_path: str, output_path: str, limit: int \| None) -> None:
	with open(input_path, "r", encoding="utf-8") as file:
	content = file.read()

	tree = ET.ElementTree(ET.fromstring(content))
	root = tree.getroot()

	feeds = collect_feeds(root)
	selected_feeds = feeds.copy()
	if limit is not None:
	if limit < 1:
	selected_feeds = []
	else:
	random.shuffle(selected_feeds)
	selected_feeds = selected_feeds[:limit]

	categories_for_feeds = categorize_feeds(selected_feeds)

	write_categorized_opml(root, output_path, selected_feeds, categories_for_feeds)


	if __name__ == "__main__":
	args = parse_args()
	main(args.input_path, args.output, args.limit)