Last active
April 6, 2026 22:49
-
-
Save Sarverott/d84b4ad59f5f5c4abbd2d14009affb9e to your computer and use it in GitHub Desktop.
rfc-index-parsing.ipynb
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| { | |
| "nbformat": 4, | |
| "nbformat_minor": 0, | |
| "metadata": { | |
| "colab": { | |
| "private_outputs": true, | |
| "provenance": [], | |
| "authorship_tag": "ABX9TyNW4UR720ykODTRTLXfGYSz", | |
| "include_colab_link": true | |
| }, | |
| "kernelspec": { | |
| "name": "python3", | |
| "display_name": "Python 3" | |
| }, | |
| "language_info": { | |
| "name": "python" | |
| } | |
| }, | |
| "cells": [ | |
| { | |
| "cell_type": "markdown", | |
| "metadata": { | |
| "id": "view-in-github", | |
| "colab_type": "text" | |
| }, | |
| "source": [ | |
| "<a href=\"https://colab.research.google.com/gist/Sarverott/d84b4ad59f5f5c4abbd2d14009affb9e/rfc-index-parsing.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": { | |
| "id": "_1BsWX0yekqZ" | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "!pip install requests" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": [ | |
| "import requests\n", | |
| "\n", | |
| "x = requests.get(\"https://www.rfc-editor.org/rfc-index.txt\")\n", | |
| "\n", | |
| "x.text" | |
| ], | |
| "metadata": { | |
| "id": "kkYdrcFvev8Z" | |
| }, | |
| "execution_count": null, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": [ | |
| "import re\n", | |
| "splitterpattern = re.compile(r\"\\n\\~+\\n\")\n", | |
| "print(re.split(splitterpattern, x.text)[1])\n" | |
| ], | |
| "metadata": { | |
| "id": "ZPlwc6Cge5lp" | |
| }, | |
| "execution_count": null, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": [ | |
| "import pprint\n", | |
| "splitterpattern = re.compile(r\"\\n\\~+\\n\\n\\s+(RFC INDEX)\\n\\s+\\-+\")\n", | |
| "\n", | |
| "#pprint.pprint(re.split(splitterpattern, x.text)[2])\n", | |
| "\n", | |
| "rtf_entries_txt = re.split(splitterpattern, x.text)[4]\n" | |
| ], | |
| "metadata": { | |
| "id": "z4NpABhzfBHN" | |
| }, | |
| "execution_count": null, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": [ | |
| "glossary_patterns = {\n", | |
| " \"RFC_ENTRY\":r\"\\n\\n\\d{4}\\s.+(\\n\\s{5}.+)*\",\n", | |
| " \"RFC_Matcher\":r\"(?P<rfc_number>\\d{4}) (?P<rfc_desc>.+(\\n\\s{5}.+)*)\",\n", | |
| " \"RFC_formats\":re.compile(r\"\\(Format\\:(?P<formats>((\\s|(\\n\\s{5}))[A-Z]+\\,?)+)\\)\"),\n", | |
| " \"RFC_doi\":re.compile(r\"\\(DOI\\:(\\s|(\\n\\s{5}))(?P<doi>\\d+\\.\\d+\\/RFC\\d{4})\\)\"),\n", | |
| " \"RFC_status\":re.compile(r\"\\(Status\\:((\\s|(\\n\\s{5}))(?P<status>\\w+)\\,?)+\\)\"),\n", | |
| " \"RFC_mentions\":re.compile(r\"RFC\\d{4}\")\n", | |
| "}\n", | |
| "\n", | |
| "entrypattern = re.compile(glossary_patterns[\"RFC_ENTRY\"])\n", | |
| "\n", | |
| "ENTRIES = entrypattern.findall(x.text)\n", | |
| "\n", | |
| "y = [re.match(glossary_patterns[\"RFC_Matcher\"], x).groupdict() for x in rtf_entries_txt.split(\"\\n\\n\") if re.match(glossary_patterns[\"RFC_Matcher\"], x) ]\n", | |
| "[len(y), len(ENTRIES)]\n", | |
| "print(glossary_patterns[\"RFC_formats\"].search(y[0][\"rfc_desc\"]).group(\"formats\"))" | |
| ], | |
| "metadata": { | |
| "id": "M_6kSWt4jhI_" | |
| }, | |
| "execution_count": null, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": [], | |
| "metadata": { | |
| "id": "H9yMNNjyyANS" | |
| }, | |
| "execution_count": null, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "source": [], | |
| "metadata": { | |
| "id": "78cO40futK2y" | |
| } | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": [ | |
| "z = {f\"RFC{aa[\"rfc_number\"]}\": aa[\"rfc_desc\"] for aa in y}\n", | |
| "re.compile(glossary_patterns[\"RFC_formats\"]).search(z[\"RFC0010\"])" | |
| ], | |
| "metadata": { | |
| "id": "4xh3LWh0pQTK" | |
| }, | |
| "execution_count": null, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": [ | |
| "#z = {f\"RFC{aa[\"rfc_number\"]}\": aa[\"rfc_desc\"] for aa in y}\n", | |
| "re.compile(glossary_patterns[\"RFC_doi\"]).search(z[\"RFC0010\"])# .group(\"doi\")" | |
| ], | |
| "metadata": { | |
| "id": "f0x-IZO-h_xr" | |
| }, | |
| "execution_count": null, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": [ | |
| "zz = {f\"RFC{aa[\"rfc_number\"]}\": {\n", | |
| " \"number\":int(aa[\"rfc_number\"]),\n", | |
| " \"url\":f\"https://www.rfc-editor.org/rfc/rfc{str(int(aa[\"rfc_number\"]))}.txt\" if glossary_patterns[\"RFC_mentions\"].findall(aa[\"rfc_desc\"]) else None,\n", | |
| " \"title\": aa[\"rfc_desc\"].split(\".\")[0],\n", | |
| " \"description\": \" \".join([bb.strip() for bb in aa[\"rfc_desc\"].split(\"\\n\")]),\n", | |
| " \"related\":glossary_patterns[\"RFC_mentions\"].findall(aa[\"rfc_desc\"]),\n", | |
| " \"doi\":glossary_patterns[\"RFC_doi\"].search(aa[\"rfc_desc\"]).group(\"doi\") if glossary_patterns[\"RFC_doi\"].search(aa[\"rfc_desc\"]) else None,\n", | |
| " \"status\":glossary_patterns[\"RFC_status\"].search(aa[\"rfc_desc\"]).group(\"status\") if glossary_patterns[\"RFC_status\"].search(aa[\"rfc_desc\"]) else None,\n", | |
| " \"formats\":[bb.strip().lower() for bb in glossary_patterns[\"RFC_formats\"].search(aa[\"rfc_desc\"]).group(\"formats\").split(\",\") if bb.strip()] if glossary_patterns[\"RFC_formats\"].search(aa[\"rfc_desc\"]) else None,\n", | |
| " }\n", | |
| " for aa in y}\n", | |
| "#re.compile(glossary_patterns[\"RFC_formats\"]).search(z[\"RFC0001\"])\n" | |
| ], | |
| "metadata": { | |
| "id": "ZVwig2_byRGu" | |
| }, | |
| "execution_count": null, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": [ | |
| "pprint.pprint(zz[\"RFC0010\"])" | |
| ], | |
| "metadata": { | |
| "id": "CBNu3qrizozy" | |
| }, | |
| "execution_count": null, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": [ | |
| "import json\n", | |
| "\n", | |
| "\n", | |
| "\n", | |
| "print(json.dumps(zz[\"RFC0010\"], indent=4))" | |
| ], | |
| "metadata": { | |
| "id": "KkrdULYY5so2" | |
| }, | |
| "execution_count": null, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": [], | |
| "metadata": { | |
| "id": "3o-BGALy1-P3" | |
| }, | |
| "execution_count": null, | |
| "outputs": [] | |
| } | |
| ] | |
| } |
Author
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
https://gist.github.com/Sarverott/c21ec17ac458228b3a0da27ee56c12af