Skip to content

Instantly share code, notes, and snippets.

@Sarverott
Last active April 6, 2026 22:49
Show Gist options
  • Select an option

  • Save Sarverott/d84b4ad59f5f5c4abbd2d14009affb9e to your computer and use it in GitHub Desktop.

Select an option

Save Sarverott/d84b4ad59f5f5c4abbd2d14009affb9e to your computer and use it in GitHub Desktop.
rfc-index-parsing.ipynb
Display the source blob
Display the rendered blob
Raw
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"private_outputs": true,
"provenance": [],
"authorship_tag": "ABX9TyNW4UR720ykODTRTLXfGYSz",
"include_colab_link": true
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
},
"language_info": {
"name": "python"
}
},
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "view-in-github",
"colab_type": "text"
},
"source": [
"<a href=\"https://colab.research.google.com/gist/Sarverott/d84b4ad59f5f5c4abbd2d14009affb9e/rfc-index-parsing.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "_1BsWX0yekqZ"
},
"outputs": [],
"source": [
"!pip install requests"
]
},
{
"cell_type": "code",
"source": [
"import requests\n",
"\n",
"x = requests.get(\"https://www.rfc-editor.org/rfc-index.txt\")\n",
"\n",
"x.text"
],
"metadata": {
"id": "kkYdrcFvev8Z"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"import re\n",
"splitterpattern = re.compile(r\"\\n\\~+\\n\")\n",
"print(re.split(splitterpattern, x.text)[1])\n"
],
"metadata": {
"id": "ZPlwc6Cge5lp"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"import pprint\n",
"splitterpattern = re.compile(r\"\\n\\~+\\n\\n\\s+(RFC INDEX)\\n\\s+\\-+\")\n",
"\n",
"#pprint.pprint(re.split(splitterpattern, x.text)[2])\n",
"\n",
"rtf_entries_txt = re.split(splitterpattern, x.text)[4]\n"
],
"metadata": {
"id": "z4NpABhzfBHN"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"glossary_patterns = {\n",
" \"RFC_ENTRY\":r\"\\n\\n\\d{4}\\s.+(\\n\\s{5}.+)*\",\n",
" \"RFC_Matcher\":r\"(?P<rfc_number>\\d{4}) (?P<rfc_desc>.+(\\n\\s{5}.+)*)\",\n",
" \"RFC_formats\":re.compile(r\"\\(Format\\:(?P<formats>((\\s|(\\n\\s{5}))[A-Z]+\\,?)+)\\)\"),\n",
" \"RFC_doi\":re.compile(r\"\\(DOI\\:(\\s|(\\n\\s{5}))(?P<doi>\\d+\\.\\d+\\/RFC\\d{4})\\)\"),\n",
" \"RFC_status\":re.compile(r\"\\(Status\\:((\\s|(\\n\\s{5}))(?P<status>\\w+)\\,?)+\\)\"),\n",
" \"RFC_mentions\":re.compile(r\"RFC\\d{4}\")\n",
"}\n",
"\n",
"entrypattern = re.compile(glossary_patterns[\"RFC_ENTRY\"])\n",
"\n",
"ENTRIES = entrypattern.findall(x.text)\n",
"\n",
"y = [re.match(glossary_patterns[\"RFC_Matcher\"], x).groupdict() for x in rtf_entries_txt.split(\"\\n\\n\") if re.match(glossary_patterns[\"RFC_Matcher\"], x) ]\n",
"[len(y), len(ENTRIES)]\n",
"print(glossary_patterns[\"RFC_formats\"].search(y[0][\"rfc_desc\"]).group(\"formats\"))"
],
"metadata": {
"id": "M_6kSWt4jhI_"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [],
"metadata": {
"id": "H9yMNNjyyANS"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"source": [],
"metadata": {
"id": "78cO40futK2y"
}
},
{
"cell_type": "code",
"source": [
"z = {f\"RFC{aa[\"rfc_number\"]}\": aa[\"rfc_desc\"] for aa in y}\n",
"re.compile(glossary_patterns[\"RFC_formats\"]).search(z[\"RFC0010\"])"
],
"metadata": {
"id": "4xh3LWh0pQTK"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"#z = {f\"RFC{aa[\"rfc_number\"]}\": aa[\"rfc_desc\"] for aa in y}\n",
"re.compile(glossary_patterns[\"RFC_doi\"]).search(z[\"RFC0010\"])# .group(\"doi\")"
],
"metadata": {
"id": "f0x-IZO-h_xr"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"zz = {f\"RFC{aa[\"rfc_number\"]}\": {\n",
" \"number\":int(aa[\"rfc_number\"]),\n",
" \"url\":f\"https://www.rfc-editor.org/rfc/rfc{str(int(aa[\"rfc_number\"]))}.txt\" if glossary_patterns[\"RFC_mentions\"].findall(aa[\"rfc_desc\"]) else None,\n",
" \"title\": aa[\"rfc_desc\"].split(\".\")[0],\n",
" \"description\": \" \".join([bb.strip() for bb in aa[\"rfc_desc\"].split(\"\\n\")]),\n",
" \"related\":glossary_patterns[\"RFC_mentions\"].findall(aa[\"rfc_desc\"]),\n",
" \"doi\":glossary_patterns[\"RFC_doi\"].search(aa[\"rfc_desc\"]).group(\"doi\") if glossary_patterns[\"RFC_doi\"].search(aa[\"rfc_desc\"]) else None,\n",
" \"status\":glossary_patterns[\"RFC_status\"].search(aa[\"rfc_desc\"]).group(\"status\") if glossary_patterns[\"RFC_status\"].search(aa[\"rfc_desc\"]) else None,\n",
" \"formats\":[bb.strip().lower() for bb in glossary_patterns[\"RFC_formats\"].search(aa[\"rfc_desc\"]).group(\"formats\").split(\",\") if bb.strip()] if glossary_patterns[\"RFC_formats\"].search(aa[\"rfc_desc\"]) else None,\n",
" }\n",
" for aa in y}\n",
"#re.compile(glossary_patterns[\"RFC_formats\"]).search(z[\"RFC0001\"])\n"
],
"metadata": {
"id": "ZVwig2_byRGu"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"pprint.pprint(zz[\"RFC0010\"])"
],
"metadata": {
"id": "CBNu3qrizozy"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"import json\n",
"\n",
"\n",
"\n",
"print(json.dumps(zz[\"RFC0010\"], indent=4))"
],
"metadata": {
"id": "KkrdULYY5so2"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [],
"metadata": {
"id": "3o-BGALy1-P3"
},
"execution_count": null,
"outputs": []
}
]
}
@Sarverott
Copy link
Copy Markdown
Author

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment