SebDeclercq · April 26, 2022 09:31
diff --git a/convert_oreilly_export.ipynb b/convert_oreilly_export.ipynb
 {
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "5b4430c9",
   "metadata": {},
   "outputs": [],
   "source": [
    "from __future__ import annotations\n",
    "from collections import defaultdict\n",
    "from pathlib import Path\n",
    "from typing import ClassVar, TextIO\n",
    "import csv\n",
    "import re\n",
    "\n",
    "\n",
    "class Book:\n",
    "    '''Class representing a book w/ some of the metadata available\n",
    "    in the safari-annotations-export.csv file\n",
    "    \n",
    "    Attrs:\n",
    "        title: The title (duh)\n",
    "        slug: The slug to use for the filename (for instance)\n",
    "        html_filename: The filename to use for the HTML export\n",
    "        chapters: The **ordered** list of the book's chapters\n",
    "        highlights: A list of the highlights, chapter by chapter\n",
    "        exporter: The Exporter to use as formatter\n",
    "    '''\n",
    "    \n",
    "    def __init__(self, *, title: str, exporter: Exporter, slug: str = '') -> None:\n",
    "        self.title: str = title\n",
    "        self.slug: str = slug\n",
    "        self.chapters: list[str] = []\n",
    "        self.highlights: defaultdict[str, list[str]] = defaultdict(list)\n",
    "        if not self.slug:\n",
    "            self.slug = re.sub(r'[^\\w]+', '-', self.title.lower()).replace('--', '-')\n",
    "        self.exporter: Exporter = exporter\n",
    "\n",
    "    def get_highlights(self) -> defaultdict[str, list[str]]:\n",
    "        self.highlights = defaultdict(list)\n",
    "        return self.exporter.get_highlights(self).highlights\n",
    "            \n",
    "    @property\n",
    "    def to_file(self) -> None:\n",
    "        '''Export the highlights to HTML'''\n",
    "        self.exporter.export(self)\n",
    "    \n",
    "    @property\n",
    "    def filename(self) -> str:\n",
    "        return self.exporter.get_filename(self)\n",
    "            \n",
    "            \n",
    "class Exporter:\n",
    "    '''safari-annotations-export.csv file parser'''\n",
    "    OREILLY_EXPORT_FILENAME: ClassVar[Path] = Path(\"oreilly-annotations.csv\")\n",
    "    EXTENSION: ClassVar[str] = ''\n",
    "          \n",
    "    @classmethod\n",
    "    def get_highlights(cls, book: Book) -> Book:\n",
    "        '''Parse the csv file in order to collect all the highlights\n",
    "        for a specific book.\n",
    "        \n",
    "        Params:\n",
    "            book: The book to search for.\n",
    "            \n",
    "        Returns:\n",
    "            The book fed w/ its chapters & highlights.\n",
    "        \n",
    "        '''\n",
    "        with cls.OREILLY_EXPORT_FILENAME.open(encoding='utf8') as file:\n",
    "            reader: csv.DictReader = csv.DictReader(file)\n",
    "            for record in reader:\n",
    "                if record['Book Title'] == book.title:\n",
    "                    chapter: str = record['Chapter Title']\n",
    "                    highlight: str = record['Highlight']\n",
    "                    if chapter not in book.chapters:\n",
    "                        book.chapters.insert(0, chapter)\n",
    "                    book.highlights[chapter].append(highlight)\n",
    "        return book\n",
    "\n",
    "    @classmethod\n",
    "    def get_filename(cls, book: Book) -> str:\n",
    "        return Path(f\"{book.slug}.{cls.EXTENSION}\")\n",
    "    \n",
    "    @classmethod\n",
    "    def export(cls, book: Book) -> Path:\n",
    "        raise NotImplementedError()\n",
    "\n",
    "    \n",
    "class OReillyCSV2HTML(Exporter):    \n",
    "    HEADER: ClassVar[str] = '''<!DOCTYPE html>\n",
    "    <html>\n",
    "    <head>\n",
    "        <meta http-equiv=\"Content-type\" content=\"text/html;charset=UTF-8\">\n",
    "        <title>HTML_TITLE</title>\n",
    "        <link rel=\"stylesheet\" href=\"https://cdn.jsdelivr.net/npm/katex@0.10.2/dist/katex.min.css\" integrity=\"sha384-yFRtMMDnQtDRO8rLpMIKrtPCD5jdktao2TV19YiZYWMDkUR5GQZR/NOVTdquEx1j\" crossorigin=\"anonymous\">\n",
    "        <link rel=\"stylesheet\" href=\"https://cdn.jsdelivr.net/gh/Microsoft/vscode/extensions/markdown-language-features/media/markdown.css\">\n",
    "        <link rel=\"stylesheet\" href=\"https://cdn.jsdelivr.net/gh/Microsoft/vscode/extensions/markdown-language-features/media/highlight.css\">\n",
    "        <link href=\"https://cdn.jsdelivr.net/npm/katex-copytex@latest/dist/katex-copytex.min.css\" rel=\"stylesheet\" type=\"text/css\">\n",
    "        <style>\n",
    ".task-list-item { list-style-type: none; } .task-list-item-checkbox { margin-left: -20px; vertical-align: middle; }\n",
    "</style>\n",
    "        <style>\n",
    "            body {\n",
    "                font-family: -apple-system, BlinkMacSystemFont, 'Segoe WPC', 'Segoe UI', 'Ubuntu', 'Droid Sans', sans-serif;\n",
    "                font-size: 14px;\n",
    "                line-height: 1.6;\n",
    "            }\n",
    "        </style>\n",
    "        \n",
    "        <script src=\"https://cdn.jsdelivr.net/npm/katex-copytex@latest/dist/katex-copytex.min.js\"></script>\n",
    "    </head>\n",
    "    <body>'''\n",
    "\n",
    "    FOOTER: ClassVar[str] = '''\n",
    "        </body>\n",
    "        </html>'''\n",
    "    \n",
    "    EXTENSION: ClassVar[str] = \".html\"\n",
    "    \n",
    "    @classmethod\n",
    "    def export(cls, book: Book) -> Path:\n",
    "        '''Export a book to HTML.\n",
    "        \n",
    "        Params:\n",
    "            book: The book to export\n",
    "        '''\n",
    "        with book.filename.open('w', encoding='utf8') as file:\n",
    "            file.write(cls.HEADER.replace('HTML_TITLE', book.title))\n",
    "            file.write(f'<h1>{book.title}</h1>\\n')\n",
    "            for chapter in book.chapters:\n",
    "                file.write(f'<h2>{chapter}</h2>\\n')\n",
    "                file.write('<ul>\\n')\n",
    "                for highlight in book.highlights[chapter]:\n",
    "                    file.write('<li>\\n<blockquote>\\n')\n",
    "                    fmt_highlight = ''.join([f'<p>{line}</p>\\n' for line in highlight.split('\\n')])\n",
    "                    file.write(fmt_highlight)\n",
    "                    file.write('</blockquote>\\n</li>\\n')\n",
    "                file.write('</ul>\\n')\n",
    "            file.write(cls.FOOTER)\n",
    "        return book.filename\n",
    "        \n",
    "            \n",
    "            \n",
    "class OReillyCSV2MD(Exporter):\n",
    "    '''safari-annotations-export.csv file parser'''\n",
    "    \n",
    "    EXTENSION: ClassVar[str] = \".md\"\n",
    "    \n",
    "    @classmethod\n",
    "    def format(cls, string: str) -> str:\n",
    "        return f\"{string}\\n\\n\"\n",
    "    \n",
    "    @classmethod\n",
    "    def print(cls, file: TextIO, string: str) -> None:\n",
    "        file.write(cls.format(string))\n",
    "    \n",
    "    @classmethod\n",
    "    def export(cls, book: Book) -> Path:\n",
    "        book.get_highlights()\n",
    "        with book.filename.open(\"w\", encoding=\"utf8\") as file:\n",
    "            for chapter in book.chapters:\n",
    "                cls.print(file, f\"## {chapter}\")\n",
    "                for highlight in book.highlights[chapter]:\n",
    "                    cleaned_highligth: str = '\\n>'.join(highlight.split('\\n'))\n",
    "                    cls.print(file, f\"- > {cleaned_highligth}\")\n",
    "        return book.filename\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "167fb71b",
   "metadata": {},
   "outputs": [],
   "source": [
    "titles: list[str] = [\n",
    "    \"Extreme Programming Explained: Embrace Change, Second Edition\",\n",
    "    \"Planning Extreme Programming\",\n",
    "]\n",
    "for title in titles:\n",
    "    Book(title=title, exporter=OReillyCSV2MD).to_file"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
 }
	{
	"cells": [
	{
	"cell_type": "code",
	"execution_count": 1,
	"id": "5b4430c9",
	"metadata": {},
	"outputs": [],
	"source": [
	"from __future__ import annotations\n",
	"from collections import defaultdict\n",
	"from pathlib import Path\n",
	"from typing import ClassVar, TextIO\n",
	"import csv\n",
	"import re\n",
	"\n",
	"\n",
	"class Book:\n",
	" '''Class representing a book w/ some of the metadata available\n",
	" in the safari-annotations-export.csv file\n",
	" \n",
	" Attrs:\n",
	" title: The title (duh)\n",
	" slug: The slug to use for the filename (for instance)\n",
	" html_filename: The filename to use for the HTML export\n",
	" chapters: The ordered list of the book's chapters\n",
	" highlights: A list of the highlights, chapter by chapter\n",
	" exporter: The Exporter to use as formatter\n",
	" '''\n",
	" \n",
	" def __init__(self, *, title: str, exporter: Exporter, slug: str = '') -> None:\n",
	" self.title: str = title\n",
	" self.slug: str = slug\n",
	" self.chapters: list[str] = []\n",
	" self.highlights: defaultdict[str, list[str]] = defaultdict(list)\n",
	" if not self.slug:\n",
	" self.slug = re.sub(r'[^\\w]+', '-', self.title.lower()).replace('--', '-')\n",
	" self.exporter: Exporter = exporter\n",
	"\n",
	" def get_highlights(self) -> defaultdict[str, list[str]]:\n",
	" self.highlights = defaultdict(list)\n",
	" return self.exporter.get_highlights(self).highlights\n",
	" \n",
	" @property\n",
	" def to_file(self) -> None:\n",
	" '''Export the highlights to HTML'''\n",
	" self.exporter.export(self)\n",
	" \n",
	" @property\n",
	" def filename(self) -> str:\n",
	" return self.exporter.get_filename(self)\n",
	" \n",
	" \n",
	"class Exporter:\n",
	" '''safari-annotations-export.csv file parser'''\n",
	" OREILLY_EXPORT_FILENAME: ClassVar[Path] = Path(\"oreilly-annotations.csv\")\n",
	" EXTENSION: ClassVar[str] = ''\n",
	" \n",
	" @classmethod\n",
	" def get_highlights(cls, book: Book) -> Book:\n",
	" '''Parse the csv file in order to collect all the highlights\n",
	" for a specific book.\n",
	" \n",
	" Params:\n",
	" book: The book to search for.\n",
	" \n",
	" Returns:\n",
	" The book fed w/ its chapters & highlights.\n",
	" \n",
	" '''\n",
	" with cls.OREILLY_EXPORT_FILENAME.open(encoding='utf8') as file:\n",
	" reader: csv.DictReader = csv.DictReader(file)\n",
	" for record in reader:\n",
	" if record['Book Title'] == book.title:\n",
	" chapter: str = record['Chapter Title']\n",
	" highlight: str = record['Highlight']\n",
	" if chapter not in book.chapters:\n",
	" book.chapters.insert(0, chapter)\n",
	" book.highlights[chapter].append(highlight)\n",
	" return book\n",
	"\n",
	" @classmethod\n",
	" def get_filename(cls, book: Book) -> str:\n",
	" return Path(f\"{book.slug}.{cls.EXTENSION}\")\n",
	" \n",
	" @classmethod\n",
	" def export(cls, book: Book) -> Path:\n",
	" raise NotImplementedError()\n",
	"\n",
	" \n",
	"class OReillyCSV2HTML(Exporter): \n",
	" HEADER: ClassVar[str] = '''<!DOCTYPE html>\n",
	" <html>\n",
	" <head>\n",
	" <meta http-equiv=\"Content-type\" content=\"text/html;charset=UTF-8\">\n",
	" <title>HTML_TITLE</title>\n",
	" <link rel=\"stylesheet\" href=\"https://cdn.jsdelivr.net/npm/katex@0.10.2/dist/katex.min.css\" integrity=\"sha384-yFRtMMDnQtDRO8rLpMIKrtPCD5jdktao2TV19YiZYWMDkUR5GQZR/NOVTdquEx1j\" crossorigin=\"anonymous\">\n",
	" <link rel=\"stylesheet\" href=\"https://cdn.jsdelivr.net/gh/Microsoft/vscode/extensions/markdown-language-features/media/markdown.css\">\n",
	" <link rel=\"stylesheet\" href=\"https://cdn.jsdelivr.net/gh/Microsoft/vscode/extensions/markdown-language-features/media/highlight.css\">\n",
	" <link href=\"https://cdn.jsdelivr.net/npm/katex-copytex@latest/dist/katex-copytex.min.css\" rel=\"stylesheet\" type=\"text/css\">\n",
	" <style>\n",
	".task-list-item { list-style-type: none; } .task-list-item-checkbox { margin-left: -20px; vertical-align: middle; }\n",
	"</style>\n",
	" <style>\n",
	" body {\n",
	" font-family: -apple-system, BlinkMacSystemFont, 'Segoe WPC', 'Segoe UI', 'Ubuntu', 'Droid Sans', sans-serif;\n",
	" font-size: 14px;\n",
	" line-height: 1.6;\n",
	" }\n",
	" </style>\n",
	" \n",
	" <script src=\"https://cdn.jsdelivr.net/npm/katex-copytex@latest/dist/katex-copytex.min.js\"></script>\n",
	" </head>\n",
	" <body>'''\n",
	"\n",
	" FOOTER: ClassVar[str] = '''\n",
	" </body>\n",
	" </html>'''\n",
	" \n",
	" EXTENSION: ClassVar[str] = \".html\"\n",
	" \n",
	" @classmethod\n",
	" def export(cls, book: Book) -> Path:\n",
	" '''Export a book to HTML.\n",
	" \n",
	" Params:\n",
	" book: The book to export\n",
	" '''\n",
	" with book.filename.open('w', encoding='utf8') as file:\n",
	" file.write(cls.HEADER.replace('HTML_TITLE', book.title))\n",
	" file.write(f'<h1>{book.title}</h1>\\n')\n",
	" for chapter in book.chapters:\n",
	" file.write(f'<h2>{chapter}</h2>\\n')\n",
	" file.write('<ul>\\n')\n",
	" for highlight in book.highlights[chapter]:\n",
	" file.write('<li>\\n<blockquote>\\n')\n",
	" fmt_highlight = ''.join([f'<p>{line}</p>\\n' for line in highlight.split('\\n')])\n",
	" file.write(fmt_highlight)\n",
	" file.write('</blockquote>\\n</li>\\n')\n",
	" file.write('</ul>\\n')\n",
	" file.write(cls.FOOTER)\n",
	" return book.filename\n",
	" \n",
	" \n",
	" \n",
	"class OReillyCSV2MD(Exporter):\n",
	" '''safari-annotations-export.csv file parser'''\n",
	" \n",
	" EXTENSION: ClassVar[str] = \".md\"\n",
	" \n",
	" @classmethod\n",
	" def format(cls, string: str) -> str:\n",
	" return f\"{string}\\n\\n\"\n",
	" \n",
	" @classmethod\n",
	" def print(cls, file: TextIO, string: str) -> None:\n",
	" file.write(cls.format(string))\n",
	" \n",
	" @classmethod\n",
	" def export(cls, book: Book) -> Path:\n",
	" book.get_highlights()\n",
	" with book.filename.open(\"w\", encoding=\"utf8\") as file:\n",
	" for chapter in book.chapters:\n",
	" cls.print(file, f\"## {chapter}\")\n",
	" for highlight in book.highlights[chapter]:\n",
	" cleaned_highligth: str = '\\n>'.join(highlight.split('\\n'))\n",
	" cls.print(file, f\"- > {cleaned_highligth}\")\n",
	" return book.filename\n"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 2,
	"id": "167fb71b",
	"metadata": {},
	"outputs": [],
	"source": [
	"titles: list[str] = [\n",
	" \"Extreme Programming Explained: Embrace Change, Second Edition\",\n",
	" \"Planning Extreme Programming\",\n",
	"]\n",
	"for title in titles:\n",
	" Book(title=title, exporter=OReillyCSV2MD).to_file"
	]
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3 (ipykernel)",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.10.2"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 5
	}
No results found