|  | #!/usr/bin/env python | 
        
          |  | # encoding: utf-8 | 
        
          |  | """ | 
        
          |  | *Parse kindle book highlights from exported kindle HTML file and embed them in PDF version of the same book* | 
        
          |  |  | 
        
          |  | :Author: | 
        
          |  | David Young | 
        
          |  |  | 
        
          |  | :Date Created: | 
        
          |  | August 21, 2021 | 
        
          |  |  | 
        
          |  | Usage: | 
        
          |  | kindle_highlights_to_pdf <htmlPath> <pdfPath> | 
        
          |  |  | 
        
          |  | Options: | 
        
          |  | -h, --help            show this help message | 
        
          |  | htmlPath              path to the kindle export of book highlights | 
        
          |  | pdfPath               path to the PDF version of the book | 
        
          |  | """ | 
        
          |  | ################# GLOBAL IMPORTS #################### | 
        
          |  | import sys | 
        
          |  | import os | 
        
          |  | from fundamentals import tools | 
        
          |  | from os.path import expanduser | 
        
          |  | import codecs | 
        
          |  | from bs4 import BeautifulSoup | 
        
          |  | from fuzzysearch import find_near_matches | 
        
          |  | import fitz | 
        
          |  | import pandas as pd | 
        
          |  | from tabulate import tabulate | 
        
          |  | import sqlite3 as sql | 
        
          |  | import numpy as np | 
        
          |  | import time | 
        
          |  | rgbColors = { | 
        
          |  | "blue": (0.75, 0.8, 0.95), | 
        
          |  | "yellow": (1, 1, 0.6), | 
        
          |  | "orange": (1, 0.75, 0.5), | 
        
          |  | "pink": (1, 0.7, 0.9) | 
        
          |  | } | 
        
          |  |  | 
        
          |  |  | 
        
          |  | def main(arguments=None): | 
        
          |  | """ | 
        
          |  | *The main function used when ``kindle_highlights_to_pdf.py`` is run as a single script from the cl* | 
        
          |  | """ | 
        
          |  |  | 
        
          |  | # SETUP THE COMMAND-LINE UTIL SETTINGS | 
        
          |  | su = tools( | 
        
          |  | arguments=arguments, | 
        
          |  | docString=__doc__, | 
        
          |  | logLevel="WARNING", | 
        
          |  | options_first=False, | 
        
          |  | projectName=False | 
        
          |  | ) | 
        
          |  | arguments, settings, log, dbConn = su.setup() | 
        
          |  |  | 
        
          |  | # UNPACK REMAINING CL ARGUMENTS USING `EXEC` TO SETUP THE VARIABLE NAMES | 
        
          |  | # AUTOMATICALLY | 
        
          |  | a = {} | 
        
          |  | for arg, val in list(arguments.items()): | 
        
          |  | if arg[0] == "-": | 
        
          |  | varname = arg.replace("-", "") + "Flag" | 
        
          |  | else: | 
        
          |  | varname = arg.replace("<", "").replace(">", "") | 
        
          |  | a[varname] = val | 
        
          |  | if arg == "--dbConn": | 
        
          |  | dbConn = val | 
        
          |  | a["dbConn"] = val | 
        
          |  | log.debug('%s = %s' % (varname, val,)) | 
        
          |  |  | 
        
          |  | htmlPath = a['htmlPath'] | 
        
          |  | pdfPath = a['pdfPath'] | 
        
          |  | # MAKE RELATIVE HOME PATH ABSOLUTE | 
        
          |  |  | 
        
          |  | home = expanduser("~") | 
        
          |  | if htmlPath[0] == "~": | 
        
          |  | htmlPath = htmlPath.replace("~", home) | 
        
          |  | if pdfPath[0] == "~": | 
        
          |  | pdfPath = pdfPath.replace("~", home) | 
        
          |  |  | 
        
          |  | highlights = parse_html_highlights( | 
        
          |  | log=log, | 
        
          |  | htmlPath=htmlPath) | 
        
          |  |  | 
        
          |  | output = embed_highlights_in_pdf( | 
        
          |  | log=log, | 
        
          |  | highlights=highlights, | 
        
          |  | pdfPath=pdfPath) | 
        
          |  |  | 
        
          |  | print(f"Highlighted PDF is here: '{output}'") | 
        
          |  |  | 
        
          |  | return | 
        
          |  |  | 
        
          |  |  | 
        
          |  | def parse_html_highlights( | 
        
          |  | log, | 
        
          |  | htmlPath): | 
        
          |  | """*return a list of highlight content to embed in PDF* | 
        
          |  |  | 
        
          |  | **Key Arguments:** | 
        
          |  |  | 
        
          |  | - ``log`` -- logger | 
        
          |  | - ``htmlPath`` -- path to the kindle export of book highlights | 
        
          |  |  | 
        
          |  | **Usage:** | 
        
          |  |  | 
        
          |  | ```python | 
        
          |  | highlights = parse_html_highlights( | 
        
          |  | log=log, | 
        
          |  | htmlPath=htmlPath | 
        
          |  | ) | 
        
          |  | ``` | 
        
          |  | """ | 
        
          |  | log.debug('starting the ``parse_html_highlights`` function') | 
        
          |  |  | 
        
          |  | with codecs.open(htmlPath, encoding='utf-8', mode='r') as readFile: | 
        
          |  | thisData = readFile.read() | 
        
          |  | soup = BeautifulSoup(thisData, 'html.parser') | 
        
          |  |  | 
        
          |  | highlights = soup.find_all("div", {"class": "noteText"}) | 
        
          |  | headings = soup.find_all("h3", {"class": "noteHeading"}) | 
        
          |  |  | 
        
          |  | colors = [h.find("span").text for h in headings] | 
        
          |  | highlights = [h.text.split("Highlight (")[0].strip() for h in highlights] | 
        
          |  |  | 
        
          |  | # REMOVE HIGHLIGHT LESS THAN 4 WORDS OR 20 CHARACTERS | 
        
          |  | colors = [c for h, c in zip(highlights, colors) | 
        
          |  | if len(h) > 19 and len(h.split()) > 3] | 
        
          |  | highlights = [h for h in highlights if len(h) > 19 and len(h.split()) > 3] | 
        
          |  |  | 
        
          |  | # CREATE DATA FRAME FROM A DICTIONARY OF LISTS | 
        
          |  | highlights = {"highlight": highlights, "color": colors} | 
        
          |  | highlights['page'] = 0 | 
        
          |  | highlights['page_min'] = 0 | 
        
          |  | highlights['page_max'] = 0 | 
        
          |  | highlights['match'] = "" | 
        
          |  | highlights['split_match_bot'] = "" | 
        
          |  | highlights['split_match_top'] = "" | 
        
          |  | highlights['length'] = 0 | 
        
          |  | highlights = pd.DataFrame(highlights) | 
        
          |  |  | 
        
          |  | log.debug('completed the ``parse_html_highlights`` function') | 
        
          |  | return highlights | 
        
          |  |  | 
        
          |  |  | 
        
          |  | def embed_highlights_in_pdf( | 
        
          |  | log, | 
        
          |  | highlights, | 
        
          |  | pdfPath): | 
        
          |  | """*summary of function* | 
        
          |  |  | 
        
          |  | **Key Arguments:** | 
        
          |  |  | 
        
          |  | - ``log`` -- logger | 
        
          |  | - ``highlights`` -- list of highlighted text content | 
        
          |  | - ``pdfPath`` -- path to the PDF version of the book | 
        
          |  |  | 
        
          |  | **Usage:** | 
        
          |  |  | 
        
          |  | ```eval_rst | 
        
          |  | .. todo:: | 
        
          |  |  | 
        
          |  | add usage info | 
        
          |  | create a sublime snippet for usage | 
        
          |  | ``` | 
        
          |  |  | 
        
          |  | ```python | 
        
          |  | usage code | 
        
          |  | ``` | 
        
          |  | """ | 
        
          |  | log.debug('starting the ``embed_highlights_in_pdf`` function') | 
        
          |  |  | 
        
          |  | cleanPDF = fitz.open(pdfPath) | 
        
          |  | # cleanPDF.save("/tmp/kindle_book.pdf", garbage=4, deflate=True, clean=True) | 
        
          |  | # EMPTY PDF PLACEHOLDER FOR THE SOON-TO-BE HIGHLIGHTED PDF | 
        
          |  | annotatedPDF = fitz.open() | 
        
          |  | annotatedPDF.insertPDF( | 
        
          |  | cleanPDF, | 
        
          |  | from_page=0, | 
        
          |  | to_page=cleanPDF.pageCount) | 
        
          |  | cleanPDF.close() | 
        
          |  |  | 
        
          |  | total = len(highlights) | 
        
          |  | count = 0 | 
        
          |  |  | 
        
          |  | # FIND SINGLE PAGE MATCHES | 
        
          |  | percent = 0 | 
        
          |  | matchTolerance = 4 | 
        
          |  | while percent < 90 and matchTolerance < 40: | 
        
          |  | time.sleep(2) | 
        
          |  | highlights = highlights.apply( | 
        
          |  | find_in_single_page_hightlight, axis=1, pdf=annotatedPDF, matchTolerance=matchTolerance) | 
        
          |  | # FILTER DATA FRAME | 
        
          |  | # FIRST CREATE THE MASK | 
        
          |  | mask = (highlights['match'] != "") | 
        
          |  | totalMatched = len(highlights.loc[mask]) | 
        
          |  | total = len(highlights.index) | 
        
          |  | percent = int(totalMatched * 100 / total) | 
        
          |  | print(f"Matched {totalMatched}/{total} annotations ({percent}%) (max levenshtein distance = {matchTolerance})") | 
        
          |  | pages = np.copy(highlights["page"].values) | 
        
          |  | pages[pages == 0] = -999999 | 
        
          |  | matchTolerance = matchTolerance + 5 | 
        
          |  | theseMin = [] | 
        
          |  | theseMin[:] = [np.max(pages[:i + 1]) | 
        
          |  | for i in range(len(highlights.index))] | 
        
          |  | pages[pages == -999999] = 999999 | 
        
          |  | theseMax = [] | 
        
          |  | theseMax[:] = [i for i in range(len(highlights.index))] | 
        
          |  | theseMax[:] = [np.min(pages[i:]) | 
        
          |  | for i in range(len(highlights.index))] | 
        
          |  | highlights['page_min'] = theseMin | 
        
          |  | highlights['page_max'] = theseMax | 
        
          |  | if totalMatched == total: | 
        
          |  | break | 
        
          |  |  | 
        
          |  | matchTolerance = 4 | 
        
          |  | while matchTolerance < 40: | 
        
          |  | if totalMatched == total: | 
        
          |  | break | 
        
          |  | time.sleep(0.5) | 
        
          |  | highlights = highlights.apply( | 
        
          |  | find_in_over_multiple_pages, axis=1, pdf=annotatedPDF, matchTolerance=matchTolerance) | 
        
          |  | # FILTER DATA FRAME | 
        
          |  | # FIRST CREATE THE MASK | 
        
          |  | mask = ((highlights['split_match_bot'] != | 
        
          |  | "") & (highlights['split_match_top'] != "")) | 
        
          |  | highlights.loc[mask, 'match'] = highlights[ | 
        
          |  | 'split_match_bot'] + " " + highlights['split_match_top'] | 
        
          |  | mask = (highlights['match'] != "") | 
        
          |  | totalMatched = len(highlights.loc[mask]) | 
        
          |  | total = len(highlights.index) | 
        
          |  | percent = int(totalMatched * 100 / total) | 
        
          |  | print(f"Matched {totalMatched}/{total} annotations ({percent}%) (max levenshtein distance = {matchTolerance})") | 
        
          |  | if totalMatched == total: | 
        
          |  | break | 
        
          |  | matchTolerance = matchTolerance + 2 | 
        
          |  |  | 
        
          |  | outfile = pdfPath.replace(".pdf", "_highlighted.pdf") | 
        
          |  | annotatedPDF.save(outfile, garbage=4, deflate=True, clean=True) | 
        
          |  | annotatedPDF.close() | 
        
          |  |  | 
        
          |  | # CONNECT TO THE DATABASE | 
        
          |  | conn = sql.connect("highlights_export.db") | 
        
          |  | # SEND TO DATABASE | 
        
          |  | highlights.to_sql('highlights', con=conn, | 
        
          |  | index=False, if_exists='replace') | 
        
          |  |  | 
        
          |  | log.debug('completed the ``embed_highlights_in_pdf`` function') | 
        
          |  | return outfile | 
        
          |  |  | 
        
          |  |  | 
        
          |  | def find_in_single_page_hightlight( | 
        
          |  | series, | 
        
          |  | pdf, | 
        
          |  | matchTolerance): | 
        
          |  | """*given a highlighted text, find its location in the PDF (doesn't find highlights spanning multiple pages)* | 
        
          |  |  | 
        
          |  | **Key Arguments:** | 
        
          |  |  | 
        
          |  | - ``series`` -- the dataframe row/series to apply work on | 
        
          |  | - ``pdf`` -- the fitz PDF object | 
        
          |  | - ``matchTolerance`` -- lower tolerance is stricter, matching less, but faster | 
        
          |  | """ | 
        
          |  | h = series['highlight'] | 
        
          |  | c = series['color'] | 
        
          |  | color = rgbColors[c] | 
        
          |  | series['length'] = len(h) | 
        
          |  | matchText = None | 
        
          |  |  | 
        
          |  | if len(series['match']): | 
        
          |  | return series | 
        
          |  |  | 
        
          |  | page_min = series['page_min'] | 
        
          |  | page_max = series['page_max'] + 1 | 
        
          |  |  | 
        
          |  | if page_max == 1 or page_max > 99998: | 
        
          |  | page_max = pdf.pageCount | 
        
          |  | if page_min < 1: | 
        
          |  | page_min = 1 | 
        
          |  |  | 
        
          |  | for p in range(page_min, page_max): | 
        
          |  | page = pdf[p] | 
        
          |  | pageText = page.getText("text") | 
        
          |  | matches = find_near_matches( | 
        
          |  | h, pageText, max_l_dist=matchTolerance) | 
        
          |  | for match in matches: | 
        
          |  | matchText = match.matched | 
        
          |  | clMatch = matchText.replace("\n", " ")[:30] | 
        
          |  | print(f"    MATCH: {clMatch}...") | 
        
          |  | textOnPage = page.searchFor(matchText) | 
        
          |  | while not textOnPage: | 
        
          |  | textOnPage = page.searchFor(matchText) | 
        
          |  | start = textOnPage[0].top_left | 
        
          |  | end = textOnPage[-1].bottom_right | 
        
          |  | series["page"] = p | 
        
          |  | series['match'] = matchText | 
        
          |  | annot = page.addHighlightAnnot(None, start=start, stop=end) | 
        
          |  | annot.set_colors({"stroke": color}) | 
        
          |  |  | 
        
          |  | # annot.update() | 
        
          |  | if matchText: | 
        
          |  | break | 
        
          |  |  | 
        
          |  | return series | 
        
          |  |  | 
        
          |  |  | 
        
          |  | def find_in_over_multiple_pages( | 
        
          |  | series, | 
        
          |  | pdf, | 
        
          |  | matchTolerance): | 
        
          |  | """*given a highlighted text, find its location in the PDF (doesn't find highlights spanning multiple pages)* | 
        
          |  |  | 
        
          |  | **Key Arguments:** | 
        
          |  |  | 
        
          |  | - ``series`` -- the dataframe row/series to apply work on | 
        
          |  | - ``pdf`` -- the fitz PDF object | 
        
          |  | - ``matchTolerance`` -- lower tolerance is stricter, matching less, but faster | 
        
          |  | """ | 
        
          |  | h = series['highlight'] | 
        
          |  | c = series['color'] | 
        
          |  | color = rgbColors[c] | 
        
          |  | series['length'] = len(h) | 
        
          |  | matchText = None | 
        
          |  |  | 
        
          |  | if len(series['match']): | 
        
          |  | return series | 
        
          |  |  | 
        
          |  | page_min = series['page_min'] | 
        
          |  | page_max = series['page_max'] + 1 | 
        
          |  |  | 
        
          |  | if page_max == 1 or page_max > 99998: | 
        
          |  | page_max = pdf.pageCount | 
        
          |  | if page_min < 1: | 
        
          |  | page_min = 1 | 
        
          |  |  | 
        
          |  | if not len(series['split_match_bot']): | 
        
          |  |  | 
        
          |  | snippet = h[:40] | 
        
          |  | for p in range(page_min, page_max): | 
        
          |  | if len(series['split_match_bot']): | 
        
          |  | break | 
        
          |  | page = pdf[p] | 
        
          |  | pageText = page.getText("text") | 
        
          |  | matches = find_near_matches( | 
        
          |  | snippet, pageText, max_l_dist=matchTolerance) | 
        
          |  | for match in matches: | 
        
          |  | matchText = match.matched | 
        
          |  | if len(matchText): | 
        
          |  | matchText = matchText + pageText.split(matchText)[-1] | 
        
          |  | else: | 
        
          |  | matchText = None | 
        
          |  |  | 
        
          |  | if matchText: | 
        
          |  | matches = find_near_matches( | 
        
          |  | matchText, pageText, max_l_dist=4) | 
        
          |  | for match in matches: | 
        
          |  | matchText = match.matched | 
        
          |  | clMatch = matchText.replace("\n", " ")[:30] | 
        
          |  | print(f"    MATCH: {clMatch}...") | 
        
          |  | textOnPage = page.searchFor(matchText) | 
        
          |  | while not textOnPage: | 
        
          |  | textOnPage = page.searchFor(matchText) | 
        
          |  | start = textOnPage[0].top_left | 
        
          |  | end = textOnPage[-1].bottom_right | 
        
          |  | series["page"] = p | 
        
          |  | # series['match'] = matchText | 
        
          |  | series["split_match_bot"] = matchText | 
        
          |  | annot = page.addHighlightAnnot( | 
        
          |  | None, start=start, stop=end) | 
        
          |  | annot.set_colors({"stroke": color}) | 
        
          |  |  | 
        
          |  | series['page_min'] = p | 
        
          |  | page_min = p | 
        
          |  | if matchText: | 
        
          |  | break | 
        
          |  |  | 
        
          |  | if not len(series['split_match_top']): | 
        
          |  |  | 
        
          |  | matchText = None | 
        
          |  | snippet = h[-40:] | 
        
          |  | for p in range(page_min, page_max): | 
        
          |  | if len(series['split_match_top']): | 
        
          |  | break | 
        
          |  | page = pdf[p] | 
        
          |  | pageText = page.getText("text") | 
        
          |  | matches = find_near_matches( | 
        
          |  | snippet, pageText, max_l_dist=matchTolerance) | 
        
          |  | for match in matches: | 
        
          |  | matchText = match.matched | 
        
          |  | matchText = pageText.split(matchText)[0] + matchText | 
        
          |  | if matchText: | 
        
          |  | matches = find_near_matches( | 
        
          |  | matchText, pageText, max_l_dist=4) | 
        
          |  | for match in matches: | 
        
          |  | matchText = match.matched | 
        
          |  | clMatch = matchText.replace("\n", " ")[:30] | 
        
          |  | print(f"    MATCH: {clMatch}...") | 
        
          |  | textOnPage = page.searchFor(matchText) | 
        
          |  | while not textOnPage: | 
        
          |  | textOnPage = page.searchFor(matchText) | 
        
          |  | start = textOnPage[0].top_left | 
        
          |  | end = textOnPage[-1].bottom_right | 
        
          |  | series["page"] = p | 
        
          |  | # series['match'] = matchText | 
        
          |  | series["split_match_top"] = matchText | 
        
          |  | annot = page.addHighlightAnnot( | 
        
          |  | None, start=start, stop=end) | 
        
          |  | annot.set_colors({"stroke": color}) | 
        
          |  |  | 
        
          |  | series['page_max'] = p | 
        
          |  | page_max = p + 1 | 
        
          |  | if matchText: | 
        
          |  | break | 
        
          |  |  | 
        
          |  | return series | 
        
          |  |  | 
        
          |  | # use the tab-trigger below for new function | 
        
          |  | # xt-def-function | 
        
          |  |  | 
        
          |  | if __name__ == '__main__': | 
        
          |  | main() | 
  
PyMuPDF changed (https://pymupdf.readthedocs.io/en/latest/znames.html) from “camelCase” to “snake_cased” naming. Easy to fix.
I still do have some problems with your nice script. For example there are lines in my exported html where the script seems to hang (samples enclosed). And find_in_over_multiple_pages is creating strange highlighting in the target pdf file.