spewil · June 1, 2020 17:26
diff --git a/parse_clippings.py b/parse_clippings.py
 import pathlib
 import re
 import sys

 # TODO
 # - check if we're parsing raw clippings into
 #     - a new author file
 #     or
 #     - appending to existing author store
 # - use markdown tree so we can add clips to existing books
 # - make a version that works for pre-processed notes


 def clippings_to_dict(clippings):
    """
        Takes a Kindle "My Clippings.txt" file and puts the contents into a dictionary with the following structure:

        dictionary =
            {author: {
                title: [
                          {
                             "highlight": "",
                             "note": ""
                          },
                          {
                             "highlight": "",
                             "note": ""
                          }
                       ]
            }
            }

        Subtleties :
        - Use Calibre to access your "My Clippings.txt" file
        - Edit your book metadata to work with this parser:
        - Each book needs a title and one author
        - Author names must be 3 words or less including hyphenations, etc
        - Currently supports only a single auther per title
        - Authors with multiple titles must be formatted identically

    """

    def parse_title(title_line):
        title_matches = re.findall(r"^(.*?) \(", title_author_line)
        if len(title_matches):
            title = title_matches[0]
            # remove anything non-space, non-alphanumeric
            title = re.sub(r'([^ \w])+', '', title)
            # replace any number of spaces with single underscore
            title = re.sub(r'( )+', '_', title).lower()
            return title
        else:
            return None

    def parse_author(author_line):
        # up to three author names only
        author_matches = re.findall(r"\(((?:.\w*){,3})\)", title_author_line)
        # TODO: make this work for multiple author pattern
        if len(author_matches):  # if there is a valid author name
            raw_name = author_matches[0]
            parts = raw_name.split(' ')
            # if the author name is 3 words or less
            if len(parts) <= 3:
                # if the author name has at least 2 capital letters
                if sum(1 for c in raw_name if c.isupper()) >= 2:
                    # if the author name has at least 4 characters
                    if len(raw_name) > 4:
                        # remove anything non-space, non-alphanumeric
                        author = re.sub(r'([^ \w])+', '', raw_name)
                        # replace any number of spaces with single underscore
                        author = re.sub(r'( )+', '_', author).lower()
                        return author
        else:
            return None

    clippings_dict = {}
    clipping_sep = "=========="
    for i, line in enumerate(clippings):
        if clipping_sep in line:
            title_author_line = clippings[i - 4]
            metadata_line = clippings[i - 3]
            clipping_line = clippings[i - 1]
            # look ahead to check for a note
            try:
                next_title_author_line = clippings[i + 1]
                next_metadata_line = clippings[i + 2]
                next_clipping_line = clippings[i + 4]
            except IndexError:  # end of file
                next_title_author_line = ""
                next_metadata_line = ""
                next_clipping_line = ""

            author = parse_author(title_author_line)
            next_author = parse_author(next_title_author_line)
            title = parse_title(title_author_line)

            if author:
                if "Highlight" in metadata_line:
                    clip = dict(highlight=clipping_line)
                    if "Note" in next_metadata_line and author == next_author:
                        clip["note"] = next_clipping_line
                    # create author dict if new
                    if clippings_dict.get(author, None) is None:
                        clippings_dict[author] = {}
                    if clippings_dict[author].get(title, None) is None:
                        clippings_dict[author][title] = []
                    # add new clipping
                    clippings_dict[author][title].append(clip)
    return clippings_dict


 def clippings_dict_to_markdown(clippings_dict):
    """
        - TODO: enable adding to existing titles.
            - would do this like:
                if line-beginning with "##" == title:
                    append to before next line beginning with "##"
        - Formatted like:
            # author

            ## title

            > highlight 1

            note

            ---

            > highlight 2

            ---

    """

    for author in clippings_dict.keys():
        # read file, get existing notes
        # with open("parsed_clippings/" + author + ".md", 'r') as output_file:
        #     lines = output_file.readlines()
        #     titles = []
        #     for i, line in enumerate(lines):
        #         if line[0:1] == "# ":
        #             existing_author = line.replace("# ", "").replace(
        #                 " ", "_").strip()
        #         else:
        #             existing_author = None
        #         if line[0:3] == "## ":
        #             titles.append(
        #                 [line.replace("## ", "").replace(" ", "_").strip(), i])
        #         else:
        #             pass

        # # add new notes
        # with open("parsed_clippings/" + author + ".md", 'w') as output_file:
        #     output_file.write("# " + author.replace("_", " ") + "\n\n")
        #     for title in clippings_dict[author].keys():
        #         output_file.write("## " + title.replace("_", " ") + "\n\n")
        #         for clip in clippings_dict[author][title]:
        #             output_file.write("> " + clip['highlight'] + "\n\n")
        #             if clip.get("note") is not None:
        #                 output_file.write(clip["note"] + "\n\n")
        #             output_file.write("---\n\n")
        #     print(author, titles)

        parent = pathlib.Path("parsed_clippings/")
        path = pathlib.Path(parent / "".join([author, ".md"]))
        if not parent.exists():
            parent.mkdir()
        if not path.exists():
            with path.open('w'):
                pass
        with open(path, 'w') as output_file:
            output_file.write("# " + author.replace("_", " ") + "\n\n")
            for title in clippings_dict[author].keys():
                output_file.write("## " + title.replace("_", " ") + "\n\n")
                for clip in clippings_dict[author][title]:
                    output_file.write("> " + clip['highlight'] + "\n\n")
                    if clip.get("note") is not None:
                        output_file.write(clip["note"] + "\n\n")
                    output_file.write("---\n\n")

        # with open("parsed_clippings/" + author + ".md", 'w') as output_file:
        #     output_file.write(lines)


 if __name__ == '__main__':
    clippings_path = sys.argv[1]
    with open(clippings_path) as clipping_file:
        clippings = clipping_file.readlines()
    clippings_dict = clippings_to_dict(clippings)
    clippings_dict_to_markdown(clippings_dict)

 # first = True
 # start_of_block = True
 # print("<meta charset='utf-8'/>")
 # print("<link rel='stylesheet' href='style.css'>")
 # print("<div class='block'>")
 # for line in sys.stdin:
 #     if line == '\n':
 #         print("</div>")
 #         print("<div class='block'>")
 #         start_of_block = True
 #         continue
 #     if line.startswith('<'):
 #         start_of_block = False
 #         print(line)
 #         continue
 #     if start_of_block:
 #         print("<div class='direction'>%s</div>" % line.replace('\n', ''))
 #         continue
 #     else:
 #         print('<p>%s</p>' % line.replace('\n', ''))
 #         start_of_block = False
	import pathlib
	import re
	import sys

	# TODO
	# - check if we're parsing raw clippings into
	# - a new author file
	# or
	# - appending to existing author store
	# - use markdown tree so we can add clips to existing books
	# - make a version that works for pre-processed notes


	def clippings_to_dict(clippings):
	"""
	Takes a Kindle "My Clippings.txt" file and puts the contents into a dictionary with the following structure:

	dictionary =
	{author: {
	title: [
	{
	"highlight": "",
	"note": ""
	},
	{
	"highlight": "",
	"note": ""
	}
	]
	}
	}

	Subtleties :
	- Use Calibre to access your "My Clippings.txt" file
	- Edit your book metadata to work with this parser:
	- Each book needs a title and one author
	- Author names must be 3 words or less including hyphenations, etc
	- Currently supports only a single auther per title
	- Authors with multiple titles must be formatted identically

	"""

	def parse_title(title_line):
	title_matches = re.findall(r"^(.*?) \(", title_author_line)
	if len(title_matches):
	title = title_matches[0]
	# remove anything non-space, non-alphanumeric
	title = re.sub(r'([^ \w])+', '', title)
	# replace any number of spaces with single underscore
	title = re.sub(r'( )+', '_', title).lower()
	return title
	else:
	return None

	def parse_author(author_line):
	# up to three author names only
	author_matches = re.findall(r"\(((?:.\w*){,3})\)", title_author_line)
	# TODO: make this work for multiple author pattern
	if len(author_matches): # if there is a valid author name
	raw_name = author_matches[0]
	parts = raw_name.split(' ')
	# if the author name is 3 words or less
	if len(parts) <= 3:
	# if the author name has at least 2 capital letters
	if sum(1 for c in raw_name if c.isupper()) >= 2:
	# if the author name has at least 4 characters
	if len(raw_name) > 4:
	# remove anything non-space, non-alphanumeric
	author = re.sub(r'([^ \w])+', '', raw_name)
	# replace any number of spaces with single underscore
	author = re.sub(r'( )+', '_', author).lower()
	return author
	else:
	return None

	clippings_dict = {}
	clipping_sep = "=========="
	for i, line in enumerate(clippings):
	if clipping_sep in line:
	title_author_line = clippings[i - 4]
	metadata_line = clippings[i - 3]
	clipping_line = clippings[i - 1]
	# look ahead to check for a note
	try:
	next_title_author_line = clippings[i + 1]
	next_metadata_line = clippings[i + 2]
	next_clipping_line = clippings[i + 4]
	except IndexError: # end of file
	next_title_author_line = ""
	next_metadata_line = ""
	next_clipping_line = ""

	author = parse_author(title_author_line)
	next_author = parse_author(next_title_author_line)
	title = parse_title(title_author_line)

	if author:
	if "Highlight" in metadata_line:
	clip = dict(highlight=clipping_line)
	if "Note" in next_metadata_line and author == next_author:
	clip["note"] = next_clipping_line
	# create author dict if new
	if clippings_dict.get(author, None) is None:
	clippings_dict[author] = {}
	if clippings_dict[author].get(title, None) is None:
	clippings_dict[author][title] = []
	# add new clipping
	clippings_dict[author][title].append(clip)
	return clippings_dict


	def clippings_dict_to_markdown(clippings_dict):
	"""
	- TODO: enable adding to existing titles.
	- would do this like:
	if line-beginning with "##" == title:
	append to before next line beginning with "##"
	- Formatted like:
	# author

	## title

	> highlight 1

	note

	---

	> highlight 2

	---

	"""

	for author in clippings_dict.keys():
	# read file, get existing notes
	# with open("parsed_clippings/" + author + ".md", 'r') as output_file:
	# lines = output_file.readlines()
	# titles = []
	# for i, line in enumerate(lines):
	# if line[0:1] == "# ":
	# existing_author = line.replace("# ", "").replace(
	# " ", "_").strip()
	# else:
	# existing_author = None
	# if line[0:3] == "## ":
	# titles.append(
	# [line.replace("## ", "").replace(" ", "_").strip(), i])
	# else:
	# pass

	# # add new notes
	# with open("parsed_clippings/" + author + ".md", 'w') as output_file:
	# output_file.write("# " + author.replace("_", " ") + "\n\n")
	# for title in clippings_dict[author].keys():
	# output_file.write("## " + title.replace("_", " ") + "\n\n")
	# for clip in clippings_dict[author][title]:
	# output_file.write("> " + clip['highlight'] + "\n\n")
	# if clip.get("note") is not None:
	# output_file.write(clip["note"] + "\n\n")
	# output_file.write("---\n\n")
	# print(author, titles)

	parent = pathlib.Path("parsed_clippings/")
	path = pathlib.Path(parent / "".join([author, ".md"]))
	if not parent.exists():
	parent.mkdir()
	if not path.exists():
	with path.open('w'):
	pass
	with open(path, 'w') as output_file:
	output_file.write("# " + author.replace("_", " ") + "\n\n")
	for title in clippings_dict[author].keys():
	output_file.write("## " + title.replace("_", " ") + "\n\n")
	for clip in clippings_dict[author][title]:
	output_file.write("> " + clip['highlight'] + "\n\n")
	if clip.get("note") is not None:
	output_file.write(clip["note"] + "\n\n")
	output_file.write("---\n\n")

	# with open("parsed_clippings/" + author + ".md", 'w') as output_file:
	# output_file.write(lines)


	if __name__ == '__main__':
	clippings_path = sys.argv[1]
	with open(clippings_path) as clipping_file:
	clippings = clipping_file.readlines()
	clippings_dict = clippings_to_dict(clippings)
	clippings_dict_to_markdown(clippings_dict)

	# first = True
	# start_of_block = True
	# print("<meta charset='utf-8'/>")
	# print("<link rel='stylesheet' href='style.css'>")
	# print("<div class='block'>")
	# for line in sys.stdin:
	# if line == '\n':
	# print("</div>")
	# print("<div class='block'>")
	# start_of_block = True
	# continue
	# if line.startswith('<'):
	# start_of_block = False
	# print(line)
	# continue
	# if start_of_block:
	# print("<div class='direction'>%s</div>" % line.replace('\n', ''))
	# continue
	# else:
	# print('<p>%s</p>' % line.replace('\n', ''))
	# start_of_block = False