Skip to content

Instantly share code, notes, and snippets.

@spewil
Created June 1, 2020 17:26
Show Gist options
  • Save spewil/9bd75e589b93afe9620568fb2bc73119 to your computer and use it in GitHub Desktop.
Save spewil/9bd75e589b93afe9620568fb2bc73119 to your computer and use it in GitHub Desktop.
slightly better Kindle clippings parser
import pathlib
import re
import sys
# TODO
# - check if we're parsing raw clippings into
# - a new author file
# or
# - appending to existing author store
# - use markdown tree so we can add clips to existing books
# - make a version that works for pre-processed notes
def clippings_to_dict(clippings):
"""
Takes a Kindle "My Clippings.txt" file and puts the contents into a dictionary with the following structure:
dictionary =
{author: {
title: [
{
"highlight": "",
"note": ""
},
{
"highlight": "",
"note": ""
}
]
}
}
Subtleties :
- Use Calibre to access your "My Clippings.txt" file
- Edit your book metadata to work with this parser:
- Each book needs a title and one author
- Author names must be 3 words or less including hyphenations, etc
- Currently supports only a single auther per title
- Authors with multiple titles must be formatted identically
"""
def parse_title(title_line):
title_matches = re.findall(r"^(.*?) \(", title_author_line)
if len(title_matches):
title = title_matches[0]
# remove anything non-space, non-alphanumeric
title = re.sub(r'([^ \w])+', '', title)
# replace any number of spaces with single underscore
title = re.sub(r'( )+', '_', title).lower()
return title
else:
return None
def parse_author(author_line):
# up to three author names only
author_matches = re.findall(r"\(((?:.\w*){,3})\)", title_author_line)
# TODO: make this work for multiple author pattern
if len(author_matches): # if there is a valid author name
raw_name = author_matches[0]
parts = raw_name.split(' ')
# if the author name is 3 words or less
if len(parts) <= 3:
# if the author name has at least 2 capital letters
if sum(1 for c in raw_name if c.isupper()) >= 2:
# if the author name has at least 4 characters
if len(raw_name) > 4:
# remove anything non-space, non-alphanumeric
author = re.sub(r'([^ \w])+', '', raw_name)
# replace any number of spaces with single underscore
author = re.sub(r'( )+', '_', author).lower()
return author
else:
return None
clippings_dict = {}
clipping_sep = "=========="
for i, line in enumerate(clippings):
if clipping_sep in line:
title_author_line = clippings[i - 4]
metadata_line = clippings[i - 3]
clipping_line = clippings[i - 1]
# look ahead to check for a note
try:
next_title_author_line = clippings[i + 1]
next_metadata_line = clippings[i + 2]
next_clipping_line = clippings[i + 4]
except IndexError: # end of file
next_title_author_line = ""
next_metadata_line = ""
next_clipping_line = ""
author = parse_author(title_author_line)
next_author = parse_author(next_title_author_line)
title = parse_title(title_author_line)
if author:
if "Highlight" in metadata_line:
clip = dict(highlight=clipping_line)
if "Note" in next_metadata_line and author == next_author:
clip["note"] = next_clipping_line
# create author dict if new
if clippings_dict.get(author, None) is None:
clippings_dict[author] = {}
if clippings_dict[author].get(title, None) is None:
clippings_dict[author][title] = []
# add new clipping
clippings_dict[author][title].append(clip)
return clippings_dict
def clippings_dict_to_markdown(clippings_dict):
"""
- TODO: enable adding to existing titles.
- would do this like:
if line-beginning with "##" == title:
append to before next line beginning with "##"
- Formatted like:
# author
## title
> highlight 1
note
---
> highlight 2
---
"""
for author in clippings_dict.keys():
# read file, get existing notes
# with open("parsed_clippings/" + author + ".md", 'r') as output_file:
# lines = output_file.readlines()
# titles = []
# for i, line in enumerate(lines):
# if line[0:1] == "# ":
# existing_author = line.replace("# ", "").replace(
# " ", "_").strip()
# else:
# existing_author = None
# if line[0:3] == "## ":
# titles.append(
# [line.replace("## ", "").replace(" ", "_").strip(), i])
# else:
# pass
# # add new notes
# with open("parsed_clippings/" + author + ".md", 'w') as output_file:
# output_file.write("# " + author.replace("_", " ") + "\n\n")
# for title in clippings_dict[author].keys():
# output_file.write("## " + title.replace("_", " ") + "\n\n")
# for clip in clippings_dict[author][title]:
# output_file.write("> " + clip['highlight'] + "\n\n")
# if clip.get("note") is not None:
# output_file.write(clip["note"] + "\n\n")
# output_file.write("---\n\n")
# print(author, titles)
parent = pathlib.Path("parsed_clippings/")
path = pathlib.Path(parent / "".join([author, ".md"]))
if not parent.exists():
parent.mkdir()
if not path.exists():
with path.open('w'):
pass
with open(path, 'w') as output_file:
output_file.write("# " + author.replace("_", " ") + "\n\n")
for title in clippings_dict[author].keys():
output_file.write("## " + title.replace("_", " ") + "\n\n")
for clip in clippings_dict[author][title]:
output_file.write("> " + clip['highlight'] + "\n\n")
if clip.get("note") is not None:
output_file.write(clip["note"] + "\n\n")
output_file.write("---\n\n")
# with open("parsed_clippings/" + author + ".md", 'w') as output_file:
# output_file.write(lines)
if __name__ == '__main__':
clippings_path = sys.argv[1]
with open(clippings_path) as clipping_file:
clippings = clipping_file.readlines()
clippings_dict = clippings_to_dict(clippings)
clippings_dict_to_markdown(clippings_dict)
# first = True
# start_of_block = True
# print("<meta charset='utf-8'/>")
# print("<link rel='stylesheet' href='style.css'>")
# print("<div class='block'>")
# for line in sys.stdin:
# if line == '\n':
# print("</div>")
# print("<div class='block'>")
# start_of_block = True
# continue
# if line.startswith('<'):
# start_of_block = False
# print(line)
# continue
# if start_of_block:
# print("<div class='direction'>%s</div>" % line.replace('\n', ''))
# continue
# else:
# print('<p>%s</p>' % line.replace('\n', ''))
# start_of_block = False
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment