Created
June 1, 2020 17:26
-
-
Save spewil/9bd75e589b93afe9620568fb2bc73119 to your computer and use it in GitHub Desktop.
slightly better Kindle clippings parser
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pathlib | |
import re | |
import sys | |
# TODO | |
# - check if we're parsing raw clippings into | |
# - a new author file | |
# or | |
# - appending to existing author store | |
# - use markdown tree so we can add clips to existing books | |
# - make a version that works for pre-processed notes | |
def clippings_to_dict(clippings): | |
""" | |
Takes a Kindle "My Clippings.txt" file and puts the contents into a dictionary with the following structure: | |
dictionary = | |
{author: { | |
title: [ | |
{ | |
"highlight": "", | |
"note": "" | |
}, | |
{ | |
"highlight": "", | |
"note": "" | |
} | |
] | |
} | |
} | |
Subtleties : | |
- Use Calibre to access your "My Clippings.txt" file | |
- Edit your book metadata to work with this parser: | |
- Each book needs a title and one author | |
- Author names must be 3 words or less including hyphenations, etc | |
- Currently supports only a single auther per title | |
- Authors with multiple titles must be formatted identically | |
""" | |
def parse_title(title_line): | |
title_matches = re.findall(r"^(.*?) \(", title_author_line) | |
if len(title_matches): | |
title = title_matches[0] | |
# remove anything non-space, non-alphanumeric | |
title = re.sub(r'([^ \w])+', '', title) | |
# replace any number of spaces with single underscore | |
title = re.sub(r'( )+', '_', title).lower() | |
return title | |
else: | |
return None | |
def parse_author(author_line): | |
# up to three author names only | |
author_matches = re.findall(r"\(((?:.\w*){,3})\)", title_author_line) | |
# TODO: make this work for multiple author pattern | |
if len(author_matches): # if there is a valid author name | |
raw_name = author_matches[0] | |
parts = raw_name.split(' ') | |
# if the author name is 3 words or less | |
if len(parts) <= 3: | |
# if the author name has at least 2 capital letters | |
if sum(1 for c in raw_name if c.isupper()) >= 2: | |
# if the author name has at least 4 characters | |
if len(raw_name) > 4: | |
# remove anything non-space, non-alphanumeric | |
author = re.sub(r'([^ \w])+', '', raw_name) | |
# replace any number of spaces with single underscore | |
author = re.sub(r'( )+', '_', author).lower() | |
return author | |
else: | |
return None | |
clippings_dict = {} | |
clipping_sep = "==========" | |
for i, line in enumerate(clippings): | |
if clipping_sep in line: | |
title_author_line = clippings[i - 4] | |
metadata_line = clippings[i - 3] | |
clipping_line = clippings[i - 1] | |
# look ahead to check for a note | |
try: | |
next_title_author_line = clippings[i + 1] | |
next_metadata_line = clippings[i + 2] | |
next_clipping_line = clippings[i + 4] | |
except IndexError: # end of file | |
next_title_author_line = "" | |
next_metadata_line = "" | |
next_clipping_line = "" | |
author = parse_author(title_author_line) | |
next_author = parse_author(next_title_author_line) | |
title = parse_title(title_author_line) | |
if author: | |
if "Highlight" in metadata_line: | |
clip = dict(highlight=clipping_line) | |
if "Note" in next_metadata_line and author == next_author: | |
clip["note"] = next_clipping_line | |
# create author dict if new | |
if clippings_dict.get(author, None) is None: | |
clippings_dict[author] = {} | |
if clippings_dict[author].get(title, None) is None: | |
clippings_dict[author][title] = [] | |
# add new clipping | |
clippings_dict[author][title].append(clip) | |
return clippings_dict | |
def clippings_dict_to_markdown(clippings_dict): | |
""" | |
- TODO: enable adding to existing titles. | |
- would do this like: | |
if line-beginning with "##" == title: | |
append to before next line beginning with "##" | |
- Formatted like: | |
# author | |
## title | |
> highlight 1 | |
note | |
--- | |
> highlight 2 | |
--- | |
""" | |
for author in clippings_dict.keys(): | |
# read file, get existing notes | |
# with open("parsed_clippings/" + author + ".md", 'r') as output_file: | |
# lines = output_file.readlines() | |
# titles = [] | |
# for i, line in enumerate(lines): | |
# if line[0:1] == "# ": | |
# existing_author = line.replace("# ", "").replace( | |
# " ", "_").strip() | |
# else: | |
# existing_author = None | |
# if line[0:3] == "## ": | |
# titles.append( | |
# [line.replace("## ", "").replace(" ", "_").strip(), i]) | |
# else: | |
# pass | |
# # add new notes | |
# with open("parsed_clippings/" + author + ".md", 'w') as output_file: | |
# output_file.write("# " + author.replace("_", " ") + "\n\n") | |
# for title in clippings_dict[author].keys(): | |
# output_file.write("## " + title.replace("_", " ") + "\n\n") | |
# for clip in clippings_dict[author][title]: | |
# output_file.write("> " + clip['highlight'] + "\n\n") | |
# if clip.get("note") is not None: | |
# output_file.write(clip["note"] + "\n\n") | |
# output_file.write("---\n\n") | |
# print(author, titles) | |
parent = pathlib.Path("parsed_clippings/") | |
path = pathlib.Path(parent / "".join([author, ".md"])) | |
if not parent.exists(): | |
parent.mkdir() | |
if not path.exists(): | |
with path.open('w'): | |
pass | |
with open(path, 'w') as output_file: | |
output_file.write("# " + author.replace("_", " ") + "\n\n") | |
for title in clippings_dict[author].keys(): | |
output_file.write("## " + title.replace("_", " ") + "\n\n") | |
for clip in clippings_dict[author][title]: | |
output_file.write("> " + clip['highlight'] + "\n\n") | |
if clip.get("note") is not None: | |
output_file.write(clip["note"] + "\n\n") | |
output_file.write("---\n\n") | |
# with open("parsed_clippings/" + author + ".md", 'w') as output_file: | |
# output_file.write(lines) | |
if __name__ == '__main__': | |
clippings_path = sys.argv[1] | |
with open(clippings_path) as clipping_file: | |
clippings = clipping_file.readlines() | |
clippings_dict = clippings_to_dict(clippings) | |
clippings_dict_to_markdown(clippings_dict) | |
# first = True | |
# start_of_block = True | |
# print("<meta charset='utf-8'/>") | |
# print("<link rel='stylesheet' href='style.css'>") | |
# print("<div class='block'>") | |
# for line in sys.stdin: | |
# if line == '\n': | |
# print("</div>") | |
# print("<div class='block'>") | |
# start_of_block = True | |
# continue | |
# if line.startswith('<'): | |
# start_of_block = False | |
# print(line) | |
# continue | |
# if start_of_block: | |
# print("<div class='direction'>%s</div>" % line.replace('\n', '')) | |
# continue | |
# else: | |
# print('<p>%s</p>' % line.replace('\n', '')) | |
# start_of_block = False |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment