Last active
April 11, 2021 20:05
-
-
Save kanjieater/ba3aa450ba7d6e7c5dc4c53077b6134f to your computer and use it in GitHub Desktop.
Kindle Highlights to Anki Exporter for Japanese
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from datetime import datetime | |
import re | |
from collections import namedtuple | |
from bs4 import BeautifulSoup | |
from anki.notes import Note | |
from aqt import mw | |
from aqt.utils import getFile, showInfo, showText | |
from aqt.qt import QAction | |
import locale | |
locale.setlocale(locale.LC_ALL, 'ja_JP') | |
def main(): | |
action = QAction('Import Kindle highlights...', mw) | |
action.triggered.connect(import_highlights) | |
mw.form.menuTools.addAction(action) | |
def import_highlights(): | |
path = getFile(mw, 'Open Kindle clippings', cb=None, filter='Clippings file (*.txt *.html)', key='KindleHighlights') | |
with open(path, encoding='utf-8') as file: | |
lower_path = path.lower() | |
if lower_path.endswith('txt'): | |
clippings, bad_clippings = parse_text_clippings(file) | |
elif lower_path.endswith('html'): | |
clippings, bad_clippings = parse_html_clippings(file) | |
else: | |
raise RuntimeError(f'Unknown extension in path: {path!r}') | |
if bad_clippings: | |
showText( | |
f'The following {len(bad_clippings)} clippings could not be parsed:\n\n' + | |
'\n==========\n'.join(bad_clippings)) | |
config = mw.addonManager.getConfig(__name__) | |
highlight_clippings = list(highlights_only(clippings)) | |
clippings_to_add = after_last_added(highlight_clippings, last_added_datetime(config)) | |
model = mw.col.models.byName(config['model_name']) | |
last_added = None | |
# deck = mw.col.decks.byName(config['deck_name']) | |
# mid = mw.col.decks.select(deck['id']) | |
for clipping in clippings_to_add: | |
note = Note(mw.col, model) | |
note.addTag(clipping.document) | |
# note.model()['did'] = mid | |
note.fields = list(fields(clipping, model, config)) | |
mw.col.addNote(note) | |
if clipping.added: | |
last_added = clipping.added | |
note.flush | |
if last_added: | |
config['last_added'] = parse_clipping_added(last_added).isoformat() | |
mw.addonManager.writeConfig(__name__, config) | |
def info(): | |
if clippings_to_add: | |
yield f'{len(clippings_to_add)} new highlights imported' | |
num_old_highlights = len(highlight_clippings) - len(clippings_to_add) | |
if num_old_highlights: | |
yield f'{num_old_highlights} old highlights ignored' | |
num_not_highlights = len(clippings) - len(highlight_clippings) | |
if num_not_highlights: | |
yield f'{num_not_highlights} non-highlight clippings ignored' | |
info_strings = list(info()) | |
if info_strings: | |
showInfo(', '.join(info_strings) + '.') | |
elif bad_clippings: | |
showInfo('No other clippings found.') | |
else: | |
showInfo('No clippings found.') | |
Clipping = namedtuple('Clipping', ('kind', 'document', 'page', 'location', 'added', 'content')) | |
def parse_text_clippings(file): | |
clippings = [] | |
bad_clippings = [] | |
current_clipping_lines = [] | |
for line in file: | |
if line != '==========\n': | |
current_clipping_lines.append(line) | |
continue | |
string = ''.join(current_clipping_lines) | |
current_clipping_lines.clear() | |
clipping = parse_text_clipping(string) | |
if clipping: | |
clippings.append(clipping) | |
else: | |
bad_clippings.append(string) | |
if current_clipping_lines: | |
bad_clippings.append(''.join(current_clipping_lines)) | |
return clippings, bad_clippings | |
def parse_text_clipping(string): | |
match = re.fullmatch(CLIPPING_PATTERN, string) | |
if not match: | |
return None | |
return Clipping(**match.groupdict()) | |
CLIPPING_PATTERN = r'''\ufeff?(?P<document>.*) | |
- Your (?P<kind>.*) on (?:page (?P<page>.*) \| )?(?:Location (?P<location>.*) \| )?Added on (?P<added>.*) | |
(?P<content>.*) | |
?''' | |
CLIPPING_PATTERN = r'''\ufeff?(?P<document>.*) | |
- (?P<page>.*)?ページ\|位置No\. (?P<location>.*)?の(?:(?P<kind>.*) \|)?作成日: (?P<added>.*) | |
(?P<content>.*) | |
?''' | |
def parse_html_clippings(file): | |
clippings = [] | |
bad_clippings = [] | |
soup = BeautifulSoup(file, 'html.parser') | |
title = None | |
authors = None | |
section = None | |
kind = None | |
subsection = None | |
location = None | |
for paragraph in soup.find_all(class_=True): | |
classes = paragraph['class'] | |
text = paragraph.get_text().strip() | |
if 'bookTitle' in classes: | |
title = text | |
if 'authors' in classes: | |
authors = text | |
if 'sectionHeading' in classes: | |
section = text | |
if 'noteHeading' in classes: | |
match = re.fullmatch(NOTE_HEADING_PATTERN, text) | |
if not match: | |
bad_clippings.append(text) | |
kind = None | |
location = None | |
subsection = None | |
else: | |
kind = match['kind'].strip() | |
location = match['location'].strip() | |
if match['subsection']: | |
subsection = match['subsection'].strip() | |
else: | |
subsection = None | |
if 'noteText' in classes: | |
content = text | |
else: | |
continue | |
if not kind or not location: | |
bad_clippings.append(text) | |
continue | |
if title and authors: | |
document = f'{title} ({authors})' | |
elif title: | |
document = title | |
elif authors: | |
document = authors | |
if section: | |
document += ' ' + section + ',' | |
if subsection: | |
document += ' ' + subsection + ',' | |
clippings.append(Clipping( | |
kind=kind, | |
document=document, | |
page=None, | |
location=location, | |
added=None, | |
content=content, | |
)) | |
return clippings, bad_clippings | |
NOTE_HEADING_PATTERN = r'(?P<kind>.*)\s*-\s*(?:(?P<subsection>.*)\s*>\s*)?Location\s*(?P<location>.*)' | |
def after_last_added(clippings, last_added): | |
if not last_added: | |
return clippings | |
def reversed_clippings_after_last_added(): | |
for clipping in reversed(clippings): | |
if clipping.added: | |
clipping_added = parse_clipping_added(clipping.added) | |
if clipping_added and clipping_added <= last_added: | |
return | |
yield clipping | |
clippings_after_last_added = list(reversed_clippings_after_last_added()) | |
clippings_after_last_added.reverse() | |
return clippings_after_last_added | |
def parse_clipping_added(clipping_added): | |
return datetime.strptime(clipping_added, '%Y年%m月%d日%A %H:%M:%S') | |
def last_added_datetime(config): | |
last_added_config = config['last_added'] | |
return datetime.strptime(last_added_config, '%Y-%m-%dT%H:%M:%S') if last_added_config else None | |
def highlights_only(clippings): | |
for clipping in clippings: | |
if 'ハイライト' in clipping.kind.lower(): | |
yield clipping | |
def fields(clipping, model, config): | |
content_yielded = False | |
source_yielded = False | |
for field in mw.col.models.fieldNames(model): | |
if field == config['content_field']: | |
yield clipping.content.strip() | |
content_yielded = True | |
elif field == config['source_field']: | |
yield '{page}{added}'.format( | |
page='ページ' + clipping.page if clipping.page is not None else '', | |
added=' ' + clipping.added if clipping.added is not None else '', | |
) | |
source_yielded = True | |
else: | |
yield '' | |
if not (content_yielded and source_yielded): | |
raise ValueError('Could not find content and/or source fields in model.') | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
I have listed out the various methods I've tried in this tweet:
https://twitter.com/kanjieater/status/1348044229159825411