Created
December 14, 2020 05:04
-
-
Save AlexApps99/7732e746ac791959a46d0a00fc4d5d6a to your computer and use it in GitHub Desktop.
Generate an Anki deck from a PDF file of Remembering the Kanji 1, 6th edition
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
# By AlexApps99 | |
# This Python script will generate an Anki deck from a PDF file of Remembering the Kanji 1, 6th edition. | |
# To use it, just use the command "python3 create_rtk_deck.py RTK.pdf RTK.apkg" | |
# Feel free to customize/modify it however you want, public domain code. | |
# There is unused functionality for getting stories/text too. | |
# If you have any questions on how to use this feel free to contact me. | |
# Dependencies: | |
# - genanki | |
# - beautifulsoup4 | |
# - Poppler (for pdftohtml command) | |
import unicodedata | |
from bs4 import BeautifulSoup | |
def is_kanji(text): | |
try: | |
return unicodedata.name(text).startswith("CJK ") or unicodedata.name(text).startswith("KANGXI ") | |
except TypeError: | |
return False | |
text_fonts = ["6", "7", "11", "12", "16", "25", "28", "29", "33"] | |
annotation_fonts = ["9", "10", "17", "18", "21", "24", "32"] | |
def parse(f): | |
soup = BeautifulSoup(f, "xml") | |
parsed = [] | |
current = {} | |
for text in soup.find_all("text"): | |
t = text.get_text().strip() | |
if text["font"] == "3" and t.isdecimal(): | |
if current: | |
parsed.append(current) | |
current = {} | |
current["id"] = int(t) | |
elif text["font"] == "4" or text["font"] == "26": | |
if "id" in current: | |
current["keyword"] = t | |
elif text["font"] == "5" and is_kanji(t): | |
if "id" in current: | |
current["kanji"] = t | |
elif text["font"] in text_fonts: | |
if "id" in current: | |
if "text" in current: | |
current["text"] += " " + t | |
else: | |
current["text"] = t | |
elif text["font"] in annotation_fonts: | |
if "id" in current: | |
if "annotation" in current: | |
current["annotation"] += " " + t | |
else: | |
current["annotation"] = t | |
parsed.append(current) | |
# Some kanji are not formatted properly and need to be fixed manually | |
if not "kanji" in parsed[ 128]: parsed[ 128]["kanji"] = "\u55c5" | |
if not "kanji" in parsed[ 307]: parsed[ 307]["kanji"] = "\u55bb" | |
if not "kanji" in parsed[ 679]: parsed[ 679]["kanji"] = "\u60e7" | |
if not "kanji" in parsed[1010]: parsed[1010]["kanji"] = "\u41f3" | |
if not "kanji" in parsed[1393]: parsed[1393]["kanji"] = "\u9699" | |
if not "kanji" in parsed[2004]: parsed[2004]["kanji"] = "\u540e" | |
return parsed | |
if __name__ == "__main__": | |
import genanki | |
import subprocess | |
from sys import argv | |
if len(argv) != 3: | |
print("Usage:", argv[0], "[RTK1 6th edition].pdf [Anki package].apkg") | |
subprocess.run(["pdftohtml", "-f", "20", "-l", "401", "-q", "-s", "-i", "-xml", argv[1], "rtk.xml"], check=True) | |
with open("rtk.xml") as f: | |
p = parse(f) | |
deck = genanki.Deck( | |
1196528216, | |
"RTK1", | |
"Generated from Remembering the Kanji 1, 6th edition", | |
) | |
model = genanki.Model( | |
1530649655, | |
"RTK1", | |
fields=[ | |
{"name": "Index"}, | |
{"name": "Keyword"}, | |
{"name": "Kanji"}, | |
], | |
templates=[{ | |
"name": "Recall", | |
"qfmt": "{{Keyword}}", | |
"afmt": "{{FrontSide}}<hr id=\"answer\">{{Kanji}}<br><small>{{Index}}</small>", | |
}], | |
css="", | |
) | |
for e in p: | |
deck.add_note(genanki.Note( | |
model=model, | |
fields=[str(e["id"]), e["keyword"], e["kanji"]], | |
sort_field=e["id"], | |
)) | |
genanki.Package(deck).write_to_file(argv[2]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment