Last active
February 18, 2022 21:23
-
-
Save bemitc/72c1e527238082013c55c6b3d7199fc0 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
from lxml import html | |
import requests | |
from bs4 import BeautifulSoup | |
import sys | |
import itertools | |
import genanki | |
import glob | |
import shutil | |
import os.path | |
import re | |
import hashlib | |
import struct | |
def writeCsvLine(f, t1, t2): | |
f.write("\"{}\",\"{}\"\n".format(t1, t2)) | |
origin_language = sys.argv[1].upper() | |
target_language = sys.argv[2].upper() | |
url = "https://www.goethe-verlag.com/book2" | |
target_url = f"{url}/{origin_language}/{origin_language}{target_language}/{origin_language}{target_language}" | |
def pad_number(n): | |
if n < 10: | |
return "00" + str(n) | |
elif n < 100: | |
return "0" + str(n) | |
else: | |
return str(n) | |
my_model = genanki.Model( | |
1091735104, | |
"50Languages_Import", | |
fields=[ | |
{"name": "L2"}, | |
{"name": "L1"}, | |
{"name": "Audio"}, # ADD THIS | |
], | |
templates=[ | |
{ | |
"name": "Card 1", | |
"qfmt": "{{L2}}", # AND THIS | |
"afmt": '{{FrontSide}}<hr id="answer">{{L1}}<br>{{Audio}}', | |
}, | |
], | |
css=""".card { | |
font-family: arial; | |
font-size: 20px; | |
text-align: center; | |
color: black; | |
background-color: white; | |
} | |
.card1 { background-color: #FFFFFF; } | |
.card2 { background-color: #FFFFFF; }""" | |
) | |
# unique id for deck based on language pair and type -- probably overkill | |
h = hashlib.sha256() | |
h.update(origin_language.encode()) | |
h.update(target_language.encode()) | |
h.update(b'sentences') | |
my_deck = genanki.Deck( | |
struct.unpack("<L", h.digest()[0:4])[0], f"Book2 {origin_language}-{target_language} (sentences)" | |
) | |
MIN_LESSON = 3 # 2 is the index page | |
MAX_LESSON = 102 # 103 is the youtube video | |
history = {} | |
f = open(origin_language+target_language+".csv", "wt") | |
writeCsvLine(f, "Target language", "Native Language") | |
for i in range(MIN_LESSON, MAX_LESSON + 1): | |
r = requests.get(f"{target_url}{pad_number(i)}.HTM") # no slash unlike vocab scraping | |
soup = BeautifulSoup(r.content, "html.parser") | |
# header | |
header_l1_sentences = [t.text for t in soup.find_all("span", {"class": "Stil36"})] | |
header_l2_sentences = [t.text for t in soup.find_all("span", {"class": "Stil46"})] | |
l2_audio = [t.find_all("source")[0]["src"] for t in soup.find_all("audio")] | |
body_l1_sentences = [t.text.strip() for t in soup.find_all("div", {"class": "Stil35"})][:18] # last element is some text about Alzheimer | |
body_l2_sentences = [t.text.strip().split('\r\n\n')[1] for t in soup.find_all("div", {"class": "Stil45"})] | |
l1_sentences = header_l1_sentences + body_l1_sentences | |
l2_sentences = header_l2_sentences + body_l2_sentences | |
for l1_s, l2_s, m in zip(l1_sentences, l2_sentences, l2_audio): | |
l1_s = l1_s.lstrip() | |
l1_s = l1_s.rstrip() | |
l2_s = l2_s.lstrip() | |
l2_s = l2_s.rstrip() | |
# patch numbers -- hopefully this is sufficient | |
if re.match(r"\d{1,} ?\[", l2_s): | |
l2_s = l2_s.split('[', 1)[1].split(']')[0] | |
l1_s = l1_s.split('[', 1)[1].split(']')[0] | |
# avoid duplicates | |
if l2_s in history: | |
continue | |
history[l2_s] = 1 | |
writeCsvLine(f, l2_s, l1_s) | |
filename = f"sentence_{origin_language}{target_language}_" + m.split("/")[-1] | |
if not os.path.isfile(filename): | |
dl_file = requests.get(m, stream=True) | |
print(m) | |
with open(filename, "wb") as out_file: | |
shutil.copyfileobj(dl_file.raw, out_file) | |
my_note = genanki.Note( | |
model=my_model, fields=[l2_s, l1_s, f"[sound:{filename}]"] | |
) | |
my_deck.add_note(my_note) | |
f.close() | |
my_package = genanki.Package(my_deck) | |
my_package.media_files = [m for m in glob.glob(f"sentence_{origin_language}{target_language}_*.mp3")] | |
my_package.write_to_file(f"book2_{origin_language}{target_language}_sentences.apkg") |
If you need additional http parameters, you can enter them after the url in the requests.get call. You do need to provide the 2 character language codes for origin and target language which is generate a path (eg: https://www.goethe-verlag.com/book2/EM/EMJA/EMJA003.HTM). Admittedly, I haven't touched this since May and was mostly written because I don't like production decks or isolated vocabulary decks.
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
great script! do i just input the http request parameters between 'target_url's curly braces and run it?