bemitc · February 18, 2022 21:23 · bemitc · Oct 12, 2021
diff --git a/booksentences2anki.py b/booksentences2anki.py
 #!/usr/bin/env python3

 from lxml import html
 import requests
 from bs4 import BeautifulSoup
 import sys
 import itertools
 import genanki
 import glob
 import shutil
 import os.path
 import re
 import hashlib
 import struct

 def writeCsvLine(f, t1, t2):
    f.write("\"{}\",\"{}\"\n".format(t1, t2))


 origin_language = sys.argv[1].upper()
 target_language = sys.argv[2].upper()

 url = "https://www.goethe-verlag.com/book2"

 target_url = f"{url}/{origin_language}/{origin_language}{target_language}/{origin_language}{target_language}"

 def pad_number(n):
    if n < 10:
        return "00" + str(n)
    elif n < 100:
       return "0" + str(n)
    else:
        return str(n)

 my_model = genanki.Model(
    1091735104,
    "50Languages_Import",
    fields=[
        {"name": "L2"},
        {"name": "L1"},
        {"name": "Audio"},  # ADD THIS
    ],
    templates=[
        {
            "name": "Card 1",
            "qfmt": "{{L2}}",  # AND THIS
            "afmt": '{{FrontSide}}<hr id="answer">{{L1}}<br>{{Audio}}',
        },
    ],
    css=""".card {
 font-family: arial;
 font-size: 20px;
 text-align: center;
 color: black;
 background-color: white;
 }

 .card1 { background-color: #FFFFFF; }
 .card2 { background-color: #FFFFFF; }"""
 )

 # unique id for deck based on language pair and type -- probably overkill
 h = hashlib.sha256()
 h.update(origin_language.encode())
 h.update(target_language.encode())
 h.update(b'sentences')

 my_deck = genanki.Deck(
    struct.unpack("<L", h.digest()[0:4])[0], f"Book2 {origin_language}-{target_language} (sentences)"
 )

 MIN_LESSON = 3 # 2 is the index page
 MAX_LESSON = 102 # 103 is the youtube video

 history = {} 
 f = open(origin_language+target_language+".csv", "wt")
 writeCsvLine(f, "Target language", "Native Language")

 for i in range(MIN_LESSON, MAX_LESSON + 1):
    r = requests.get(f"{target_url}{pad_number(i)}.HTM") # no slash unlike vocab scraping
    soup = BeautifulSoup(r.content, "html.parser")

    # header
    header_l1_sentences = [t.text for t in soup.find_all("span", {"class": "Stil36"})]
    header_l2_sentences = [t.text for t in soup.find_all("span", {"class": "Stil46"})]
    l2_audio = [t.find_all("source")[0]["src"] for t in soup.find_all("audio")]

    body_l1_sentences = [t.text.strip() for t in soup.find_all("div", {"class": "Stil35"})][:18] # last element is some text about Alzheimer
    body_l2_sentences = [t.text.strip().split('\r\n\n')[1] for t in soup.find_all("div", {"class": "Stil45"})]

    l1_sentences = header_l1_sentences + body_l1_sentences
    l2_sentences = header_l2_sentences + body_l2_sentences

    for l1_s, l2_s, m in zip(l1_sentences, l2_sentences, l2_audio):

        l1_s = l1_s.lstrip()
        l1_s = l1_s.rstrip()
        l2_s = l2_s.lstrip()
        l2_s = l2_s.rstrip()

        # patch numbers -- hopefully this is sufficient
        if re.match(r"\d{1,} ?\[", l2_s):
            l2_s = l2_s.split('[', 1)[1].split(']')[0]
            l1_s = l1_s.split('[', 1)[1].split(']')[0]

        # avoid duplicates
        if l2_s in history:
            continue

        history[l2_s] = 1

        writeCsvLine(f, l2_s, l1_s)

        filename = f"sentence_{origin_language}{target_language}_" + m.split("/")[-1]

        if not os.path.isfile(filename):
            dl_file = requests.get(m, stream=True)
            print(m)
            with open(filename, "wb") as out_file:
                shutil.copyfileobj(dl_file.raw, out_file)

        my_note = genanki.Note(
            model=my_model, fields=[l2_s, l1_s, f"[sound:{filename}]"]
        )

        my_deck.add_note(my_note)

 f.close()
 my_package = genanki.Package(my_deck)
 my_package.media_files = [m for m in glob.glob(f"sentence_{origin_language}{target_language}_*.mp3")]
 my_package.write_to_file(f"book2_{origin_language}{target_language}_sentences.apkg")
	#!/usr/bin/env python3

	from lxml import html
	import requests
	from bs4 import BeautifulSoup
	import sys
	import itertools
	import genanki
	import glob
	import shutil
	import os.path
	import re
	import hashlib
	import struct

	def writeCsvLine(f, t1, t2):
	f.write("\"{}\",\"{}\"\n".format(t1, t2))


	origin_language = sys.argv[1].upper()
	target_language = sys.argv[2].upper()

	url = "https://www.goethe-verlag.com/book2"

	target_url = f"{url}/{origin_language}/{origin_language}{target_language}/{origin_language}{target_language}"

	def pad_number(n):
	if n < 10:
	return "00" + str(n)
	elif n < 100:
	return "0" + str(n)
	else:
	return str(n)

	my_model = genanki.Model(
	1091735104,
	"50Languages_Import",
	fields=[
	{"name": "L2"},
	{"name": "L1"},
	{"name": "Audio"}, # ADD THIS
	],
	templates=[
	{
	"name": "Card 1",
	"qfmt": "{{L2}}", # AND THIS
	"afmt": '{{FrontSide}}<hr id="answer">{{L1}}<br>{{Audio}}',
	},
	],
	css=""".card {
	font-family: arial;
	font-size: 20px;
	text-align: center;
	color: black;
	background-color: white;
	}

	.card1 { background-color: #FFFFFF; }
	.card2 { background-color: #FFFFFF; }"""
	)

	# unique id for deck based on language pair and type -- probably overkill
	h = hashlib.sha256()
	h.update(origin_language.encode())
	h.update(target_language.encode())
	h.update(b'sentences')

	my_deck = genanki.Deck(
	struct.unpack("<L", h.digest()[0:4])[0], f"Book2 {origin_language}-{target_language} (sentences)"
	)

	MIN_LESSON = 3 # 2 is the index page
	MAX_LESSON = 102 # 103 is the youtube video

	history = {}
	f = open(origin_language+target_language+".csv", "wt")
	writeCsvLine(f, "Target language", "Native Language")

	for i in range(MIN_LESSON, MAX_LESSON + 1):
	r = requests.get(f"{target_url}{pad_number(i)}.HTM") # no slash unlike vocab scraping
	soup = BeautifulSoup(r.content, "html.parser")

	# header
	header_l1_sentences = [t.text for t in soup.find_all("span", {"class": "Stil36"})]
	header_l2_sentences = [t.text for t in soup.find_all("span", {"class": "Stil46"})]
	l2_audio = [t.find_all("source")[0]["src"] for t in soup.find_all("audio")]

	body_l1_sentences = [t.text.strip() for t in soup.find_all("div", {"class": "Stil35"})][:18] # last element is some text about Alzheimer
	body_l2_sentences = [t.text.strip().split('\r\n\n')[1] for t in soup.find_all("div", {"class": "Stil45"})]

	l1_sentences = header_l1_sentences + body_l1_sentences
	l2_sentences = header_l2_sentences + body_l2_sentences

	for l1_s, l2_s, m in zip(l1_sentences, l2_sentences, l2_audio):

	l1_s = l1_s.lstrip()
	l1_s = l1_s.rstrip()
	l2_s = l2_s.lstrip()
	l2_s = l2_s.rstrip()

	# patch numbers -- hopefully this is sufficient
	if re.match(r"\d{1,} ?\[", l2_s):
	l2_s = l2_s.split('[', 1)[1].split(']')[0]
	l1_s = l1_s.split('[', 1)[1].split(']')[0]

	# avoid duplicates
	if l2_s in history:
	continue

	history[l2_s] = 1

	writeCsvLine(f, l2_s, l1_s)

	filename = f"sentence_{origin_language}{target_language}_" + m.split("/")[-1]

	if not os.path.isfile(filename):
	dl_file = requests.get(m, stream=True)
	print(m)
	with open(filename, "wb") as out_file:
	shutil.copyfileobj(dl_file.raw, out_file)

	my_note = genanki.Note(
	model=my_model, fields=[l2_s, l1_s, f"[sound:{filename}]"]
	)

	my_deck.add_note(my_note)

	f.close()
	my_package = genanki.Package(my_deck)
	my_package.media_files = [m for m in glob.glob(f"sentence_{origin_language}{target_language}_*.mp3")]
	my_package.write_to_file(f"book2_{origin_language}{target_language}_sentences.apkg")