Created
November 10, 2023 07:59
-
-
Save ilius/6f67b94cebd7b2991bcb04d8ceb41442 to your computer and use it in GitHub Desktop.
appledict-bin-to-slob.py
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
import sys | |
from os.path import extsep, join, splitext | |
from pyglossary import slob | |
from pyglossary.core import cacheDir | |
from pyglossary.entry import Entry | |
from pyglossary.glossary_v2 import Glossary | |
from pyglossary.plugins.aard2_slob import Writer as SlobWriter | |
from pyglossary.plugins.appledict_bin import KeyData, Reader | |
class NewEntry(Entry): | |
def __init__(self, entry): | |
Entry.__init__( | |
self, | |
word=entry.l_word, | |
defi=entry.defi, | |
) | |
class NewReader(Reader): | |
def __iter__(self): | |
yield from self.readResDir( | |
self._contentsPath, | |
recurse=True, | |
) | |
yield from self.readResDir( | |
join(self._contentsPath, "Resources"), | |
recurse=True, | |
) | |
keyTextFieldOrder = self._properties.key_text_variable_fields | |
for entryBytes, articleAddress in self.yieldEntryBytes( | |
self._file, | |
self._properties, | |
): | |
entry = self.createEntry(entryBytes, articleAddress) | |
if entry is None: | |
continue | |
entry = NewEntry(entry) | |
keyDataList: "list[KeyData]" = [ | |
KeyData.fromRaw(rawKeyData, keyTextFieldOrder) | |
for rawKeyData in self._keyTextData.get(articleAddress, []) | |
] | |
anchorByKeyword = { | |
keyData.keyword: keyData.anchor | |
for keyData in keyDataList | |
if keyData.anchor | |
} | |
#if anchorByKeyword: | |
# print("anchorByKeyword = ", anchorByKeyword) | |
entry.anchorByKeyword = anchorByKeyword | |
yield entry | |
content_type = "text/html; charset=utf-8" | |
word_title: bool = False | |
def addEntry(entry) -> None: | |
words = entry.l_word | |
b_defi = entry.defi.encode("utf-8") | |
entry.detectDefiFormat() | |
defiFormat = entry.defiFormat | |
if word_title and defiFormat in ("h", "m"): | |
if defiFormat == "m": | |
defiFormat = "h" | |
title = glos.wordTitleStr( | |
words[0], | |
) | |
b_defi = title.encode("utf-8") + b_defi | |
if defiFormat == "h": | |
b_defi = b_defi.replace(b'"bword://', b'"') | |
b_defi = b_defi.replace(b"'bword://", b"'") | |
anchorByKeyword = entry.anchorByKeyword | |
headword, *alts = words | |
writer.add( | |
b_defi, | |
(headword, anchorByKeyword.get(headword, "")), | |
content_type=content_type, | |
) | |
for alt in alts: | |
writer.add( | |
b_defi, | |
(f"{alt}, {headword}", anchorByKeyword.get(alt, "")), | |
content_type=content_type, | |
) | |
def addDataEntry(entry) -> None: | |
rel_path = entry.s_word | |
_, ext = splitext(rel_path) | |
ext = ext.lstrip(extsep).lower() | |
content_type = SlobWriter.resourceMimeTypes.get(ext) | |
if not content_type: | |
print(f"Aard2 slob: unknown content type for {rel_path!r}") | |
return | |
content = entry.data | |
key = rel_path | |
try: | |
key.encode(writer.encoding) | |
except UnicodeEncodeError: | |
print(f'Failed to add, broken unicode in key: {key!a}') | |
return | |
writer.add(content, key, content_type=content_type) | |
inputFilename = sys.argv[1] | |
outputFilename = sys.argv[2] | |
Glossary.init() | |
glos = Glossary() | |
reader = NewReader(glos) | |
for _ in reader.open(inputFilename): | |
pass | |
writer = slob.Writer( | |
outputFilename, | |
workdir=cacheDir, | |
compression="zlib", | |
version_info=False, | |
) | |
# writer.tag("label", self._glos.getInfo("name") + namePostfix) | |
for entry in reader: | |
if entry.isData(): | |
addDataEntry(entry) | |
else: | |
addEntry(entry) | |
writer.finalize() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment