-
-
Save slanterns/d7492a4545f05efcd1fcc62f73854ab7 to your computer and use it in GitHub Desktop.
Dump images in Kindle's AZW6 (.azw.res) file (Python3 version of this script: https://www.mobileread.com/forums/showpost.php?p=3114050&postcount=1145)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import argparse | |
import imghdr | |
import os | |
import struct | |
def get_image_type(imgname, imgdata=None): | |
imgtype = imghdr.what(imgname, imgdata) | |
# horrible hack since imghdr detects jxr/wdp as tiffs | |
if imgtype is not None and imgtype == "tiff": | |
imgtype = "wdp" | |
# imghdr only checks for JFIF or Exif JPEG files. Apparently, there are some | |
# with only the magic JPEG bytes out there... | |
# ImageMagick handles those, so, do it too. | |
if imgtype is None: | |
if imgdata[0:2] == b"\xFF\xD8": | |
# Get last non-null bytes | |
last = len(imgdata) | |
while imgdata[last - 1 : last] == b"\x00": | |
last -= 1 | |
# Be extra safe, check the trailing bytes, too. | |
if imgdata[last - 2 : last] == b"\xFF\xD9": | |
imgtype = "jpeg" | |
return imgtype | |
def processCRES(i, data): | |
data = data[12:] | |
imgtype = get_image_type(None, data) | |
if imgtype == "jpeg": | |
imgtype = "jpg" | |
if imgtype is None: | |
print( | |
" Warning: CRES Section %s does not contain a recognised resource" | |
% i | |
) | |
imgtype = "dat" | |
imgname = "HDimage%05d.%s" % (i, imgtype) | |
imgdir = os.path.join(".", "azw6_images") | |
if not os.path.exists(imgdir): | |
os.mkdir(imgdir) | |
print(" Extracting HD image: {0:s} from section {1:d}".format(imgname, i)) | |
imgpath = os.path.join(imgdir, imgname) | |
with open(imgpath, "wb") as f: | |
f.write(data) | |
# this is just guesswork so far, making big assumption that | |
# metavalue key numbers reamin the same in the CONT EXTH | |
def dump_contexth(codec, extheader): | |
# determine text encoding | |
if extheader == "": | |
return None | |
id_map_strings = { | |
1: "Drm Server Id (1)", | |
2: "Drm Commerce Id (2)", | |
3: "Drm Ebookbase Book Id(3)", | |
100: "Creator_(100)", | |
101: "Publisher_(101)", | |
102: "Imprint_(102)", | |
103: "Description_(103)", | |
104: "ISBN_(104)", | |
105: "Subject_(105)", | |
106: "Published_(106)", | |
107: "Review_(107)", | |
108: "Contributor_(108)", | |
109: "Rights_(109)", | |
110: "SubjectCode_(110)", | |
111: "Type_(111)", | |
112: "Source_(112)", | |
113: "ASIN_(113)", | |
114: "versionNumber_(114)", | |
117: "Adult_(117)", | |
118: "Price_(118)", | |
119: "Currency_(119)", | |
122: "fixed-layout_(122)", | |
123: "book-type_(123)", | |
124: "orientation-lock_(124)", | |
126: "original-resolution_(126)", | |
127: "zero-gutter_(127)", | |
128: "zero-margin_(128)", | |
129: "K8_Masthead/Cover_Image_(129)", | |
132: "RegionMagnification_(132)", | |
200: "DictShortName_(200)", | |
208: "Watermark_(208)", | |
501: "cdeType_(501)", | |
502: "last_update_time_(502)", | |
503: "Updated_Title_(503)", | |
504: "ASIN_(504)", | |
508: "Unknown_Title_Furigana?_(508)", | |
517: "Unknown_Creator_Furigana?_(517)", | |
522: "Unknown_Publisher_Furigana?_(522)", | |
524: "Language_(524)", | |
525: "primary-writing-mode_(525)", | |
526: "Unknown_(526)", | |
527: "page-progression-direction_(527)", | |
528: "override-kindle_fonts_(528)", | |
529: "Unknown_(529)", | |
534: "Input_Source_Type_(534)", | |
535: "Kindlegen_BuildRev_Number_(535)", | |
536: "Container_Info_(536)", # CONT_Header is 0, Ends with CONTAINER_BOUNDARY (or Asset_Type?) | |
538: "Container_Resolution_(538)", | |
539: "Container_Mimetype_(539)", | |
542: "Unknown_but_changes_with_filename_only_(542)", | |
543: "Container_id_(543)", # FONT_CONTAINER, BW_CONTAINER, HD_CONTAINER | |
544: "Unknown_(544)", | |
} | |
id_map_values = { | |
115: "sample_(115)", | |
116: "StartOffset_(116)", | |
121: "K8(121)_Boundary_Section_(121)", | |
125: "K8_Count_of_Resources_Fonts_Images_(125)", | |
131: "K8_Unidentified_Count_(131)", | |
201: "CoverOffset_(201)", | |
202: "ThumbOffset_(202)", | |
203: "Fake_Cover_(203)", | |
204: "Creator_Software_(204)", | |
205: "Creator_Major_Version_(205)", | |
206: "Creator_Minor_Version_(206)", | |
207: "Creator_Build_Number_(207)", | |
401: "Clipping_Limit_(401)", | |
402: "Publisher_Limit_(402)", | |
404: "Text_to_Speech_Disabled_(404)", | |
} | |
id_map_hexstrings = { | |
209: "Tamper_Proof_Keys_(209_in_hex)", | |
300: "Font_Signature_(300_in_hex)", | |
} | |
_length, num_items = struct.unpack(">LL", extheader[4:12]) | |
extheader = extheader[12:] | |
pos = 0 | |
for _ in range(num_items): | |
id_, size = struct.unpack(">LL", extheader[pos : pos + 8]) | |
content = extheader[pos + 8 : pos + size] | |
if id_ in id_map_strings: | |
name = id_map_strings[id_] | |
print( | |
'\n Key: "%s"\n Value: "%s"' | |
% (name, str(content, codec).encode("utf-8")) | |
) | |
elif id_ in id_map_values: | |
name = id_map_values[id_] | |
if size == 9: | |
(value,) = struct.unpack("B", content) | |
print('\n Key: "%s"\n Value: 0x%01x' % (name, value)) | |
elif size == 10: | |
(value,) = struct.unpack(">H", content) | |
print('\n Key: "%s"\n Value: 0x%02x' % (name, value)) | |
elif size == 12: | |
(value,) = struct.unpack(">L", content) | |
print('\n Key: "%s"\n Value: 0x%04x' % (name, value)) | |
else: | |
print("\nError: Value for %s has unexpected size of %s" % (name, size)) | |
elif id_ in id_map_hexstrings: | |
name = id_map_hexstrings[id_] | |
print( | |
'\n Key: "%s"\n Value: 0x%s' % (name, content.encode("hex")) | |
) | |
else: | |
print("\nWarning: Unknown metadata with id %s found" % id_) | |
name = str(id_) + " (hex)" | |
print(' Key: "%s"\n Value: 0x%s' % (name, content.encode("hex"))) | |
pos += size | |
def sortedHeaderKeys(mheader): | |
hdrkeys = sorted(list(mheader.keys()), key=lambda akey: mheader[akey][0]) | |
return hdrkeys | |
class dumpHeaderException(Exception): | |
pass | |
class PalmDB: | |
# important palmdb header offsets | |
unique_id_seed = 68 | |
number_of_pdb_records = 76 | |
first_pdb_record = 78 | |
def __init__(self, palmdata): | |
self.data = palmdata | |
(self.nsec,) = struct.unpack_from(">H", self.data, PalmDB.number_of_pdb_records) | |
def getsecaddr(self, secno): | |
(secstart,) = struct.unpack_from( | |
">L", self.data, PalmDB.first_pdb_record + secno * 8 | |
) | |
if secno == self.nsec - 1: | |
secend = len(self.data) | |
else: | |
(secend,) = struct.unpack_from( | |
">L", self.data, PalmDB.first_pdb_record + (secno + 1) * 8 | |
) | |
return secstart, secend | |
def readsection(self, secno): | |
if secno < self.nsec: | |
secstart, secend = self.getsecaddr(secno) | |
return self.data[secstart:secend] | |
return "" | |
def getnumsections(self): | |
return self.nsec | |
class HdrParser: | |
cont_header = { | |
"magic": (0x00, "4s", 4), | |
"record_size": (0x04, ">L", 4), | |
"type": (0x08, ">H", 2), | |
"count": (0x0A, ">H", 2), | |
"codepage": (0x0C, ">L", 4), | |
"unknown0": (0x10, ">L", 4), | |
"unknown1": (0x14, ">L", 4), | |
"num_resc_recs": (0x18, ">L", 4), | |
"num_wo_placeholders": (0x1C, ">L", 4), | |
"offset_to_hrefs": (0x20, ">L", 4), | |
"unknown2": (0x24, ">L", 4), | |
"title_offset": (0x28, ">L", 4), | |
"title_length": (0x2C, ">L", 4), | |
} | |
cont_header_sorted_keys = sortedHeaderKeys(cont_header) | |
def __init__(self, header, start): | |
self.header = header | |
self.start = start | |
self.hdr = {} | |
# set it up for the proper header version | |
self.header_sorted_keys = HdrParser.cont_header_sorted_keys | |
self.cont_header = HdrParser.cont_header | |
# parse the header information | |
for key in self.header_sorted_keys: | |
pos, format_str, _ = self.cont_header[key] | |
if pos < 48: | |
(val,) = struct.unpack_from(format_str, self.header, pos) | |
self.hdr[key] = val | |
self.exth = self.header[48:] | |
self.title_offset = self.hdr["title_offset"] | |
self.title_length = self.hdr["title_length"] | |
self.title = self.header[ | |
self.title_offset : self.title_offset + self.title_length | |
] | |
self.codec = "windows-1252" | |
self.codec_map = { | |
1252: "windows-1252", | |
65001: "utf-8", | |
} | |
if self.hdr["codepage"] in self.codec_map: | |
self.codec = self.codec_map[self.hdr["codepage"]] | |
self.title = self.title.decode(self.codec).encode("utf-8") | |
def dumpHeaderInfo(self) -> None: | |
for key in self.cont_header_sorted_keys: | |
pos, _, tot_len = self.cont_header[key] | |
if pos < 48: | |
if key != "magic": | |
fmt_string = ( | |
" Field: %20s Offset: 0x%03x Width: %d Value: 0x%0" | |
+ str(tot_len) | |
+ "x" | |
) | |
else: | |
fmt_string = ( | |
" Field: %20s Offset: 0x%03x Width: %d Value: %s" | |
) | |
print(fmt_string % (key, pos, tot_len, self.hdr[key])) | |
print("\nEXTH Region Length: 0x%0x" % len(self.exth)) | |
print("EXTH MetaData: {}".format(self.title.decode("utf-8"))) | |
dump_contexth(self.codec, self.exth) | |
def main(infile): | |
print("DumpAZW6 v01") | |
infileext = os.path.splitext(infile)[1].upper() | |
print(infile, infileext) | |
if infileext not in [".AZW6", ".RES"]: | |
print( | |
"Error: first parameter must be a Kindle AZW6 HD container file with extension .azw6 or .res." | |
) | |
else: | |
# make sure it is really an hd container file | |
with open(infile, "rb") as fp: | |
contdata = fp.read() | |
palmheader = contdata[0:78] | |
ident = palmheader[0x3C : 0x3C + 8] | |
if ident != b"RBINCONT": | |
raise dumpHeaderException("invalid file format") | |
pp = PalmDB(contdata) | |
header = pp.readsection(0) | |
print("\nFirst Header Dump from Section %d" % 0) | |
hp = HdrParser(header, 0) | |
hp.dumpHeaderInfo() | |
# now dump a basic sector map of the palmdb | |
n = pp.getnumsections() | |
dtmap = { | |
b"FONT": "FONT", | |
b"RESC": "RESC", | |
b"CRES": "CRES", | |
b"CONT": "CONT", | |
b"\xA0\xA0\xA0\xA0": "Empty_Image/Resource_Placeholder", | |
b"\xe9\x8e\r\n": "EOF_RECORD", | |
} | |
dtmap2 = { | |
"kindle:embed": "KINDLE:EMBED", | |
} | |
hp = None | |
print("\nMap of Palm DB Sections") | |
print(" Dec - Hex : Description") | |
print(" ---- - ---- -----------") | |
for i in range(n): | |
pp.getsecaddr(i) | |
data = pp.readsection(i) | |
dlen = len(data) | |
dt = data[0:4] | |
dtext = data[0:12] | |
desc = "" | |
if dtext in dtmap2: | |
desc = data | |
linkhrefs = [] | |
hreflist = desc.split("|") | |
for href in hreflist: | |
if href != "": | |
linkhrefs.append(" " + href) | |
desc = "\n" + "\n".join(linkhrefs) | |
elif dt in dtmap: | |
desc = dtmap[dt] | |
if dt == b"CONT": | |
desc = "Cont Header" | |
elif dt == b"CRES": | |
processCRES(i, data) | |
else: | |
desc = dtext.hex() + " " + dtext.decode("windows-1252") | |
if desc != "CONT": | |
print(" %04d - %04x: %s [%d]" % (i, i, desc, dlen)) | |
if __name__ == "__main__": | |
parser = argparse.ArgumentParser( | |
description="Dump the image from an AZW6 HD container file." | |
) | |
parser.add_argument("infile", type=str, help="azw6 file to dump the images from") | |
args = parser.parse_args() | |
main(args.infile) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment