Last active
December 21, 2015 04:29
-
-
Save gabalese/6249780 to your computer and use it in GitHub Desktop.
Make a list of every img alt attribute in htmls and print it to stdout
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! /usr/bin/env python | |
# file: attributealtfield.py | |
# Make a list of every img alt attribute in htmls and print to stdout | |
# Usage: from the command line, python attributealtfield.py <epub.epub> | |
from __future__ import print_function | |
import os | |
import sys | |
import zipfile as ZIP | |
try: | |
from lxml import etree as ET | |
except ImportError: | |
from xml.etree import ElementTree as ET | |
lista = [] | |
new_list = [] | |
filelist = [] | |
root_folder = "" | |
namespaces = {"opf": "{http://www.idpf.org/2007/opf}", "dc": "{http://purl.org/dc/elements/1.1/}"} | |
def altlist(infile): | |
global lista | |
global new_list | |
# init a tolerant parser | |
parser = ET.HTMLParser() | |
html = ET.fromstring(ZIP.ZipFile(sys.argv[1]).read(root_folder + "/" + infile), parser) | |
for i in html.iter(): | |
if i.tag == "img": | |
try: | |
string = "{} tag in {} contains alt text '{}'".format(i, infile, i.get("alt")) | |
lista.append(string) | |
except AttributeError: | |
print("img tag in '{}' does not have a alt attribute!".format(infile)) | |
def parseInfo(filename): | |
info = {} | |
global root_folder | |
try: | |
f = ZIP.ZipFile(filename).read("META-INF/container.xml") | |
except KeyError: | |
print("The %s file is not a valid OCF." % str(filename)) | |
else: | |
f = ET.fromstring(f) | |
info["path_to_opf"] = f[0][0].get("full-path") | |
root_folder = os.path.dirname(info["path_to_opf"]) | |
opf = ET.fromstring(ZIP.ZipFile(filename).read(info["path_to_opf"])) | |
toc_id = opf.find("{0}spine".format(namespaces["opf"])).get("toc") | |
expr = ".//*[@id='%s']" % toc_id | |
info["ncx_name"] = opf.find(expr).get("href") | |
info["path_to_ncx"] = root_folder + "/" + info["ncx_name"] | |
info.pop("ncx_name") | |
return info | |
def parseOPF(filename): | |
opf = ET.fromstring(ZIP.ZipFile(filename).read(parseInfo(filename)["path_to_opf"])) | |
return opf | |
if __name__ == "__main__": | |
if len(sys.argv) != 2: | |
print("USAGE: ./attributealtfield.py <epubfile>") | |
sys.exit() | |
i = sys.argv[1] | |
opf = parseOPF(i) | |
for item in opf.iter(): | |
if item.get("media-type") == "application/xhtml+xml": | |
filelist.append(item.get("href")) | |
for item in filelist: | |
altlist(item) | |
for i in lista: | |
new_list.append(i) | |
for i in new_list: | |
print(i) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment