Skip to content

Instantly share code, notes, and snippets.

@AliMirlou
Created January 14, 2019 19:33
Show Gist options
  • Save AliMirlou/d9cf60a201994adba6c4e0c101ffd6fc to your computer and use it in GitHub Desktop.
Save AliMirlou/d9cf60a201994adba6c4e0c101ffd6fc to your computer and use it in GitHub Desktop.
A program to extract ISBN from an EPUB document
import argparse
import re
from zipfile import ZipFile
def get_id_by_tag(text: str):
i1 = text.find('unique-identifier="')
i2 = text.find('"', i1 + 19)
id_start = text.find(">", text.find(text[i1 + 19:i2], i2)) + 1
return text[id_start:text.find("<", id_start)]
def get_id_by_structure(text: str, structure: str = 'ISBN'):
if structure == 'ISBN':
try:
# ISBN is 13 digits since 2007 and was 10 digits before
return next(filter(lambda s: True if len(s) in [10, 13] else False, re.sub('[^0-9]', ' ', text).split()))
except StopIteration:
raise ValueError("Error: Document does not contain an ISBN")
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument("file_path", help="EPUB file path")
args = parser.parse_args()
with ZipFile(args.file_path) as z:
text = z.read(next(filter(lambda n: n.endswith('.opf'), z.namelist()))).decode('utf-8')
print("ID by ISBN structure:", get_id_by_structure(text))
print("ID by identifier tag:", get_id_by_tag(text))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment