Created
January 14, 2019 19:33
-
-
Save AliMirlou/d9cf60a201994adba6c4e0c101ffd6fc to your computer and use it in GitHub Desktop.
A program to extract ISBN from an EPUB document
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import argparse | |
import re | |
from zipfile import ZipFile | |
def get_id_by_tag(text: str): | |
i1 = text.find('unique-identifier="') | |
i2 = text.find('"', i1 + 19) | |
id_start = text.find(">", text.find(text[i1 + 19:i2], i2)) + 1 | |
return text[id_start:text.find("<", id_start)] | |
def get_id_by_structure(text: str, structure: str = 'ISBN'): | |
if structure == 'ISBN': | |
try: | |
# ISBN is 13 digits since 2007 and was 10 digits before | |
return next(filter(lambda s: True if len(s) in [10, 13] else False, re.sub('[^0-9]', ' ', text).split())) | |
except StopIteration: | |
raise ValueError("Error: Document does not contain an ISBN") | |
if __name__ == '__main__': | |
parser = argparse.ArgumentParser() | |
parser.add_argument("file_path", help="EPUB file path") | |
args = parser.parse_args() | |
with ZipFile(args.file_path) as z: | |
text = z.read(next(filter(lambda n: n.endswith('.opf'), z.namelist()))).decode('utf-8') | |
print("ID by ISBN structure:", get_id_by_structure(text)) | |
print("ID by identifier tag:", get_id_by_tag(text)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment