-
-
Save Krazybug/b7e814d7189db9ee1d6b9c1d1a1de95c to your computer and use it in GitHub Desktop.
| #!/usr/bin/env python3 | |
| ''' | |
| calisuck: index, filter-out smartly and download ebooks from Calibre open directories | |
| Installation: | |
| You need python 3.5 installed | |
| Download the file as a zip and unzip-it and get into the dir | |
| OR | |
| > git clone https://gist.github.com/b7e814d7189db9ee1d6b9c1d1a1de95c.git | |
| > mv b7e814d7189db9ee1d6b9c1d1a1de95c calisuck | |
| > cd calisuck | |
| > | |
| THEN | |
| > python3 -m venv . | |
| > . bin/activate | |
| > pip install requests fire humanize langid iso639 beautifultable | |
| > python calisuck.py --help | |
| > python calisuck.py index-ebooks --help | |
| > python calisuck.py download-ebooks --help | |
| > python calisuck.py download-covers --help | |
| ''' | |
| ''' | |
| DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE | |
| Version 2, December 2004 | |
| Copyright (C) 2004 Sam Hocevar <[email protected]> | |
| Everyone is permitted to copy and distribute verbatim or modified | |
| copies of this license document, and changing it is allowed as long | |
| as the name is changed. | |
| DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE | |
| TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION | |
| 0. You just DO WHAT THE FUCK YOU WANT TO. | |
| ''' | |
| import sys | |
| import os | |
| import time | |
| import re | |
| import shutil | |
| import requests | |
| import json | |
| import fire | |
| from humanize import naturalsize as hsize | |
| from langid.langid import LanguageIdentifier, model | |
| import iso639 | |
| import time | |
| from requests.adapters import HTTPAdapter | |
| import urllib.parse | |
| import urllib3 | |
| from beautifultable import BeautifulTable | |
| urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) | |
| all_ordered_formats=['azw', 'azw3', 'cbr', 'chm', 'djvu', 'doc', 'docx', 'epub', 'kepub', 'lit', 'lrf', 'mobi', 'original_epub', 'pdf', 'ppt', 'prc', 'rar', 'rtf', 'txt', 'zip', 'fb2'] | |
| identifier = LanguageIdentifier.from_modelstring(model, norm_probs=True) | |
| def load_metadata(path, uuid): | |
| filepath=path+'/'+uuid+'/metadata.json' | |
| # print (filepath) | |
| if os.path.isfile(filepath): | |
| try: | |
| with open(filepath, 'r') as fd: | |
| return json.load(fd) | |
| except: | |
| print ("Error loading metadata for:", uuid, "from path:", path) | |
| return 0 | |
| else: | |
| # print ("Metadata not found for:", uuid, "from path:", path) | |
| return 0 | |
| def save_metadata(path, book): | |
| filepath=path+'/'+book['uuid']+'/metadata.json' | |
| # print("Saving book metadata for:", book['uuid'], "to:", filepath) | |
| os.makedirs(os.path.dirname(filepath+".tmp"), exist_ok=True) | |
| with open(filepath+".tmp", 'w') as fd: | |
| json.dump(book, fd, indent=4, separators=(',', ': ')) | |
| try: | |
| shutil.move(filepath+".tmp", filepath) | |
| # print("Saved to:", filepath) | |
| except: | |
| print("Unable to rename .tmp file:", filepath+".tmp") | |
| def get_cover_path(path, uuid): | |
| filepath=path+'/'+uuid+'/cover.jpg' | |
| if os.path.isfile(filepath): return filepath | |
| else: return 0 | |
| def get_file_path(path, uuid, fileformat): | |
| files=os.listdir(path+'/'+uuid) | |
| if files: | |
| for f in files: | |
| fname, ext=os.path.splitext(f) | |
| if ext =='.'+fileformat: | |
| return path+'/'+uuid+'/'+f | |
| else: return 0 | |
| else: return 0 | |
| def get_cover(path, book, map): | |
| url=book['source']['cover'] | |
| if map: | |
| pu=urllib.parse.urlparse(url) | |
| pu=(pu[0], map, *pu[2:]) | |
| print(pu) | |
| url=urllib.parse.urlunparse(pu) | |
| print("Downloading cover from:", url) | |
| r=requests.get(url, timeout=(20, 3), verify=False) | |
| r.raise_for_status() | |
| filepath=path+'/'+book['uuid']+'/cover.jpg' | |
| os.makedirs(os.path.dirname(filepath+".tmp"), exist_ok=True) | |
| with open(filepath+".tmp", 'wb') as fd: | |
| fd.write(r.content) | |
| shutil.move(filepath+".tmp", filepath) | |
| print("Saved to:", filepath) | |
| def download_covers(dir='my_books', server='', map=""): | |
| """ Download covers for each books""" | |
| for root, dirs, files in os.walk(dir, topdown=True): | |
| for d in dirs: | |
| # print() | |
| # print("-->", d) | |
| book = load_metadata(root, d) | |
| if book: | |
| # if book['source']['status'] != "ignored": | |
| if True: | |
| if not get_cover_path(root, book['uuid']): | |
| print() | |
| print("-->", d) | |
| print(book['uuid']) | |
| try: | |
| get_cover(root, book, map) | |
| except: | |
| print ("Unable to get cover", book['uuid']) | |
| else: | |
| pass | |
| # print ("Cover already present:", book['uuid']) | |
| else: | |
| print ('book {} in status {}: ignored'.format(book['uuid'], book['source']['status'])) | |
| else: | |
| print ("No ebook metadata found in:", root) | |
| def get_file_size(url): | |
| print("Downloading size:", url) | |
| r = requests.head(url, verify=False) | |
| r.raise_for_status() | |
| size=r.headers['Content-Length'] | |
| print("Size received="+ hsize(size)) | |
| return int(size) | |
| def get_file(path, book, format, session, map, map_lib): | |
| uuid = book['uuid'] | |
| url=book['source']['formats'][format]['url'] | |
| if map: | |
| pu=urllib.parse.urlparse(url) | |
| pu=(pu[0], map, *pu[2:]) | |
| print(pu) | |
| url=urllib.parse.urlunparse(pu) | |
| if map_lib: | |
| # pu=urllib.parse.urlparse(url) | |
| # print(pu) | |
| url_s=url.split("/") | |
| # print(url_s) | |
| url_s=url_s[:-1]+[map_lib] | |
| # print('/'.join(url_s)) | |
| url='/'.join(url_s) | |
| print() | |
| print("Downloading ebook:", url) | |
| print("Size expected (estimation):", hsize(book['source']['formats'][format]['size'])) | |
| r = session.get(url, timeout=(25,15), verify=False) | |
| # headers = {"Range": "bytes=0-1023"} | |
| # r = requests.get(url, headers=headers) | |
| r.raise_for_status() | |
| # print(r.headers) | |
| if('Content-Length' in r.headers ): | |
| print("Size received="+hsize(r.headers['Content-Length'])) | |
| else: | |
| print("Fize received") | |
| filename=re.findall(r'filename="(.*)"', r.headers['Content-Disposition']) | |
| # print(filename) | |
| if len(filename): | |
| filepath=path+'/'+uuid+'/'+filename[0] | |
| else: | |
| filepath=path+'/'+uuid+'/'+uuid+"."+format | |
| os.makedirs(os.path.dirname(filepath+".tmp"), exist_ok=True) | |
| with open(filepath+".tmp", 'wb') as fd: | |
| fd.write(r.content) | |
| shutil.move(filepath+".tmp", filepath) | |
| print("Saved to:", filepath) | |
| def set_status(uuid, status, dir='.'): | |
| book = load_metadata(dir, uuid) | |
| if book: | |
| if book['source']['status'] != status: | |
| book['source']['status'] = status | |
| save_metadata(dir, book) | |
| print("Status changed to", status+":", book['uuid'], "(", book['title'], ")") | |
| else: | |
| print("Status unchanged changed ", status+":", book['uuid']) | |
| else: | |
| print ("No ebook metadata found for:", uuid) | |
| def remove_book(uuid, path='.'): | |
| print(os.getcwd()) | |
| bookpath=path+'/'+uuid | |
| if os.path.isdir(bookpath): | |
| try: | |
| shutil.rmtree(bookpath) | |
| print(uuid, "removed") | |
| except: | |
| print("problem") | |
| else: | |
| print(uuid, "not found") | |
| def update_done_status(book): | |
| source=book['source'] | |
| if source['status']!='ignored': | |
| if set(source['formats'].keys()) == set(book['formats']) & set(source['formats'].keys()): | |
| book['source']['status']="done" | |
| else: | |
| book['source']['status']="todo" | |
| def index_ebooks(site, library="", start=0, stop=0, dir="my_books", inc=1000, force_refresh=False): | |
| """ | |
| Index a remote Calibre library | |
| You will get in your <dir> all the metadata (title, authors, isbn, ...) for each book. | |
| They're stored as simple JSON files (metadata.json) so that you can easily visualize them or process them with 'jq' program. | |
| They are stored in subdirectories with a UUID as a name. These directories do match different books and allow you to group all | |
| the different formats of the same book and eventually the cover file. | |
| You can mix books from different sites without any (theoric) collisions | |
| Params: | |
| --site=<string> : Url of the site to index (ex: http://123.123.123.123/) | |
| --library=<string> (default=my_books) : Id of library to index. The script index the default library by default. | |
| The id is string following '&library_id=' in the url | |
| --force-refresh (defaul=False) : Force a refresh of the metadata. By default all the metdata | |
| already gathered are ignored | |
| --start=<int> (default=0) | |
| --stop=<int> (default=0) : Allow indexing between a range of ebooks | |
| --inc=<int> (default=1000) : Fix the number of ebooks for each request one the server | |
| """ | |
| os.makedirs(dir, exist_ok=True) | |
| offset= 0 if not start else start-1 | |
| num=min(1000,inc) | |
| server=site.rstrip('/') | |
| api=server+'/ajax/' | |
| library= '/'+library if library else library | |
| print("Server:", server) | |
| url=api+'search'+library+'?num=0' | |
| print() | |
| print("Getting ebooks count:", server) | |
| try: | |
| r = requests.get(url,verify=False) | |
| r.raise_for_status() | |
| except: | |
| print("Unable to open site:", url) | |
| sys.exit(1) | |
| print("Total count=",r.json()["total_num"]) | |
| total_num=int(r.json()["total_num"]) | |
| total_num= total_num if not stop else stop | |
| print() | |
| print("Start indexing") | |
| range=offset+1 | |
| while offset < total_num: | |
| remaining_num = min(num, total_num - offset) | |
| # print() | |
| # print("Downloading ids: offset="+str(offset), "num="+str(remaining_num)) | |
| url=api+'search'+library+'?num='+str(remaining_num)+'&offset='+str(offset)+'&sort=timestamp&sort_order=desc' | |
| # print("->", url) | |
| r=requests.get(url, verify=False) | |
| # print("Ids received from:"+str(offset), "to:"+str(offset+remaining_num-1)) | |
| # print() | |
| # print("\rDownloading metadata from", str(offset+1), "to", str(offset+remaining_num),end='') | |
| books_s=",".join(str(i) for i in r.json()['book_ids']) | |
| url=api+'books'+library+'?ids='+books_s | |
| # print("->", url) | |
| r=requests.get(url, verify=False) | |
| # print(len(r.json()), "received") | |
| for id, r_book in r.json().items(): | |
| uuid=r_book['uuid'] | |
| if not uuid: | |
| print ("No uuid for ebook: ignored") | |
| continue | |
| if r_book['authors']: | |
| desc= f"uuid={uuid} ({r_book['title']} / {r_book['authors'][0]})" | |
| else: | |
| desc= f"uuid={uuid} ({r_book['title']})" | |
| s=f"\r--> {range}/{total_num} - {desc}" | |
| s='{:140.140}'.format(s) | |
| print (s, end='') | |
| if not force_refresh: | |
| try: | |
| book = load_metadata(dir, uuid) | |
| except: | |
| print() | |
| print("Unable to get metadata from:", uuid) | |
| range+=1 | |
| continue | |
| if book: | |
| # print("Metadata already present for:", uuid) | |
| range+=1 | |
| continue | |
| if not r_book['formats']: | |
| print() | |
| print("No format found for {}".format(r_book['uuid'])) | |
| range+=1 | |
| continue | |
| book={} | |
| url=api+'book/'+id | |
| book['title']=r_book['title'] | |
| book['authors']=r_book['authors'] | |
| book['series']=r_book['series'] | |
| book['series_index']=r_book['series_index'] | |
| book['edition']=0 | |
| book['uuid']=r_book['uuid'] | |
| book['identifiers']=r_book['identifiers'] | |
| book['comments']=r_book['comments'] | |
| book['pubdate']=r_book['pubdate'] | |
| book['publisher']=r_book['publisher'] | |
| languages=r_book['languages'] | |
| if not languages: | |
| # if True: | |
| if book['comments']: | |
| text=book['comments'] | |
| else: | |
| text=book['title'] | |
| s_language, prob=identifier.classify(text) | |
| if prob >= 0.85: | |
| language = iso639.to_iso639_2(s_language) | |
| book['languages']=[language] | |
| else: | |
| book['languages']=[] | |
| else: | |
| book['languages']=[] | |
| for l in languages: | |
| book['languages'].append(iso639.to_iso639_2(l)) | |
| book['tags']=r_book['tags'] | |
| book['formats']=[] | |
| book['metadata_version']=0.1 | |
| source={} | |
| source['url']=url+library | |
| source['id']=id | |
| try: | |
| tmpbook = load_metadata(dir, uuid) | |
| except: | |
| print("Unable to get metadata from:", uuid) | |
| range+=1 | |
| continue | |
| if tmpbook and tmpbook['source']['status']=="ignored": | |
| source['status']="ignored" | |
| else: | |
| source['status']="todo" | |
| source['cover']=server+r_book['cover'] | |
| source['timestamp']=r_book['timestamp'] | |
| format_sources={} | |
| formats=r_book['formats'] | |
| for f in formats: | |
| s={} | |
| url='' | |
| if f in r_book['main_format']: | |
| url=r_book['main_format'][f] | |
| else: | |
| url=r_book['other_formats'][f] | |
| s['url']=server+url | |
| if 'size' in r_book['format_metadata'][f]: | |
| s['size']=int(r_book['format_metadata'][f]['size']) | |
| else: | |
| print() | |
| print("Size not found for format '{}' : {}".format(f, uuid)) | |
| print("Trying to get size online: {}".format(s['url'])) | |
| try: | |
| s['size']=get_file_size(s['url']) | |
| except: | |
| print("Unable to access format '{}' : {} skipped".format(f, uuid)) | |
| continue | |
| s['status']='todo' | |
| format_sources[f]=s | |
| source['formats']=format_sources | |
| book['source']=source | |
| if not source['formats']: | |
| print("No format found for {}".format(r_book['uuid'])) | |
| range+=1 | |
| continue | |
| update_done_status(book) | |
| # print("Saving metadata for:", uuid) | |
| try: | |
| save_metadata(dir, book) | |
| except: | |
| print() | |
| print("Unable to save book metadata", book['uuid']) | |
| range+=1 | |
| offset=offset+num | |
| print() | |
| print("Done") | |
| def has_languages(book, languages=[], ignore_empty_language=False): | |
| # print("Accepted languages", languages) | |
| if not ignore_empty_language: | |
| # print("Unknown language accepted") | |
| pass | |
| # rustine | |
| if not 'languages' in book: | |
| book['languages']=[] | |
| # print("Book languages", book['languages']) | |
| if ignore_empty_language and not book['languages']: | |
| # print ("'{}' ignored: language is empty".format(book['uuid'])) | |
| return False | |
| if not ignore_empty_language and not book['languages']: | |
| # print ("'{}' todo: language is empty".format(book['uuid'])) | |
| return True | |
| expected_languages=list(set(book['languages']) & set(languages)) | |
| if languages and not expected_languages: | |
| # print ("'{}' ignored: language {} not in {}".format(book['uuid'], book['languages'],languages)) | |
| return False | |
| # print ("'{}' todo: expected languages {}".format(book['uuid'], expected_languages)) | |
| return True | |
| def has_identifiers(book, identifiers=[], ignore_empty_identifiers=False): | |
| # print("Accepted identifiers", identifiers) | |
| if not ignore_empty_identifiers: | |
| # print("Unknown identifiers accepted") | |
| pass | |
| # print("Book identifiers", book['identifiers'].keys()) | |
| if ignore_empty_identifiers and not book['identifiers']: | |
| # print ("'{}' ignored: identifier is empty".format(book['uuid'])) | |
| return False | |
| if not ignore_empty_identifiers and not book['identifiers']: | |
| # print ("'{}' todo: identifiers is empty".format(book['uuid'])) | |
| return True | |
| expected_identifiers=list(set(book['identifiers'].keys()) & set(identifiers)) | |
| if identifiers and not expected_identifiers: | |
| # print ("'{}' ignored: identifiers {} not in {}".format(book['uuid'], book['identifiers'].keys(), identifiers)) | |
| return False | |
| # print ("'{}' todo: expected identifiers {}".format(book['uuid'], expected_identifiers)) | |
| return True | |
| def download_ebooks(dir= 'my_books', formats=[], single_format=False, ignored_formats=[], languages=[], identifiers=[], min_size=0, max_size=0, ignore_empty_language=False, ignore_empty_identifiers=False, dry_run=False, timer=0, map="", map_lib=""): | |
| ''' | |
| Download ebooks in matching subdirs: | |
| The different formats of the same book are groupe in the same directory | |
| with an UUID name close to the metadata file (metadata.json). | |
| The status of the formats for a book and its global status are initially set to 'todo'. | |
| They move to 'done' after their download. This allows you to rerun the download and progressively collect books. | |
| You can use different options to filter the formats for the download | |
| by language, size, format and identifiers(isbn, ...). | |
| A report of the download is displayed at the end of the process. | |
| You can run this command in dry mode (--dry-run) with different settings | |
| to only display the report and prepare your effective. | |
| Params: | |
| --min-size=<int> (default=0) | |
| --max-size=<int> (default=infinity) : Delimit the size in MB for the accepted formats | |
| --dry-run (defaul=False) : Run the command to simulate the download | |
| --language=<string> : Restrict the download to a list of specific languages | |
| (Ex: --languages='["eng","ita"]' | |
| --ignore-empty-language (defaul=False) : Ignore books with unidentfied language | |
| --formats=<string> : Restrict the download to a list of specific formats | |
| (Ex: --formats='["epub", "mobi", "pdf"]' | |
| --ignore-formats=<string> : Ignore the formats of a list of specific. | |
| Compliant with --formats. | |
| (Ex: --ignored-formats='["mp3", "rar", "zip"]' | |
| --single-format (defaul=False) : Limit the download to 1 format per book with this preference order | |
| 'azw', 'azw3', 'cbr', 'chm', 'djvu', 'doc', 'docx', 'epub', 'kepub', | |
| 'lit', 'lrf', 'mobi', 'original_epub', 'pdf', 'ppt', 'prc', 'rar' | |
| , 'rtf', 'txt', 'zip', 'fb2' | |
| --identifiers=<string> : Restrict the download to a list of specific identifiers | |
| (Ex: --identifiers='["isbn","asin"]' | |
| --ignore-empty-identifiers (defaul=False) : Ignore books without identifiers (often OCR) | |
| ''' | |
| # all_ordered_formats=['azw', 'azw3', 'cbr', 'chm', 'djvu', 'doc', 'docx', 'epub', 'lit', 'lrf', 'mobi', 'original_epub', 'pdf', 'ppt', 'prc', 'rar', 'rtf', 'txt', 'zip'] | |
| print() | |
| if single_format: my_formats = formats if formats else all_ordered_formats | |
| else: my_formats=formats | |
| # print("formats=", my_formats) | |
| min_size=int(min_size)*1024*1024 | |
| max_size=int(max_size)*1024*1024 | |
| print ("Format expected between {} and {}".format(hsize(min_size), hsize(max_size) if max_size else "infinity")) | |
| total_size=0 | |
| total_size_by_format={} | |
| total_ebook_count=0 | |
| total_format_count=0 | |
| total_count_by_format={} | |
| size_max=0 | |
| size_min=0 | |
| language_count={} | |
| identifiers_count={} | |
| s = requests.Session() | |
| for root, dirs, files in os.walk(dir, topdown=True): | |
| for counter, uuid in enumerate(dirs): | |
| book = load_metadata(root, uuid) | |
| if book: | |
| status=book['source']['status'] | |
| if status=="todo": | |
| if not has_languages(book, languages=languages, ignore_empty_language=ignore_empty_language): | |
| continue | |
| if not has_identifiers(book, identifiers=identifiers, ignore_empty_identifiers=ignore_empty_identifiers): | |
| continue | |
| source=book['source'] | |
| download_formats = get_formats_to_download(book, accepted_formats=my_formats, single_format=single_format, ignored_formats=ignored_formats, max_size=max_size, min_size=min_size) | |
| if not len(download_formats): | |
| # print ("'{}' ignored: no more format available in formats expected {}".format(uuid, download_formats)) | |
| # print() | |
| pass | |
| else: | |
| ebook_kept=False | |
| for f in download_formats: | |
| url = source['formats'][f]['url'] | |
| # if map: | |
| # pu=urllib.parse.urlparse(url) | |
| # pu=(pu[0], map, *pu[2:]) | |
| # print(pu) | |
| # print(urllib.parse.urlunparse(pu)) | |
| if url: | |
| # # It shouldn't occur: Need to download again | |
| if get_file_path(dir, uuid, f): | |
| # print ("Format '{}' already present for {}: Retrying".format(f, uuid)) | |
| # print() | |
| # continue | |
| # print("Format '{}': size expected={}".format(f, hsize(source['formats'][f]['size']))) | |
| pass | |
| # print(f"--> format '{f}' for ({book['title']} / {book['authors'][0]} / {str(book['series'])})") | |
| if not dry_run: | |
| try: | |
| get_file(dir, book, f, s, map, map_lib) | |
| book['formats'].append(f) | |
| book['source']['formats'][f]['status']="done" | |
| if timer: | |
| print(f"Waiting {timer} seconds") | |
| time.sleep(timer) | |
| except Exception as msg: | |
| print("Unable to get book:", url) | |
| print(msg) | |
| time.sleep(5) | |
| continue | |
| save_metadata(dir, book) | |
| ebook_kept=True | |
| size=source['formats'][f]['size'] | |
| total_size += size | |
| size_max = size if size>size_max else size_max | |
| if not size_min: | |
| size_min = size | |
| else: | |
| size_min = size if size<size_min else size_min | |
| if not f in total_size_by_format: | |
| total_size_by_format[f] = size | |
| else: total_size_by_format[f] +=size | |
| if not f in total_count_by_format: | |
| total_count_by_format[f] = 1 | |
| else: | |
| total_count_by_format[f]+=1 | |
| total_format_count +=1 | |
| else: | |
| # print ("Format '{}' ignored for {} ({}): No url)".format(f, uuid, book['title'])) | |
| # print() | |
| pass | |
| if ebook_kept: | |
| total_ebook_count+=1 | |
| if not book['languages']: | |
| if not '<unknown>' in language_count: | |
| language_count['<unknown>'] = 1 | |
| else: | |
| language_count['<unknown>']+=1 | |
| else: | |
| for l in book['languages']: | |
| if not l in language_count: | |
| language_count[l] = 1 | |
| else: | |
| language_count[l]+=1 | |
| if not book['identifiers']: | |
| if not '<unknown>' in identifiers_count: | |
| identifiers_count['<unknown>'] = 1 | |
| else: | |
| identifiers_count['<unknown>']+=1 | |
| else: | |
| for l in book['identifiers'].keys(): | |
| if not l in identifiers_count: | |
| identifiers_count[l] = 1 | |
| else: | |
| identifiers_count[l]+=1 | |
| if not dry_run: | |
| update_done_status(book) | |
| if book['source']['status']=="done": | |
| save_metadata(dir, book) | |
| print("Book done:", book['uuid']) | |
| print() | |
| # total_ebook_count+=1 | |
| else: | |
| # print() | |
| # print("-->", uuid, "("+book['title']+")") | |
| # print ('{} in status "{}": skipped'.format(book['uuid'], status)) | |
| # print(f"--> {uuid} ({book['title']}) in status {status}: skipped", end="\r") | |
| # print(f"--> {uuid} ({book['title']})", end="\r") | |
| print(f'--> {counter} books handled', end="\r") | |
| print() | |
| print("Reporting ...") | |
| table_l = BeautifulTable() | |
| table_l.column_headers = ["Language", "Ebooks count"] | |
| for l, c in language_count.items(): | |
| table_l.append_row([l, c]) | |
| table_l.sort("Ebooks count", reverse=True) | |
| table_l=table_l[0:10] | |
| table_i = BeautifulTable() | |
| table_i.column_headers = ["Identifier", "Ebooks count"] | |
| for i, c in identifiers_count.items(): | |
| table_i.append_row([i, c]) | |
| table_i.sort("Ebooks count", reverse=True) | |
| table_i=table_i[0:10] | |
| print() | |
| print("Top 10 ebooks by language/identifier:") | |
| table = BeautifulTable() | |
| table.column_headers = ["Languages", "Identifiers"] | |
| table.append_row([table_l, table_i]) | |
| # table.set_style(BeautifulTable.STYLE_MARKDOWN) | |
| print(table) | |
| print() | |
| print("Total count of ebooks by format:") | |
| table = BeautifulTable() | |
| table.column_headers = ["Format", "Size", "Ebooks count"] | |
| for f in total_count_by_format.keys(): | |
| table.append_row([f, hsize(total_size_by_format[f]),total_count_by_format[f]]) | |
| table.sort("Ebooks count", reverse=True) | |
| # table.set_style(BeautifulTable.STYLE_MARKDOWN) | |
| print(table) | |
| table_c = BeautifulTable() | |
| table_c.column_headers = ["", "Total count"] | |
| table_c.append_row(["Formats", total_format_count]) | |
| table_c.append_row(["Ebooks", total_ebook_count]) | |
| table_s = BeautifulTable() | |
| table_s.column_headers = ["", "Size"] | |
| # table.append_row(["Min", hsize(size_min)]) | |
| table_s.append_row(["Biggest File", hsize(size_max)]) | |
| table_s.append_row(["Total", hsize(total_size)]) | |
| print() | |
| print("Summary:") | |
| table = BeautifulTable() | |
| table.column_headers = ["Total Count", "Total Size"] | |
| table.append_row([table_c, table_s]) | |
| # table.set_style(BeautifulTable.STYLE_MARKDOWN) | |
| print(table) | |
| print() | |
| def get_formats_to_download(book, accepted_formats=[], ignored_formats=[], single_format=False, min_size=0, max_size=0): | |
| # print("Accepted formats", accepted_formats) | |
| source=book['source'] | |
| # print("Formats available in source: {}".format(list(source['formats'].keys()))) | |
| my_formats=[] | |
| for f,v in source['formats'].items(): | |
| if v['status']=='todo': | |
| my_formats.append(f) | |
| # print("Formats in 'todo': {}".format(my_formats)) | |
| formats=[] | |
| if single_format: | |
| if accepted_formats: | |
| for f in accepted_formats: | |
| if f in my_formats: | |
| formats=[f] | |
| break | |
| else: | |
| print("need at least 1 format for ordering") | |
| else: | |
| if accepted_formats: | |
| formats=list(set(accepted_formats) & set(my_formats)) | |
| elif ignored_formats: | |
| formats = list(set(my_formats) - set(ignored_formats)) | |
| else: | |
| formats=my_formats | |
| # print("Formats expected: {}".format(formats)) | |
| download_formats=formats[:] | |
| for f in formats: | |
| if not 'size' in source['formats'][f] and max_size: | |
| # print ("Format '{}' ignored for {}: Size unknown".format(f, book['uuid'])) | |
| download_formats.remove(f) | |
| else: | |
| size = source['formats'][f]['size'] | |
| if size < min_size or (max_size and size > max_size): | |
| download_formats.remove(f) | |
| # print ("Format '{}' ignored for {}: size={} but expected between {} and {}".format(f, book['uuid'], hsize(size), hsize(min_size), hsize(max_size) if max_size else "infinity")) | |
| return download_formats | |
| def update_format_statuses(book,refresh_ignored): | |
| formats=book['source']['formats'] | |
| for f, v in formats.items(): | |
| if v['status']=='ignored' and not refresh_ignored: | |
| # print ("Format '{}' ignored: {} ({}))".format(f, book['uuid'], book['title'])) | |
| pass | |
| else: | |
| # print ("Format '{}' todo: {} ({}))".format(f, book['uuid'], book['title'])) | |
| book['source']['formats'][f]['status']='todo' | |
| import glob | |
| def check_ebooks(dir= 'my_books', dry_run=True): | |
| ''' | |
| Check ebooks: | |
| ''' | |
| print("Checking ...") | |
| for root, dirs, files in os.walk(dir, topdown=True): | |
| for counter, uuid in enumerate(dirs): | |
| book = load_metadata(root, uuid) | |
| if book: | |
| status=book['source']['status'] | |
| if status=="todo": | |
| print(status) | |
| source=book['source'] | |
| update=False | |
| for f, v in source["formats"].items(): | |
| print(uuid, f, v['status']) | |
| if v['status']=="todo": | |
| formats= glob.glob(root+"/"+uuid+"/*."+f) | |
| print(formats) | |
| if formats: | |
| print(book['uuid'], formats[0]) | |
| book['source']['formats'][f]['status']="done" | |
| update=True | |
| if not dry_run and update: | |
| update_done_status(book) | |
| save_metadata(dir, book) | |
| print("Book done", book['uuid']) | |
| print() | |
| print() | |
| if __name__ == "__main__": | |
| fire.Fire({ | |
| "index_ebooks": index_ebooks, | |
| "download_ebooks": download_ebooks, | |
| "download_covers": download_covers, | |
| "set_status": set_status, | |
| "check_ebooks": check_ebooks | |
| }) |
Yes, I'm working on a new version. A true project not a gist. Probably releasing this fix. Is it blocking for this version ? I'll give a look in this case.
Oh no, not blocking at all, it still prints the reports just fine, it's just a warning that in the near future, so when the next people do pip install, they might get in trouble. So not that urgent, just a fyi :-)
Hi, any update on the new version status? I just happened to rediscover this project and saw the last comment was mine :D
I'm getting two copies of the ebooks downloaded, one with the correct format extension (eg .epub) and one with a tmp extension (eg .epub.tmp). In the console, I'm seeing the following:
Downloading ebook: ***the ebook url***
Size expected (estimation): 1.0 MB
Size received=1.0 MB
Unable to get book: ***the ebook url***
[WinError 32] The process cannot access the file because it is being used by another process: '***the path to the ebook that it did actually download***.tmp'
This is on Windows 10 using Python 3.6
edit: If I change the indention on line 211&212, it seems to resolve the issue. The tmp file needs to be closed before it can be copied & deleted.
This might be something to look into too:
/home/xx/.local/lib/python3.6/site-packages/beautifultable/utils.py:113: FutureWarning: 'BeautifulTable.column_headers' has been deprecated in 'v1.0.0' and will be removed in 'v1.
2.0'. Use 'BTColumnCollection.header' instead.
warnings.warn(message, FutureWarning)
/home/xx/.local/lib/python3.6/site-packages/beautifultable/utils.py:113: FutureWarning: 'BeautifulTable.append_row' has been deprecated in 'v1.0.0' and will be removed in 'v1.2.0'
. Use 'BTRowCollection.append' instead.
warnings.warn(message, FutureWarning)
/home/xx/.local/lib/python3.6/site-packages/beautifultable/utils.py:113: FutureWarning: 'BeautifulTable.sort' has been deprecated in 'v1.0.0' and will be removed in 'v1.2.0'. Use
'BTRowCollection.sort' instead.
warnings.warn(message, FutureWarning)
/home/xx/.local/lib/python3.6/site-packages/beautifultable/utils.py:113: FutureWarning: 'BeautifulTable.getitem' has been deprecated in 'v1.0.0' and will be removed in 'v1.2.0
'. Use 'BeautifulTable.{columns|rows}[key]' instead.
warnings.warn(message, FutureWarning)