-
-
Save Krazybug/b7e814d7189db9ee1d6b9c1d1a1de95c to your computer and use it in GitHub Desktop.
#!/usr/bin/env python3 | |
''' | |
calisuck: index, filter-out smartly and download ebooks from Calibre open directories | |
Installation: | |
You need python 3.5 installed | |
Download the file as a zip and unzip-it and get into the dir | |
OR | |
> git clone https://gist.github.com/b7e814d7189db9ee1d6b9c1d1a1de95c.git | |
> mv b7e814d7189db9ee1d6b9c1d1a1de95c calisuck | |
> cd calisuck | |
> | |
THEN | |
> python3 -m venv . | |
> . bin/activate | |
> pip install requests fire humanize langid iso639 beautifultable | |
> python calisuck.py --help | |
> python calisuck.py index-ebooks --help | |
> python calisuck.py download-ebooks --help | |
> python calisuck.py download-covers --help | |
''' | |
''' | |
DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE | |
Version 2, December 2004 | |
Copyright (C) 2004 Sam Hocevar <[email protected]> | |
Everyone is permitted to copy and distribute verbatim or modified | |
copies of this license document, and changing it is allowed as long | |
as the name is changed. | |
DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE | |
TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION | |
0. You just DO WHAT THE FUCK YOU WANT TO. | |
''' | |
import sys | |
import os | |
import time | |
import re | |
import shutil | |
import requests | |
import json | |
import fire | |
from humanize import naturalsize as hsize | |
from langid.langid import LanguageIdentifier, model | |
import iso639 | |
import time | |
from requests.adapters import HTTPAdapter | |
import urllib.parse | |
import urllib3 | |
from beautifultable import BeautifulTable | |
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) | |
all_ordered_formats=['azw', 'azw3', 'cbr', 'chm', 'djvu', 'doc', 'docx', 'epub', 'kepub', 'lit', 'lrf', 'mobi', 'original_epub', 'pdf', 'ppt', 'prc', 'rar', 'rtf', 'txt', 'zip', 'fb2'] | |
identifier = LanguageIdentifier.from_modelstring(model, norm_probs=True) | |
def load_metadata(path, uuid): | |
filepath=path+'/'+uuid+'/metadata.json' | |
# print (filepath) | |
if os.path.isfile(filepath): | |
try: | |
with open(filepath, 'r') as fd: | |
return json.load(fd) | |
except: | |
print ("Error loading metadata for:", uuid, "from path:", path) | |
return 0 | |
else: | |
# print ("Metadata not found for:", uuid, "from path:", path) | |
return 0 | |
def save_metadata(path, book): | |
filepath=path+'/'+book['uuid']+'/metadata.json' | |
# print("Saving book metadata for:", book['uuid'], "to:", filepath) | |
os.makedirs(os.path.dirname(filepath+".tmp"), exist_ok=True) | |
with open(filepath+".tmp", 'w') as fd: | |
json.dump(book, fd, indent=4, separators=(',', ': ')) | |
try: | |
shutil.move(filepath+".tmp", filepath) | |
# print("Saved to:", filepath) | |
except: | |
print("Unable to rename .tmp file:", filepath+".tmp") | |
def get_cover_path(path, uuid): | |
filepath=path+'/'+uuid+'/cover.jpg' | |
if os.path.isfile(filepath): return filepath | |
else: return 0 | |
def get_file_path(path, uuid, fileformat): | |
files=os.listdir(path+'/'+uuid) | |
if files: | |
for f in files: | |
fname, ext=os.path.splitext(f) | |
if ext =='.'+fileformat: | |
return path+'/'+uuid+'/'+f | |
else: return 0 | |
else: return 0 | |
def get_cover(path, book, map): | |
url=book['source']['cover'] | |
if map: | |
pu=urllib.parse.urlparse(url) | |
pu=(pu[0], map, *pu[2:]) | |
print(pu) | |
url=urllib.parse.urlunparse(pu) | |
print("Downloading cover from:", url) | |
r=requests.get(url, timeout=(20, 3), verify=False) | |
r.raise_for_status() | |
filepath=path+'/'+book['uuid']+'/cover.jpg' | |
os.makedirs(os.path.dirname(filepath+".tmp"), exist_ok=True) | |
with open(filepath+".tmp", 'wb') as fd: | |
fd.write(r.content) | |
shutil.move(filepath+".tmp", filepath) | |
print("Saved to:", filepath) | |
def download_covers(dir='my_books', server='', map=""): | |
""" Download covers for each books""" | |
for root, dirs, files in os.walk(dir, topdown=True): | |
for d in dirs: | |
# print() | |
# print("-->", d) | |
book = load_metadata(root, d) | |
if book: | |
# if book['source']['status'] != "ignored": | |
if True: | |
if not get_cover_path(root, book['uuid']): | |
print() | |
print("-->", d) | |
print(book['uuid']) | |
try: | |
get_cover(root, book, map) | |
except: | |
print ("Unable to get cover", book['uuid']) | |
else: | |
pass | |
# print ("Cover already present:", book['uuid']) | |
else: | |
print ('book {} in status {}: ignored'.format(book['uuid'], book['source']['status'])) | |
else: | |
print ("No ebook metadata found in:", root) | |
def get_file_size(url): | |
print("Downloading size:", url) | |
r = requests.head(url, verify=False) | |
r.raise_for_status() | |
size=r.headers['Content-Length'] | |
print("Size received="+ hsize(size)) | |
return int(size) | |
def get_file(path, book, format, session, map, map_lib): | |
uuid = book['uuid'] | |
url=book['source']['formats'][format]['url'] | |
if map: | |
pu=urllib.parse.urlparse(url) | |
pu=(pu[0], map, *pu[2:]) | |
print(pu) | |
url=urllib.parse.urlunparse(pu) | |
if map_lib: | |
# pu=urllib.parse.urlparse(url) | |
# print(pu) | |
url_s=url.split("/") | |
# print(url_s) | |
url_s=url_s[:-1]+[map_lib] | |
# print('/'.join(url_s)) | |
url='/'.join(url_s) | |
print() | |
print("Downloading ebook:", url) | |
print("Size expected (estimation):", hsize(book['source']['formats'][format]['size'])) | |
r = session.get(url, timeout=(25,15), verify=False) | |
# headers = {"Range": "bytes=0-1023"} | |
# r = requests.get(url, headers=headers) | |
r.raise_for_status() | |
# print(r.headers) | |
if('Content-Length' in r.headers ): | |
print("Size received="+hsize(r.headers['Content-Length'])) | |
else: | |
print("Fize received") | |
filename=re.findall(r'filename="(.*)"', r.headers['Content-Disposition']) | |
# print(filename) | |
if len(filename): | |
filepath=path+'/'+uuid+'/'+filename[0] | |
else: | |
filepath=path+'/'+uuid+'/'+uuid+"."+format | |
os.makedirs(os.path.dirname(filepath+".tmp"), exist_ok=True) | |
with open(filepath+".tmp", 'wb') as fd: | |
fd.write(r.content) | |
shutil.move(filepath+".tmp", filepath) | |
print("Saved to:", filepath) | |
def set_status(uuid, status, dir='.'): | |
book = load_metadata(dir, uuid) | |
if book: | |
if book['source']['status'] != status: | |
book['source']['status'] = status | |
save_metadata(dir, book) | |
print("Status changed to", status+":", book['uuid'], "(", book['title'], ")") | |
else: | |
print("Status unchanged changed ", status+":", book['uuid']) | |
else: | |
print ("No ebook metadata found for:", uuid) | |
def remove_book(uuid, path='.'): | |
print(os.getcwd()) | |
bookpath=path+'/'+uuid | |
if os.path.isdir(bookpath): | |
try: | |
shutil.rmtree(bookpath) | |
print(uuid, "removed") | |
except: | |
print("problem") | |
else: | |
print(uuid, "not found") | |
def update_done_status(book): | |
source=book['source'] | |
if source['status']!='ignored': | |
if set(source['formats'].keys()) == set(book['formats']) & set(source['formats'].keys()): | |
book['source']['status']="done" | |
else: | |
book['source']['status']="todo" | |
def index_ebooks(site, library="", start=0, stop=0, dir="my_books", inc=1000, force_refresh=False): | |
""" | |
Index a remote Calibre library | |
You will get in your <dir> all the metadata (title, authors, isbn, ...) for each book. | |
They're stored as simple JSON files (metadata.json) so that you can easily visualize them or process them with 'jq' program. | |
They are stored in subdirectories with a UUID as a name. These directories do match different books and allow you to group all | |
the different formats of the same book and eventually the cover file. | |
You can mix books from different sites without any (theoric) collisions | |
Params: | |
--site=<string> : Url of the site to index (ex: http://123.123.123.123/) | |
--library=<string> (default=my_books) : Id of library to index. The script index the default library by default. | |
The id is string following '&library_id=' in the url | |
--force-refresh (defaul=False) : Force a refresh of the metadata. By default all the metdata | |
already gathered are ignored | |
--start=<int> (default=0) | |
--stop=<int> (default=0) : Allow indexing between a range of ebooks | |
--inc=<int> (default=1000) : Fix the number of ebooks for each request one the server | |
""" | |
os.makedirs(dir, exist_ok=True) | |
offset= 0 if not start else start-1 | |
num=min(1000,inc) | |
server=site.rstrip('/') | |
api=server+'/ajax/' | |
library= '/'+library if library else library | |
print("Server:", server) | |
url=api+'search'+library+'?num=0' | |
print() | |
print("Getting ebooks count:", server) | |
try: | |
r = requests.get(url,verify=False) | |
r.raise_for_status() | |
except: | |
print("Unable to open site:", url) | |
sys.exit(1) | |
print("Total count=",r.json()["total_num"]) | |
total_num=int(r.json()["total_num"]) | |
total_num= total_num if not stop else stop | |
print() | |
print("Start indexing") | |
range=offset+1 | |
while offset < total_num: | |
remaining_num = min(num, total_num - offset) | |
# print() | |
# print("Downloading ids: offset="+str(offset), "num="+str(remaining_num)) | |
url=api+'search'+library+'?num='+str(remaining_num)+'&offset='+str(offset)+'&sort=timestamp&sort_order=desc' | |
# print("->", url) | |
r=requests.get(url, verify=False) | |
# print("Ids received from:"+str(offset), "to:"+str(offset+remaining_num-1)) | |
# print() | |
# print("\rDownloading metadata from", str(offset+1), "to", str(offset+remaining_num),end='') | |
books_s=",".join(str(i) for i in r.json()['book_ids']) | |
url=api+'books'+library+'?ids='+books_s | |
# print("->", url) | |
r=requests.get(url, verify=False) | |
# print(len(r.json()), "received") | |
for id, r_book in r.json().items(): | |
uuid=r_book['uuid'] | |
if not uuid: | |
print ("No uuid for ebook: ignored") | |
continue | |
if r_book['authors']: | |
desc= f"uuid={uuid} ({r_book['title']} / {r_book['authors'][0]})" | |
else: | |
desc= f"uuid={uuid} ({r_book['title']})" | |
s=f"\r--> {range}/{total_num} - {desc}" | |
s='{:140.140}'.format(s) | |
print (s, end='') | |
if not force_refresh: | |
try: | |
book = load_metadata(dir, uuid) | |
except: | |
print() | |
print("Unable to get metadata from:", uuid) | |
range+=1 | |
continue | |
if book: | |
# print("Metadata already present for:", uuid) | |
range+=1 | |
continue | |
if not r_book['formats']: | |
print() | |
print("No format found for {}".format(r_book['uuid'])) | |
range+=1 | |
continue | |
book={} | |
url=api+'book/'+id | |
book['title']=r_book['title'] | |
book['authors']=r_book['authors'] | |
book['series']=r_book['series'] | |
book['series_index']=r_book['series_index'] | |
book['edition']=0 | |
book['uuid']=r_book['uuid'] | |
book['identifiers']=r_book['identifiers'] | |
book['comments']=r_book['comments'] | |
book['pubdate']=r_book['pubdate'] | |
book['publisher']=r_book['publisher'] | |
languages=r_book['languages'] | |
if not languages: | |
# if True: | |
if book['comments']: | |
text=book['comments'] | |
else: | |
text=book['title'] | |
s_language, prob=identifier.classify(text) | |
if prob >= 0.85: | |
language = iso639.to_iso639_2(s_language) | |
book['languages']=[language] | |
else: | |
book['languages']=[] | |
else: | |
book['languages']=[] | |
for l in languages: | |
book['languages'].append(iso639.to_iso639_2(l)) | |
book['tags']=r_book['tags'] | |
book['formats']=[] | |
book['metadata_version']=0.1 | |
source={} | |
source['url']=url+library | |
source['id']=id | |
try: | |
tmpbook = load_metadata(dir, uuid) | |
except: | |
print("Unable to get metadata from:", uuid) | |
range+=1 | |
continue | |
if tmpbook and tmpbook['source']['status']=="ignored": | |
source['status']="ignored" | |
else: | |
source['status']="todo" | |
source['cover']=server+r_book['cover'] | |
source['timestamp']=r_book['timestamp'] | |
format_sources={} | |
formats=r_book['formats'] | |
for f in formats: | |
s={} | |
url='' | |
if f in r_book['main_format']: | |
url=r_book['main_format'][f] | |
else: | |
url=r_book['other_formats'][f] | |
s['url']=server+url | |
if 'size' in r_book['format_metadata'][f]: | |
s['size']=int(r_book['format_metadata'][f]['size']) | |
else: | |
print() | |
print("Size not found for format '{}' : {}".format(f, uuid)) | |
print("Trying to get size online: {}".format(s['url'])) | |
try: | |
s['size']=get_file_size(s['url']) | |
except: | |
print("Unable to access format '{}' : {} skipped".format(f, uuid)) | |
continue | |
s['status']='todo' | |
format_sources[f]=s | |
source['formats']=format_sources | |
book['source']=source | |
if not source['formats']: | |
print("No format found for {}".format(r_book['uuid'])) | |
range+=1 | |
continue | |
update_done_status(book) | |
# print("Saving metadata for:", uuid) | |
try: | |
save_metadata(dir, book) | |
except: | |
print() | |
print("Unable to save book metadata", book['uuid']) | |
range+=1 | |
offset=offset+num | |
print() | |
print("Done") | |
def has_languages(book, languages=[], ignore_empty_language=False): | |
# print("Accepted languages", languages) | |
if not ignore_empty_language: | |
# print("Unknown language accepted") | |
pass | |
# rustine | |
if not 'languages' in book: | |
book['languages']=[] | |
# print("Book languages", book['languages']) | |
if ignore_empty_language and not book['languages']: | |
# print ("'{}' ignored: language is empty".format(book['uuid'])) | |
return False | |
if not ignore_empty_language and not book['languages']: | |
# print ("'{}' todo: language is empty".format(book['uuid'])) | |
return True | |
expected_languages=list(set(book['languages']) & set(languages)) | |
if languages and not expected_languages: | |
# print ("'{}' ignored: language {} not in {}".format(book['uuid'], book['languages'],languages)) | |
return False | |
# print ("'{}' todo: expected languages {}".format(book['uuid'], expected_languages)) | |
return True | |
def has_identifiers(book, identifiers=[], ignore_empty_identifiers=False): | |
# print("Accepted identifiers", identifiers) | |
if not ignore_empty_identifiers: | |
# print("Unknown identifiers accepted") | |
pass | |
# print("Book identifiers", book['identifiers'].keys()) | |
if ignore_empty_identifiers and not book['identifiers']: | |
# print ("'{}' ignored: identifier is empty".format(book['uuid'])) | |
return False | |
if not ignore_empty_identifiers and not book['identifiers']: | |
# print ("'{}' todo: identifiers is empty".format(book['uuid'])) | |
return True | |
expected_identifiers=list(set(book['identifiers'].keys()) & set(identifiers)) | |
if identifiers and not expected_identifiers: | |
# print ("'{}' ignored: identifiers {} not in {}".format(book['uuid'], book['identifiers'].keys(), identifiers)) | |
return False | |
# print ("'{}' todo: expected identifiers {}".format(book['uuid'], expected_identifiers)) | |
return True | |
def download_ebooks(dir= 'my_books', formats=[], single_format=False, ignored_formats=[], languages=[], identifiers=[], min_size=0, max_size=0, ignore_empty_language=False, ignore_empty_identifiers=False, dry_run=False, timer=0, map="", map_lib=""): | |
''' | |
Download ebooks in matching subdirs: | |
The different formats of the same book are groupe in the same directory | |
with an UUID name close to the metadata file (metadata.json). | |
The status of the formats for a book and its global status are initially set to 'todo'. | |
They move to 'done' after their download. This allows you to rerun the download and progressively collect books. | |
You can use different options to filter the formats for the download | |
by language, size, format and identifiers(isbn, ...). | |
A report of the download is displayed at the end of the process. | |
You can run this command in dry mode (--dry-run) with different settings | |
to only display the report and prepare your effective. | |
Params: | |
--min-size=<int> (default=0) | |
--max-size=<int> (default=infinity) : Delimit the size in MB for the accepted formats | |
--dry-run (defaul=False) : Run the command to simulate the download | |
--language=<string> : Restrict the download to a list of specific languages | |
(Ex: --languages='["eng","ita"]' | |
--ignore-empty-language (defaul=False) : Ignore books with unidentfied language | |
--formats=<string> : Restrict the download to a list of specific formats | |
(Ex: --formats='["epub", "mobi", "pdf"]' | |
--ignore-formats=<string> : Ignore the formats of a list of specific. | |
Compliant with --formats. | |
(Ex: --ignored-formats='["mp3", "rar", "zip"]' | |
--single-format (defaul=False) : Limit the download to 1 format per book with this preference order | |
'azw', 'azw3', 'cbr', 'chm', 'djvu', 'doc', 'docx', 'epub', 'kepub', | |
'lit', 'lrf', 'mobi', 'original_epub', 'pdf', 'ppt', 'prc', 'rar' | |
, 'rtf', 'txt', 'zip', 'fb2' | |
--identifiers=<string> : Restrict the download to a list of specific identifiers | |
(Ex: --identifiers='["isbn","asin"]' | |
--ignore-empty-identifiers (defaul=False) : Ignore books without identifiers (often OCR) | |
''' | |
# all_ordered_formats=['azw', 'azw3', 'cbr', 'chm', 'djvu', 'doc', 'docx', 'epub', 'lit', 'lrf', 'mobi', 'original_epub', 'pdf', 'ppt', 'prc', 'rar', 'rtf', 'txt', 'zip'] | |
print() | |
if single_format: my_formats = formats if formats else all_ordered_formats | |
else: my_formats=formats | |
# print("formats=", my_formats) | |
min_size=int(min_size)*1024*1024 | |
max_size=int(max_size)*1024*1024 | |
print ("Format expected between {} and {}".format(hsize(min_size), hsize(max_size) if max_size else "infinity")) | |
total_size=0 | |
total_size_by_format={} | |
total_ebook_count=0 | |
total_format_count=0 | |
total_count_by_format={} | |
size_max=0 | |
size_min=0 | |
language_count={} | |
identifiers_count={} | |
s = requests.Session() | |
for root, dirs, files in os.walk(dir, topdown=True): | |
for counter, uuid in enumerate(dirs): | |
book = load_metadata(root, uuid) | |
if book: | |
status=book['source']['status'] | |
if status=="todo": | |
if not has_languages(book, languages=languages, ignore_empty_language=ignore_empty_language): | |
continue | |
if not has_identifiers(book, identifiers=identifiers, ignore_empty_identifiers=ignore_empty_identifiers): | |
continue | |
source=book['source'] | |
download_formats = get_formats_to_download(book, accepted_formats=my_formats, single_format=single_format, ignored_formats=ignored_formats, max_size=max_size, min_size=min_size) | |
if not len(download_formats): | |
# print ("'{}' ignored: no more format available in formats expected {}".format(uuid, download_formats)) | |
# print() | |
pass | |
else: | |
ebook_kept=False | |
for f in download_formats: | |
url = source['formats'][f]['url'] | |
# if map: | |
# pu=urllib.parse.urlparse(url) | |
# pu=(pu[0], map, *pu[2:]) | |
# print(pu) | |
# print(urllib.parse.urlunparse(pu)) | |
if url: | |
# # It shouldn't occur: Need to download again | |
if get_file_path(dir, uuid, f): | |
# print ("Format '{}' already present for {}: Retrying".format(f, uuid)) | |
# print() | |
# continue | |
# print("Format '{}': size expected={}".format(f, hsize(source['formats'][f]['size']))) | |
pass | |
# print(f"--> format '{f}' for ({book['title']} / {book['authors'][0]} / {str(book['series'])})") | |
if not dry_run: | |
try: | |
get_file(dir, book, f, s, map, map_lib) | |
book['formats'].append(f) | |
book['source']['formats'][f]['status']="done" | |
if timer: | |
print(f"Waiting {timer} seconds") | |
time.sleep(timer) | |
except Exception as msg: | |
print("Unable to get book:", url) | |
print(msg) | |
time.sleep(5) | |
continue | |
save_metadata(dir, book) | |
ebook_kept=True | |
size=source['formats'][f]['size'] | |
total_size += size | |
size_max = size if size>size_max else size_max | |
if not size_min: | |
size_min = size | |
else: | |
size_min = size if size<size_min else size_min | |
if not f in total_size_by_format: | |
total_size_by_format[f] = size | |
else: total_size_by_format[f] +=size | |
if not f in total_count_by_format: | |
total_count_by_format[f] = 1 | |
else: | |
total_count_by_format[f]+=1 | |
total_format_count +=1 | |
else: | |
# print ("Format '{}' ignored for {} ({}): No url)".format(f, uuid, book['title'])) | |
# print() | |
pass | |
if ebook_kept: | |
total_ebook_count+=1 | |
if not book['languages']: | |
if not '<unknown>' in language_count: | |
language_count['<unknown>'] = 1 | |
else: | |
language_count['<unknown>']+=1 | |
else: | |
for l in book['languages']: | |
if not l in language_count: | |
language_count[l] = 1 | |
else: | |
language_count[l]+=1 | |
if not book['identifiers']: | |
if not '<unknown>' in identifiers_count: | |
identifiers_count['<unknown>'] = 1 | |
else: | |
identifiers_count['<unknown>']+=1 | |
else: | |
for l in book['identifiers'].keys(): | |
if not l in identifiers_count: | |
identifiers_count[l] = 1 | |
else: | |
identifiers_count[l]+=1 | |
if not dry_run: | |
update_done_status(book) | |
if book['source']['status']=="done": | |
save_metadata(dir, book) | |
print("Book done:", book['uuid']) | |
print() | |
# total_ebook_count+=1 | |
else: | |
# print() | |
# print("-->", uuid, "("+book['title']+")") | |
# print ('{} in status "{}": skipped'.format(book['uuid'], status)) | |
# print(f"--> {uuid} ({book['title']}) in status {status}: skipped", end="\r") | |
# print(f"--> {uuid} ({book['title']})", end="\r") | |
print(f'--> {counter} books handled', end="\r") | |
print() | |
print("Reporting ...") | |
table_l = BeautifulTable() | |
table_l.column_headers = ["Language", "Ebooks count"] | |
for l, c in language_count.items(): | |
table_l.append_row([l, c]) | |
table_l.sort("Ebooks count", reverse=True) | |
table_l=table_l[0:10] | |
table_i = BeautifulTable() | |
table_i.column_headers = ["Identifier", "Ebooks count"] | |
for i, c in identifiers_count.items(): | |
table_i.append_row([i, c]) | |
table_i.sort("Ebooks count", reverse=True) | |
table_i=table_i[0:10] | |
print() | |
print("Top 10 ebooks by language/identifier:") | |
table = BeautifulTable() | |
table.column_headers = ["Languages", "Identifiers"] | |
table.append_row([table_l, table_i]) | |
# table.set_style(BeautifulTable.STYLE_MARKDOWN) | |
print(table) | |
print() | |
print("Total count of ebooks by format:") | |
table = BeautifulTable() | |
table.column_headers = ["Format", "Size", "Ebooks count"] | |
for f in total_count_by_format.keys(): | |
table.append_row([f, hsize(total_size_by_format[f]),total_count_by_format[f]]) | |
table.sort("Ebooks count", reverse=True) | |
# table.set_style(BeautifulTable.STYLE_MARKDOWN) | |
print(table) | |
table_c = BeautifulTable() | |
table_c.column_headers = ["", "Total count"] | |
table_c.append_row(["Formats", total_format_count]) | |
table_c.append_row(["Ebooks", total_ebook_count]) | |
table_s = BeautifulTable() | |
table_s.column_headers = ["", "Size"] | |
# table.append_row(["Min", hsize(size_min)]) | |
table_s.append_row(["Biggest File", hsize(size_max)]) | |
table_s.append_row(["Total", hsize(total_size)]) | |
print() | |
print("Summary:") | |
table = BeautifulTable() | |
table.column_headers = ["Total Count", "Total Size"] | |
table.append_row([table_c, table_s]) | |
# table.set_style(BeautifulTable.STYLE_MARKDOWN) | |
print(table) | |
print() | |
def get_formats_to_download(book, accepted_formats=[], ignored_formats=[], single_format=False, min_size=0, max_size=0): | |
# print("Accepted formats", accepted_formats) | |
source=book['source'] | |
# print("Formats available in source: {}".format(list(source['formats'].keys()))) | |
my_formats=[] | |
for f,v in source['formats'].items(): | |
if v['status']=='todo': | |
my_formats.append(f) | |
# print("Formats in 'todo': {}".format(my_formats)) | |
formats=[] | |
if single_format: | |
if accepted_formats: | |
for f in accepted_formats: | |
if f in my_formats: | |
formats=[f] | |
break | |
else: | |
print("need at least 1 format for ordering") | |
else: | |
if accepted_formats: | |
formats=list(set(accepted_formats) & set(my_formats)) | |
elif ignored_formats: | |
formats = list(set(my_formats) - set(ignored_formats)) | |
else: | |
formats=my_formats | |
# print("Formats expected: {}".format(formats)) | |
download_formats=formats[:] | |
for f in formats: | |
if not 'size' in source['formats'][f] and max_size: | |
# print ("Format '{}' ignored for {}: Size unknown".format(f, book['uuid'])) | |
download_formats.remove(f) | |
else: | |
size = source['formats'][f]['size'] | |
if size < min_size or (max_size and size > max_size): | |
download_formats.remove(f) | |
# print ("Format '{}' ignored for {}: size={} but expected between {} and {}".format(f, book['uuid'], hsize(size), hsize(min_size), hsize(max_size) if max_size else "infinity")) | |
return download_formats | |
def update_format_statuses(book,refresh_ignored): | |
formats=book['source']['formats'] | |
for f, v in formats.items(): | |
if v['status']=='ignored' and not refresh_ignored: | |
# print ("Format '{}' ignored: {} ({}))".format(f, book['uuid'], book['title'])) | |
pass | |
else: | |
# print ("Format '{}' todo: {} ({}))".format(f, book['uuid'], book['title'])) | |
book['source']['formats'][f]['status']='todo' | |
import glob | |
def check_ebooks(dir= 'my_books', dry_run=True): | |
''' | |
Check ebooks: | |
''' | |
print("Checking ...") | |
for root, dirs, files in os.walk(dir, topdown=True): | |
for counter, uuid in enumerate(dirs): | |
book = load_metadata(root, uuid) | |
if book: | |
status=book['source']['status'] | |
if status=="todo": | |
print(status) | |
source=book['source'] | |
update=False | |
for f, v in source["formats"].items(): | |
print(uuid, f, v['status']) | |
if v['status']=="todo": | |
formats= glob.glob(root+"/"+uuid+"/*."+f) | |
print(formats) | |
if formats: | |
print(book['uuid'], formats[0]) | |
book['source']['formats'][f]['status']="done" | |
update=True | |
if not dry_run and update: | |
update_done_status(book) | |
save_metadata(dir, book) | |
print("Book done", book['uuid']) | |
print() | |
print() | |
if __name__ == "__main__": | |
fire.Fire({ | |
"index_ebooks": index_ebooks, | |
"download_ebooks": download_ebooks, | |
"download_covers": download_covers, | |
"set_status": set_status, | |
"check_ebooks": check_ebooks | |
}) |
This might be something to look into too:
/home/xx/.local/lib/python3.6/site-packages/beautifultable/utils.py:113: FutureWarning: 'BeautifulTable.column_headers' has been deprecated in 'v1.0.0' and will be removed in 'v1.
2.0'. Use 'BTColumnCollection.header' instead.
warnings.warn(message, FutureWarning)
/home/xx/.local/lib/python3.6/site-packages/beautifultable/utils.py:113: FutureWarning: 'BeautifulTable.append_row' has been deprecated in 'v1.0.0' and will be removed in 'v1.2.0'
. Use 'BTRowCollection.append' instead.
warnings.warn(message, FutureWarning)
/home/xx/.local/lib/python3.6/site-packages/beautifultable/utils.py:113: FutureWarning: 'BeautifulTable.sort' has been deprecated in 'v1.0.0' and will be removed in 'v1.2.0'. Use
'BTRowCollection.sort' instead.
warnings.warn(message, FutureWarning)
/home/xx/.local/lib/python3.6/site-packages/beautifultable/utils.py:113: FutureWarning: 'BeautifulTable.getitem' has been deprecated in 'v1.0.0' and will be removed in 'v1.2.0
'. Use 'BeautifulTable.{columns|rows}[key]' instead.
warnings.warn(message, FutureWarning)
Yes, I'm working on a new version. A true project not a gist. Probably releasing this fix. Is it blocking for this version ? I'll give a look in this case.
Oh no, not blocking at all, it still prints the reports just fine, it's just a warning that in the near future, so when the next people do pip install, they might get in trouble. So not that urgent, just a fyi :-)
Hi, any update on the new version status? I just happened to rediscover this project and saw the last comment was mine :D
I'm getting two copies of the ebooks downloaded, one with the correct format extension (eg .epub) and one with a tmp extension (eg .epub.tmp). In the console, I'm seeing the following:
Downloading ebook: ***the ebook url***
Size expected (estimation): 1.0 MB
Size received=1.0 MB
Unable to get book: ***the ebook url***
[WinError 32] The process cannot access the file because it is being used by another process: '***the path to the ebook that it did actually download***.tmp'
This is on Windows 10 using Python 3.6
edit: If I change the indention on line 211&212, it seems to resolve the issue. The tmp file needs to be closed before it can be copied & deleted.
Cool, thanks!