-
-
Save Krazybug/b7e814d7189db9ee1d6b9c1d1a1de95c to your computer and use it in GitHub Desktop.
#!/usr/bin/env python3 | |
''' | |
calisuck: index, filter-out smartly and download ebooks from Calibre open directories | |
Installation: | |
You need python 3.5 installed | |
Download the file as a zip and unzip-it and get into the dir | |
OR | |
> git clone https://gist.github.com/b7e814d7189db9ee1d6b9c1d1a1de95c.git | |
> mv b7e814d7189db9ee1d6b9c1d1a1de95c calisuck | |
> cd calisuck | |
> | |
THEN | |
> python3 -m venv . | |
> . bin/activate | |
> pip install requests fire humanize langid iso639 beautifultable | |
> python calisuck.py --help | |
> python calisuck.py index-ebooks --help | |
> python calisuck.py download-ebooks --help | |
> python calisuck.py download-covers --help | |
''' | |
''' | |
DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE | |
Version 2, December 2004 | |
Copyright (C) 2004 Sam Hocevar <[email protected]> | |
Everyone is permitted to copy and distribute verbatim or modified | |
copies of this license document, and changing it is allowed as long | |
as the name is changed. | |
DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE | |
TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION | |
0. You just DO WHAT THE FUCK YOU WANT TO. | |
''' | |
import sys | |
import os | |
import time | |
import re | |
import shutil | |
import requests | |
import json | |
import fire | |
from humanize import naturalsize as hsize | |
from langid.langid import LanguageIdentifier, model | |
import iso639 | |
import time | |
from requests.adapters import HTTPAdapter | |
import urllib.parse | |
import urllib3 | |
from beautifultable import BeautifulTable | |
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) | |
all_ordered_formats=['azw', 'azw3', 'cbr', 'chm', 'djvu', 'doc', 'docx', 'epub', 'kepub', 'lit', 'lrf', 'mobi', 'original_epub', 'pdf', 'ppt', 'prc', 'rar', 'rtf', 'txt', 'zip', 'fb2'] | |
identifier = LanguageIdentifier.from_modelstring(model, norm_probs=True) | |
def load_metadata(path, uuid): | |
filepath=path+'/'+uuid+'/metadata.json' | |
# print (filepath) | |
if os.path.isfile(filepath): | |
try: | |
with open(filepath, 'r') as fd: | |
return json.load(fd) | |
except: | |
print ("Error loading metadata for:", uuid, "from path:", path) | |
return 0 | |
else: | |
# print ("Metadata not found for:", uuid, "from path:", path) | |
return 0 | |
def save_metadata(path, book): | |
filepath=path+'/'+book['uuid']+'/metadata.json' | |
# print("Saving book metadata for:", book['uuid'], "to:", filepath) | |
os.makedirs(os.path.dirname(filepath+".tmp"), exist_ok=True) | |
with open(filepath+".tmp", 'w') as fd: | |
json.dump(book, fd, indent=4, separators=(',', ': ')) | |
try: | |
shutil.move(filepath+".tmp", filepath) | |
# print("Saved to:", filepath) | |
except: | |
print("Unable to rename .tmp file:", filepath+".tmp") | |
def get_cover_path(path, uuid): | |
filepath=path+'/'+uuid+'/cover.jpg' | |
if os.path.isfile(filepath): return filepath | |
else: return 0 | |
def get_file_path(path, uuid, fileformat): | |
files=os.listdir(path+'/'+uuid) | |
if files: | |
for f in files: | |
fname, ext=os.path.splitext(f) | |
if ext =='.'+fileformat: | |
return path+'/'+uuid+'/'+f | |
else: return 0 | |
else: return 0 | |
def get_cover(path, book, map): | |
url=book['source']['cover'] | |
if map: | |
pu=urllib.parse.urlparse(url) | |
pu=(pu[0], map, *pu[2:]) | |
print(pu) | |
url=urllib.parse.urlunparse(pu) | |
print("Downloading cover from:", url) | |
r=requests.get(url, timeout=(20, 3), verify=False) | |
r.raise_for_status() | |
filepath=path+'/'+book['uuid']+'/cover.jpg' | |
os.makedirs(os.path.dirname(filepath+".tmp"), exist_ok=True) | |
with open(filepath+".tmp", 'wb') as fd: | |
fd.write(r.content) | |
shutil.move(filepath+".tmp", filepath) | |
print("Saved to:", filepath) | |
def download_covers(dir='my_books', server='', map=""): | |
""" Download covers for each books""" | |
for root, dirs, files in os.walk(dir, topdown=True): | |
for d in dirs: | |
# print() | |
# print("-->", d) | |
book = load_metadata(root, d) | |
if book: | |
# if book['source']['status'] != "ignored": | |
if True: | |
if not get_cover_path(root, book['uuid']): | |
print() | |
print("-->", d) | |
print(book['uuid']) | |
try: | |
get_cover(root, book, map) | |
except: | |
print ("Unable to get cover", book['uuid']) | |
else: | |
pass | |
# print ("Cover already present:", book['uuid']) | |
else: | |
print ('book {} in status {}: ignored'.format(book['uuid'], book['source']['status'])) | |
else: | |
print ("No ebook metadata found in:", root) | |
def get_file_size(url): | |
print("Downloading size:", url) | |
r = requests.head(url, verify=False) | |
r.raise_for_status() | |
size=r.headers['Content-Length'] | |
print("Size received="+ hsize(size)) | |
return int(size) | |
def get_file(path, book, format, session, map, map_lib): | |
uuid = book['uuid'] | |
url=book['source']['formats'][format]['url'] | |
if map: | |
pu=urllib.parse.urlparse(url) | |
pu=(pu[0], map, *pu[2:]) | |
print(pu) | |
url=urllib.parse.urlunparse(pu) | |
if map_lib: | |
# pu=urllib.parse.urlparse(url) | |
# print(pu) | |
url_s=url.split("/") | |
# print(url_s) | |
url_s=url_s[:-1]+[map_lib] | |
# print('/'.join(url_s)) | |
url='/'.join(url_s) | |
print() | |
print("Downloading ebook:", url) | |
print("Size expected (estimation):", hsize(book['source']['formats'][format]['size'])) | |
r = session.get(url, timeout=(25,15), verify=False) | |
# headers = {"Range": "bytes=0-1023"} | |
# r = requests.get(url, headers=headers) | |
r.raise_for_status() | |
# print(r.headers) | |
if('Content-Length' in r.headers ): | |
print("Size received="+hsize(r.headers['Content-Length'])) | |
else: | |
print("Fize received") | |
filename=re.findall(r'filename="(.*)"', r.headers['Content-Disposition']) | |
# print(filename) | |
if len(filename): | |
filepath=path+'/'+uuid+'/'+filename[0] | |
else: | |
filepath=path+'/'+uuid+'/'+uuid+"."+format | |
os.makedirs(os.path.dirname(filepath+".tmp"), exist_ok=True) | |
with open(filepath+".tmp", 'wb') as fd: | |
fd.write(r.content) | |
shutil.move(filepath+".tmp", filepath) | |
print("Saved to:", filepath) | |
def set_status(uuid, status, dir='.'): | |
book = load_metadata(dir, uuid) | |
if book: | |
if book['source']['status'] != status: | |
book['source']['status'] = status | |
save_metadata(dir, book) | |
print("Status changed to", status+":", book['uuid'], "(", book['title'], ")") | |
else: | |
print("Status unchanged changed ", status+":", book['uuid']) | |
else: | |
print ("No ebook metadata found for:", uuid) | |
def remove_book(uuid, path='.'): | |
print(os.getcwd()) | |
bookpath=path+'/'+uuid | |
if os.path.isdir(bookpath): | |
try: | |
shutil.rmtree(bookpath) | |
print(uuid, "removed") | |
except: | |
print("problem") | |
else: | |
print(uuid, "not found") | |
def update_done_status(book): | |
source=book['source'] | |
if source['status']!='ignored': | |
if set(source['formats'].keys()) == set(book['formats']) & set(source['formats'].keys()): | |
book['source']['status']="done" | |
else: | |
book['source']['status']="todo" | |
def index_ebooks(site, library="", start=0, stop=0, dir="my_books", inc=1000, force_refresh=False): | |
""" | |
Index a remote Calibre library | |
You will get in your <dir> all the metadata (title, authors, isbn, ...) for each book. | |
They're stored as simple JSON files (metadata.json) so that you can easily visualize them or process them with 'jq' program. | |
They are stored in subdirectories with a UUID as a name. These directories do match different books and allow you to group all | |
the different formats of the same book and eventually the cover file. | |
You can mix books from different sites without any (theoric) collisions | |
Params: | |
--site=<string> : Url of the site to index (ex: http://123.123.123.123/) | |
--library=<string> (default=my_books) : Id of library to index. The script index the default library by default. | |
The id is string following '&library_id=' in the url | |
--force-refresh (defaul=False) : Force a refresh of the metadata. By default all the metdata | |
already gathered are ignored | |
--start=<int> (default=0) | |
--stop=<int> (default=0) : Allow indexing between a range of ebooks | |
--inc=<int> (default=1000) : Fix the number of ebooks for each request one the server | |
""" | |
os.makedirs(dir, exist_ok=True) | |
offset= 0 if not start else start-1 | |
num=min(1000,inc) | |
server=site.rstrip('/') | |
api=server+'/ajax/' | |
library= '/'+library if library else library | |
print("Server:", server) | |
url=api+'search'+library+'?num=0' | |
print() | |
print("Getting ebooks count:", server) | |
try: | |
r = requests.get(url,verify=False) | |
r.raise_for_status() | |
except: | |
print("Unable to open site:", url) | |
sys.exit(1) | |
print("Total count=",r.json()["total_num"]) | |
total_num=int(r.json()["total_num"]) | |
total_num= total_num if not stop else stop | |
print() | |
print("Start indexing") | |
range=offset+1 | |
while offset < total_num: | |
remaining_num = min(num, total_num - offset) | |
# print() | |
# print("Downloading ids: offset="+str(offset), "num="+str(remaining_num)) | |
url=api+'search'+library+'?num='+str(remaining_num)+'&offset='+str(offset)+'&sort=timestamp&sort_order=desc' | |
# print("->", url) | |
r=requests.get(url, verify=False) | |
# print("Ids received from:"+str(offset), "to:"+str(offset+remaining_num-1)) | |
# print() | |
# print("\rDownloading metadata from", str(offset+1), "to", str(offset+remaining_num),end='') | |
books_s=",".join(str(i) for i in r.json()['book_ids']) | |
url=api+'books'+library+'?ids='+books_s | |
# print("->", url) | |
r=requests.get(url, verify=False) | |
# print(len(r.json()), "received") | |
for id, r_book in r.json().items(): | |
uuid=r_book['uuid'] | |
if not uuid: | |
print ("No uuid for ebook: ignored") | |
continue | |
if r_book['authors']: | |
desc= f"uuid={uuid} ({r_book['title']} / {r_book['authors'][0]})" | |
else: | |
desc= f"uuid={uuid} ({r_book['title']})" | |
s=f"\r--> {range}/{total_num} - {desc}" | |
s='{:140.140}'.format(s) | |
print (s, end='') | |
if not force_refresh: | |
try: | |
book = load_metadata(dir, uuid) | |
except: | |
print() | |
print("Unable to get metadata from:", uuid) | |
range+=1 | |
continue | |
if book: | |
# print("Metadata already present for:", uuid) | |
range+=1 | |
continue | |
if not r_book['formats']: | |
print() | |
print("No format found for {}".format(r_book['uuid'])) | |
range+=1 | |
continue | |
book={} | |
url=api+'book/'+id | |
book['title']=r_book['title'] | |
book['authors']=r_book['authors'] | |
book['series']=r_book['series'] | |
book['series_index']=r_book['series_index'] | |
book['edition']=0 | |
book['uuid']=r_book['uuid'] | |
book['identifiers']=r_book['identifiers'] | |
book['comments']=r_book['comments'] | |
book['pubdate']=r_book['pubdate'] | |
book['publisher']=r_book['publisher'] | |
languages=r_book['languages'] | |
if not languages: | |
# if True: | |
if book['comments']: | |
text=book['comments'] | |
else: | |
text=book['title'] | |
s_language, prob=identifier.classify(text) | |
if prob >= 0.85: | |
language = iso639.to_iso639_2(s_language) | |
book['languages']=[language] | |
else: | |
book['languages']=[] | |
else: | |
book['languages']=[] | |
for l in languages: | |
book['languages'].append(iso639.to_iso639_2(l)) | |
book['tags']=r_book['tags'] | |
book['formats']=[] | |
book['metadata_version']=0.1 | |
source={} | |
source['url']=url+library | |
source['id']=id | |
try: | |
tmpbook = load_metadata(dir, uuid) | |
except: | |
print("Unable to get metadata from:", uuid) | |
range+=1 | |
continue | |
if tmpbook and tmpbook['source']['status']=="ignored": | |
source['status']="ignored" | |
else: | |
source['status']="todo" | |
source['cover']=server+r_book['cover'] | |
source['timestamp']=r_book['timestamp'] | |
format_sources={} | |
formats=r_book['formats'] | |
for f in formats: | |
s={} | |
url='' | |
if f in r_book['main_format']: | |
url=r_book['main_format'][f] | |
else: | |
url=r_book['other_formats'][f] | |
s['url']=server+url | |
if 'size' in r_book['format_metadata'][f]: | |
s['size']=int(r_book['format_metadata'][f]['size']) | |
else: | |
print() | |
print("Size not found for format '{}' : {}".format(f, uuid)) | |
print("Trying to get size online: {}".format(s['url'])) | |
try: | |
s['size']=get_file_size(s['url']) | |
except: | |
print("Unable to access format '{}' : {} skipped".format(f, uuid)) | |
continue | |
s['status']='todo' | |
format_sources[f]=s | |
source['formats']=format_sources | |
book['source']=source | |
if not source['formats']: | |
print("No format found for {}".format(r_book['uuid'])) | |
range+=1 | |
continue | |
update_done_status(book) | |
# print("Saving metadata for:", uuid) | |
try: | |
save_metadata(dir, book) | |
except: | |
print() | |
print("Unable to save book metadata", book['uuid']) | |
range+=1 | |
offset=offset+num | |
print() | |
print("Done") | |
def has_languages(book, languages=[], ignore_empty_language=False): | |
# print("Accepted languages", languages) | |
if not ignore_empty_language: | |
# print("Unknown language accepted") | |
pass | |
# rustine | |
if not 'languages' in book: | |
book['languages']=[] | |
# print("Book languages", book['languages']) | |
if ignore_empty_language and not book['languages']: | |
# print ("'{}' ignored: language is empty".format(book['uuid'])) | |
return False | |
if not ignore_empty_language and not book['languages']: | |
# print ("'{}' todo: language is empty".format(book['uuid'])) | |
return True | |
expected_languages=list(set(book['languages']) & set(languages)) | |
if languages and not expected_languages: | |
# print ("'{}' ignored: language {} not in {}".format(book['uuid'], book['languages'],languages)) | |
return False | |
# print ("'{}' todo: expected languages {}".format(book['uuid'], expected_languages)) | |
return True | |
def has_identifiers(book, identifiers=[], ignore_empty_identifiers=False): | |
# print("Accepted identifiers", identifiers) | |
if not ignore_empty_identifiers: | |
# print("Unknown identifiers accepted") | |
pass | |
# print("Book identifiers", book['identifiers'].keys()) | |
if ignore_empty_identifiers and not book['identifiers']: | |
# print ("'{}' ignored: identifier is empty".format(book['uuid'])) | |
return False | |
if not ignore_empty_identifiers and not book['identifiers']: | |
# print ("'{}' todo: identifiers is empty".format(book['uuid'])) | |
return True | |
expected_identifiers=list(set(book['identifiers'].keys()) & set(identifiers)) | |
if identifiers and not expected_identifiers: | |
# print ("'{}' ignored: identifiers {} not in {}".format(book['uuid'], book['identifiers'].keys(), identifiers)) | |
return False | |
# print ("'{}' todo: expected identifiers {}".format(book['uuid'], expected_identifiers)) | |
return True | |
def download_ebooks(dir= 'my_books', formats=[], single_format=False, ignored_formats=[], languages=[], identifiers=[], min_size=0, max_size=0, ignore_empty_language=False, ignore_empty_identifiers=False, dry_run=False, timer=0, map="", map_lib=""): | |
''' | |
Download ebooks in matching subdirs: | |
The different formats of the same book are groupe in the same directory | |
with an UUID name close to the metadata file (metadata.json). | |
The status of the formats for a book and its global status are initially set to 'todo'. | |
They move to 'done' after their download. This allows you to rerun the download and progressively collect books. | |
You can use different options to filter the formats for the download | |
by language, size, format and identifiers(isbn, ...). | |
A report of the download is displayed at the end of the process. | |
You can run this command in dry mode (--dry-run) with different settings | |
to only display the report and prepare your effective. | |
Params: | |
--min-size=<int> (default=0) | |
--max-size=<int> (default=infinity) : Delimit the size in MB for the accepted formats | |
--dry-run (defaul=False) : Run the command to simulate the download | |
--language=<string> : Restrict the download to a list of specific languages | |
(Ex: --languages='["eng","ita"]' | |
--ignore-empty-language (defaul=False) : Ignore books with unidentfied language | |
--formats=<string> : Restrict the download to a list of specific formats | |
(Ex: --formats='["epub", "mobi", "pdf"]' | |
--ignore-formats=<string> : Ignore the formats of a list of specific. | |
Compliant with --formats. | |
(Ex: --ignored-formats='["mp3", "rar", "zip"]' | |
--single-format (defaul=False) : Limit the download to 1 format per book with this preference order | |
'azw', 'azw3', 'cbr', 'chm', 'djvu', 'doc', 'docx', 'epub', 'kepub', | |
'lit', 'lrf', 'mobi', 'original_epub', 'pdf', 'ppt', 'prc', 'rar' | |
, 'rtf', 'txt', 'zip', 'fb2' | |
--identifiers=<string> : Restrict the download to a list of specific identifiers | |
(Ex: --identifiers='["isbn","asin"]' | |
--ignore-empty-identifiers (defaul=False) : Ignore books without identifiers (often OCR) | |
''' | |
# all_ordered_formats=['azw', 'azw3', 'cbr', 'chm', 'djvu', 'doc', 'docx', 'epub', 'lit', 'lrf', 'mobi', 'original_epub', 'pdf', 'ppt', 'prc', 'rar', 'rtf', 'txt', 'zip'] | |
print() | |
if single_format: my_formats = formats if formats else all_ordered_formats | |
else: my_formats=formats | |
# print("formats=", my_formats) | |
min_size=int(min_size)*1024*1024 | |
max_size=int(max_size)*1024*1024 | |
print ("Format expected between {} and {}".format(hsize(min_size), hsize(max_size) if max_size else "infinity")) | |
total_size=0 | |
total_size_by_format={} | |
total_ebook_count=0 | |
total_format_count=0 | |
total_count_by_format={} | |
size_max=0 | |
size_min=0 | |
language_count={} | |
identifiers_count={} | |
s = requests.Session() | |
for root, dirs, files in os.walk(dir, topdown=True): | |
for counter, uuid in enumerate(dirs): | |
book = load_metadata(root, uuid) | |
if book: | |
status=book['source']['status'] | |
if status=="todo": | |
if not has_languages(book, languages=languages, ignore_empty_language=ignore_empty_language): | |
continue | |
if not has_identifiers(book, identifiers=identifiers, ignore_empty_identifiers=ignore_empty_identifiers): | |
continue | |
source=book['source'] | |
download_formats = get_formats_to_download(book, accepted_formats=my_formats, single_format=single_format, ignored_formats=ignored_formats, max_size=max_size, min_size=min_size) | |
if not len(download_formats): | |
# print ("'{}' ignored: no more format available in formats expected {}".format(uuid, download_formats)) | |
# print() | |
pass | |
else: | |
ebook_kept=False | |
for f in download_formats: | |
url = source['formats'][f]['url'] | |
# if map: | |
# pu=urllib.parse.urlparse(url) | |
# pu=(pu[0], map, *pu[2:]) | |
# print(pu) | |
# print(urllib.parse.urlunparse(pu)) | |
if url: | |
# # It shouldn't occur: Need to download again | |
if get_file_path(dir, uuid, f): | |
# print ("Format '{}' already present for {}: Retrying".format(f, uuid)) | |
# print() | |
# continue | |
# print("Format '{}': size expected={}".format(f, hsize(source['formats'][f]['size']))) | |
pass | |
# print(f"--> format '{f}' for ({book['title']} / {book['authors'][0]} / {str(book['series'])})") | |
if not dry_run: | |
try: | |
get_file(dir, book, f, s, map, map_lib) | |
book['formats'].append(f) | |
book['source']['formats'][f]['status']="done" | |
if timer: | |
print(f"Waiting {timer} seconds") | |
time.sleep(timer) | |
except Exception as msg: | |
print("Unable to get book:", url) | |
print(msg) | |
time.sleep(5) | |
continue | |
save_metadata(dir, book) | |
ebook_kept=True | |
size=source['formats'][f]['size'] | |
total_size += size | |
size_max = size if size>size_max else size_max | |
if not size_min: | |
size_min = size | |
else: | |
size_min = size if size<size_min else size_min | |
if not f in total_size_by_format: | |
total_size_by_format[f] = size | |
else: total_size_by_format[f] +=size | |
if not f in total_count_by_format: | |
total_count_by_format[f] = 1 | |
else: | |
total_count_by_format[f]+=1 | |
total_format_count +=1 | |
else: | |
# print ("Format '{}' ignored for {} ({}): No url)".format(f, uuid, book['title'])) | |
# print() | |
pass | |
if ebook_kept: | |
total_ebook_count+=1 | |
if not book['languages']: | |
if not '<unknown>' in language_count: | |
language_count['<unknown>'] = 1 | |
else: | |
language_count['<unknown>']+=1 | |
else: | |
for l in book['languages']: | |
if not l in language_count: | |
language_count[l] = 1 | |
else: | |
language_count[l]+=1 | |
if not book['identifiers']: | |
if not '<unknown>' in identifiers_count: | |
identifiers_count['<unknown>'] = 1 | |
else: | |
identifiers_count['<unknown>']+=1 | |
else: | |
for l in book['identifiers'].keys(): | |
if not l in identifiers_count: | |
identifiers_count[l] = 1 | |
else: | |
identifiers_count[l]+=1 | |
if not dry_run: | |
update_done_status(book) | |
if book['source']['status']=="done": | |
save_metadata(dir, book) | |
print("Book done:", book['uuid']) | |
print() | |
# total_ebook_count+=1 | |
else: | |
# print() | |
# print("-->", uuid, "("+book['title']+")") | |
# print ('{} in status "{}": skipped'.format(book['uuid'], status)) | |
# print(f"--> {uuid} ({book['title']}) in status {status}: skipped", end="\r") | |
# print(f"--> {uuid} ({book['title']})", end="\r") | |
print(f'--> {counter} books handled', end="\r") | |
print() | |
print("Reporting ...") | |
table_l = BeautifulTable() | |
table_l.column_headers = ["Language", "Ebooks count"] | |
for l, c in language_count.items(): | |
table_l.append_row([l, c]) | |
table_l.sort("Ebooks count", reverse=True) | |
table_l=table_l[0:10] | |
table_i = BeautifulTable() | |
table_i.column_headers = ["Identifier", "Ebooks count"] | |
for i, c in identifiers_count.items(): | |
table_i.append_row([i, c]) | |
table_i.sort("Ebooks count", reverse=True) | |
table_i=table_i[0:10] | |
print() | |
print("Top 10 ebooks by language/identifier:") | |
table = BeautifulTable() | |
table.column_headers = ["Languages", "Identifiers"] | |
table.append_row([table_l, table_i]) | |
# table.set_style(BeautifulTable.STYLE_MARKDOWN) | |
print(table) | |
print() | |
print("Total count of ebooks by format:") | |
table = BeautifulTable() | |
table.column_headers = ["Format", "Size", "Ebooks count"] | |
for f in total_count_by_format.keys(): | |
table.append_row([f, hsize(total_size_by_format[f]),total_count_by_format[f]]) | |
table.sort("Ebooks count", reverse=True) | |
# table.set_style(BeautifulTable.STYLE_MARKDOWN) | |
print(table) | |
table_c = BeautifulTable() | |
table_c.column_headers = ["", "Total count"] | |
table_c.append_row(["Formats", total_format_count]) | |
table_c.append_row(["Ebooks", total_ebook_count]) | |
table_s = BeautifulTable() | |
table_s.column_headers = ["", "Size"] | |
# table.append_row(["Min", hsize(size_min)]) | |
table_s.append_row(["Biggest File", hsize(size_max)]) | |
table_s.append_row(["Total", hsize(total_size)]) | |
print() | |
print("Summary:") | |
table = BeautifulTable() | |
table.column_headers = ["Total Count", "Total Size"] | |
table.append_row([table_c, table_s]) | |
# table.set_style(BeautifulTable.STYLE_MARKDOWN) | |
print(table) | |
print() | |
def get_formats_to_download(book, accepted_formats=[], ignored_formats=[], single_format=False, min_size=0, max_size=0): | |
# print("Accepted formats", accepted_formats) | |
source=book['source'] | |
# print("Formats available in source: {}".format(list(source['formats'].keys()))) | |
my_formats=[] | |
for f,v in source['formats'].items(): | |
if v['status']=='todo': | |
my_formats.append(f) | |
# print("Formats in 'todo': {}".format(my_formats)) | |
formats=[] | |
if single_format: | |
if accepted_formats: | |
for f in accepted_formats: | |
if f in my_formats: | |
formats=[f] | |
break | |
else: | |
print("need at least 1 format for ordering") | |
else: | |
if accepted_formats: | |
formats=list(set(accepted_formats) & set(my_formats)) | |
elif ignored_formats: | |
formats = list(set(my_formats) - set(ignored_formats)) | |
else: | |
formats=my_formats | |
# print("Formats expected: {}".format(formats)) | |
download_formats=formats[:] | |
for f in formats: | |
if not 'size' in source['formats'][f] and max_size: | |
# print ("Format '{}' ignored for {}: Size unknown".format(f, book['uuid'])) | |
download_formats.remove(f) | |
else: | |
size = source['formats'][f]['size'] | |
if size < min_size or (max_size and size > max_size): | |
download_formats.remove(f) | |
# print ("Format '{}' ignored for {}: size={} but expected between {} and {}".format(f, book['uuid'], hsize(size), hsize(min_size), hsize(max_size) if max_size else "infinity")) | |
return download_formats | |
def update_format_statuses(book,refresh_ignored): | |
formats=book['source']['formats'] | |
for f, v in formats.items(): | |
if v['status']=='ignored' and not refresh_ignored: | |
# print ("Format '{}' ignored: {} ({}))".format(f, book['uuid'], book['title'])) | |
pass | |
else: | |
# print ("Format '{}' todo: {} ({}))".format(f, book['uuid'], book['title'])) | |
book['source']['formats'][f]['status']='todo' | |
import glob | |
def check_ebooks(dir= 'my_books', dry_run=True): | |
''' | |
Check ebooks: | |
''' | |
print("Checking ...") | |
for root, dirs, files in os.walk(dir, topdown=True): | |
for counter, uuid in enumerate(dirs): | |
book = load_metadata(root, uuid) | |
if book: | |
status=book['source']['status'] | |
if status=="todo": | |
print(status) | |
source=book['source'] | |
update=False | |
for f, v in source["formats"].items(): | |
print(uuid, f, v['status']) | |
if v['status']=="todo": | |
formats= glob.glob(root+"/"+uuid+"/*."+f) | |
print(formats) | |
if formats: | |
print(book['uuid'], formats[0]) | |
book['source']['formats'][f]['status']="done" | |
update=True | |
if not dry_run and update: | |
update_done_status(book) | |
save_metadata(dir, book) | |
print("Book done", book['uuid']) | |
print() | |
print() | |
if __name__ == "__main__": | |
fire.Fire({ | |
"index_ebooks": index_ebooks, | |
"download_ebooks": download_ebooks, | |
"download_covers": download_covers, | |
"set_status": set_status, | |
"check_ebooks": check_ebooks | |
}) |
@Imm0rt4lDr3am3r
You may test to enter directly the url with login and password in the following format as Calibre is using basic auth:
http://login:[email protected]:8080It it fails, we can do it easily. Just need to add some parameters and some code.
If needed, I'll release a new version with this feature.If you're a bit familiar with python you can do it by yourself.
- create 2 variables login and password
2 change the lines with a call to requests.get with auth parameter like:
r = requests.get('https://api.github.com/user', auth=('user', 'pass'))
I've tried doing it with
#http://login:[email protected]:8080
format, but it fails due to a special character '!' in the password.
I am a bit familiar with Python, so I did try to create variables and add the auth parameter to the requests, but the script still failed to open the site. Although this may not be the reason, I'm not sure whether it's because of the self-signed certificate used by the site.
If possible, I hope you can provide a new version with this feature in case I did something wrong somewhere.
@Krazybug
I achieved my purpose (of backing up a remote server with known credentials) by tinkering around with the calibredb commands, but I will keep trying to make authentication work in this script. My end goal is to run the script regularly so that the backup is in sync with the remote server at all times.
Hoping to see a new version if you have the time to update the current script later on.
Thanks for the help.
You're welcome. I will work on tis feature very soon so.
Hi,
Thanks for the script! Is there a way to say "I'd like the epub version of this book, except if that doesn't exist, then the mobi, and if that doesn't, the "? Or does it always download all versions of the book?
Hi, @Lambik
Yes, you can use the --single-format option which has this priority order:
all_ordered_formats=['azw', 'azw3', 'cbr', 'chm', 'djvu', 'doc', 'docx', 'epub', 'kepub', 'lit', 'lrf', 'mobi', 'original_epub', 'pdf', 'ppt', 'prc', 'rar', 'rtf', 'txt', 'zip', 'fb2']
You can edit this line in your code if it doesn't fit your need.
And you can combine it with the --ignored-formats option.
Cool, thanks!
This might be something to look into too:
/home/xx/.local/lib/python3.6/site-packages/beautifultable/utils.py:113: FutureWarning: 'BeautifulTable.column_headers' has been deprecated in 'v1.0.0' and will be removed in 'v1.
2.0'. Use 'BTColumnCollection.header' instead.
warnings.warn(message, FutureWarning)
/home/xx/.local/lib/python3.6/site-packages/beautifultable/utils.py:113: FutureWarning: 'BeautifulTable.append_row' has been deprecated in 'v1.0.0' and will be removed in 'v1.2.0'
. Use 'BTRowCollection.append' instead.
warnings.warn(message, FutureWarning)
/home/xx/.local/lib/python3.6/site-packages/beautifultable/utils.py:113: FutureWarning: 'BeautifulTable.sort' has been deprecated in 'v1.0.0' and will be removed in 'v1.2.0'. Use
'BTRowCollection.sort' instead.
warnings.warn(message, FutureWarning)
/home/xx/.local/lib/python3.6/site-packages/beautifultable/utils.py:113: FutureWarning: 'BeautifulTable.getitem' has been deprecated in 'v1.0.0' and will be removed in 'v1.2.0
'. Use 'BeautifulTable.{columns|rows}[key]' instead.
warnings.warn(message, FutureWarning)
Yes, I'm working on a new version. A true project not a gist. Probably releasing this fix. Is it blocking for this version ? I'll give a look in this case.
Oh no, not blocking at all, it still prints the reports just fine, it's just a warning that in the near future, so when the next people do pip install, they might get in trouble. So not that urgent, just a fyi :-)
Hi, any update on the new version status? I just happened to rediscover this project and saw the last comment was mine :D
I'm getting two copies of the ebooks downloaded, one with the correct format extension (eg .epub) and one with a tmp extension (eg .epub.tmp). In the console, I'm seeing the following:
Downloading ebook: ***the ebook url***
Size expected (estimation): 1.0 MB
Size received=1.0 MB
Unable to get book: ***the ebook url***
[WinError 32] The process cannot access the file because it is being used by another process: '***the path to the ebook that it did actually download***.tmp'
This is on Windows 10 using Python 3.6
edit: If I change the indention on line 211&212, it seems to resolve the issue. The tmp file needs to be closed before it can be copied & deleted.
@Imm0rt4lDr3am3r
You may test to enter directly the url with login and password in the following format as Calibre is using basic auth:
http://login:[email protected]:8080
It it fails, we can do it easily. Just need to add some parameters and some code.
If needed, I'll release a new version with this feature.
If you're a bit familiar with python you can do it by yourself.
2 change the lines with a call to requests.get with auth parameter like:
r = requests.get('https://api.github.com/user', auth=('user', 'pass'))