Last active
March 14, 2026 13:15
-
-
Save bemitc/fcbfaad89c3518eddab992f69ff51ad4 to your computer and use it in GitHub Desktop.
simple downloader class for bloom library
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import requests | |
| import json | |
| from urllib.parse import urlparse, urlunparse | |
| import os | |
| # Just a simple downloader | |
| class BloomDownloader(object): | |
| def __init__(self): | |
| self._api_url = "https://server.bloomlibrary.org/parse/classes/books" | |
| self._headers = { | |
| "accept": "application/json, text/plain, */*", | |
| "content-type": "text/json", | |
| "origin": "https://bloomlibrary.org", | |
| "referer": "https://bloomlibrary.org/", | |
| "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/145.0.0.0 Safari/537.36", | |
| "x-parse-application-id": "R6qNTeumQXjJCMutAJYAwPtip1qBulkFyLefkCE5" | |
| } | |
| self._books = [] | |
| self._level = 1 | |
| self._lang = "" | |
| def get_books(self, lang, level=1): | |
| self._lang = lang | |
| self._books = [] | |
| self._level = level | |
| in_levels = [f"computedLevel:{level}", f"level:{level}"] | |
| nin_levels = ["level:1", "level:2", "level:3", "level:4"] | |
| if f'level:{level}' in nin_levels: | |
| nin_levels.remove(f'level:{level}') | |
| limit = 100 | |
| skip = 0 | |
| while True: | |
| payload = { | |
| "_method": "GET", | |
| "keys": "title,baseUrl,objectId,tags", | |
| "include": "langPointers", | |
| "limit": limit, | |
| "skip": skip, | |
| "where": { | |
| "langPointers": { | |
| "$inQuery": { | |
| "where": {"isoCode": lang}, | |
| "className": "language" | |
| } | |
| }, | |
| "$and": [ | |
| {"tags": {"$in": in_levels}}, | |
| {"tags": {"$nin": nin_levels}}, | |
| ], | |
| "inCirculation": True, | |
| "draft": False, | |
| "rebrand": False, | |
| "baseUrl": {"$exists": True} | |
| }, | |
| "order": "-createdAt" | |
| } | |
| response = requests.post(self._api_url, headers=self._headers, json=payload) | |
| if response.status_code == 200: | |
| data = response.json() | |
| results = data.get("results", []) | |
| if not results: | |
| break | |
| self._books.extend(results) | |
| skip += limit | |
| else: | |
| break | |
| def get_book_urls(self): | |
| book_epubs = [] | |
| for book in self._books: | |
| title = book.get("title") | |
| objectId = book.get("objectId") | |
| tags = book.get("tags", []) | |
| topic_tag = next((t for t in tags if t.startswith("topic:")), "") | |
| topic = topic_tag.replace("topic:", "") | |
| base_url = (book.get("baseUrl") or "").replace('%2f', '/') | |
| parsed = urlparse(base_url) | |
| parts = parsed.path.split('/') | |
| parts.insert(-2, 'epub') | |
| parts[-2] = parts[-2] + '.epub' | |
| parts.pop() | |
| new_path = '/'.join(parts) | |
| new_path = new_path.replace('BloomLibraryBooks', 'bloomharvest') | |
| new_url_obj = parsed._replace(path=new_path) | |
| book_epubs.append( | |
| { | |
| "id": objectId, | |
| "topic": topic, | |
| "title": title, | |
| "epub": urlunparse(new_url_obj), | |
| "bloom_url": f"https://bloomlibrary.org/language:{self._lang}/book/{objectId}?lang={self._lang}" | |
| } | |
| ) | |
| return book_epubs | |
| def dump_json(self): | |
| return json.dumps(self._books, indent=4) | |
| def download_books(self, save_dir=f"bloom"): | |
| save_dir = save_dir + f'/{self._lang}/{self._level}' | |
| if not os.path.exists(save_dir): | |
| os.makedirs(save_dir) | |
| books_to_download = self.get_book_urls() | |
| for i, book in enumerate(books_to_download, 1): | |
| url = book['epub'] | |
| filename = f'{book["id"]}.epub' | |
| filepath = os.path.join(save_dir, filename) | |
| try: | |
| with requests.get(url, stream=True, timeout=20) as r: | |
| if r.status_code == 200: | |
| with open(filepath, 'wb') as f: | |
| for chunk in r.iter_content(chunk_size=8192): | |
| f.write(chunk) | |
| else: | |
| print(f" FAILED: Status {r.status_code} for {url}") | |
| except Exception as e: | |
| print(f" ERROR downloading {book['title']}: {e}") | |
| bloom = BloomDownloader() | |
| bloom.get_books('ceb', 1) | |
| bloom.download_books() | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment