pip install requests beautifulsoup4
python goodreads-list-scrape.py > books_data.js
make sure you update GOODREADS_SHELF_URL
| import time | |
| import json | |
| import sys | |
| import requests | |
| from bs4 import BeautifulSoup | |
| GOODREADS_SHELF_URL = "https://www.goodreads.com/review/list/<<add-your-own-here>>" | |
| SHELF_NAME = "read" | |
| MAX_PAGES = 20 | |
| REQUEST_DELAY = 1.0 | |
| HEADERS = { | |
| "User-Agent": "Mozilla/5.0 (compatible; GoodreadsShelfScraper/1.0; +https://luckysingh.net)" | |
| } | |
| def is_book_cover(img): | |
| src = img.get("src") or "" | |
| alt = img.get("alt", "").strip() | |
| if not alt: | |
| return False | |
| if "gr-assets.com" not in src: | |
| return False | |
| if "/books/" not in src: | |
| return False | |
| return True | |
| def normalize_cover_url(src): | |
| return src | |
| def fetch_page(session, page): | |
| params = { | |
| "shelf": SHELF_NAME, | |
| "page": page, | |
| } | |
| resp = session.get(GOODREADS_SHELF_URL, params=params, headers=HEADERS, timeout=20) | |
| resp.raise_for_status() | |
| return resp.text | |
| def extract_books_from_html(html): | |
| soup = BeautifulSoup(html, "html.parser") | |
| books = [] | |
| for img in soup.find_all("img"): | |
| if not is_book_cover(img): | |
| continue | |
| title = img.get("alt", "").strip() | |
| src = normalize_cover_url(img.get("src", "").strip()) | |
| if title and src: | |
| books.append({"title": title, "cover": src}) | |
| seen = set() | |
| unique = [] | |
| for b in books: | |
| key = (b["title"], b["cover"]) | |
| if key not in seen: | |
| seen.add(key) | |
| unique.append(b) | |
| return unique | |
| def scrape_all_books(): | |
| session = requests.Session() | |
| all_books = [] | |
| seen_keys = set() | |
| for page in range(1, MAX_PAGES + 1): | |
| html = fetch_page(session, page) | |
| books = extract_books_from_html(html) | |
| if not books: | |
| break | |
| new_count = 0 | |
| for b in books: | |
| key = (b["title"], b["cover"]) | |
| if key not in seen_keys: | |
| seen_keys.add(key) | |
| all_books.append(b) | |
| new_count += 1 | |
| if new_count == 0: | |
| break | |
| time.sleep(REQUEST_DELAY) | |
| return all_books | |
| def print_books_js(books): | |
| print("const BOOKS = [") | |
| for b in books: | |
| title = json.dumps(b["title"]) | |
| cover = json.dumps(b["cover"]) | |
| print(f" {{ title: {title}, cover: {cover} }},") | |
| print("];") | |
| def main(): | |
| try: | |
| books = scrape_all_books() | |
| except Exception as e: | |
| print(f"Error while scraping: {e}", file=sys.stderr) | |
| sys.exit(1) | |
| print_books_js(books) | |
| if __name__ == "__main__": | |
| main() |