the-code-rider · December 8, 2025 07:55
diff --git a/readme.md b/readme.md
diff --git a/goodreads-list-scrape.py b/goodreads-list-scrape.py
 import time
 import json
 import sys
 import requests
 from bs4 import BeautifulSoup

 GOODREADS_SHELF_URL = "https://www.goodreads.com/review/list/<<add-your-own-here>>"
 SHELF_NAME = "read"
 MAX_PAGES = 20
 REQUEST_DELAY = 1.0

 HEADERS = {
    "User-Agent": "Mozilla/5.0 (compatible; GoodreadsShelfScraper/1.0; +https://luckysingh.net)"
 }


 def is_book_cover(img):
    src = img.get("src") or ""
    alt = img.get("alt", "").strip()
    if not alt:
        return False
    if "gr-assets.com" not in src:
        return False
    if "/books/" not in src:
        return False
    return True


 def normalize_cover_url(src):
    return src


 def fetch_page(session, page):
    params = {
        "shelf": SHELF_NAME,
        "page": page,
    }
    resp = session.get(GOODREADS_SHELF_URL, params=params, headers=HEADERS, timeout=20)
    resp.raise_for_status()
    return resp.text


 def extract_books_from_html(html):
    soup = BeautifulSoup(html, "html.parser")
    books = []
    for img in soup.find_all("img"):
        if not is_book_cover(img):
            continue
        title = img.get("alt", "").strip()
        src = normalize_cover_url(img.get("src", "").strip())
        if title and src:
            books.append({"title": title, "cover": src})

    seen = set()
    unique = []
    for b in books:
        key = (b["title"], b["cover"])
        if key not in seen:
            seen.add(key)
            unique.append(b)
    return unique


 def scrape_all_books():
    session = requests.Session()
    all_books = []
    seen_keys = set()

    for page in range(1, MAX_PAGES + 1):
        html = fetch_page(session, page)
        books = extract_books_from_html(html)
        if not books:
            break

        new_count = 0
        for b in books:
            key = (b["title"], b["cover"])
            if key not in seen_keys:
                seen_keys.add(key)
                all_books.append(b)
                new_count += 1

        if new_count == 0:
            break

        time.sleep(REQUEST_DELAY)

    return all_books


 def print_books_js(books):
    print("const BOOKS = [")
    for b in books:
        title = json.dumps(b["title"])
        cover = json.dumps(b["cover"])
        print(f"  {{ title: {title}, cover: {cover} }},")
    print("];")


 def main():
    try:
        books = scrape_all_books()
    except Exception as e:
        print(f"Error while scraping: {e}", file=sys.stderr)
        sys.exit(1)

    print_books_js(books)


 if __name__ == "__main__":
    main()
	import time
	import json
	import sys
	import requests
	from bs4 import BeautifulSoup

	GOODREADS_SHELF_URL = "https://www.goodreads.com/review/list/<<add-your-own-here>>"
	SHELF_NAME = "read"
	MAX_PAGES = 20
	REQUEST_DELAY = 1.0

	HEADERS = {
	"User-Agent": "Mozilla/5.0 (compatible; GoodreadsShelfScraper/1.0; +https://luckysingh.net)"
	}


	def is_book_cover(img):
	src = img.get("src") or ""
	alt = img.get("alt", "").strip()
	if not alt:
	return False
	if "gr-assets.com" not in src:
	return False
	if "/books/" not in src:
	return False
	return True


	def normalize_cover_url(src):
	return src


	def fetch_page(session, page):
	params = {
	"shelf": SHELF_NAME,
	"page": page,
	}
	resp = session.get(GOODREADS_SHELF_URL, params=params, headers=HEADERS, timeout=20)
	resp.raise_for_status()
	return resp.text


	def extract_books_from_html(html):
	soup = BeautifulSoup(html, "html.parser")
	books = []
	for img in soup.find_all("img"):
	if not is_book_cover(img):
	continue
	title = img.get("alt", "").strip()
	src = normalize_cover_url(img.get("src", "").strip())
	if title and src:
	books.append({"title": title, "cover": src})

	seen = set()
	unique = []
	for b in books:
	key = (b["title"], b["cover"])
	if key not in seen:
	seen.add(key)
	unique.append(b)
	return unique


	def scrape_all_books():
	session = requests.Session()
	all_books = []
	seen_keys = set()

	for page in range(1, MAX_PAGES + 1):
	html = fetch_page(session, page)
	books = extract_books_from_html(html)
	if not books:
	break

	new_count = 0
	for b in books:
	key = (b["title"], b["cover"])
	if key not in seen_keys:
	seen_keys.add(key)
	all_books.append(b)
	new_count += 1

	if new_count == 0:
	break

	time.sleep(REQUEST_DELAY)

	return all_books


	def print_books_js(books):
	print("const BOOKS = [")
	for b in books:
	title = json.dumps(b["title"])
	cover = json.dumps(b["cover"])
	print(f" {{ title: {title}, cover: {cover} }},")
	print("];")


	def main():
	try:
	books = scrape_all_books()
	except Exception as e:
	print(f"Error while scraping: {e}", file=sys.stderr)
	sys.exit(1)

	print_books_js(books)


	if __name__ == "__main__":
	main()
No results found