Created
April 7, 2025 12:13
-
-
Save 0187773933/0c57108e8759afb694d12c5726f7c1ac to your computer and use it in GitHub Desktop.
FSL Course PDF Downloader
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
import sys | |
import requests | |
from requests.adapters import HTTPAdapter | |
from urllib3.util.retry import Retry | |
from concurrent.futures import ThreadPoolExecutor, as_completed | |
import time | |
import json | |
from tqdm import tqdm | |
import base64 | |
from pathlib import Path | |
from pdfmerge import pdfmerge | |
from pypdf import PdfReader, PdfWriter | |
def decode_base64_json( encoded_str ): | |
try: | |
utf8_bytes = base64.b64decode(encoded_str) | |
json_str = utf8_bytes.decode('utf-8') | |
return json.loads(json_str) | |
except Exception as e: | |
print("Decode error:", e) | |
return None | |
def create_session(): | |
session = requests.Session() | |
retries = Retry( | |
total=5, | |
backoff_factor=1, | |
status_forcelist=[429, 500, 502, 503, 504], | |
allowed_methods=["GET"], | |
raise_on_status=False | |
) | |
adapter = HTTPAdapter(max_retries=retries) | |
session.mount("http://", adapter) | |
session.mount("https://", adapter) | |
return session | |
def download_file(url, save_path, session, try_stream=False): | |
try: | |
if not try_stream: | |
# Quick, simple full-body request | |
r = session.get(url, timeout=30) | |
r.raise_for_status() | |
save_path.write_bytes(r.content) | |
print(f"✅ Downloaded (no stream): {save_path.name}") | |
return | |
else: | |
# Streamed download (for big files / progress) | |
with session.get(url, stream=True, timeout=30) as r: | |
r.raise_for_status() | |
total_size = int(r.headers.get('content-length', 0)) | |
block_size = 1024 | |
t = tqdm(total=total_size, unit='iB', unit_scale=True, desc=save_path.name) | |
with open(save_path, 'wb') as f: | |
for data in r.iter_content(block_size): | |
t.update(len(data)) | |
f.write(data) | |
t.close() | |
if total_size != 0 and t.n != total_size: | |
print(f"❌ Incomplete download: {url}") | |
except Exception as e: | |
if try_stream: | |
print(f"⚠️ Stream failed, retrying without stream: {url} ({e})") | |
download_file(url, save_path, session, try_stream=False) | |
else: | |
print(f"❌ Final failure: {url} ({e})") | |
def download_all( urls , output_dir="downloads" , max_workers=5 ): | |
session = create_session() | |
Path(output_dir).mkdir(parents=True, exist_ok=True) | |
results = {} | |
with ThreadPoolExecutor(max_workers=max_workers) as executor: | |
future_to_url = {} | |
for url in urls: | |
filename = url.split("/")[-1] | |
save_path = Path(output_dir) / filename | |
if save_path.exists(): | |
print(f"✅ Already exists: {save_path.name}") | |
results[url] = True | |
continue | |
future = executor.submit(download_file, url, save_path, session) | |
future_to_url[ future ] = url | |
for future in tqdm( as_completed( future_to_url ) , total=len( urls ) , desc="Downloading all" ): | |
url = future_to_url[ future ] | |
try: | |
future.result() | |
results[url] = True | |
except Exception as e: | |
print( f"⚠️ {url} failed: {e}" ) | |
results[ url ] = False | |
return results | |
def merge_pdfs( pdf_paths , output_path ): | |
writer = PdfWriter() | |
page_offset = 0 | |
for pdf_path in pdf_paths: | |
path = Path( pdf_path ) | |
reader = PdfReader( str( path ) ) | |
num_pages = len( reader.pages ) | |
# Append all pages | |
writer.append( reader ) | |
# Add top-level bookmark pointing to first page of this PDF | |
writer.add_outline_item( | |
title=path.stem , | |
page_number=page_offset | |
) | |
print( f"✅ Added {path.name} ({num_pages} pages) at page offset {page_offset}" ) | |
page_offset += num_pages | |
with open( output_path, "wb" ) as out_f: | |
writer.write( out_f ) | |
print( f"\n🎉 Merged PDF with bookmarks saved to: {output_path}" ) | |
if __name__ == "__main__": | |
urls = decode_base64_json( sys.argv[ 1 ] ) | |
download_all( urls ) | |
save_paths = [ str( Path("downloads") / url.split("/")[-1] ) for url in urls ] | |
# pdfmerge( save_paths , "merged_output.pdf" ) | |
merge_pdfs( save_paths , "merged_output.pdf" ) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
( ()=> { | |
// target = https://open.win.ox.ac.uk/pages/fslcourse/website/online_materials.html | |
// example = https://open.win.ox.ac.uk/pages/fslcourse/lectures/Reg_P1E1.pdf | |
// <a class="card-link" href="../lectures/Struc_P1E4.pdf" target="_blank" style="font-family: Arial;">PDF slides</a> | |
function base64_encode( js_obj ) { | |
const seen = new WeakSet(); | |
const safe_string = JSON.stringify( js_obj , ( key , value ) => { | |
if ( typeof value === "object" && value !== null ) { | |
if ( seen.has( value ) ) return "[Circular]"; | |
seen.add( value ); | |
} | |
return value; | |
}); | |
const utf8_bytes = new TextEncoder().encode( safe_string ); | |
const binary_str = Array.from( utf8_bytes , byte => String.fromCharCode( byte ) ).join( '' ); | |
return btoa( binary_str ); | |
} | |
let pdf_elements = document.querySelectorAll( 'a.card-link[href^="../lectures/"][href$=".pdf"]' ); | |
let pdf_urls = [ ...pdf_elements ].map( x => x.href ); | |
let json_b64 = base64_encode( pdf_urls ); | |
console.log( json_b64 ); | |
})(); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment