Skip to content

Instantly share code, notes, and snippets.

@0187773933
Created April 7, 2025 12:13
Show Gist options
  • Save 0187773933/0c57108e8759afb694d12c5726f7c1ac to your computer and use it in GitHub Desktop.
Save 0187773933/0c57108e8759afb694d12c5726f7c1ac to your computer and use it in GitHub Desktop.
FSL Course PDF Downloader
#!/usr/bin/env python3
import sys
import requests
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
from concurrent.futures import ThreadPoolExecutor, as_completed
import time
import json
from tqdm import tqdm
import base64
from pathlib import Path
from pdfmerge import pdfmerge
from pypdf import PdfReader, PdfWriter
def decode_base64_json( encoded_str ):
try:
utf8_bytes = base64.b64decode(encoded_str)
json_str = utf8_bytes.decode('utf-8')
return json.loads(json_str)
except Exception as e:
print("Decode error:", e)
return None
def create_session():
session = requests.Session()
retries = Retry(
total=5,
backoff_factor=1,
status_forcelist=[429, 500, 502, 503, 504],
allowed_methods=["GET"],
raise_on_status=False
)
adapter = HTTPAdapter(max_retries=retries)
session.mount("http://", adapter)
session.mount("https://", adapter)
return session
def download_file(url, save_path, session, try_stream=False):
try:
if not try_stream:
# Quick, simple full-body request
r = session.get(url, timeout=30)
r.raise_for_status()
save_path.write_bytes(r.content)
print(f"✅ Downloaded (no stream): {save_path.name}")
return
else:
# Streamed download (for big files / progress)
with session.get(url, stream=True, timeout=30) as r:
r.raise_for_status()
total_size = int(r.headers.get('content-length', 0))
block_size = 1024
t = tqdm(total=total_size, unit='iB', unit_scale=True, desc=save_path.name)
with open(save_path, 'wb') as f:
for data in r.iter_content(block_size):
t.update(len(data))
f.write(data)
t.close()
if total_size != 0 and t.n != total_size:
print(f"❌ Incomplete download: {url}")
except Exception as e:
if try_stream:
print(f"⚠️ Stream failed, retrying without stream: {url} ({e})")
download_file(url, save_path, session, try_stream=False)
else:
print(f"❌ Final failure: {url} ({e})")
def download_all( urls , output_dir="downloads" , max_workers=5 ):
session = create_session()
Path(output_dir).mkdir(parents=True, exist_ok=True)
results = {}
with ThreadPoolExecutor(max_workers=max_workers) as executor:
future_to_url = {}
for url in urls:
filename = url.split("/")[-1]
save_path = Path(output_dir) / filename
if save_path.exists():
print(f"✅ Already exists: {save_path.name}")
results[url] = True
continue
future = executor.submit(download_file, url, save_path, session)
future_to_url[ future ] = url
for future in tqdm( as_completed( future_to_url ) , total=len( urls ) , desc="Downloading all" ):
url = future_to_url[ future ]
try:
future.result()
results[url] = True
except Exception as e:
print( f"⚠️ {url} failed: {e}" )
results[ url ] = False
return results
def merge_pdfs( pdf_paths , output_path ):
writer = PdfWriter()
page_offset = 0
for pdf_path in pdf_paths:
path = Path( pdf_path )
reader = PdfReader( str( path ) )
num_pages = len( reader.pages )
# Append all pages
writer.append( reader )
# Add top-level bookmark pointing to first page of this PDF
writer.add_outline_item(
title=path.stem ,
page_number=page_offset
)
print( f"✅ Added {path.name} ({num_pages} pages) at page offset {page_offset}" )
page_offset += num_pages
with open( output_path, "wb" ) as out_f:
writer.write( out_f )
print( f"\n🎉 Merged PDF with bookmarks saved to: {output_path}" )
if __name__ == "__main__":
urls = decode_base64_json( sys.argv[ 1 ] )
download_all( urls )
save_paths = [ str( Path("downloads") / url.split("/")[-1] ) for url in urls ]
# pdfmerge( save_paths , "merged_output.pdf" )
merge_pdfs( save_paths , "merged_output.pdf" )
( ()=> {
// target = https://open.win.ox.ac.uk/pages/fslcourse/website/online_materials.html
// example = https://open.win.ox.ac.uk/pages/fslcourse/lectures/Reg_P1E1.pdf
// <a class="card-link" href="../lectures/Struc_P1E4.pdf" target="_blank" style="font-family: Arial;">PDF slides</a>
function base64_encode( js_obj ) {
const seen = new WeakSet();
const safe_string = JSON.stringify( js_obj , ( key , value ) => {
if ( typeof value === "object" && value !== null ) {
if ( seen.has( value ) ) return "[Circular]";
seen.add( value );
}
return value;
});
const utf8_bytes = new TextEncoder().encode( safe_string );
const binary_str = Array.from( utf8_bytes , byte => String.fromCharCode( byte ) ).join( '' );
return btoa( binary_str );
}
let pdf_elements = document.querySelectorAll( 'a.card-link[href^="../lectures/"][href$=".pdf"]' );
let pdf_urls = [ ...pdf_elements ].map( x => x.href );
let json_b64 = base64_encode( pdf_urls );
console.log( json_b64 );
})();
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment