Skip to content

Instantly share code, notes, and snippets.

@ehzawad
Created January 12, 2025 08:01
Show Gist options
  • Save ehzawad/012131cb391c0d01122f2e8758499dd4 to your computer and use it in GitHub Desktop.
Save ehzawad/012131cb391c0d01122f2e8758499dd4 to your computer and use it in GitHub Desktop.
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
from tqdm import tqdm
import time
import os
import base64
from PyPDF2 import PdfMerger
def capture_current_view_as_pdf(driver, output_path, page_num, total_pages):
"""Capture the current viewport as a PDF with clean page breaks"""
print(f"\nCapturing page {page_num}/{total_pages}")
# Inject CSS for better page breaks
driver.execute_script("""
// Add page break styles
var style = document.createElement('style');
style.textContent = `
@media print {
.page-break {
display: block;
page-break-before: always;
}
h1, h2 { page-break-before: always; }
h1 + h2 { page-break-before: avoid; }
table, figure { page-break-inside: avoid; }
.header { display: none !important; }
}
`;
document.head.appendChild(style);
""")
# Ensure current view is properly laid out
driver.execute_script("""
// Clean up any overlap
document.querySelectorAll('*').forEach(el => {
if (window.getComputedStyle(el).position === 'fixed') {
el.style.position = 'relative';
}
});
// Remove duplicate headers/footers
document.querySelectorAll('.header, .footer').forEach(el => {
el.style.display = 'none';
});
""")
time.sleep(0.5) # Let layout settle
# Generate PDF of current view
pdf_data = driver.execute_cdp_cmd("Page.printToPDF", {
"displayHeaderFooter": False,
"printBackground": True,
"paperWidth": 8.27, # A4 width
"paperHeight": 11.69, # A4 height
"marginTop": 0.4, # Small margins to prevent content touching edges
"marginBottom": 0.4,
"marginLeft": 0.4,
"marginRight": 0.4,
"scale": 0.9, # Slightly reduced scale to ensure content fits
"preferCSSPageSize": True
})
if pdf_data and 'data' in pdf_data:
pdf_bytes = base64.b64decode(pdf_data['data'])
with open(output_path, 'wb') as f:
f.write(pdf_bytes)
return True
return False
def save_full_page_as_pdf(url, output_pdf_path):
print("Initializing Chrome setup...")
chrome_options = Options()
chrome_options.add_argument("--headless=new")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("--window-size=1920,1080") # Standard viewport
try:
with tqdm(total=100, desc="Setting up Chrome") as pbar:
service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service, options=chrome_options)
pbar.update(100)
print("\nLoading webpage...")
driver.get(url)
time.sleep(3) # Initial load
# Get the main content element
content_element = driver.find_element(By.CLASS_NAME, "content")
# Calculate dimensions
total_height = driver.execute_script("return arguments[0].scrollHeight", content_element)
viewport_height = driver.execute_script("return arguments[0].clientHeight", content_element)
# Calculate optimal page height (slightly less than viewport to ensure clean breaks)
page_height = int(viewport_height * 0.95) # 95% of viewport height
total_pages = (total_height + page_height - 1) // page_height
print(f"\nTotal content height: {total_height}px")
print(f"Page height: {page_height}px")
print(f"Estimated total pages: {total_pages}")
temp_pdfs = []
current_scroll = 0
page_number = 1
with tqdm(total=total_height, desc="Capturing pages") as pbar:
while current_scroll < total_height:
# Ensure clean scroll position
scroll_pos = current_scroll
driver.execute_script(
f"arguments[0].scrollTop = {scroll_pos};",
content_element
)
time.sleep(0.5) # Let content settle
# Look for natural break points (headers, section breaks)
driver.execute_script("""
const viewport = arguments[0];
const headers = viewport.querySelectorAll('h1, h2, h3, .section-break');
headers.forEach(header => {
const rect = header.getBoundingClientRect();
if (rect.top > 0 && rect.top < window.innerHeight) {
header.style.pageBreakBefore = 'always';
}
});
""", content_element)
# Capture current view
temp_pdf = f"temp_page_{page_number}.pdf"
if capture_current_view_as_pdf(driver, temp_pdf, page_number, total_pages):
temp_pdfs.append(temp_pdf)
page_number += 1
# Update scroll position and progress
current_scroll += page_height
pbar.update(min(page_height, total_height - (current_scroll - page_height)))
print("\nMerging PDF pages...")
merger = PdfMerger()
for pdf in temp_pdfs:
if os.path.exists(pdf):
merger.append(pdf)
merger.write(output_pdf_path)
merger.close()
# Clean up temp files
for pdf in temp_pdfs:
if os.path.exists(pdf):
os.remove(pdf)
if os.path.exists(output_pdf_path):
print(f"\n✅ PDF saved successfully to: {output_pdf_path}")
print(f"Final PDF size: {os.path.getsize(output_pdf_path)} bytes")
print(f"Total pages captured: {len(temp_pdfs)}")
return True
else:
raise Exception("Failed to create final PDF")
except Exception as e:
print(f"\n❌ Error: {str(e)}")
return False
finally:
driver.quit()
print("\nCleaned up resources.")
if __name__ == "__main__":
url = "https://cdn.openai.com/spec/model-spec-2024-05-08.html"
output_pdf_path = os.path.join(os.getcwd(), "model_spec_clean.pdf")
print(f"Starting PDF capture from {url}")
# Ensure required packages are installed
os.system("pip install PyPDF2 selenium webdriver-manager tqdm")
success = save_full_page_as_pdf(url, output_pdf_path)
if not success:
print("\nTroubleshooting tips:")
print("1. Try adjusting page height calculation")
print("2. Check content loading and visibility")
print("3. Verify all required packages are installed")
exit(1)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment