ehzawad · January 12, 2025 08:01
diff --git a/web_to_pdf.py b/web_to_pdf.py
 from selenium import webdriver
 from selenium.webdriver.chrome.service import Service
 from selenium.webdriver.chrome.options import Options
 from selenium.webdriver.common.by import By
 from selenium.webdriver.support.ui import WebDriverWait
 from selenium.webdriver.support import expected_conditions as EC
 from webdriver_manager.chrome import ChromeDriverManager
 from tqdm import tqdm
 import time
 import os
 import base64
 from PyPDF2 import PdfMerger

 def capture_current_view_as_pdf(driver, output_path, page_num, total_pages):
    """Capture the current viewport as a PDF with clean page breaks"""
    print(f"\nCapturing page {page_num}/{total_pages}")
    
    # Inject CSS for better page breaks
    driver.execute_script("""
        // Add page break styles
        var style = document.createElement('style');
        style.textContent = `
            @media print {
                .page-break { 
                    display: block;
                    page-break-before: always;
                }
                h1, h2 { page-break-before: always; }
                h1 + h2 { page-break-before: avoid; }
                table, figure { page-break-inside: avoid; }
                .header { display: none !important; }
            }
        `;
        document.head.appendChild(style);
    """)
    
    # Ensure current view is properly laid out
    driver.execute_script("""
        // Clean up any overlap
        document.querySelectorAll('*').forEach(el => {
            if (window.getComputedStyle(el).position === 'fixed') {
                el.style.position = 'relative';
            }
        });
        
        // Remove duplicate headers/footers
        document.querySelectorAll('.header, .footer').forEach(el => {
            el.style.display = 'none';
        });
    """)
    
    time.sleep(0.5)  # Let layout settle
    
    # Generate PDF of current view
    pdf_data = driver.execute_cdp_cmd("Page.printToPDF", {
        "displayHeaderFooter": False,
        "printBackground": True,
        "paperWidth": 8.27,  # A4 width
        "paperHeight": 11.69,  # A4 height
        "marginTop": 0.4,  # Small margins to prevent content touching edges
        "marginBottom": 0.4,
        "marginLeft": 0.4,
        "marginRight": 0.4,
        "scale": 0.9,  # Slightly reduced scale to ensure content fits
        "preferCSSPageSize": True
    })
    
    if pdf_data and 'data' in pdf_data:
        pdf_bytes = base64.b64decode(pdf_data['data'])
        with open(output_path, 'wb') as f:
            f.write(pdf_bytes)
        return True
    return False

 def save_full_page_as_pdf(url, output_pdf_path):
    print("Initializing Chrome setup...")
    
    chrome_options = Options()
    chrome_options.add_argument("--headless=new")
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    chrome_options.add_argument("--disable-gpu")
    chrome_options.add_argument("--window-size=1920,1080")  # Standard viewport
    
    try:
        with tqdm(total=100, desc="Setting up Chrome") as pbar:
            service = Service(ChromeDriverManager().install())
            driver = webdriver.Chrome(service=service, options=chrome_options)
            pbar.update(100)
        
        print("\nLoading webpage...")
        driver.get(url)
        time.sleep(3)  # Initial load
        
        # Get the main content element
        content_element = driver.find_element(By.CLASS_NAME, "content")
        
        # Calculate dimensions
        total_height = driver.execute_script("return arguments[0].scrollHeight", content_element)
        viewport_height = driver.execute_script("return arguments[0].clientHeight", content_element)
        
        # Calculate optimal page height (slightly less than viewport to ensure clean breaks)
        page_height = int(viewport_height * 0.95)  # 95% of viewport height
        total_pages = (total_height + page_height - 1) // page_height
        
        print(f"\nTotal content height: {total_height}px")
        print(f"Page height: {page_height}px")
        print(f"Estimated total pages: {total_pages}")
        
        temp_pdfs = []
        current_scroll = 0
        page_number = 1
        
        with tqdm(total=total_height, desc="Capturing pages") as pbar:
            while current_scroll < total_height:
                # Ensure clean scroll position
                scroll_pos = current_scroll
                driver.execute_script(
                    f"arguments[0].scrollTop = {scroll_pos};", 
                    content_element
                )
                time.sleep(0.5)  # Let content settle
                
                # Look for natural break points (headers, section breaks)
                driver.execute_script("""
                    const viewport = arguments[0];
                    const headers = viewport.querySelectorAll('h1, h2, h3, .section-break');
                    headers.forEach(header => {
                        const rect = header.getBoundingClientRect();
                        if (rect.top > 0 && rect.top < window.innerHeight) {
                            header.style.pageBreakBefore = 'always';
                        }
                    });
                """, content_element)
                
                # Capture current view
                temp_pdf = f"temp_page_{page_number}.pdf"
                if capture_current_view_as_pdf(driver, temp_pdf, page_number, total_pages):
                    temp_pdfs.append(temp_pdf)
                    page_number += 1
                
                # Update scroll position and progress
                current_scroll += page_height
                pbar.update(min(page_height, total_height - (current_scroll - page_height)))
        
        print("\nMerging PDF pages...")
        merger = PdfMerger()
        for pdf in temp_pdfs:
            if os.path.exists(pdf):
                merger.append(pdf)
        
        merger.write(output_pdf_path)
        merger.close()
        
        # Clean up temp files
        for pdf in temp_pdfs:
            if os.path.exists(pdf):
                os.remove(pdf)
        
        if os.path.exists(output_pdf_path):
            print(f"\n✅ PDF saved successfully to: {output_pdf_path}")
            print(f"Final PDF size: {os.path.getsize(output_pdf_path)} bytes")
            print(f"Total pages captured: {len(temp_pdfs)}")
            return True
        else:
            raise Exception("Failed to create final PDF")
            
    except Exception as e:
        print(f"\n❌ Error: {str(e)}")
        return False
        
    finally:
        driver.quit()
        print("\nCleaned up resources.")

 if __name__ == "__main__":
    url = "https://cdn.openai.com/spec/model-spec-2024-05-08.html"
    output_pdf_path = os.path.join(os.getcwd(), "model_spec_clean.pdf")
    
    print(f"Starting PDF capture from {url}")
    
    # Ensure required packages are installed
    os.system("pip install PyPDF2 selenium webdriver-manager tqdm")
    
    success = save_full_page_as_pdf(url, output_pdf_path)
    
    if not success:
        print("\nTroubleshooting tips:")
        print("1. Try adjusting page height calculation")
        print("2. Check content loading and visibility")
        print("3. Verify all required packages are installed")
        exit(1)
	from selenium import webdriver
	from selenium.webdriver.chrome.service import Service
	from selenium.webdriver.chrome.options import Options
	from selenium.webdriver.common.by import By
	from selenium.webdriver.support.ui import WebDriverWait
	from selenium.webdriver.support import expected_conditions as EC
	from webdriver_manager.chrome import ChromeDriverManager
	from tqdm import tqdm
	import time
	import os
	import base64
	from PyPDF2 import PdfMerger

	def capture_current_view_as_pdf(driver, output_path, page_num, total_pages):
	"""Capture the current viewport as a PDF with clean page breaks"""
	print(f"\nCapturing page {page_num}/{total_pages}")

	# Inject CSS for better page breaks
	driver.execute_script("""
	// Add page break styles
	var style = document.createElement('style');
	style.textContent = `
	@media print {
	.page-break {
	display: block;
	page-break-before: always;
	}
	h1, h2 { page-break-before: always; }
	h1 + h2 { page-break-before: avoid; }
	table, figure { page-break-inside: avoid; }
	.header { display: none !important; }
	}
	`;
	document.head.appendChild(style);
	""")

	# Ensure current view is properly laid out
	driver.execute_script("""
	// Clean up any overlap
	document.querySelectorAll('*').forEach(el => {
	if (window.getComputedStyle(el).position === 'fixed') {
	el.style.position = 'relative';
	}
	});

	// Remove duplicate headers/footers
	document.querySelectorAll('.header, .footer').forEach(el => {
	el.style.display = 'none';
	});
	""")

	time.sleep(0.5) # Let layout settle

	# Generate PDF of current view
	pdf_data = driver.execute_cdp_cmd("Page.printToPDF", {
	"displayHeaderFooter": False,
	"printBackground": True,
	"paperWidth": 8.27, # A4 width
	"paperHeight": 11.69, # A4 height
	"marginTop": 0.4, # Small margins to prevent content touching edges
	"marginBottom": 0.4,
	"marginLeft": 0.4,
	"marginRight": 0.4,
	"scale": 0.9, # Slightly reduced scale to ensure content fits
	"preferCSSPageSize": True
	})

	if pdf_data and 'data' in pdf_data:
	pdf_bytes = base64.b64decode(pdf_data['data'])
	with open(output_path, 'wb') as f:
	f.write(pdf_bytes)
	return True
	return False

	def save_full_page_as_pdf(url, output_pdf_path):
	print("Initializing Chrome setup...")

	chrome_options = Options()
	chrome_options.add_argument("--headless=new")
	chrome_options.add_argument("--no-sandbox")
	chrome_options.add_argument("--disable-dev-shm-usage")
	chrome_options.add_argument("--disable-gpu")
	chrome_options.add_argument("--window-size=1920,1080") # Standard viewport

	try:
	with tqdm(total=100, desc="Setting up Chrome") as pbar:
	service = Service(ChromeDriverManager().install())
	driver = webdriver.Chrome(service=service, options=chrome_options)
	pbar.update(100)

	print("\nLoading webpage...")
	driver.get(url)
	time.sleep(3) # Initial load

	# Get the main content element
	content_element = driver.find_element(By.CLASS_NAME, "content")

	# Calculate dimensions
	total_height = driver.execute_script("return arguments[0].scrollHeight", content_element)
	viewport_height = driver.execute_script("return arguments[0].clientHeight", content_element)

	# Calculate optimal page height (slightly less than viewport to ensure clean breaks)
	page_height = int(viewport_height * 0.95) # 95% of viewport height
	total_pages = (total_height + page_height - 1) // page_height

	print(f"\nTotal content height: {total_height}px")
	print(f"Page height: {page_height}px")
	print(f"Estimated total pages: {total_pages}")

	temp_pdfs = []
	current_scroll = 0
	page_number = 1

	with tqdm(total=total_height, desc="Capturing pages") as pbar:
	while current_scroll < total_height:
	# Ensure clean scroll position
	scroll_pos = current_scroll
	driver.execute_script(
	f"arguments[0].scrollTop = {scroll_pos};",
	content_element
	)
	time.sleep(0.5) # Let content settle

	# Look for natural break points (headers, section breaks)
	driver.execute_script("""
	const viewport = arguments[0];
	const headers = viewport.querySelectorAll('h1, h2, h3, .section-break');
	headers.forEach(header => {
	const rect = header.getBoundingClientRect();
	if (rect.top > 0 && rect.top < window.innerHeight) {
	header.style.pageBreakBefore = 'always';
	}
	});
	""", content_element)

	# Capture current view
	temp_pdf = f"temp_page_{page_number}.pdf"
	if capture_current_view_as_pdf(driver, temp_pdf, page_number, total_pages):
	temp_pdfs.append(temp_pdf)
	page_number += 1

	# Update scroll position and progress
	current_scroll += page_height
	pbar.update(min(page_height, total_height - (current_scroll - page_height)))

	print("\nMerging PDF pages...")
	merger = PdfMerger()
	for pdf in temp_pdfs:
	if os.path.exists(pdf):
	merger.append(pdf)

	merger.write(output_pdf_path)
	merger.close()

	# Clean up temp files
	for pdf in temp_pdfs:
	if os.path.exists(pdf):
	os.remove(pdf)

	if os.path.exists(output_pdf_path):
	print(f"\n✅ PDF saved successfully to: {output_pdf_path}")
	print(f"Final PDF size: {os.path.getsize(output_pdf_path)} bytes")
	print(f"Total pages captured: {len(temp_pdfs)}")
	return True
	else:
	raise Exception("Failed to create final PDF")

	except Exception as e:
	print(f"\n❌ Error: {str(e)}")
	return False

	finally:
	driver.quit()
	print("\nCleaned up resources.")

	if __name__ == "__main__":
	url = "https://cdn.openai.com/spec/model-spec-2024-05-08.html"
	output_pdf_path = os.path.join(os.getcwd(), "model_spec_clean.pdf")

	print(f"Starting PDF capture from {url}")

	# Ensure required packages are installed
	os.system("pip install PyPDF2 selenium webdriver-manager tqdm")

	success = save_full_page_as_pdf(url, output_pdf_path)

	if not success:
	print("\nTroubleshooting tips:")
	print("1. Try adjusting page height calculation")
	print("2. Check content loading and visibility")
	print("3. Verify all required packages are installed")
	exit(1)