Created
January 12, 2025 08:01
-
-
Save ehzawad/012131cb391c0d01122f2e8758499dd4 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from selenium import webdriver | |
from selenium.webdriver.chrome.service import Service | |
from selenium.webdriver.chrome.options import Options | |
from selenium.webdriver.common.by import By | |
from selenium.webdriver.support.ui import WebDriverWait | |
from selenium.webdriver.support import expected_conditions as EC | |
from webdriver_manager.chrome import ChromeDriverManager | |
from tqdm import tqdm | |
import time | |
import os | |
import base64 | |
from PyPDF2 import PdfMerger | |
def capture_current_view_as_pdf(driver, output_path, page_num, total_pages): | |
"""Capture the current viewport as a PDF with clean page breaks""" | |
print(f"\nCapturing page {page_num}/{total_pages}") | |
# Inject CSS for better page breaks | |
driver.execute_script(""" | |
// Add page break styles | |
var style = document.createElement('style'); | |
style.textContent = ` | |
@media print { | |
.page-break { | |
display: block; | |
page-break-before: always; | |
} | |
h1, h2 { page-break-before: always; } | |
h1 + h2 { page-break-before: avoid; } | |
table, figure { page-break-inside: avoid; } | |
.header { display: none !important; } | |
} | |
`; | |
document.head.appendChild(style); | |
""") | |
# Ensure current view is properly laid out | |
driver.execute_script(""" | |
// Clean up any overlap | |
document.querySelectorAll('*').forEach(el => { | |
if (window.getComputedStyle(el).position === 'fixed') { | |
el.style.position = 'relative'; | |
} | |
}); | |
// Remove duplicate headers/footers | |
document.querySelectorAll('.header, .footer').forEach(el => { | |
el.style.display = 'none'; | |
}); | |
""") | |
time.sleep(0.5) # Let layout settle | |
# Generate PDF of current view | |
pdf_data = driver.execute_cdp_cmd("Page.printToPDF", { | |
"displayHeaderFooter": False, | |
"printBackground": True, | |
"paperWidth": 8.27, # A4 width | |
"paperHeight": 11.69, # A4 height | |
"marginTop": 0.4, # Small margins to prevent content touching edges | |
"marginBottom": 0.4, | |
"marginLeft": 0.4, | |
"marginRight": 0.4, | |
"scale": 0.9, # Slightly reduced scale to ensure content fits | |
"preferCSSPageSize": True | |
}) | |
if pdf_data and 'data' in pdf_data: | |
pdf_bytes = base64.b64decode(pdf_data['data']) | |
with open(output_path, 'wb') as f: | |
f.write(pdf_bytes) | |
return True | |
return False | |
def save_full_page_as_pdf(url, output_pdf_path): | |
print("Initializing Chrome setup...") | |
chrome_options = Options() | |
chrome_options.add_argument("--headless=new") | |
chrome_options.add_argument("--no-sandbox") | |
chrome_options.add_argument("--disable-dev-shm-usage") | |
chrome_options.add_argument("--disable-gpu") | |
chrome_options.add_argument("--window-size=1920,1080") # Standard viewport | |
try: | |
with tqdm(total=100, desc="Setting up Chrome") as pbar: | |
service = Service(ChromeDriverManager().install()) | |
driver = webdriver.Chrome(service=service, options=chrome_options) | |
pbar.update(100) | |
print("\nLoading webpage...") | |
driver.get(url) | |
time.sleep(3) # Initial load | |
# Get the main content element | |
content_element = driver.find_element(By.CLASS_NAME, "content") | |
# Calculate dimensions | |
total_height = driver.execute_script("return arguments[0].scrollHeight", content_element) | |
viewport_height = driver.execute_script("return arguments[0].clientHeight", content_element) | |
# Calculate optimal page height (slightly less than viewport to ensure clean breaks) | |
page_height = int(viewport_height * 0.95) # 95% of viewport height | |
total_pages = (total_height + page_height - 1) // page_height | |
print(f"\nTotal content height: {total_height}px") | |
print(f"Page height: {page_height}px") | |
print(f"Estimated total pages: {total_pages}") | |
temp_pdfs = [] | |
current_scroll = 0 | |
page_number = 1 | |
with tqdm(total=total_height, desc="Capturing pages") as pbar: | |
while current_scroll < total_height: | |
# Ensure clean scroll position | |
scroll_pos = current_scroll | |
driver.execute_script( | |
f"arguments[0].scrollTop = {scroll_pos};", | |
content_element | |
) | |
time.sleep(0.5) # Let content settle | |
# Look for natural break points (headers, section breaks) | |
driver.execute_script(""" | |
const viewport = arguments[0]; | |
const headers = viewport.querySelectorAll('h1, h2, h3, .section-break'); | |
headers.forEach(header => { | |
const rect = header.getBoundingClientRect(); | |
if (rect.top > 0 && rect.top < window.innerHeight) { | |
header.style.pageBreakBefore = 'always'; | |
} | |
}); | |
""", content_element) | |
# Capture current view | |
temp_pdf = f"temp_page_{page_number}.pdf" | |
if capture_current_view_as_pdf(driver, temp_pdf, page_number, total_pages): | |
temp_pdfs.append(temp_pdf) | |
page_number += 1 | |
# Update scroll position and progress | |
current_scroll += page_height | |
pbar.update(min(page_height, total_height - (current_scroll - page_height))) | |
print("\nMerging PDF pages...") | |
merger = PdfMerger() | |
for pdf in temp_pdfs: | |
if os.path.exists(pdf): | |
merger.append(pdf) | |
merger.write(output_pdf_path) | |
merger.close() | |
# Clean up temp files | |
for pdf in temp_pdfs: | |
if os.path.exists(pdf): | |
os.remove(pdf) | |
if os.path.exists(output_pdf_path): | |
print(f"\n✅ PDF saved successfully to: {output_pdf_path}") | |
print(f"Final PDF size: {os.path.getsize(output_pdf_path)} bytes") | |
print(f"Total pages captured: {len(temp_pdfs)}") | |
return True | |
else: | |
raise Exception("Failed to create final PDF") | |
except Exception as e: | |
print(f"\n❌ Error: {str(e)}") | |
return False | |
finally: | |
driver.quit() | |
print("\nCleaned up resources.") | |
if __name__ == "__main__": | |
url = "https://cdn.openai.com/spec/model-spec-2024-05-08.html" | |
output_pdf_path = os.path.join(os.getcwd(), "model_spec_clean.pdf") | |
print(f"Starting PDF capture from {url}") | |
# Ensure required packages are installed | |
os.system("pip install PyPDF2 selenium webdriver-manager tqdm") | |
success = save_full_page_as_pdf(url, output_pdf_path) | |
if not success: | |
print("\nTroubleshooting tips:") | |
print("1. Try adjusting page height calculation") | |
print("2. Check content loading and visibility") | |
print("3. Verify all required packages are installed") | |
exit(1) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment