ettorerizza · April 15, 2024 16:50
diff --git a/urls_to_pdf.py b/urls_to_pdf.py
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-

 import os
 import requests
 from bs4 import BeautifulSoup
 import glob
 from PyPDF2 import PdfFileMerger

 #Todo: debug this function
 def merger(output, input_path):
    files = glob.glob('*.pdf')
    sorted(filter(os.path.isfile, os.listdir('.')), key=os.path.getmtime)
    print("files", files)
    pdf_merger = PdfFileMerger()

    for file in files:
        pdf_merger.append(file)
        #os.remove(file)

    with open(output, 'wb') as fileobj:
        pdf_merger.write(fileobj)


 def url_to_pdf(url, 
               folder, 
               filename, 
               CHROME_PATH = r"/Applications/Google\ Chrome.app/Contents/MacOS/Google\ Chrome"):
    """
    Use headless chrome CLI tool to print url to pdf
    """
    #Todo : add a default path also for Windows and Linux

    chrome_args = [CHROME_PATH,
                   '--headless',
                   '--disable-gpu',
                   f'--print-to-pdf={folder}/{filename}.pdf',
                   url, ]

    cmd = r" ".join(chrome_args)

    os.system(cmd)

 def links_to_pdf(project_name, base_url, css_selector, unique=False):
    """
    css_selector: to find links in main page
    unique: Do you want a single PDF ?
    """
    res = requests.get(base_url)
    soup = BeautifulSoup(res.text, 'lxml')
    links = soup.select(css_selector)
    urls = [base_url + link['href'] for link in links]

    # Create target Directory if don't exist
    if not os.path.exists(project_name):
        os.mkdir(project_name)
        print("Directory ", project_name,  " Created ")
    else:    
        print("Directory ", project_name,  " already exists")

    os.chdir(project_name)

    visited = []
    for url in urls:
        url_clean = url.split("#")[0]
        if url_clean not in visited:
            print(url_clean)
            name = url_clean.strip(".html").strip("/").split('/')[-1]
            print(name)
            try:
                url_to_pdf(url_clean, project_name, name)
            except Exception as e:
                print(e)
            visited.append(url_clean)

    if unique:
        output = project_name + ".pdf"
        merger(output, project_name)

 if __name__ == '__main__':

    project_name = "r-data"
    base_url = "https://r4ds.had.co.nz/"
    css_selector = ".part+ .chapter a"

    links_to_pdf(project_name, base_url, css_selector)
	#!/usr/bin/env python3
	# -- coding: utf-8 --

	import os
	import requests
	from bs4 import BeautifulSoup
	import glob
	from PyPDF2 import PdfFileMerger

	#Todo: debug this function
	def merger(output, input_path):
	files = glob.glob('*.pdf')
	sorted(filter(os.path.isfile, os.listdir('.')), key=os.path.getmtime)
	print("files", files)
	pdf_merger = PdfFileMerger()

	for file in files:
	pdf_merger.append(file)
	#os.remove(file)

	with open(output, 'wb') as fileobj:
	pdf_merger.write(fileobj)


	def url_to_pdf(url,
	folder,
	filename,
	CHROME_PATH = r"/Applications/Google\ Chrome.app/Contents/MacOS/Google\ Chrome"):
	"""
	Use headless chrome CLI tool to print url to pdf
	"""
	#Todo : add a default path also for Windows and Linux

	chrome_args = [CHROME_PATH,
	'--headless',
	'--disable-gpu',
	f'--print-to-pdf={folder}/{filename}.pdf',
	url, ]

	cmd = r" ".join(chrome_args)

	os.system(cmd)

	def links_to_pdf(project_name, base_url, css_selector, unique=False):
	"""
	css_selector: to find links in main page
	unique: Do you want a single PDF ?
	"""
	res = requests.get(base_url)
	soup = BeautifulSoup(res.text, 'lxml')
	links = soup.select(css_selector)
	urls = [base_url + link['href'] for link in links]

	# Create target Directory if don't exist
	if not os.path.exists(project_name):
	os.mkdir(project_name)
	print("Directory ", project_name, " Created ")
	else:
	print("Directory ", project_name, " already exists")

	os.chdir(project_name)

	visited = []
	for url in urls:
	url_clean = url.split("#")[0]
	if url_clean not in visited:
	print(url_clean)
	name = url_clean.strip(".html").strip("/").split('/')[-1]
	print(name)
	try:
	url_to_pdf(url_clean, project_name, name)
	except Exception as e:
	print(e)
	visited.append(url_clean)

	if unique:
	output = project_name + ".pdf"
	merger(output, project_name)

	if __name__ == '__main__':

	project_name = "r-data"
	base_url = "https://r4ds.had.co.nz/"
	css_selector = ".part+ .chapter a"

	links_to_pdf(project_name, base_url, css_selector)