yeiichi · October 27, 2020 05:52
diff --git a/pdf_downloader b/pdf_downloader
 # -*- coding: utf-8 -*-
 import requests
 from bs4 import BeautifulSoup
 import os
 from urllib.parse import urljoin
 from pathlib import Path
 from pprint import pprint
 import time


 # User agent definition:
 # You can check your User Agent at ifconfig.me
 UA_LIST = {
    'SAFARI': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) \AppleWebKit/\
 605.1.15 (KHTML, like Gecko) Version/13.1 Safari/605.1.15',
    'FIREFOX': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:76.0) Gecko/\
 20100101 Firefox/76.0',
    'CHROME': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/\
 537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36'
 }


 def fetch_url_pdf(target_url, user_agent='SAFARI'):
    """Fetches names and URLs of pdf files in the target_url
    and returns an array containing pdf names and urls """
    my_headers = {'user-agent': UA_LIST[user_agent.upper()]}

    # Download and parse
    try:
        res = requests.get(target_url, headers=my_headers,  timeout=(3.05, 27))
        res.encoding = res.apparent_encoding
    except Exception as exc:
        print(f'Error: {exc}')

    soup = BeautifulSoup(res.text, 'lxml')
    # Extract page title
    page_title = soup.select('title')[0].text
    # Select a elements containing URL
    elements_url = soup.select('a')

    url_array = []
    for i in elements_url:
        pdf_title, pdf_url = i.text, i.get('href')
        # Filter for PDF
        if Path(str(pdf_url)).suffix == '.pdf':
            url_array.append([pdf_title, urljoin(target_url, pdf_url)])

    return url_array


 def pdf_downloader(url_array, dl_dir, user_agent='SAFARI'):
  """BLAH"""

  tfn = len(url_array) # Total pdf File Number
  dig = len(str(tfn))  # Number of digits of tfn

  my_headers = {'user-agent': UA_LIST[user_agent.upper()]}
  # Download
  for i, j in enumerate(url_array):
    base_name, file_url  = j[0], j[1]

    try:
      res = requests.get(file_url, headers=my_headers,  timeout=(3.05, 27))
      with open(os.path.join(dl_dir, (base_name+'.pdf')), 'wb') as f:
        f.write(res.content)
      print(str(i+1).zfill(dig)+'/'+str(tfn).zfill(dig),
            base_name, 'URL: ', file_url)

    except Exception as exc:
        print(f'Error: {exc}')

    print('Waiting...')
    time.sleep(5)

  print('DONE!')

 if __name__ == '__main__':
    target_url = input('Target URL? >>  ')
    pprint(fetch_url_pdf(target_url))
	# -- coding: utf-8 --
	import requests
	from bs4 import BeautifulSoup
	import os
	from urllib.parse import urljoin
	from pathlib import Path
	from pprint import pprint
	import time


	# User agent definition:
	# You can check your User Agent at ifconfig.me
	UA_LIST = {
	'SAFARI': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) \AppleWebKit/\
	605.1.15 (KHTML, like Gecko) Version/13.1 Safari/605.1.15',
	'FIREFOX': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:76.0) Gecko/\
	20100101 Firefox/76.0',
	'CHROME': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/\
	537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36'
	}


	def fetch_url_pdf(target_url, user_agent='SAFARI'):
	"""Fetches names and URLs of pdf files in the target_url
	and returns an array containing pdf names and urls """
	my_headers = {'user-agent': UA_LIST[user_agent.upper()]}

	# Download and parse
	try:
	res = requests.get(target_url, headers=my_headers, timeout=(3.05, 27))
	res.encoding = res.apparent_encoding
	except Exception as exc:
	print(f'Error: {exc}')

	soup = BeautifulSoup(res.text, 'lxml')
	# Extract page title
	page_title = soup.select('title')[0].text
	# Select a elements containing URL
	elements_url = soup.select('a')

	url_array = []
	for i in elements_url:
	pdf_title, pdf_url = i.text, i.get('href')
	# Filter for PDF
	if Path(str(pdf_url)).suffix == '.pdf':
	url_array.append([pdf_title, urljoin(target_url, pdf_url)])

	return url_array


	def pdf_downloader(url_array, dl_dir, user_agent='SAFARI'):
	"""BLAH"""

	tfn = len(url_array) # Total pdf File Number
	dig = len(str(tfn)) # Number of digits of tfn

	my_headers = {'user-agent': UA_LIST[user_agent.upper()]}
	# Download
	for i, j in enumerate(url_array):
	base_name, file_url = j[0], j[1]

	try:
	res = requests.get(file_url, headers=my_headers, timeout=(3.05, 27))
	with open(os.path.join(dl_dir, (base_name+'.pdf')), 'wb') as f:
	f.write(res.content)
	print(str(i+1).zfill(dig)+'/'+str(tfn).zfill(dig),
	base_name, 'URL: ', file_url)

	except Exception as exc:
	print(f'Error: {exc}')

	print('Waiting...')
	time.sleep(5)

	print('DONE!')

	if __name__ == '__main__':
	target_url = input('Target URL? >> ')
	pprint(fetch_url_pdf(target_url))
No results found