xbns · June 6, 2019 09:49
diff --git a/download-pdfs.py b/download-pdfs.py
 from bs4 import BeautifulSoup

 import requests
 r = requests.get("https://aws.amazon.com/whitepapers/")
 data = r.text
 soup = BeautifulSoup(data,"lxml")

 for link in soup.findAll('a',href=True):
  #skip all other liks except pdf ones
  if  not link['href'].endswith('pdf'):
    continue  
  print(link.get('href'))
  
  ##usage
  # $ python download-pdfs.py >aws-whitepapers.txt
  # then..
  # $ parallel -j 20 --gnu -a aws-whitepapers.txt  wget -nc
  # -nc,--no-clobber: skip downloads that would download to existing files
	from bs4 import BeautifulSoup

	import requests
	r = requests.get("https://aws.amazon.com/whitepapers/")
	data = r.text
	soup = BeautifulSoup(data,"lxml")

	for link in soup.findAll('a',href=True):
	#skip all other liks except pdf ones
	if not link['href'].endswith('pdf'):
	continue
	print(link.get('href'))

	##usage
	# $ python download-pdfs.py >aws-whitepapers.txt
	# then..
	# $ parallel -j 20 --gnu -a aws-whitepapers.txt wget -nc
	# -nc,--no-clobber: skip downloads that would download to existing files