yszheda · December 24, 2018 10:08
diff --git a/download_files.py b/download_files.py
 import urllib2
 from bs4 import BeautifulSoup
 import requests
 import urlparse
 import os


 extensions = ( '.pdf', '.jpg', '.png' )

 url = raw_input('Input url:')
 dst_dir = raw_input('Input download dir:')
 if not os.path.exists(dst_dir):
    os.makedirs(dst_dir)

 site = requests.get(url)
 html = site.content
 soup = BeautifulSoup(html, 'lxml')

 for link in soup.find_all('a'):
    new_url = link.get('href')
    abs_url = urlparse.urljoin(site.url, new_url)
    print(abs_url)
    if not abs_url.endswith(extensions):
        continue
    page = urllib2.urlopen(abs_url)
    html = page.read()
    # path = urlparse.urlparse(abs_url).path
    # name = dst_dir + os.path.basename(path)
    name = os.path.join(dst_dir, new_url)
    print(name)
    with open(name, 'wb') as f:
        f.write(html)
	import urllib2
	from bs4 import BeautifulSoup
	import requests
	import urlparse
	import os


	extensions = ( '.pdf', '.jpg', '.png' )

	url = raw_input('Input url:')
	dst_dir = raw_input('Input download dir:')
	if not os.path.exists(dst_dir):
	os.makedirs(dst_dir)

	site = requests.get(url)
	html = site.content
	soup = BeautifulSoup(html, 'lxml')

	for link in soup.find_all('a'):
	new_url = link.get('href')
	abs_url = urlparse.urljoin(site.url, new_url)
	print(abs_url)
	if not abs_url.endswith(extensions):
	continue
	page = urllib2.urlopen(abs_url)
	html = page.read()
	# path = urlparse.urlparse(abs_url).path
	# name = dst_dir + os.path.basename(path)
	name = os.path.join(dst_dir, new_url)
	print(name)
	with open(name, 'wb') as f:
	f.write(html)