goweiting · January 14, 2017 22:26
diff --git a/scrapper.py b/scrapper.py
 from bs4 import BeautifulSoup
 import urllib
 import os

 # define the link to scrap from:
 mainLink = 'http://www.cs.cmu.edu/~aarti/Class/10701_Spring14/lecs.html'
 link = urllib.urlopen(mainLink)
 # ext = '.pdf'

 # use bs4 to download all the files with .pdf extension
 soup = BeautifulSoup(link.read(), 'lxml')  # parse it with lxml parser
 links = soup.body.find_all('a')

 for link in links:
    hreflink = link.get('href')

    # need to change the name of extension below:
    # need to change the number of characters fitting the extension
    if (hreflink is not None) and (hreflink[-4:] == '.pdf'):
        name = hreflink.split('/')[-1]  # usually the last term
        print(hreflink, name)

        # retrieve it:
        try:
            urllib.urlretrieve(hreflink, name)
            print('done')
        except IOError:
            # remove the .html extension; and try downloading again
            mainLink = 'http://www.cs.cmu.edu/~aarti/Class/10701_Spring14/'
            # TODO: WHAT IF THERE IS NO SUCH FILE !
            # if hreflink[1:8] == 'http://' or hreflink[1:3] == 'www':
                # print(name + ' cannot be downloaded')
            # else:
            hreflink = mainLink + hreflink  # make it not relative
            try:
                urllib.urlretrieve(hreflink, name)
                print('done')
            except IOError:
                print(name + ' cannot be downloaded')
	from bs4 import BeautifulSoup
	import urllib
	import os

	# define the link to scrap from:
	mainLink = 'http://www.cs.cmu.edu/~aarti/Class/10701_Spring14/lecs.html'
	link = urllib.urlopen(mainLink)
	# ext = '.pdf'

	# use bs4 to download all the files with .pdf extension
	soup = BeautifulSoup(link.read(), 'lxml') # parse it with lxml parser
	links = soup.body.find_all('a')

	for link in links:
	hreflink = link.get('href')

	# need to change the name of extension below:
	# need to change the number of characters fitting the extension
	if (hreflink is not None) and (hreflink[-4:] == '.pdf'):
	name = hreflink.split('/')[-1] # usually the last term
	print(hreflink, name)

	# retrieve it:
	try:
	urllib.urlretrieve(hreflink, name)
	print('done')
	except IOError:
	# remove the .html extension; and try downloading again
	mainLink = 'http://www.cs.cmu.edu/~aarti/Class/10701_Spring14/'
	# TODO: WHAT IF THERE IS NO SUCH FILE !
	# if hreflink[1:8] == 'http://' or hreflink[1:3] == 'www':
	# print(name + ' cannot be downloaded')
	# else:
	hreflink = mainLink + hreflink # make it not relative
	try:
	urllib.urlretrieve(hreflink, name)
	print('done')
	except IOError:
	print(name + ' cannot be downloaded')
No results found