olopsman · October 13, 2019 09:41
diff --git a/scraper-regex.py b/scraper-regex.py
 from urllib.request import urlopen
 from bs4 import BeautifulSoup
 import re

 def crawl_url(pageUrl):
    main_url = "http://books.toscrape.com/"
    url = main_url + pageUrl
    html = urlopen(url)
    soup = BeautifulSoup(html, "html.parser")
    try:        
        try:
            new_url = soup.find("a", {"href":re.compile("page-[0-9]+\.html")})
            print(new_url['href'])    
            catalogue_str = "catalogue/"
            if catalogue_str in new_url['href']:
                htmlFile = open(new_url['href'], "w")
                crawl_url(new_url['href'])    
            else:
                htmlFile = open(catalogue_str + new_url['href'], "w")
                crawl_url(catalogue_str + new_url['href']) 
            htmlFile.write(str(soup))
            htmlFile.close()
        except AttributeError as e:
            print("Crawling finished")
            return None        
    finally:
        return None

 crawl_url("")
	from urllib.request import urlopen
	from bs4 import BeautifulSoup
	import re

	def crawl_url(pageUrl):
	main_url = "http://books.toscrape.com/"
	url = main_url + pageUrl
	html = urlopen(url)
	soup = BeautifulSoup(html, "html.parser")
	try:
	try:
	new_url = soup.find("a", {"href":re.compile("page-[0-9]+\.html")})
	print(new_url['href'])
	catalogue_str = "catalogue/"
	if catalogue_str in new_url['href']:
	htmlFile = open(new_url['href'], "w")
	crawl_url(new_url['href'])
	else:
	htmlFile = open(catalogue_str + new_url['href'], "w")
	crawl_url(catalogue_str + new_url['href'])
	htmlFile.write(str(soup))
	htmlFile.close()
	except AttributeError as e:
	print("Crawling finished")
	return None
	finally:
	return None

	crawl_url("")
No results found