jonathanoheix · December 11, 2018 14:55
diff --git a/scraping10.py b/scraping10.py
 # store all the results into a list
 pages_urls = [main_url]

 soup = getAndParseURL(pages_urls[0])

 # while we get two matches, this means that the webpage contains a 'previous' and a 'next' button
 # if there is only one button, this means that we are either on the first page or on the last page
 # we stop when we get to the last page

 while len(soup.findAll("a", href=re.compile("page"))) == 2 or len(pages_urls) == 1:
    
    # get the new complete url by adding the fetched URL to the base URL (and removing the .html part of the base URL)
    new_url = "/".join(pages_urls[-1].split("/")[:-1]) + "/" + soup.findAll("a", href=re.compile("page"))[-1].get("href")
    
    # add the URL to the list
    pages_urls.append(new_url)
    
    # parse the next page
    soup = getAndParseURL(new_url)
    

 print(str(len(pages_urls)) + " fetched URLs")
 print("Some examples:")
 pages_urls[:5]
	# store all the results into a list
	pages_urls = [main_url]

	soup = getAndParseURL(pages_urls[0])

	# while we get two matches, this means that the webpage contains a 'previous' and a 'next' button
	# if there is only one button, this means that we are either on the first page or on the last page
	# we stop when we get to the last page

	while len(soup.findAll("a", href=re.compile("page"))) == 2 or len(pages_urls) == 1:

	# get the new complete url by adding the fetched URL to the base URL (and removing the .html part of the base URL)
	new_url = "/".join(pages_urls[-1].split("/")[:-1]) + "/" + soup.findAll("a", href=re.compile("page"))[-1].get("href")

	# add the URL to the list
	pages_urls.append(new_url)

	# parse the next page
	soup = getAndParseURL(new_url)


	print(str(len(pages_urls)) + " fetched URLs")
	print("Some examples:")
	pages_urls[:5]