Skip to content

Instantly share code, notes, and snippets.

@ejmurray
Created August 4, 2015 14:37
Show Gist options
  • Save ejmurray/5cb946c4d07842dcb5dd to your computer and use it in GitHub Desktop.
Save ejmurray/5cb946c4d07842dcb5dd to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
__author__ = 'Ernest'
import urllib2
import re
from bs4 import BeautifulSoup
packtpub_url = "http://www.packtpub.com/"
def get_bookurls(url):
page = urllib2.urlopen(url)
soup_packtpage = BeautifulSoup(page, "lxml")
page.close()
next_page_li = soup_packtpage.find("li", class_="pager-next last")
if next_page_li is None:
next_page_url = None
else:
next_page_url = packtpub_url+next_page_li.a.get('href')
return next_page_url
start_url = "www.packtpub.com/books"
continue_scrapping = True
books_url = [start_url]
while continue_scrapping:
next_page_url = get_bookurls(start_url)
if next_page_url is None:
continue_scrapping = False
else:
books_url.append(next_page_url)
start_url = next_page_url
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment