Skip to content

Instantly share code, notes, and snippets.

@mickeypash
Last active August 29, 2015 14:25
Show Gist options
  • Save mickeypash/c189adfd2e2071af07a8 to your computer and use it in GitHub Desktop.
Save mickeypash/c189adfd2e2071af07a8 to your computer and use it in GitHub Desktop.
Scraping practice.
#!/usr/bin/env python
import requests
from bs4 import BeautifulSoup
def make_soup(url):
r = requests.get(url)
soup = BeautifulSoup(r.content, 'lxml')
return soup
def get_chapters_links(url):
chapter_list = make_soup(url).find('div', class_='inside')
ignored_chaps = ['Index', 'Errata', 'References']
links = chapter_list.find_all('a')
chapter_links = [i.get('href') for i in links if i.get_text() not in ignored_chaps]
return chapter_links
def get_sections(url):
chapter = make_soup(url).find('div', class_='chapter')
print chapter.h1.get_text()
section_list = chapter.find_all('div', class_='section')
section_titles = [sec.h2.get_text() for sec in section_list]
for section in section_list:
section_title = section.h2.get_text()
print " %s" % section_title
def get_paragraph():
pass
def save_to_file(filename='book_sections.txt'):
with open(filename, 'w+') as bs:
pass
if __name__ == '__main__':
url = 'http://searchuserinterfaces.com/book/'
chapter_links = get_chapters_links(url)
for link in chapter_links[:3]:
get_sections(link)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment