Skip to content

Instantly share code, notes, and snippets.

@d6e
Created November 11, 2013 09:50
Show Gist options
  • Save d6e/7410710 to your computer and use it in GitHub Desktop.
Save d6e/7410710 to your computer and use it in GitHub Desktop.
For scraping links
#!/usr/bin/python
from bs4 import BeautifulSoup
from urllib2 import urlopen
import re
import os
def getAllLinks(url):
print "getting links..."
soup = BeautifulSoup(urlopen(url).read()) # make some soup
linkList = list()
for link in soup.find_all('a'): # for all links in the page
trimmedLink = link.get('href') # trim the links to just the url
trimmedLink = trimmedLink.encode('utf-8') # taking care of encoding
if re.search(r'20[0-9]{2}', trimmedLink): # filter out unwanted links
linkList.append(trimmedLink)
print "Found link: " + trimmedLink
elif re.search(r'category', trimmedLink): # page is wonky so have to open link to get to real page
s = BeautifulSoup(urlopen(trimmedLink).read()) # open the next link
cssLink = s.find("h1", { "class" : "entry-title" }) # find the link based on the css
trimmedLink = cssLink.find('a').get('href')
linkList.append(trimmedLink)
print "Found link: " + trimmedLink
return linkList
def getTitle(soup):
entryTitle = soup.find("h1", { "class" : "entry-title" }) # finds title based on css
return entryTitle.get_text()
def writeToFile(title, text):
filename = "_".join( title.split() ) # for some reason had difficulties replacing space with underscore
print "writing: " + filename
f = open(filename,'w')
wholeText = title + text
f.write(wholeText.encode('utf-8'))
f.close()
def getText(soup):
entryContent = soup.find("div", { "class" : "entry-content" })
text = entryContent.get_text()
# Filter out unwanted strings
text = re.sub(r'Share\ this:','',text)
text = re.sub(r'Related','',text)
return text
def pullOnePage(link):
soup = BeautifulSoup(urlopen(link).read())
title = getTitle(soup)
text = getText(soup)
print title
print text
writeToFile(title, text)
def pullAllThePages(link):
for link in getAllLinks(link):
soup = BeautifulSoup(urlopen(link).read())
title = getTitle(soup)
text = getText(soup)
writeToFile(title, text)
if __name__ == "__main__":
directory = "dir" # name of directory
if not os.path.exists(directory):
os.makedirs(directory)
os.chdir(directory)
pullAllThePages("") # pass a url into this
# pullOnePage("")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment