Created
November 11, 2013 09:50
-
-
Save d6e/7410710 to your computer and use it in GitHub Desktop.
For scraping links
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
from bs4 import BeautifulSoup | |
from urllib2 import urlopen | |
import re | |
import os | |
def getAllLinks(url): | |
print "getting links..." | |
soup = BeautifulSoup(urlopen(url).read()) # make some soup | |
linkList = list() | |
for link in soup.find_all('a'): # for all links in the page | |
trimmedLink = link.get('href') # trim the links to just the url | |
trimmedLink = trimmedLink.encode('utf-8') # taking care of encoding | |
if re.search(r'20[0-9]{2}', trimmedLink): # filter out unwanted links | |
linkList.append(trimmedLink) | |
print "Found link: " + trimmedLink | |
elif re.search(r'category', trimmedLink): # page is wonky so have to open link to get to real page | |
s = BeautifulSoup(urlopen(trimmedLink).read()) # open the next link | |
cssLink = s.find("h1", { "class" : "entry-title" }) # find the link based on the css | |
trimmedLink = cssLink.find('a').get('href') | |
linkList.append(trimmedLink) | |
print "Found link: " + trimmedLink | |
return linkList | |
def getTitle(soup): | |
entryTitle = soup.find("h1", { "class" : "entry-title" }) # finds title based on css | |
return entryTitle.get_text() | |
def writeToFile(title, text): | |
filename = "_".join( title.split() ) # for some reason had difficulties replacing space with underscore | |
print "writing: " + filename | |
f = open(filename,'w') | |
wholeText = title + text | |
f.write(wholeText.encode('utf-8')) | |
f.close() | |
def getText(soup): | |
entryContent = soup.find("div", { "class" : "entry-content" }) | |
text = entryContent.get_text() | |
# Filter out unwanted strings | |
text = re.sub(r'Share\ this:','',text) | |
text = re.sub(r'Related','',text) | |
return text | |
def pullOnePage(link): | |
soup = BeautifulSoup(urlopen(link).read()) | |
title = getTitle(soup) | |
text = getText(soup) | |
print title | |
print text | |
writeToFile(title, text) | |
def pullAllThePages(link): | |
for link in getAllLinks(link): | |
soup = BeautifulSoup(urlopen(link).read()) | |
title = getTitle(soup) | |
text = getText(soup) | |
writeToFile(title, text) | |
if __name__ == "__main__": | |
directory = "dir" # name of directory | |
if not os.path.exists(directory): | |
os.makedirs(directory) | |
os.chdir(directory) | |
pullAllThePages("") # pass a url into this | |
# pullOnePage("") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment