d6e · November 11, 2013 09:50
diff --git a/linkscraper b/linkscraper
 #!/usr/bin/python
 from bs4 import BeautifulSoup
 from urllib2 import urlopen
 import re
 import os

 def getAllLinks(url):
    print "getting links..."
    soup = BeautifulSoup(urlopen(url).read())  # make some soup

    linkList = list()
    for link in soup.find_all('a'):  # for all links in the page
        trimmedLink = link.get('href')  # trim the links to just the url
        trimmedLink = trimmedLink.encode('utf-8')  # taking care of encoding
        if re.search(r'20[0-9]{2}', trimmedLink):  # filter out unwanted links
            linkList.append(trimmedLink)
            print "Found link: " + trimmedLink
        elif re.search(r'category', trimmedLink):  # page is wonky so have to open link to get to real page
            s = BeautifulSoup(urlopen(trimmedLink).read())  # open the next link
            cssLink = s.find("h1", { "class" : "entry-title" }) # find the link based on the css
            trimmedLink = cssLink.find('a').get('href')
            linkList.append(trimmedLink)
            print "Found link: " + trimmedLink
    return linkList



 def getTitle(soup):
    entryTitle = soup.find("h1", { "class" : "entry-title" })  # finds title based on css
    return entryTitle.get_text()

 def writeToFile(title, text):
    filename = "_".join( title.split() )  # for some reason had difficulties replacing space with underscore
    print "writing: " + filename
    
    f = open(filename,'w')
    wholeText = title + text
    f.write(wholeText.encode('utf-8'))
    f.close() 

 def getText(soup):
    entryContent = soup.find("div", { "class" : "entry-content" }) 
    text = entryContent.get_text()
    # Filter out unwanted strings
    text = re.sub(r'Share\ this:','',text)
    text = re.sub(r'Related','',text)
    return text

 def pullOnePage(link):
    soup = BeautifulSoup(urlopen(link).read())
    title = getTitle(soup)
    text = getText(soup)
    print title
    print text
    writeToFile(title, text)

 def pullAllThePages(link):
    for link in getAllLinks(link):
        soup = BeautifulSoup(urlopen(link).read())
        title = getTitle(soup)
        text = getText(soup)
        writeToFile(title, text)


 if __name__ == "__main__":
    directory = "dir"   # name of directory
    if not os.path.exists(directory):
        os.makedirs(directory)
    os.chdir(directory)
    
    pullAllThePages("")  # pass a url into this
    # pullOnePage("")
	#!/usr/bin/python
	from bs4 import BeautifulSoup
	from urllib2 import urlopen
	import re
	import os

	def getAllLinks(url):
	print "getting links..."
	soup = BeautifulSoup(urlopen(url).read()) # make some soup

	linkList = list()
	for link in soup.find_all('a'): # for all links in the page
	trimmedLink = link.get('href') # trim the links to just the url
	trimmedLink = trimmedLink.encode('utf-8') # taking care of encoding
	if re.search(r'20[0-9]{2}', trimmedLink): # filter out unwanted links
	linkList.append(trimmedLink)
	print "Found link: " + trimmedLink
	elif re.search(r'category', trimmedLink): # page is wonky so have to open link to get to real page
	s = BeautifulSoup(urlopen(trimmedLink).read()) # open the next link
	cssLink = s.find("h1", { "class" : "entry-title" }) # find the link based on the css
	trimmedLink = cssLink.find('a').get('href')
	linkList.append(trimmedLink)
	print "Found link: " + trimmedLink
	return linkList



	def getTitle(soup):
	entryTitle = soup.find("h1", { "class" : "entry-title" }) # finds title based on css
	return entryTitle.get_text()

	def writeToFile(title, text):
	filename = "_".join( title.split() ) # for some reason had difficulties replacing space with underscore
	print "writing: " + filename

	f = open(filename,'w')
	wholeText = title + text
	f.write(wholeText.encode('utf-8'))
	f.close()

	def getText(soup):
	entryContent = soup.find("div", { "class" : "entry-content" })
	text = entryContent.get_text()
	# Filter out unwanted strings
	text = re.sub(r'Share\ this:','',text)
	text = re.sub(r'Related','',text)
	return text

	def pullOnePage(link):
	soup = BeautifulSoup(urlopen(link).read())
	title = getTitle(soup)
	text = getText(soup)
	print title
	print text
	writeToFile(title, text)

	def pullAllThePages(link):
	for link in getAllLinks(link):
	soup = BeautifulSoup(urlopen(link).read())
	title = getTitle(soup)
	text = getText(soup)
	writeToFile(title, text)


	if __name__ == "__main__":
	directory = "dir" # name of directory
	if not os.path.exists(directory):
	os.makedirs(directory)
	os.chdir(directory)

	pullAllThePages("") # pass a url into this
	# pullOnePage("")