Skip to content

Instantly share code, notes, and snippets.

@fernandozamoraj
Last active September 25, 2015 22:09
Show Gist options
  • Save fernandozamoraj/0d69e44aea29953db797 to your computer and use it in GitHub Desktop.
Save fernandozamoraj/0d69e44aea29953db797 to your computer and use it in GitHub Desktop.
import httplib2
import webbrowser
import random
from BeautifulSoup import BeautifulSoup, SoupStrainer
#########################################################
#
# Extract all the hyperlinks from the given URL
#
#########################################################
def extractHyperLinksFromWebsite(siteUrl, targetWord):
links = []
http = httplib2.Http()
status, response = http.request(siteUrl)
previousLink = "PREVIOUS"
for link in BeautifulSoup(response, parseOnlyThese=SoupStrainer('a')):
href = link.get('href')
if not href:
pass
else:
#must have target keyword and also filter out dupe links
#dupe links occur because some pages re-use the same link
#with a description and a image link
if targetWord in href and previousLink != href:
print link.contents
#prepend the link with the domain name (e.g. theverge.com/...
if href[0] == '/':
href = siteUrl + href
#filter out links that are not perma links
if not('?' in href):
links.append(href)
print href
previousLink = href
return links
#########################################################
#
# Creates a website with the hyperlinks
#
#########################################################
def createWebPageFromLinks(filePath, links):
fileWriter = open(filePath, "w")
fileWriter.write("<html><head><title>mylinks</title>{CSS}</head><body><table>".replace("{CSS}", getCss()))
items = 0;
classes = ['red', 'blue', 'green', 'violet', 'orange', 'yellow']
previousColor = 0
for link in links:
if items == 0:
fileWriter.write("<tr>")
domain = link.split('-', 1)[0]
friendlyDescription = link.replace(domain, '').replace('-', ' ')
friendlyDescription = friendlyDescription[0].upper() + friendlyDescription[1:]
color = random.randint(0,5)
if(color == previousColor):
color = random.randint(0,5)
previousColor = color
fileWriter.write(getLinkTemplate() \
.replace("{HREF}", link) \
.replace("{DESCRIPTION}", friendlyDescription) \
.replace("{COLORCLASS}", classes[color]))
fileWriter.write("\n")
items += 1
if items == 3:
fileWriter.write("</tr>")
items = 0
fileWriter.write("</table></body></html>")
fileWriter.close()
def getCss():
return "<style type=\"text/css\">table,th,td{ border: 0px solid black;} a{ color: #ffffff;} " + \
"a:visited{color:#ffffff;} a{font-size:1.5em; font-family: Arial, 'Helvetica Neue', Helvetica, sans-serif;} a:link{ text-decoration: none;} a:hover{color: #0000dd;} " + \
".red{background-color: #aa5555;} .green{background-color: #55aa55;} .blue{background-color: #5555aa;} " + \
".violet{background-color: #aa55aa;} .orange{background-color: #aaaa55;} .yellow{background-color: #55bbaa;} " + \
".roundedcorner{ /*border-radius: 25px;*/ padding: 20px; width: 200px; height: 200px; } " + \
" tr { min-height: 200px; }" + \
"</style>"
def getLinkTemplate():
return "<td class=\"{COLORCLASS} roundedcorner\"><a href=\"{HREF}\" >{DESCRIPTION}</a></td>";
############################################################
#
# Configuration of your favorite websites
#
############################################################
def getFavoriteWebsiteList():
sites = []
sites.append('http://www.theverge.com')
#sites.append('http://www.techcrunch.com')
#sites.append('http://www.livescience.com')
return sites
completeListOfHyperLinks = []
for site in getFavoriteWebsiteList():
links = extractHyperLinksFromWebsite(site, '-')
if len(links) > 0:
completeListOfHyperLinks.extend(links)
createWebPageFromLinks('links.html', completeListOfHyperLinks)
webbrowser.open('links.html', new=2)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment