Last active
September 25, 2015 22:09
-
-
Save fernandozamoraj/0d69e44aea29953db797 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import httplib2 | |
import webbrowser | |
import random | |
from BeautifulSoup import BeautifulSoup, SoupStrainer | |
######################################################### | |
# | |
# Extract all the hyperlinks from the given URL | |
# | |
######################################################### | |
def extractHyperLinksFromWebsite(siteUrl, targetWord): | |
links = [] | |
http = httplib2.Http() | |
status, response = http.request(siteUrl) | |
previousLink = "PREVIOUS" | |
for link in BeautifulSoup(response, parseOnlyThese=SoupStrainer('a')): | |
href = link.get('href') | |
if not href: | |
pass | |
else: | |
#must have target keyword and also filter out dupe links | |
#dupe links occur because some pages re-use the same link | |
#with a description and a image link | |
if targetWord in href and previousLink != href: | |
print link.contents | |
#prepend the link with the domain name (e.g. theverge.com/... | |
if href[0] == '/': | |
href = siteUrl + href | |
#filter out links that are not perma links | |
if not('?' in href): | |
links.append(href) | |
print href | |
previousLink = href | |
return links | |
######################################################### | |
# | |
# Creates a website with the hyperlinks | |
# | |
######################################################### | |
def createWebPageFromLinks(filePath, links): | |
fileWriter = open(filePath, "w") | |
fileWriter.write("<html><head><title>mylinks</title>{CSS}</head><body><table>".replace("{CSS}", getCss())) | |
items = 0; | |
classes = ['red', 'blue', 'green', 'violet', 'orange', 'yellow'] | |
previousColor = 0 | |
for link in links: | |
if items == 0: | |
fileWriter.write("<tr>") | |
domain = link.split('-', 1)[0] | |
friendlyDescription = link.replace(domain, '').replace('-', ' ') | |
friendlyDescription = friendlyDescription[0].upper() + friendlyDescription[1:] | |
color = random.randint(0,5) | |
if(color == previousColor): | |
color = random.randint(0,5) | |
previousColor = color | |
fileWriter.write(getLinkTemplate() \ | |
.replace("{HREF}", link) \ | |
.replace("{DESCRIPTION}", friendlyDescription) \ | |
.replace("{COLORCLASS}", classes[color])) | |
fileWriter.write("\n") | |
items += 1 | |
if items == 3: | |
fileWriter.write("</tr>") | |
items = 0 | |
fileWriter.write("</table></body></html>") | |
fileWriter.close() | |
def getCss(): | |
return "<style type=\"text/css\">table,th,td{ border: 0px solid black;} a{ color: #ffffff;} " + \ | |
"a:visited{color:#ffffff;} a{font-size:1.5em; font-family: Arial, 'Helvetica Neue', Helvetica, sans-serif;} a:link{ text-decoration: none;} a:hover{color: #0000dd;} " + \ | |
".red{background-color: #aa5555;} .green{background-color: #55aa55;} .blue{background-color: #5555aa;} " + \ | |
".violet{background-color: #aa55aa;} .orange{background-color: #aaaa55;} .yellow{background-color: #55bbaa;} " + \ | |
".roundedcorner{ /*border-radius: 25px;*/ padding: 20px; width: 200px; height: 200px; } " + \ | |
" tr { min-height: 200px; }" + \ | |
"</style>" | |
def getLinkTemplate(): | |
return "<td class=\"{COLORCLASS} roundedcorner\"><a href=\"{HREF}\" >{DESCRIPTION}</a></td>"; | |
############################################################ | |
# | |
# Configuration of your favorite websites | |
# | |
############################################################ | |
def getFavoriteWebsiteList(): | |
sites = [] | |
sites.append('http://www.theverge.com') | |
#sites.append('http://www.techcrunch.com') | |
#sites.append('http://www.livescience.com') | |
return sites | |
completeListOfHyperLinks = [] | |
for site in getFavoriteWebsiteList(): | |
links = extractHyperLinksFromWebsite(site, '-') | |
if len(links) > 0: | |
completeListOfHyperLinks.extend(links) | |
createWebPageFromLinks('links.html', completeListOfHyperLinks) | |
webbrowser.open('links.html', new=2) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment