Created
April 16, 2014 14:34
-
-
Save rharter/10885705 to your computer and use it in GitHub Desktop.
Web crawler to look for hidden short links in Google Developers site that (hopefully) lead to I/O tickets.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/local/bin/python | |
import sys | |
from lxml import html | |
from urlparse import urljoin | |
import urllib2 | |
import requests | |
import logging | |
visited_links = [] | |
def get_links(url): | |
''' Gets all the links on a page. While that's happening this parses for Goo.gl short links | |
in text and prints them to the log, including a beep for tail.''' | |
page = requests.get(url, verify=False); | |
try: | |
tree = html.fromstring(page.text) | |
except: | |
logging.error("Couldn't parse this html.") | |
return | |
special = tree.xpath("//text()[contains(.,'goo.gl')]") | |
if special is not None: | |
for spec in special: | |
logging.info('\afound shortlink: ' + spec) | |
return tree.xpath('//a/@href') | |
def crawl(seed): | |
'''Crawls this page and then all linked pages from here.''' | |
queue = [] | |
queue.append(seed) | |
while queue: | |
link = queue.pop(0) | |
if link.startswith('/'): | |
link = urljoin(seed, link) | |
logging.debug("Crawling " + link) | |
if link in visited_links: | |
continue; | |
visited_links.append(link) | |
newLinks = get_links(link) | |
for l in newLinks: | |
if l and l not in visited_links and l.startswith('https://developers.google.com') or l.startswith('/'): | |
queue.append(l) | |
# Read the seed url from the command line | |
start = 'https://developers.google.com' | |
if len(sys.argv) > 1: | |
start = str(sys.argv[1]) | |
# Set up logging | |
logging.basicConfig(filename=sys.argv[2], level=logging.DEBUG) | |
requests_log = logging.getLogger('requests'); | |
requests_log.setLevel(logging.WARNING) | |
# working link | |
#crawl("https://developers.google.com/analytics/community/index?home") | |
crawl(start) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment