Created
August 23, 2012 00:03
-
-
Save AaronGhent/3430741 to your computer and use it in GitHub Desktop.
LinkDump
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! /usr/bin/env python | |
# >><<>><<>><<>><<>><<>><<>><<>><<>><<>><<>><<>><<>><<>><<>><<>><<>><<>><<>><< # | |
# linkdump - crawls a site and generates a list of links | |
# Author: Aaron Ghent | |
# >><<>><<>><<>><<>><<>><<>><<>><<>><<>><<>><<>><<>><<>><<>><<>><<>><<>><<>><< # | |
import os | |
import re | |
import sys | |
import urllib2 | |
HEADERS = [ | |
('User-agent', 'Mozilla/5.0 Gecko/20100625 Firefox/3.6.6'), | |
('Accept', 'text/html,application/xhtml+xml,application/xml'), | |
('Accept-Language', 'en-gb,en;q=0.5'), | |
('Accept-Charset', 'ISO-8859-1,utf-8;q=0.7,*;q=0.7'), | |
('Keep-Alive', '300'), | |
('Connection', 'keep-alive'), | |
('Cache-Control', 'max-age=0'), | |
] | |
HREF_REGEX = "href=(?:\"|')([-A-Z0-9+&@#\/\%=~_|$?!:,.]+)(?:\"|')" | |
LINK_REGEX = "((?:(?:https?):\/\/|www\.)[-A-Z0-9+&@#\/\%=~_|$?!:,.]+)" | |
LINK_DONT_EXPLORE_EXT = ['.jpg', '.jpeg', '.gif', '.png', '.ico', '.bmp', | |
'.swf', '.mp3', '.ogg', '.pdf' ] | |
LINK_VISITED = [] | |
LINK_QUEUE = [] | |
def dump_links(root_url, crawl_url): | |
if crawl_url in LINK_VISITED: | |
return | |
for nope in LINK_DONT_EXPLORE_EXT: | |
url = crawl_url.lower() | |
if url.endswith(nope): | |
return | |
LINK_VISITED.append(crawl_url) | |
# content isnt a webpage | |
content = grab(crawl_url) | |
if not content: | |
return | |
print crawl_url | |
sys.stdout.flush() | |
links = get_links(content) | |
for link in links: | |
link = local_link(root_url, link) | |
if link or (link not in LINK_QUEUE): | |
LINK_QUEUE.append(link) | |
for link in LINK_QUEUE: | |
if not link: | |
return | |
current = link | |
LINK_QUEUE.remove(link) | |
dump_links(root_url, current.strip()) | |
def grab(url): | |
""" only return a url if its a valid page e.g. dont return images """ | |
snatcher = urllib2.build_opener() | |
snatcher.addheaders = HEADERS | |
response = None | |
try: | |
response = snatcher.open(url) | |
except urllib2.URLError, e: | |
if hasattr(e, 'reason'): | |
print '[[Error]] Reason: {0} Url {1}'.format(e.reason, url) | |
elif hasattr(e, 'code'): | |
print '[[Error]] code: {0} Url: {1}'.format(e.code, url) | |
sys.stdout.flush() | |
return None | |
type = response.info().getheader("Content-Type") | |
if type.find('html') != -1 or type.find('application') != -1: | |
return response.read() | |
return None | |
def get_links(content): | |
links = [] | |
regex = re.compile(HREF_REGEX, re.IGNORECASE | re.MULTILINE) | |
for match in regex.findall(content): | |
links.append(match) | |
regex = re.compile(LINK_REGEX, re.IGNORECASE | re.MULTILINE) | |
for match in regex.findall(content): | |
links.append(match) | |
return links | |
def local_link(host, url): | |
if url.startswith('mailto:'): | |
return None | |
if url.startswith('http://') or url.startswith('https://'): | |
if url.startswith('http://' + host): | |
return 'http://' + url | |
elif url.startswith('https://' + host): | |
return 'https://' + url | |
elif url.startswith(host): | |
return url | |
else: | |
return None | |
elif url.startswith(host): | |
return 'http://' + url # force to http | |
elif url.startswith('/'): | |
if host.endswith('/'): | |
return host[-1] + url | |
return host + url | |
else: | |
if host.endswith('/'): | |
return host + url | |
return host + '/' + url | |
def usage(): | |
print('') | |
print('>><<>><<>><<>><<>><<>><<>><<>><<>><<>><<>><<>><<>><<>><<>><<>><<') | |
print('{0} usage'.format(os.path.basename(sys.argv[0]))) | |
print('>><<>><<>><<>><<>><<>><<>><<>><<>><<>><<>><<>><<>><<>><<>><<>><<') | |
print('') | |
print('{0} <domain>'.format(os.path.basename(sys.argv[0]))) | |
print('') | |
print('example: {0} google.com'.format(os.path.basename(sys.argv[0]))) | |
print('') | |
print('>><<>><<>><<>><<>><<>><<>><<>><<>><<>><<>><<>><<>><<>><<>><<>><<') | |
def main(): | |
if len(sys.argv) == 2: | |
link = local_link(sys.argv[1], sys.argv[1]) | |
print 'Grabbing links for {0}'.format(link) | |
dump_links(link, link) | |
else: | |
usage() | |
if __name__ == '__main__': | |
main() | |
# >><<>><<>><<>><<>><<>><<>><<>><<>><<>><<>><<>><<>><<>><<>><<>><<>><<>><<>><< # |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment