Skip to content

Instantly share code, notes, and snippets.

@AaronGhent
Created August 23, 2012 00:03
Show Gist options
  • Save AaronGhent/3430741 to your computer and use it in GitHub Desktop.
Save AaronGhent/3430741 to your computer and use it in GitHub Desktop.
LinkDump
#! /usr/bin/env python
# >><<>><<>><<>><<>><<>><<>><<>><<>><<>><<>><<>><<>><<>><<>><<>><<>><<>><<>><< #
# linkdump - crawls a site and generates a list of links
# Author: Aaron Ghent
# >><<>><<>><<>><<>><<>><<>><<>><<>><<>><<>><<>><<>><<>><<>><<>><<>><<>><<>><< #
import os
import re
import sys
import urllib2
HEADERS = [
('User-agent', 'Mozilla/5.0 Gecko/20100625 Firefox/3.6.6'),
('Accept', 'text/html,application/xhtml+xml,application/xml'),
('Accept-Language', 'en-gb,en;q=0.5'),
('Accept-Charset', 'ISO-8859-1,utf-8;q=0.7,*;q=0.7'),
('Keep-Alive', '300'),
('Connection', 'keep-alive'),
('Cache-Control', 'max-age=0'),
]
HREF_REGEX = "href=(?:\"|')([-A-Z0-9+&@#\/\%=~_|$?!:,.]+)(?:\"|')"
LINK_REGEX = "((?:(?:https?):\/\/|www\.)[-A-Z0-9+&@#\/\%=~_|$?!:,.]+)"
LINK_DONT_EXPLORE_EXT = ['.jpg', '.jpeg', '.gif', '.png', '.ico', '.bmp',
'.swf', '.mp3', '.ogg', '.pdf' ]
LINK_VISITED = []
LINK_QUEUE = []
def dump_links(root_url, crawl_url):
if crawl_url in LINK_VISITED:
return
for nope in LINK_DONT_EXPLORE_EXT:
url = crawl_url.lower()
if url.endswith(nope):
return
LINK_VISITED.append(crawl_url)
# content isnt a webpage
content = grab(crawl_url)
if not content:
return
print crawl_url
sys.stdout.flush()
links = get_links(content)
for link in links:
link = local_link(root_url, link)
if link or (link not in LINK_QUEUE):
LINK_QUEUE.append(link)
for link in LINK_QUEUE:
if not link:
return
current = link
LINK_QUEUE.remove(link)
dump_links(root_url, current.strip())
def grab(url):
""" only return a url if its a valid page e.g. dont return images """
snatcher = urllib2.build_opener()
snatcher.addheaders = HEADERS
response = None
try:
response = snatcher.open(url)
except urllib2.URLError, e:
if hasattr(e, 'reason'):
print '[[Error]] Reason: {0} Url {1}'.format(e.reason, url)
elif hasattr(e, 'code'):
print '[[Error]] code: {0} Url: {1}'.format(e.code, url)
sys.stdout.flush()
return None
type = response.info().getheader("Content-Type")
if type.find('html') != -1 or type.find('application') != -1:
return response.read()
return None
def get_links(content):
links = []
regex = re.compile(HREF_REGEX, re.IGNORECASE | re.MULTILINE)
for match in regex.findall(content):
links.append(match)
regex = re.compile(LINK_REGEX, re.IGNORECASE | re.MULTILINE)
for match in regex.findall(content):
links.append(match)
return links
def local_link(host, url):
if url.startswith('mailto:'):
return None
if url.startswith('http://') or url.startswith('https://'):
if url.startswith('http://' + host):
return 'http://' + url
elif url.startswith('https://' + host):
return 'https://' + url
elif url.startswith(host):
return url
else:
return None
elif url.startswith(host):
return 'http://' + url # force to http
elif url.startswith('/'):
if host.endswith('/'):
return host[-1] + url
return host + url
else:
if host.endswith('/'):
return host + url
return host + '/' + url
def usage():
print('')
print('>><<>><<>><<>><<>><<>><<>><<>><<>><<>><<>><<>><<>><<>><<>><<>><<')
print('{0} usage'.format(os.path.basename(sys.argv[0])))
print('>><<>><<>><<>><<>><<>><<>><<>><<>><<>><<>><<>><<>><<>><<>><<>><<')
print('')
print('{0} <domain>'.format(os.path.basename(sys.argv[0])))
print('')
print('example: {0} google.com'.format(os.path.basename(sys.argv[0])))
print('')
print('>><<>><<>><<>><<>><<>><<>><<>><<>><<>><<>><<>><<>><<>><<>><<>><<')
def main():
if len(sys.argv) == 2:
link = local_link(sys.argv[1], sys.argv[1])
print 'Grabbing links for {0}'.format(link)
dump_links(link, link)
else:
usage()
if __name__ == '__main__':
main()
# >><<>><<>><<>><<>><<>><<>><<>><<>><<>><<>><<>><<>><<>><<>><<>><<>><<>><<>><< #
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment