Skip to content

Instantly share code, notes, and snippets.

@informationsea
Last active October 13, 2015 21:38
Show Gist options
  • Save informationsea/4259423 to your computer and use it in GitHub Desktop.
Save informationsea/4259423 to your computer and use it in GitHub Desktop.
Make link graph
#!/usr/bin/env python
# -*- python -*-
__author__ = 'Yasunobu OKAMURA'
__copyright__ = 'Copyright (c) 2014 Y.Okamura'
__license__ = 'GPLv3+'
import HTMLParser
import urllib
import argparse
import sqlite3
import urlparse
import re
import csv
import os.path
import os
import sys
class LinkFinder(HTMLParser.HTMLParser):
"""
"""
def __init__(self):
"""
"""
HTMLParser.HTMLParser.__init__(self)
self.links = list()
def handle_starttag(self, tag, attrs):
"""
Arguments:
- `self`:
- `tag`:
- `attrs`:
"""
if tag == 'a' or tag == 'link':
for k, v in [x for x in attrs if x[0] == 'href']:
self.links.append((tag, v))
elif tag == 'img' or tag == 'script':
for k, v in [x for x in attrs if x[0] == 'src']:
self.links.append((tag, v))
def _main():
parser = argparse.ArgumentParser(description='check links')
parser.add_argument('start_page', help='Start page')
parser.add_argument('report_db', default='report.sqlite3', nargs='?', help='Report Database (default:%(default)s)')
parser.add_argument('-d', '--domain', help='Check this domain only (default: domain of start_page)')
parser.add_argument('-p', '--restrict-prefix', help='Check URLs that are started with this prefix', default=None)
parser.add_argument('-c', '--check-outside', action='store_true', help='Check external link')
options = parser.parse_args()
if options.domain:
domain = options.domain
else:
domain = urlparse.urlparse(options.start_page).netloc
queue = set([options.start_page])
checked = set()
if os.path.exists(options.report_db):
print >>sys.stderr, 'This program will delete previous reports. Are you sure?'
yes = raw_input('yes or no> ')
if yes != 'yes':
sys.exit('Abort by user')
if os.path.exists(options.report_db):
os.unlink(options.report_db)
db = sqlite3.connect(options.report_db)
db.execute('CREATE TABLE links(source, tag, target)')
db.execute('CREATE TABLE urls(url, code)')
remove_anchor = re.compile(r'#.+')
while queue:
target_url = queue.pop()
checked.add(target_url)
print target_url
parsed_target_url = urlparse.urlparse(target_url)
try:
f = urllib.urlopen(target_url)
db.execute('INSERT INTO urls VALUES(?, ?)', (target_url, f.code))
if f.info().gettype() == 'text/html' and parsed_target_url.netloc == domain:
finder = LinkFinder()
finder.feed(unicode(f.read(), 'utf-8'))
for tag, url in finder.links:
parsed_url = urlparse.urlparse(url)
if not parsed_url.netloc:
url = urlparse.urljoin(target_url, url)
parsed_url = urlparse.urlparse(url)
url = remove_anchor.sub('', url)
db.execute('INSERT INTO links VALUES(?, ?, ?)', (target_url, tag, url))
if url not in checked and parsed_url.scheme in ('http', 'https') \
and (options.check_outside or parsed_url.netloc == domain) and \
(not options.restrict_prefix or url.startswith(options.restrict_prefix)):
queue.add(url)
except Exception as e:
db.execute('INSERT INTO urls VALUES(?, ?)', (target_url, 0))
db.commit()
if __name__ == '__main__':
_main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment