Last active
October 13, 2015 21:38
-
-
Save informationsea/4259423 to your computer and use it in GitHub Desktop.
Make link graph
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- python -*- | |
__author__ = 'Yasunobu OKAMURA' | |
__copyright__ = 'Copyright (c) 2014 Y.Okamura' | |
__license__ = 'GPLv3+' | |
import HTMLParser | |
import urllib | |
import argparse | |
import sqlite3 | |
import urlparse | |
import re | |
import csv | |
import os.path | |
import os | |
import sys | |
class LinkFinder(HTMLParser.HTMLParser): | |
""" | |
""" | |
def __init__(self): | |
""" | |
""" | |
HTMLParser.HTMLParser.__init__(self) | |
self.links = list() | |
def handle_starttag(self, tag, attrs): | |
""" | |
Arguments: | |
- `self`: | |
- `tag`: | |
- `attrs`: | |
""" | |
if tag == 'a' or tag == 'link': | |
for k, v in [x for x in attrs if x[0] == 'href']: | |
self.links.append((tag, v)) | |
elif tag == 'img' or tag == 'script': | |
for k, v in [x for x in attrs if x[0] == 'src']: | |
self.links.append((tag, v)) | |
def _main(): | |
parser = argparse.ArgumentParser(description='check links') | |
parser.add_argument('start_page', help='Start page') | |
parser.add_argument('report_db', default='report.sqlite3', nargs='?', help='Report Database (default:%(default)s)') | |
parser.add_argument('-d', '--domain', help='Check this domain only (default: domain of start_page)') | |
parser.add_argument('-p', '--restrict-prefix', help='Check URLs that are started with this prefix', default=None) | |
parser.add_argument('-c', '--check-outside', action='store_true', help='Check external link') | |
options = parser.parse_args() | |
if options.domain: | |
domain = options.domain | |
else: | |
domain = urlparse.urlparse(options.start_page).netloc | |
queue = set([options.start_page]) | |
checked = set() | |
if os.path.exists(options.report_db): | |
print >>sys.stderr, 'This program will delete previous reports. Are you sure?' | |
yes = raw_input('yes or no> ') | |
if yes != 'yes': | |
sys.exit('Abort by user') | |
if os.path.exists(options.report_db): | |
os.unlink(options.report_db) | |
db = sqlite3.connect(options.report_db) | |
db.execute('CREATE TABLE links(source, tag, target)') | |
db.execute('CREATE TABLE urls(url, code)') | |
remove_anchor = re.compile(r'#.+') | |
while queue: | |
target_url = queue.pop() | |
checked.add(target_url) | |
print target_url | |
parsed_target_url = urlparse.urlparse(target_url) | |
try: | |
f = urllib.urlopen(target_url) | |
db.execute('INSERT INTO urls VALUES(?, ?)', (target_url, f.code)) | |
if f.info().gettype() == 'text/html' and parsed_target_url.netloc == domain: | |
finder = LinkFinder() | |
finder.feed(unicode(f.read(), 'utf-8')) | |
for tag, url in finder.links: | |
parsed_url = urlparse.urlparse(url) | |
if not parsed_url.netloc: | |
url = urlparse.urljoin(target_url, url) | |
parsed_url = urlparse.urlparse(url) | |
url = remove_anchor.sub('', url) | |
db.execute('INSERT INTO links VALUES(?, ?, ?)', (target_url, tag, url)) | |
if url not in checked and parsed_url.scheme in ('http', 'https') \ | |
and (options.check_outside or parsed_url.netloc == domain) and \ | |
(not options.restrict_prefix or url.startswith(options.restrict_prefix)): | |
queue.add(url) | |
except Exception as e: | |
db.execute('INSERT INTO urls VALUES(?, ?)', (target_url, 0)) | |
db.commit() | |
if __name__ == '__main__': | |
_main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment