Created
March 5, 2014 14:58
-
-
Save nst/9368839 to your computer and use it in GitHub Desktop.
Test dead links from a web page list. Written to help someone on a newsgroup.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
__version__ = "$Revision: 0.2 $" | |
__author__ = "Nicolas Seriot" | |
__date__ = "2005-07-20" | |
"""" | |
Test dead links from a web page list. | |
Written to help someone on a newsgroup. | |
$ cat links.txt | |
http://seriot.ch/index.php | |
http://apple.com | |
$ python checkurl.py links.txt | |
http://seriot.ch/index.php | |
http://www.hjkbkhjdsf.com/ <- dead link | |
http://apple.com | |
"""" | |
import sys | |
from sets import * | |
from urllib2 import * | |
from re import * | |
def urls(base_url, html): | |
regexp = compile("<a href=\"([^#].*?)\"", DOTALL) | |
f = findall(regexp, html) | |
set = Set() | |
for u in f: | |
if not (u.startswith('http') or u.startswith('mailto')): | |
if u.startswith('./'): | |
u = u.lstrip('.') | |
if base_url.endswith('.php') or base_url.endswith('.htm') or base_url.endswith('.html'): | |
components = base_url.split('/') | |
base_url = '/'.join(components[:-1]) | |
elif not base_url.endswith('/'): | |
base_url = base_url + '/' | |
if base_url.endswith('/') and u.startswith('/'): | |
u = u.lstrip('/') | |
u = base_url + u | |
set.add(u) | |
return set | |
if len(sys.argv) < 2: | |
print "USAGE:", sys.argv[0], "file.txt" | |
sys.exit(1) | |
if __name__ == "__main__": | |
f = open(sys.argv[1]) | |
for base_url in f.xreadlines(): | |
if not base_url: | |
continue | |
base_url = base_url.strip() | |
print base_url | |
try: | |
html = urlopen(base_url).read() | |
except: | |
continue | |
for url in urls(base_url, html): | |
try: | |
req = Request(url) | |
req.add_header("Referer", base_url) | |
response = urlopen(req) | |
except: | |
print " ", url |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment