Created
January 29, 2015 16:12
-
-
Save csmoore/a134296d99505e44fa95 to your computer and use it in GitHub Desktop.
Simple, focused link checker that outputs to csv
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#------------------------------------------------------------------------------- | |
# Licensed under the Apache License, Version 2.0: http://www.apache.org/licenses/LICENSE-2.0 | |
# See the license for details but you know the general guidance: this is distributed on an | |
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
#------------------------------------------------------------------------------- | |
# Name: urlChecker.py | |
# Usage: python.exe urlChecker.py [Output File:(.csv)] [Directory To Check] [File Filters(default:".html,.erb")] | |
# Requirements/Constraints: Must run with Python 2.X (because of use of urllib2) | |
#------------------------------------------------------------------------------- | |
# Description: a URL Checking script | |
# 1. This script will crawl the specified local directory | |
# 2. Open each text file matching the file filter/extension | |
# 3. Check for valid URLs in the file (verify that the URL provides a valid response) | |
# 4. Output bad URLs to a csv file | |
#------------------------------------------------------------------------------- | |
# Adapted/reviewed from: https://github.com/ACueva/Avi-Playground/blob/master/urlChecker.py | |
#------------------------------------------------------------------------------- | |
import csv | |
import datetime | |
import os | |
import re | |
import sys | |
import traceback | |
import urllib2 # Python3 Note: will need changed | |
def usage(): | |
print('Usage: python.exe urlChecker.py [Output File(.csv)] [Directory To Check] [File Filters(ex/default:".html,.erb")]') | |
def main(): | |
try : | |
# Set this path to local folder containing the web site repo you want to crawl (or pass in as parameter) | |
# Currently defaults to current path | |
DEFAULT_PATH = '.' | |
# Ex: DEFAULT_PATH = 'C:/my-website/html' | |
# if this verbose flag is set - it will list all URLs found (not just failing ones) | |
verbose = False | |
# a list of all the errors found | |
urlErrorsList = [] | |
if len(sys.argv) < 2 : | |
outputFile = 'UrlResults.csv' | |
else : | |
outputFile = sys.argv[1] | |
if len(sys.argv) < 3 : | |
path = DEFAULT_PATH | |
else : | |
path = sys.argv[2] | |
if len(sys.argv) < 4 : | |
patterns = ['.html', '.erb'] | |
# Add usage() if not all args suppied: | |
usage() | |
else : | |
stringFilter = sys.argv[3] | |
patterns = stringFilter.split(',') | |
print('[Output File]: ' + outputFile) | |
print('[Directory To Check]: ' + path) | |
print('[File Filters]: ' + ', '.join(patterns)) | |
if not os.path.exists(path) : | |
raise Exception("Selected path does not exist: " + path) | |
# Walks through directory structure looking for files matching patterns | |
matchingFileList = \ | |
[os.path.join(dp, f) \ | |
for dp, dn, filenames in os.walk(path) \ | |
for f in filenames \ | |
if os.path.splitext(f)[1] in patterns] | |
# Searches each file for referrences to http and https. If found then the URL is tested. | |
fileCount, urlFileCount, urlCount = 0, 0, 0 | |
for currentFile in matchingFileList: | |
fileCount += 1 | |
print('Checking File #' + str(fileCount) + ', file: ' + currentFile) | |
searchfile = open(currentFile, 'r') | |
lineNumber = 0 | |
newUrlFile = True | |
for line in searchfile: | |
lineNumber += 1 | |
if 'href="http' in line: | |
# Get a list of items betwen quotes " " | |
quotedItems = re.findall('"([^"]*)"', line) | |
for quotedItem in quotedItems: | |
# If Web Link: | |
if ('http' or 'https') in quotedItem: | |
badUrl = False | |
url, code, message = '', '', '' | |
try: | |
url = quotedItem | |
response = urllib2.urlopen(url) | |
code = str(response.getcode()) | |
checkUrl = response.geturl() # just in case we want to check the 2 | |
message = 'no error' | |
except urllib2.HTTPError as httpErr: | |
badUrl = True | |
code = str(httpErr.code) | |
# message = TODO | |
# if more info needed then potentially strip from response | |
# error_message = httpErr.read() | |
# print(error_message) | |
except urllib2.URLError as urlErr: | |
badUrl = True | |
code = '0' | |
message = urlErr.args | |
if badUrl or verbose : | |
urlCount += 1 | |
if newUrlFile : | |
urlFileCount += 1 | |
newUrlFile = False | |
urlError = [str(urlCount), str(urlFileCount), currentFile, str(lineNumber), url, code, message] | |
urlErrorsList.append(urlError) | |
searchfile.close | |
if urlCount == 0 : | |
print("No URLs found in files.") | |
return | |
# Now output URL Errors to file | |
with open(outputFile, 'wb') as csvfile: # Python3 Note: will need changed (to 'w') | |
errorWriter = csv.writer(csvfile) | |
# Header Row | |
errorWriter.writerow(['URL Count', 'File Count', 'File Name', 'Line Number', 'URL', 'Return Code', 'Message']) | |
for row in urlErrorsList : | |
errorWriter.writerow(row) | |
except Exception as err : | |
print(traceback.format_exception_only(type(err), err)[0].rstrip()) | |
if __name__ == '__main__': | |
print('Start Time: ' + str(datetime.datetime.now())) | |
main() | |
print('End Time: ' + str(datetime.datetime.now())) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment