csmoore · January 29, 2015 16:12
diff --git a/urlChecker.py b/urlChecker.py
 #-------------------------------------------------------------------------------
 # Licensed under the Apache License, Version 2.0: http://www.apache.org/licenses/LICENSE-2.0
 # See the license for details but you know the general guidance: this is distributed on an  
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
 #-------------------------------------------------------------------------------
 # Name: urlChecker.py
 # Usage: python.exe urlChecker.py [Output File:(.csv)] [Directory To Check] [File Filters(default:".html,.erb")]
 # Requirements/Constraints: Must run with Python 2.X (because of use of urllib2)
 #-------------------------------------------------------------------------------
 # Description: a URL Checking script
 # 1. This script will crawl the specified local directory
 # 2. Open each text file matching the file filter/extension
 # 3. Check for valid URLs in the file (verify that the URL provides a valid response)
 # 4. Output bad URLs to a csv file
 #-------------------------------------------------------------------------------
 # Adapted/reviewed from: https://github.com/ACueva/Avi-Playground/blob/master/urlChecker.py
 #-------------------------------------------------------------------------------

 import csv    
 import datetime  
 import os
 import re
 import sys
 import traceback
 import urllib2    # Python3 Note: will need changed

 def usage():
 	print('Usage: python.exe urlChecker.py [Output File(.csv)] [Directory To Check] [File Filters(ex/default:".html,.erb")]')

 def main():

 	try :

 		# Set this path to local folder containing the web site repo you want to crawl (or pass in as parameter)		
 		# Currently defaults to current path
 		DEFAULT_PATH = '.'
 		# Ex: DEFAULT_PATH = 'C:/my-website/html'
 		 
 		# if this verbose flag is set - it will list all URLs found (not just failing ones)
 		verbose = False

 		# a list of all the errors found
 		urlErrorsList = []

 		if len(sys.argv) < 2 :
 			outputFile = 'UrlResults.csv'
 		else :
 			outputFile = sys.argv[1]

 		if len(sys.argv) < 3 :
 			path = DEFAULT_PATH
 		else :
 			path = sys.argv[2]

 		if len(sys.argv) < 4 :
 			patterns = ['.html', '.erb']
 			# Add usage() if not all args suppied:
 			usage()			
 		else :
 			stringFilter = sys.argv[3]
 			patterns = stringFilter.split(',')

 		print('[Output File]:        ' + outputFile)
 		print('[Directory To Check]: ' + path)
 		print('[File Filters]:       ' + ', '.join(patterns))

 		if not os.path.exists(path) : 
 			raise Exception("Selected path does not exist: " + path)

 		# Walks through directory structure looking for files matching patterns
 		matchingFileList = \
 			[os.path.join(dp, f) \
 				for dp, dn, filenames in os.walk(path) \
 					for f in filenames \
 						if os.path.splitext(f)[1] in patterns]

 		# Searches each file for referrences to http and https. If found then the URL is tested.
 		fileCount, urlFileCount, urlCount = 0, 0, 0

 		for currentFile in matchingFileList:			
 			fileCount += 1
 			print('Checking File #' + str(fileCount) + ', file: ' + currentFile)
 			searchfile = open(currentFile, 'r')
 			lineNumber = 0
 			newUrlFile = True
 			for line in searchfile:
 				lineNumber += 1
 				if 'href="http' in line:
 					# Get a list of items betwen quotes " "
 					quotedItems = re.findall('"([^"]*)"', line)
 					for quotedItem in quotedItems:
 						# If Web Link:
 						if ('http' or 'https') in quotedItem:
 							badUrl = False
 							url, code, message = '', '', ''							
 							try:
 								url = quotedItem
 								response = urllib2.urlopen(url)
 								code = str(response.getcode())
 								checkUrl = response.geturl() # just in case we want to check the 2
 								message = 'no error'

 							except urllib2.HTTPError as httpErr:
 								badUrl  = True
 								code    = str(httpErr.code)
 								# message = TODO
 								# if more info needed then potentially strip from response
 								# error_message = httpErr.read()
 								# print(error_message)
 					
 							except urllib2.URLError as urlErr:
 								badUrl  = True
 								code    = '0'
 								message = urlErr.args

 							if badUrl or verbose :
 								urlCount += 1
 								if newUrlFile : 
 									urlFileCount += 1
 									newUrlFile = False
 								urlError = [str(urlCount), str(urlFileCount), currentFile, str(lineNumber), url, code, message]
 								urlErrorsList.append(urlError)

 			searchfile.close

 		if urlCount == 0 :
 			print("No URLs found in files.")
 			return

 		# Now output URL Errors to file
 		with open(outputFile, 'wb') as csvfile:   # Python3 Note: will need changed (to 'w')
 			errorWriter = csv.writer(csvfile)

 			# Header Row
 			errorWriter.writerow(['URL Count', 'File Count', 'File Name', 'Line Number', 'URL', 'Return Code', 'Message'])

 			for row in urlErrorsList : 
 				errorWriter.writerow(row) 

 	except Exception as err :
 		print(traceback.format_exception_only(type(err), err)[0].rstrip())

 if __name__ == '__main__':
 	print('Start Time: ' + str(datetime.datetime.now()))
 	main()
 	print('End Time: ' + str(datetime.datetime.now()))
	#-------------------------------------------------------------------------------
	# Licensed under the Apache License, Version 2.0: http://www.apache.org/licenses/LICENSE-2.0
	# See the license for details but you know the general guidance: this is distributed on an
	# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	#-------------------------------------------------------------------------------
	# Name: urlChecker.py
	# Usage: python.exe urlChecker.py [Output File:(.csv)] [Directory To Check] [File Filters(default:".html,.erb")]
	# Requirements/Constraints: Must run with Python 2.X (because of use of urllib2)
	#-------------------------------------------------------------------------------
	# Description: a URL Checking script
	# 1. This script will crawl the specified local directory
	# 2. Open each text file matching the file filter/extension
	# 3. Check for valid URLs in the file (verify that the URL provides a valid response)
	# 4. Output bad URLs to a csv file
	#-------------------------------------------------------------------------------
	# Adapted/reviewed from: https://github.com/ACueva/Avi-Playground/blob/master/urlChecker.py
	#-------------------------------------------------------------------------------

	import csv
	import datetime
	import os
	import re
	import sys
	import traceback
	import urllib2 # Python3 Note: will need changed

	def usage():
	print('Usage: python.exe urlChecker.py [Output File(.csv)] [Directory To Check] [File Filters(ex/default:".html,.erb")]')

	def main():

	try :

	# Set this path to local folder containing the web site repo you want to crawl (or pass in as parameter)
	# Currently defaults to current path
	DEFAULT_PATH = '.'
	# Ex: DEFAULT_PATH = 'C:/my-website/html'

	# if this verbose flag is set - it will list all URLs found (not just failing ones)
	verbose = False

	# a list of all the errors found
	urlErrorsList = []

	if len(sys.argv) < 2 :
	outputFile = 'UrlResults.csv'
	else :
	outputFile = sys.argv[1]

	if len(sys.argv) < 3 :
	path = DEFAULT_PATH
	else :
	path = sys.argv[2]

	if len(sys.argv) < 4 :
	patterns = ['.html', '.erb']
	# Add usage() if not all args suppied:
	usage()
	else :
	stringFilter = sys.argv[3]
	patterns = stringFilter.split(',')

	print('[Output File]: ' + outputFile)
	print('[Directory To Check]: ' + path)
	print('[File Filters]: ' + ', '.join(patterns))

	if not os.path.exists(path) :
	raise Exception("Selected path does not exist: " + path)

	# Walks through directory structure looking for files matching patterns
	matchingFileList = \
	[os.path.join(dp, f) \
	for dp, dn, filenames in os.walk(path) \
	for f in filenames \
	if os.path.splitext(f)[1] in patterns]

	# Searches each file for referrences to http and https. If found then the URL is tested.
	fileCount, urlFileCount, urlCount = 0, 0, 0

	for currentFile in matchingFileList:
	fileCount += 1
	print('Checking File #' + str(fileCount) + ', file: ' + currentFile)
	searchfile = open(currentFile, 'r')
	lineNumber = 0
	newUrlFile = True
	for line in searchfile:
	lineNumber += 1
	if 'href="http' in line:
	# Get a list of items betwen quotes " "
	quotedItems = re.findall('"([^"]*)"', line)
	for quotedItem in quotedItems:
	# If Web Link:
	if ('http' or 'https') in quotedItem:
	badUrl = False
	url, code, message = '', '', ''
	try:
	url = quotedItem
	response = urllib2.urlopen(url)
	code = str(response.getcode())
	checkUrl = response.geturl() # just in case we want to check the 2
	message = 'no error'

	except urllib2.HTTPError as httpErr:
	badUrl = True
	code = str(httpErr.code)
	# message = TODO
	# if more info needed then potentially strip from response
	# error_message = httpErr.read()
	# print(error_message)

	except urllib2.URLError as urlErr:
	badUrl = True
	code = '0'
	message = urlErr.args

	if badUrl or verbose :
	urlCount += 1
	if newUrlFile :
	urlFileCount += 1
	newUrlFile = False
	urlError = [str(urlCount), str(urlFileCount), currentFile, str(lineNumber), url, code, message]
	urlErrorsList.append(urlError)

	searchfile.close

	if urlCount == 0 :
	print("No URLs found in files.")
	return

	# Now output URL Errors to file
	with open(outputFile, 'wb') as csvfile: # Python3 Note: will need changed (to 'w')
	errorWriter = csv.writer(csvfile)

	# Header Row
	errorWriter.writerow(['URL Count', 'File Count', 'File Name', 'Line Number', 'URL', 'Return Code', 'Message'])

	for row in urlErrorsList :
	errorWriter.writerow(row)

	except Exception as err :
	print(traceback.format_exception_only(type(err), err)[0].rstrip())

	if __name__ == '__main__':
	print('Start Time: ' + str(datetime.datetime.now()))
	main()
	print('End Time: ' + str(datetime.datetime.now()))