Skip to content

Instantly share code, notes, and snippets.

@justjkk
Created December 30, 2010 08:10
Show Gist options
  • Save justjkk/759582 to your computer and use it in GitHub Desktop.
Save justjkk/759582 to your computer and use it in GitHub Desktop.
Accepts a list of uris and displays the count of html errors & warnings by scraping from validator.w3.org
from BeautifulSoup import BeautifulSoup
import urllib2
import urllib
import sys
import csv
def validator_scraper(uris):
user_agent = 'Mozilla/5 (Ubuntu 10.04) Gecko'
headers = { 'User-Agent' : user_agent }
results = []
for uri in uris:
try:
params = urllib.urlencode({'uri': uri})
url = "http://validator.w3.org/check?%s" % params
request = urllib2.Request( url, None, headers)
response = urllib2.urlopen(request)
html = response.read()
soup = BeautifulSoup(html)
target_td = soup.table.tr.td
error_count = 0
warning_count = 0
if target_td['class'] == 'invalid':
split_td = target_td.contents[0].split(",")
errors = split_td[0]
error_count = int(errors.split("Error")[0].strip())
if(len(split_td) > 1):
warnings = split_td[1]
warning_count = int(warnings.split("warning")[0].strip())
elif target_td['class'] == 'valid':
warnings = target_td.find("strong",{"class":"has_warnings"})
if warnings is not None:
warning_count = int(warnings.contents[0].split("warning")[0].strip())
else:
raise Exception("<td> tag that neither belongs to class 'valid' nor 'invalid'")
results.append({"uri": uri, "errors": error_count, "warnings": warning_count})
except Exception,e:
sys.stderr.write("Exception during processing uri: '%s'\nDetails: %s" % (uri, e))
return results
def print_csv(results):
header = ['uri','errors','warnings']
csv_writer = csv.DictWriter(sys.stdout, header, extrasaction='ignore')
header_writer = csv.writer(sys.stdout)
header_writer.writerow(header)
for result in results:
csv_writer.writerow(result)
if __name__ == "__main__":
if len(sys.argv) != 2:
print "Usage: python %s file-containing-uris-separated-by-newline"
sys.exit()
fptr = open(sys.argv[1],"r")
uris = [ uri for uri in fptr.read().split('\n') if uri != "" ]
results = validator_scraper(uris)
print_csv(results)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment