Created
December 30, 2010 08:10
-
-
Save justjkk/759582 to your computer and use it in GitHub Desktop.
Accepts a list of uris and displays the count of html errors & warnings by scraping from validator.w3.org
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from BeautifulSoup import BeautifulSoup | |
import urllib2 | |
import urllib | |
import sys | |
import csv | |
def validator_scraper(uris): | |
user_agent = 'Mozilla/5 (Ubuntu 10.04) Gecko' | |
headers = { 'User-Agent' : user_agent } | |
results = [] | |
for uri in uris: | |
try: | |
params = urllib.urlencode({'uri': uri}) | |
url = "http://validator.w3.org/check?%s" % params | |
request = urllib2.Request( url, None, headers) | |
response = urllib2.urlopen(request) | |
html = response.read() | |
soup = BeautifulSoup(html) | |
target_td = soup.table.tr.td | |
error_count = 0 | |
warning_count = 0 | |
if target_td['class'] == 'invalid': | |
split_td = target_td.contents[0].split(",") | |
errors = split_td[0] | |
error_count = int(errors.split("Error")[0].strip()) | |
if(len(split_td) > 1): | |
warnings = split_td[1] | |
warning_count = int(warnings.split("warning")[0].strip()) | |
elif target_td['class'] == 'valid': | |
warnings = target_td.find("strong",{"class":"has_warnings"}) | |
if warnings is not None: | |
warning_count = int(warnings.contents[0].split("warning")[0].strip()) | |
else: | |
raise Exception("<td> tag that neither belongs to class 'valid' nor 'invalid'") | |
results.append({"uri": uri, "errors": error_count, "warnings": warning_count}) | |
except Exception,e: | |
sys.stderr.write("Exception during processing uri: '%s'\nDetails: %s" % (uri, e)) | |
return results | |
def print_csv(results): | |
header = ['uri','errors','warnings'] | |
csv_writer = csv.DictWriter(sys.stdout, header, extrasaction='ignore') | |
header_writer = csv.writer(sys.stdout) | |
header_writer.writerow(header) | |
for result in results: | |
csv_writer.writerow(result) | |
if __name__ == "__main__": | |
if len(sys.argv) != 2: | |
print "Usage: python %s file-containing-uris-separated-by-newline" | |
sys.exit() | |
fptr = open(sys.argv[1],"r") | |
uris = [ uri for uri in fptr.read().split('\n') if uri != "" ] | |
results = validator_scraper(uris) | |
print_csv(results) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment