Created
January 19, 2012 07:36
-
-
Save usahg/1638605 to your computer and use it in GitHub Desktop.
validate html of a login protected page, gems used :: mechanize and w3c_validator
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#dependencies : | |
#gem install mechanize | |
# ^ if this doesnt work, and you are on ubuntu, you'll have to install nokogiri | |
# gem install __name__ is now by default equivalent of gem install __name__ -y (-y for including all dependencies) | |
#gem install w3c_validtors | |
# I had to write this in order to validate pages against html5 compliance for our in house product | |
# my first repo. | |
require "rubygems" | |
require "w3c_validators" | |
require 'mechanize' | |
include W3CValidators | |
def validate(text_frag) | |
@validator = MarkupValidator.new | |
@validator.set_doctype!(:html5) | |
# for example, change :html5 to :html32 and so on to validate against diff versions | |
html = text_frag | |
results = @validator.validate_text(html) | |
#puts "\n\n\n-------here goes validation by using a url-------\n\n\n\n" | |
k = [ ] | |
if results.errors.length > 0 | |
results.errors.each do |err| | |
puts err.to_s | |
k << err.to_s | |
end | |
else | |
puts 'Valid!' | |
end | |
num = 1 | |
File.open('errorlog_html'.concat(num.to_s), 'a') do |f| | |
num = num +1 | |
f.puts "generating error log for url " | |
f.puts "\nat ".concat(Time.now.to_s), " " | |
f.puts k | |
f.puts "\n" | |
end | |
#puts "exiting..." | |
#exit() | |
end | |
def get_list_of_html_fragments(links) | |
#this function collects html source of all the links you give it | |
#links must be in ruby array format | |
bodies = [ ] | |
# ^this list will have html soruces | |
agent = Mechanize.new | |
page = agent.get 'www.twitter.com' | |
form = page.forms.first | |
form.userName = ' ' | |
form.pwd = ' ' | |
page = agent.submit form | |
#puts page.body | |
bodies << page.body | |
links.each do |link| | |
page = agent.get link | |
bodies << page.body | |
end | |
return bodies | |
end | |
# you can add your own implementation here, import links from a file etc | |
links = [ ] | |
links << 'www.twitter.com/railstroll' | |
links << 'www.twitter.com/someone_else' | |
bodies = get_list_of_html_fragments(links) | |
bodies.each do |frag| | |
puts "validating this frag ", frag | |
validate(frag) | |
puts "\n\n\n successfully validated.....\n\n\n" | |
end | |
puts "done done done" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment