Created
November 6, 2012 08:38
-
-
Save scottaj/4023512 to your computer and use it in GitHub Desktop.
HTML Tag Scraper
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
## Make sure you install nokogiri: 'gem install nokogiri' | |
require 'nokogiri' | |
require 'set' | |
HTML5_TAGS = [ | |
"section", "nav", "article", | |
"aside", "hgroup", "header", | |
"footer", "figure", "figcaption", | |
"data", "time", "mark", | |
"ruby", "rt", "rp", | |
"bdi", "wbr", "embed", | |
"video", "audio", "source", | |
"track", "canvas", "svg", | |
"math", "datalist", "keygen", | |
"output", "progress", "meter", | |
"details", "summary", "command", | |
"menu", "input:color", "input:date", | |
"input:datetime", "input:datetime-local", | |
"input:email", "input:month", "input:number", | |
"input:range", "input:search", "input:tel", | |
"input:time", "input:url", "input:week" | |
] | |
def scrape_tags_recursively(html) | |
tags = Set.new | |
html.elements.each do |element| | |
if element.name == "input" | |
tags << "input:#{element.attributes["type"].value}" # Get type attribute for input tags. | |
else | |
tags << element.name # Get tag name for any non-input tag. | |
end | |
tags |= scrape_tags_recursively(element) if element.children # recur on every child element. | |
end | |
return tags | |
end | |
def main | |
unless ARGV[0] | |
print "Enter HTML file to parse: " | |
end | |
ARGV[0] ||= gets.chomp # Type in file in none entered on command line. | |
tags = Set.new | |
unless ARGV[0].empty? | |
begin | |
html = Nokogiri::HTML open(ARGV[0]) | |
rescue | |
puts "Could not load file" | |
return | |
end | |
tags |= scrape_tags_recursively(html) | |
else # If no file is entered, scrape every HTML file in the current directory. | |
ARGV[0] = "all files" | |
Dir.foreach(".") do |file| | |
if file.match(/^.+\.html$/i) | |
html = Nokogiri::HTML open(file) | |
tags |= scrape_tags_recursively(html) | |
end | |
end | |
end | |
puts "\nHTML tags in #{ARGV[0]}:" | |
tags.sort.each {|tag| puts tag} | |
puts "\nHTML5 Tags in #{ARGV[0]}:" | |
tags.sort.each {|tag| puts tag if HTML5_TAGS.include?(tag)} | |
end | |
main if __FILE__ == $0 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment