scottaj · November 6, 2012 08:38
diff --git a/tag_scraper.rb b/tag_scraper.rb
 ## Make sure you install nokogiri: 'gem install nokogiri'
 require 'nokogiri'
 require 'set'

 HTML5_TAGS = [
              "section", "nav", "article",
              "aside", "hgroup", "header",
              "footer", "figure", "figcaption",
              "data", "time", "mark",
              "ruby", "rt", "rp",
              "bdi", "wbr", "embed",
              "video", "audio", "source",
              "track", "canvas", "svg",
              "math", "datalist", "keygen",
              "output", "progress", "meter",
              "details", "summary", "command",
              "menu", "input:color", "input:date",
              "input:datetime", "input:datetime-local",
              "input:email", "input:month", "input:number",
              "input:range", "input:search", "input:tel",
              "input:time", "input:url", "input:week"
             ]

 def scrape_tags_recursively(html)
  tags = Set.new

  html.elements.each do |element|
    if element.name == "input"
      tags << "input:#{element.attributes["type"].value}" # Get type attribute for input tags.
    else
      tags << element.name # Get tag name for any non-input tag.
    end
    tags |= scrape_tags_recursively(element) if element.children # recur on every child element.
  end

  return tags
 end

 def main
  unless ARGV[0]
    print "Enter HTML file to parse: "
  end
  ARGV[0] ||= gets.chomp # Type in file in none entered on command line.
  tags = Set.new
  
  unless ARGV[0].empty? 
    begin
      html = Nokogiri::HTML open(ARGV[0])
    rescue
      puts "Could not load file"
      return
    end
    tags |= scrape_tags_recursively(html)
  else # If no file is entered, scrape every HTML file in the current directory.
    ARGV[0] = "all files"
    Dir.foreach(".") do |file|
      if file.match(/^.+\.html$/i)
        html = Nokogiri::HTML open(file)
        tags |= scrape_tags_recursively(html)
      end
    end
  end
  

  puts "\nHTML tags in #{ARGV[0]}:"
  tags.sort.each {|tag| puts tag}
  puts "\nHTML5 Tags in #{ARGV[0]}:"
  tags.sort.each {|tag| puts tag if HTML5_TAGS.include?(tag)}
 end

 main if __FILE__ == $0
	## Make sure you install nokogiri: 'gem install nokogiri'
	require 'nokogiri'
	require 'set'

	HTML5_TAGS = [
	"section", "nav", "article",
	"aside", "hgroup", "header",
	"footer", "figure", "figcaption",
	"data", "time", "mark",
	"ruby", "rt", "rp",
	"bdi", "wbr", "embed",
	"video", "audio", "source",
	"track", "canvas", "svg",
	"math", "datalist", "keygen",
	"output", "progress", "meter",
	"details", "summary", "command",
	"menu", "input:color", "input:date",
	"input:datetime", "input:datetime-local",
	"input:email", "input:month", "input:number",
	"input:range", "input:search", "input:tel",
	"input:time", "input:url", "input:week"
	]

	def scrape_tags_recursively(html)
	tags = Set.new

	html.elements.each do \|element\|
	if element.name == "input"
	tags << "input:#{element.attributes["type"].value}" # Get type attribute for input tags.
	else
	tags << element.name # Get tag name for any non-input tag.
	end
	tags \|= scrape_tags_recursively(element) if element.children # recur on every child element.
	end

	return tags
	end

	def main
	unless ARGV[0]
	print "Enter HTML file to parse: "
	end
	ARGV[0] \|\|= gets.chomp # Type in file in none entered on command line.
	tags = Set.new

	unless ARGV[0].empty?
	begin
	html = Nokogiri::HTML open(ARGV[0])
	rescue
	puts "Could not load file"
	return
	end
	tags \|= scrape_tags_recursively(html)
	else # If no file is entered, scrape every HTML file in the current directory.
	ARGV[0] = "all files"
	Dir.foreach(".") do \|file\|
	if file.match(/^.+\.html$/i)
	html = Nokogiri::HTML open(file)
	tags \|= scrape_tags_recursively(html)
	end
	end
	end


	puts "\nHTML tags in #{ARGV[0]}:"
	tags.sort.each {\|tag\| puts tag}
	puts "\nHTML5 Tags in #{ARGV[0]}:"
	tags.sort.each {\|tag\| puts tag if HTML5_TAGS.include?(tag)}
	end

	main if __FILE__ == $0
No results found