michaeltelford · July 23, 2025 17:02 · michaeltelford · Aug 5, 2022
diff --git a/extract.rb b/extract.rb
 require "wgit"

 # Remove the default extractors since we won't be using them.
 Wgit::Document.remove_extractors

 # The default name of the output file containing the clean HTML.
 def default_file_name
  "webpage.html"
 end

 # The HTML elements containing the content that we're interested in viewing.
 def content_elements
  Set.new(%i[
    a abbr address aside b bdi bdo blockquote caption cite
    code data del details dfn div dl em figcaption figure footer h1 h2
    h3 h4 h5 h6 header hr i img ins kbd legend main mark meter ol
    option output p pre q rb rt ruby s samp section small span strong sub
    summary sup textarea time u ul var wbr
  ])
 end

 # Returns an xpath query (String) to extract the meaningful content on a page.
 def content_xpath
  content_elements.each_with_index.reduce("") do |xpath, (el, i)|
    xpath += " | " unless i.zero?
    xpath += format("//%s", el)
  end
 end

 # Extracts the meaningful content on a webpage to be viewed without annoyances like popups etc.
 class CleanCrawler < Wgit::Base
  start   ARGV.first
  mode    :crawl_url
  extract :content, content_xpath, singleton: false, text_content_only: false
  extract :article, "//article",   singleton: true,  text_content_only: false
  
  attr_reader :file_name

  # Parse/process the crawled web document. We want to extract and write only the meaningful
  # content, not the crap around it e.g. cookie banners.
  def parse(doc)
    raise "doc.content should be an Enumerable" unless doc.content&.is_a?(Enumerable)

    @file_name = ARGV[1] || default_file_name
    File.open(@file_name, "w+") do |f|
      write_html(f) do
        write_content(f, doc)
      end
    end
  end
  
  private
  
  # Write the opening HTML tag with a CSS link to SimpleCSS.
  def write_html(file)
    html_opening_tags = <<~HTML
    <html>
      <head>
        <link rel="stylesheet" href="https://cdn.simplecss.org/simple.min.css">
      </head>
    HTML
    
    file.write(html_opening_tags)
    yield
    file.write("</html>")
  end
  
  # If there's an article element, write it to file, otherwise write all of the page content.
  def write_content(file, doc)
    if doc.article
      file.write(doc.article)
    else
      write_article(file) do
        doc.content.each { |el| file.write(el) }
      end
    end
  end
  
  def write_article(file)
    file.write("<article>")
    yield
    file.write("</article>")
  end
 end

 if __FILE__ == $0
  if ARGV.empty?
    raise "missing URL parameter, use like: ruby extract.rb http://example.com [example.html]"
  end
  
  crawler = CleanCrawler.run
  
  file_name = crawler.file_name
  file_size = File.size(file_name)
  absolute_file_path = File.expand_path(file_name)
  
  puts "Wrote #{file_size} bytes to:\n#{absolute_file_path}"
 end
	require "wgit"

	# Remove the default extractors since we won't be using them.
	Wgit::Document.remove_extractors

	# The default name of the output file containing the clean HTML.
	def default_file_name
	"webpage.html"
	end

	# The HTML elements containing the content that we're interested in viewing.
	def content_elements
	Set.new(%i[
	a abbr address aside b bdi bdo blockquote caption cite
	code data del details dfn div dl em figcaption figure footer h1 h2
	h3 h4 h5 h6 header hr i img ins kbd legend main mark meter ol
	option output p pre q rb rt ruby s samp section small span strong sub
	summary sup textarea time u ul var wbr
	])
	end

	# Returns an xpath query (String) to extract the meaningful content on a page.
	def content_xpath
	content_elements.each_with_index.reduce("") do \|xpath, (el, i)\|
	xpath += " \| " unless i.zero?
	xpath += format("//%s", el)
	end
	end

	# Extracts the meaningful content on a webpage to be viewed without annoyances like popups etc.
	class CleanCrawler < Wgit::Base
	start ARGV.first
	mode :crawl_url
	extract :content, content_xpath, singleton: false, text_content_only: false
	extract :article, "//article", singleton: true, text_content_only: false

	attr_reader :file_name

	# Parse/process the crawled web document. We want to extract and write only the meaningful
	# content, not the crap around it e.g. cookie banners.
	def parse(doc)
	raise "doc.content should be an Enumerable" unless doc.content&.is_a?(Enumerable)

	@file_name = ARGV[1] \|\| default_file_name
	File.open(@file_name, "w+") do \|f\|
	write_html(f) do
	write_content(f, doc)
	end
	end
	end

	private

	# Write the opening HTML tag with a CSS link to SimpleCSS.
	def write_html(file)
	html_opening_tags = <<~HTML
	<html>
	<head>
	<link rel="stylesheet" href="https://cdn.simplecss.org/simple.min.css">
	</head>
	HTML

	file.write(html_opening_tags)
	yield
	file.write("</html>")
	end

	# If there's an article element, write it to file, otherwise write all of the page content.
	def write_content(file, doc)
	if doc.article
	file.write(doc.article)
	else
	write_article(file) do
	doc.content.each { \|el\| file.write(el) }
	end
	end
	end

	def write_article(file)
	file.write("<article>")
	yield
	file.write("</article>")
	end
	end

	if __FILE__ == $0
	if ARGV.empty?
	raise "missing URL parameter, use like: ruby extract.rb http://example.com [example.html]"
	end

	crawler = CleanCrawler.run

	file_name = crawler.file_name
	file_size = File.size(file_name)
	absolute_file_path = File.expand_path(file_name)

	puts "Wrote #{file_size} bytes to:\n#{absolute_file_path}"
	end