edmangimelli · August 28, 2022 22:32 · edmangimelli · Aug 28, 2022
diff --git a/offline_hn.rb b/offline_hn.rb
 # offline_hn.rb

 # super-simple-grab-offline-copy-of-hacker-news

 # description:
 # - grabs the links on the front page of hacker news
 # - grabs each linked page, and each page of comments
 # - combines it all into one offline-able html

 # usage:
 #  ruby offline_hn.rb > hn.html


 require 'nokogiri'
 require 'open-uri'
 require 'net/http'

 MAX_PAGE_LOADS = 70
 SECONDS_TO_SLEEP_BETWEEN_REQUESTS = 0.5



 def main
  pages_loaded = 0

  puts "<h1>Hacker News</h1>"

  get_html("https://news.ycombinator.com").get_these_html_elements("a").each do |a|
    next unless its_a_submission_link_or_a_link_to_a_submissions_comments?(a)

    puts "<hr>" if its_a_submission_link?(a)
    
    make_relative_link_into_an_absolute_hackernews_link(a)
  
    sleep SECONDS_TO_SLEEP_BETWEEN_REQUESTS
  
    webpage_at_link = get_html_body_and_remove_javascript(a[:href])

    puts <<~HEREDOC
    <details>
      <summary>#{a}</summary>
      <p>#{webpage_at_link}</p>
    </details> 
    HEREDOC
  
    pages_loaded += 1
    exit if pages_loaded >= MAX_PAGE_LOADS
  end
 end



 def get_html url
  uri = URI.parse(url)
  response = Net::HTTP.get_response(uri)
  html = response.body
  Nokogiri::HTML.parse(html) # Nokogiri::HTML::Document
 end

 class Nokogiri::HTML::Document
  def get_these_html_elements(element) # xpath wrapper
    document.xpath("//#{element}")
  end
 end

 def its_a_submission_link_or_a_link_to_a_submissions_comments? a_tag
  inner_text = a_tag.children&.first.to_s
  its_a_submission_link?(a_tag) || inner_text == "discuss" || inner_text.include?("&nbsp;comment")
 end

 def its_a_submission_link? a_tag
  a_tag[:class] == 'titlelink'
 end

 def make_relative_link_into_an_absolute_hackernews_link a_tag
  return if a_tag[:href].start_with? "http"

  a_tag[:href] = "https://news.ycombinator.com/" + a_tag[:href]
 end

 def get_html_body_and_remove_javascript url
  html = get_html(url)

  html.get_these_html_elements("script").each do |javascript|
    javascript.remove
  end

  if its_a_hackernews_url?(url)
    html.get_these_html_elements("a").each do |a|
      make_relative_link_into_an_absolute_hackernews_link(a)
    end
  end
  
  html.get_these_html_elements("body")
 end

 def its_a_hackernews_url? url
  url.match? Regexp.new("https?://news\.ycombinator\.com/")
 end

 # START
 main
	# offline_hn.rb

	# super-simple-grab-offline-copy-of-hacker-news

	# description:
	# - grabs the links on the front page of hacker news
	# - grabs each linked page, and each page of comments
	# - combines it all into one offline-able html

	# usage:
	# ruby offline_hn.rb > hn.html


	require 'nokogiri'
	require 'open-uri'
	require 'net/http'

	MAX_PAGE_LOADS = 70
	SECONDS_TO_SLEEP_BETWEEN_REQUESTS = 0.5



	def main
	pages_loaded = 0

	puts "<h1>Hacker News</h1>"

	get_html("https://news.ycombinator.com").get_these_html_elements("a").each do \|a\|
	next unless its_a_submission_link_or_a_link_to_a_submissions_comments?(a)

	puts "<hr>" if its_a_submission_link?(a)

	make_relative_link_into_an_absolute_hackernews_link(a)

	sleep SECONDS_TO_SLEEP_BETWEEN_REQUESTS

	webpage_at_link = get_html_body_and_remove_javascript(a[:href])

	puts <<~HEREDOC
	<details>
	<summary>#{a}</summary>
	<p>#{webpage_at_link}</p>
	</details>
	HEREDOC

	pages_loaded += 1
	exit if pages_loaded >= MAX_PAGE_LOADS
	end
	end



	def get_html url
	uri = URI.parse(url)
	response = Net::HTTP.get_response(uri)
	html = response.body
	Nokogiri::HTML.parse(html) # Nokogiri::HTML::Document
	end

	class Nokogiri::HTML::Document
	def get_these_html_elements(element) # xpath wrapper
	document.xpath("//#{element}")
	end
	end

	def its_a_submission_link_or_a_link_to_a_submissions_comments? a_tag
	inner_text = a_tag.children&.first.to_s
	its_a_submission_link?(a_tag) \|\| inner_text == "discuss" \|\| inner_text.include?(" comment")
	end

	def its_a_submission_link? a_tag
	a_tag[:class] == 'titlelink'
	end

	def make_relative_link_into_an_absolute_hackernews_link a_tag
	return if a_tag[:href].start_with? "http"

	a_tag[:href] = "https://news.ycombinator.com/" + a_tag[:href]
	end

	def get_html_body_and_remove_javascript url
	html = get_html(url)

	html.get_these_html_elements("script").each do \|javascript\|
	javascript.remove
	end

	if its_a_hackernews_url?(url)
	html.get_these_html_elements("a").each do \|a\|
	make_relative_link_into_an_absolute_hackernews_link(a)
	end
	end

	html.get_these_html_elements("body")
	end

	def its_a_hackernews_url? url
	url.match? Regexp.new("https?://news\.ycombinator\.com/")
	end

	# START
	main