Created
August 28, 2022 22:32
-
-
Save edmangimelli/6c800facb76870d45b26d2c300f4ce0b to your computer and use it in GitHub Desktop.
super simple grab offline copy of hacker news
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# offline_hn.rb | |
# super-simple-grab-offline-copy-of-hacker-news | |
# description: | |
# - grabs the links on the front page of hacker news | |
# - grabs each linked page, and each page of comments | |
# - combines it all into one offline-able html | |
# usage: | |
# ruby offline_hn.rb > hn.html | |
require 'nokogiri' | |
require 'open-uri' | |
require 'net/http' | |
MAX_PAGE_LOADS = 70 | |
SECONDS_TO_SLEEP_BETWEEN_REQUESTS = 0.5 | |
def main | |
pages_loaded = 0 | |
puts "<h1>Hacker News</h1>" | |
get_html("https://news.ycombinator.com").get_these_html_elements("a").each do |a| | |
next unless its_a_submission_link_or_a_link_to_a_submissions_comments?(a) | |
puts "<hr>" if its_a_submission_link?(a) | |
make_relative_link_into_an_absolute_hackernews_link(a) | |
sleep SECONDS_TO_SLEEP_BETWEEN_REQUESTS | |
webpage_at_link = get_html_body_and_remove_javascript(a[:href]) | |
puts <<~HEREDOC | |
<details> | |
<summary>#{a}</summary> | |
<p>#{webpage_at_link}</p> | |
</details> | |
HEREDOC | |
pages_loaded += 1 | |
exit if pages_loaded >= MAX_PAGE_LOADS | |
end | |
end | |
def get_html url | |
uri = URI.parse(url) | |
response = Net::HTTP.get_response(uri) | |
html = response.body | |
Nokogiri::HTML.parse(html) # Nokogiri::HTML::Document | |
end | |
class Nokogiri::HTML::Document | |
def get_these_html_elements(element) # xpath wrapper | |
document.xpath("//#{element}") | |
end | |
end | |
def its_a_submission_link_or_a_link_to_a_submissions_comments? a_tag | |
inner_text = a_tag.children&.first.to_s | |
its_a_submission_link?(a_tag) || inner_text == "discuss" || inner_text.include?(" comment") | |
end | |
def its_a_submission_link? a_tag | |
a_tag[:class] == 'titlelink' | |
end | |
def make_relative_link_into_an_absolute_hackernews_link a_tag | |
return if a_tag[:href].start_with? "http" | |
a_tag[:href] = "https://news.ycombinator.com/" + a_tag[:href] | |
end | |
def get_html_body_and_remove_javascript url | |
html = get_html(url) | |
html.get_these_html_elements("script").each do |javascript| | |
javascript.remove | |
end | |
if its_a_hackernews_url?(url) | |
html.get_these_html_elements("a").each do |a| | |
make_relative_link_into_an_absolute_hackernews_link(a) | |
end | |
end | |
html.get_these_html_elements("body") | |
end | |
def its_a_hackernews_url? url | |
url.match? Regexp.new("https?://news\.ycombinator\.com/") | |
end | |
# START | |
main |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
super rough, but functional. pretty easy to extend it to make it fancier.
there's some fun styling madness because i'm literally just smooshing webpages together. when i was debugging it, i had a nice white background, but, the last few times i've run it, the background has been yellow. lol. note: all script tags get removed, but i don't do anything with the css