|
# wikiweb.rb |
|
# Matthew Daly 2023 |
|
# @matthewd673 |
|
|
|
require "net/http" |
|
require "nokogiri" |
|
|
|
# parse command line args |
|
if ARGV.length < 1 |
|
puts "ruby wikiweb.rb <article-url>" |
|
puts "\t -q --quiet: omit article links" |
|
puts "\t -d --depth <number>: crawl to a given depth (default 1)" |
|
return |
|
end |
|
|
|
# kinda ugly but also kinda clever imo |
|
url = "" |
|
depth = 1 |
|
@quiet = false |
|
mode = "" |
|
for i in 0..ARGV.length |
|
if ARGV[i].eql?("-d") || ARGV[i].eql?("--depth") |
|
mode = "depth" |
|
next |
|
elsif ARGV[i].eql?("-q") || ARGV[i].eql?("--quiet") |
|
@quiet = true |
|
next |
|
end |
|
|
|
if mode.eql?("") && url.eql?("") |
|
url = ARGV[i] |
|
elsif mode.eql?("depth") |
|
depth = ARGV[i].to_i() |
|
end |
|
|
|
mode = "" |
|
end |
|
|
|
@urls = [] |
|
@dupCount = 0 |
|
# GET and parse a page for /wiki/ links |
|
def scan_page(page_url, depth, prefix) |
|
if (depth == 0) then return end |
|
|
|
# http GET request |
|
page_url = "https://en.wikipedia.org" + page_url unless page_url.include?("://") |
|
res = Net::HTTP.get_response(URI(page_url)) |
|
|
|
if !res.is_a?(Net::HTTPSuccess) |
|
puts prefix + "Failed to load URL (" + res.code + ")" |
|
return |
|
end |
|
|
|
# pull links out of page |
|
page = Nokogiri::HTML5.parse(res.body) |
|
links = page.xpath("//a") |
|
|
|
puts prefix + page_url unless @quiet |
|
|
|
# add each link to list and make recursive call |
|
count = 0; |
|
links.each { |l| |
|
if l["href"] == nil then next end |
|
|
|
# filter non-article links |
|
if !l["href"].start_with?("/wiki/") || |
|
l["href"].include?(":") || |
|
l["href"].eql?("/wiki/Main_Page") |
|
next |
|
end |
|
|
|
href = l["href"].split("#")[0] |
|
|
|
if ([email protected]?(href)) |
|
@urls.push(href) |
|
count = count + 1 |
|
scan_page(href, depth - 1, prefix + " ") |
|
else |
|
@dupCount = @dupCount + 1 |
|
end |
|
} |
|
|
|
puts prefix + " [" + count.to_s() + " articles]" unless @quiet |
|
end |
|
|
|
# run scanner and print result |
|
scan_page(url, depth, " ") |
|
puts "\n" + @urls.length.to_s() + " articles linked at depth " + depth.to_s() + " (excluding " + @dupCount.to_s() + " duplicates)" |