matthewd673 · February 24, 2023 05:39
diff --git a/README.md b/README.md
diff --git a/wikiweb.rb b/wikiweb.rb
 # wikiweb.rb
 # Matthew Daly 2023
 # @matthewd673

 require "net/http"
 require "nokogiri"

 # parse command line args
 if ARGV.length < 1
    puts "ruby wikiweb.rb <article-url>"
    puts "\t -q --quiet: omit article links"
    puts "\t -d --depth <number>: crawl to a given depth (default 1)"
    return
 end

 # kinda ugly but also kinda clever imo
 url = ""
 depth = 1
 @quiet = false
 mode = ""
 for i in 0..ARGV.length
    if ARGV[i].eql?("-d") || ARGV[i].eql?("--depth")
        mode = "depth"
        next
    elsif ARGV[i].eql?("-q") || ARGV[i].eql?("--quiet")
        @quiet = true
        next
    end

    if mode.eql?("") && url.eql?("")
        url = ARGV[i]
    elsif mode.eql?("depth")
        depth = ARGV[i].to_i()
    end

    mode = ""
 end

 @urls = []
 @dupCount = 0
 # GET and parse a page for /wiki/ links
 def scan_page(page_url, depth, prefix)
    if (depth == 0) then return end

    # http GET request
    page_url = "https://en.wikipedia.org" + page_url unless page_url.include?("://")
    res = Net::HTTP.get_response(URI(page_url))

    if !res.is_a?(Net::HTTPSuccess)
        puts prefix + "Failed to load URL (" + res.code + ")"
        return
    end

    # pull links out of page
    page = Nokogiri::HTML5.parse(res.body)
    links = page.xpath("//a")

    puts prefix + page_url unless @quiet

    # add each link to list and make recursive call
    count = 0;
    links.each { |l|
        if l["href"] == nil then next end

        # filter non-article links
        if !l["href"].start_with?("/wiki/") ||
            l["href"].include?(":") ||
            l["href"].eql?("/wiki/Main_Page")
                next
        end

        href = l["href"].split("#")[0]

        if ([email protected]?(href))
            @urls.push(href)
            count = count + 1
            scan_page(href, depth - 1, prefix + " ")
        else
            @dupCount = @dupCount + 1
        end
    }

    puts prefix + " [" + count.to_s() + " articles]" unless @quiet
 end

 # run scanner and print result
 scan_page(url, depth, " ")
 puts "\n" + @urls.length.to_s() + " articles linked at depth " + depth.to_s() + " (excluding " + @dupCount.to_s() + " duplicates)"
	# wikiweb.rb
	# Matthew Daly 2023
	# @matthewd673

	require "net/http"
	require "nokogiri"

	# parse command line args
	if ARGV.length < 1
	puts "ruby wikiweb.rb <article-url>"
	puts "\t -q --quiet: omit article links"
	puts "\t -d --depth <number>: crawl to a given depth (default 1)"
	return
	end

	# kinda ugly but also kinda clever imo
	url = ""
	depth = 1
	@quiet = false
	mode = ""
	for i in 0..ARGV.length
	if ARGV[i].eql?("-d") \|\| ARGV[i].eql?("--depth")
	mode = "depth"
	next
	elsif ARGV[i].eql?("-q") \|\| ARGV[i].eql?("--quiet")
	@quiet = true
	next
	end

	if mode.eql?("") && url.eql?("")
	url = ARGV[i]
	elsif mode.eql?("depth")
	depth = ARGV[i].to_i()
	end

	mode = ""
	end

	@urls = []
	@dupCount = 0
	# GET and parse a page for /wiki/ links
	def scan_page(page_url, depth, prefix)
	if (depth == 0) then return end

	# http GET request
	page_url = "https://en.wikipedia.org" + page_url unless page_url.include?("://")
	res = Net::HTTP.get_response(URI(page_url))

	if !res.is_a?(Net::HTTPSuccess)
	puts prefix + "Failed to load URL (" + res.code + ")"
	return
	end

	# pull links out of page
	page = Nokogiri::HTML5.parse(res.body)
	links = page.xpath("//a")

	puts prefix + page_url unless @quiet

	# add each link to list and make recursive call
	count = 0;
	links.each { \|l\|
	if l["href"] == nil then next end

	# filter non-article links
	if !l["href"].start_with?("/wiki/") \|\|
	l["href"].include?(":") \|\|
	l["href"].eql?("/wiki/Main_Page")
	next
	end

	href = l["href"].split("#")[0]

	if ([email protected]?(href))
	@urls.push(href)
	count = count + 1
	scan_page(href, depth - 1, prefix + " ")
	else
	@dupCount = @dupCount + 1
	end
	}

	puts prefix + " [" + count.to_s() + " articles]" unless @quiet
	end

	# run scanner and print result
	scan_page(url, depth, " ")
	puts "\n" + @urls.length.to_s() + " articles linked at depth " + depth.to_s() + " (excluding " + @dupCount.to_s() + " duplicates)"