Skip to content

Instantly share code, notes, and snippets.

@yosukehasumi
Created June 15, 2017 15:57
Show Gist options
  • Save yosukehasumi/963ba0328bbf0f368a8013363284548d to your computer and use it in GitHub Desktop.
Save yosukehasumi/963ba0328bbf0f368a8013363284548d to your computer and use it in GitHub Desktop.
linkmap
#!/usr/bin/env ruby
require 'rubygems'
require 'nokogiri'
require 'open-uri'
require 'uri'
require 'net/http'
require 'rainbow'
require 'awesome_print'
class Crawl
attr_reader :site_url
attr_accessor :collected_pages
def initialize(site_url)
@site_url = site_url
@collected_pages = []
end
def pages
collect_pages(@site_url)
puts ""
@collected_pages
end
private
def collect_pages(url)
return if @collected_pages.include? url
add_to_collected_pages(url)
links = page_links(url)
links.each do |link|
collect_pages(link)
end
end
def add_to_collected_pages(url)
@collected_pages << url
print "."
end
def page_links(url)
begin
page = Nokogiri::HTML(Net::HTTP.get(URI(clean_url(url))))
links = page.css('a')
links.collect do |link|
link_filter(link['href'])
end.compact
rescue => e
[]
end
end
def link_filter(url)
return nil if url.nil?
url = Nokogiri::HTML(url).text
home = URI.parse(@site_url)
href = clean_url(url)
if href.host.nil?
url = "#{home.scheme}://#{home.host}/#{url.sub!(/^\//, '')}"
href = clean_url(url)
end
return nil if url.empty?
return nil if %w(jpeg jpg png pdf gif).include? File.extname(url).delete('.')
return nil if url.include? 'mailto'
return nil if home.host != href.host
url
end
def clean_url(url)
URI.parse(URI.encode(url))
end
end
puts "################################################".green
puts "Mapping links for #{@site_url}".green
puts "################################################".green
url = ARGV[0]
puts "[ERROR] You must enter a url".red if url.nil?
exit if url.nil?
puts "[ERROR] Not a valid url".red unless url =~ URI::regexp
exit unless url =~ URI::regexp
crawler = Crawl.new(url)
pages = crawler.pages
puts "[WARNING] This seems to be a redirected page".yellow if pages.size == 1
exit if pages.size == 1
puts pages
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment