Created
June 15, 2017 15:57
-
-
Save yosukehasumi/963ba0328bbf0f368a8013363284548d to your computer and use it in GitHub Desktop.
linkmap
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env ruby | |
require 'rubygems' | |
require 'nokogiri' | |
require 'open-uri' | |
require 'uri' | |
require 'net/http' | |
require 'rainbow' | |
require 'awesome_print' | |
class Crawl | |
attr_reader :site_url | |
attr_accessor :collected_pages | |
def initialize(site_url) | |
@site_url = site_url | |
@collected_pages = [] | |
end | |
def pages | |
collect_pages(@site_url) | |
puts "" | |
@collected_pages | |
end | |
private | |
def collect_pages(url) | |
return if @collected_pages.include? url | |
add_to_collected_pages(url) | |
links = page_links(url) | |
links.each do |link| | |
collect_pages(link) | |
end | |
end | |
def add_to_collected_pages(url) | |
@collected_pages << url | |
print "." | |
end | |
def page_links(url) | |
begin | |
page = Nokogiri::HTML(Net::HTTP.get(URI(clean_url(url)))) | |
links = page.css('a') | |
links.collect do |link| | |
link_filter(link['href']) | |
end.compact | |
rescue => e | |
[] | |
end | |
end | |
def link_filter(url) | |
return nil if url.nil? | |
url = Nokogiri::HTML(url).text | |
home = URI.parse(@site_url) | |
href = clean_url(url) | |
if href.host.nil? | |
url = "#{home.scheme}://#{home.host}/#{url.sub!(/^\//, '')}" | |
href = clean_url(url) | |
end | |
return nil if url.empty? | |
return nil if %w(jpeg jpg png pdf gif).include? File.extname(url).delete('.') | |
return nil if url.include? 'mailto' | |
return nil if home.host != href.host | |
url | |
end | |
def clean_url(url) | |
URI.parse(URI.encode(url)) | |
end | |
end | |
puts "################################################".green | |
puts "Mapping links for #{@site_url}".green | |
puts "################################################".green | |
url = ARGV[0] | |
puts "[ERROR] You must enter a url".red if url.nil? | |
exit if url.nil? | |
puts "[ERROR] Not a valid url".red unless url =~ URI::regexp | |
exit unless url =~ URI::regexp | |
crawler = Crawl.new(url) | |
pages = crawler.pages | |
puts "[WARNING] This seems to be a redirected page".yellow if pages.size == 1 | |
exit if pages.size == 1 | |
puts pages |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment