thomasjslone · May 5, 2021 03:53
diff --git a/web_crawler.rb b/web_crawler.rb
 # webcrawler 1.0  created 2021.05.04 ## follow the term across every connected link blindly
 require 'uri';require 'open-uri'
 class Web_Crawler
  def initialize
 	@case_sensitivity=false
  end
  
  def crawl(base_page,term)
    @counter=0 ##number of loops executed\pages crawled
    @search_term=term.to_s ## a string to search pages for
 	@already_searched=[] ## links to not crawl again
 	@search_que=[base_page] ## a list of links to crawl
 	@matching_pages=[] ## a list of pages that matched the search term
 	@base_page=base_page ##starting page
    @crawling=true
 	while @crawling
      @invalid_counter=0 ## a counter for how many invalid urls have been skipped
 	  #sleep 1 ## sleeps for one second between page loads to throttle network ussage and keep your isp and the sites you crawl happy, its just being polite really but you can reduce this value
 	  @counter=@counter+1 ## counts pages checked
 	  current=@search_que[0];@search_que.delete_at(0)	  ## assume the next page to check
 	  puts "Searching: " + current.to_s
 	  puts @counter.to_s+"/"+@search_que.length.to_s
 	  p=get_page(current)
 	  if p == '' ;@invalid_counter+=1; puts "Invalid link checked.("+@invalid_counter.to_s+")" ##page couldnt be loaded or was blank
 	  else
 		begin;f=File.open(Dir.getwd+"/dat/a_traveled_links.txt","a");f.write(current.to_s+"\n");f.close;rescue;;end  ## maintain a list of links checked in a file		
 		l=get_links(p) ## extract http links from the page and add them to the seach que
 		nl=[];l.each do |i| ; begin;if i.to_s[0..3].downcase=="http";nl<<i;end;rescue;;end ; end ; l=nl ## filter out only hyper text links
 	    if l.length>0;puts "Found "+l.length.to_s+" links on this page."  
 		  l.each {|i| if @already_searched.include?(i)==false;@search_que<<i.to_s;end} ## add links to search que if they havent already been searched
 		  begin;f=File.open(Dir.getwd+"/dat/a_found_links.txt","a");f.write(l.join("\n")+"\n");f.close;rescue;;end ## maintain a file list of all links discovered 
 		else;puts "No links found on this page."
 		end
 	  end
 	  begin;if @case_sensitivity==false;cp=p.downcase;else;cp=c;end;rescue;;end ## handle case sensetivity setting
 	  if cp.include?(@search_term) ## check page for search term
 	    puts "PAGE MATCH:" + @matching_pages.length.to_s + ": "+current.to_s
 	    @matching_pages<<p
 		f=File.open(Dir.getwd+"/dat/match"[email protected]_s+".txt","w");f.write(p.to_s);f.close ## save page because it matched search term
 	  end
 	  if @search_que.length==0;@crawling=false;puts "SEARCH CONCLUDED";end#if out of links to crawl stop , like maybe thats possible
      @already_searched<<current.to_s ## list this link so we wont search it again
 	end
  end

  def get_page(url)  ## returns a string of a webpages source
    if url.to_s[0].downcase=="x"
 	  url = url.split("")[-1]	
 	end
    begin ##just incase string contains invalid or illegal characters
 	  page=[] 
 	    URI.open(url) {|f| 
 	      f.each_line {|line| page<<line.to_s} ## get page lines
        }
      return page.join("\n") ## return page in its original form
 	rescue;return''
 	end
  end
  
  def get_links(str) #extracts links from strings
    begin ## protection from invalid and illegal characters
      return URI.extract(str.to_s) ## returns an array of urls parsed from a given string
 	rescue
 	  return []
 	end
  end
  
 end
	# webcrawler 1.0 created 2021.05.04 ## follow the term across every connected link blindly
	require 'uri';require 'open-uri'
	class Web_Crawler
	def initialize
	@case_sensitivity=false
	end

	def crawl(base_page,term)
	@counter=0 ##number of loops executed\pages crawled
	@search_term=term.to_s ## a string to search pages for
	@already_searched=[] ## links to not crawl again
	@search_que=[base_page] ## a list of links to crawl
	@matching_pages=[] ## a list of pages that matched the search term
	@base_page=base_page ##starting page
	@crawling=true
	while @crawling
	@invalid_counter=0 ## a counter for how many invalid urls have been skipped
	#sleep 1 ## sleeps for one second between page loads to throttle network ussage and keep your isp and the sites you crawl happy, its just being polite really but you can reduce this value
	@counter=@counter+1 ## counts pages checked
	current=@search_que[0];@search_que.delete_at(0) ## assume the next page to check
	puts "Searching: " + current.to_s
	puts @counter.to_s+"/"+@search_que.length.to_s
	p=get_page(current)
	if p == '' ;@invalid_counter+=1; puts "Invalid link checked.("+@invalid_counter.to_s+")" ##page couldnt be loaded or was blank
	else
	begin;f=File.open(Dir.getwd+"/dat/a_traveled_links.txt","a");f.write(current.to_s+"\n");f.close;rescue;;end ## maintain a list of links checked in a file
	l=get_links(p) ## extract http links from the page and add them to the seach que
	nl=[];l.each do \|i\| ; begin;if i.to_s[0..3].downcase=="http";nl<<i;end;rescue;;end ; end ; l=nl ## filter out only hyper text links
	if l.length>0;puts "Found "+l.length.to_s+" links on this page."
	l.each {\|i\| if @already_searched.include?(i)==false;@search_que<<i.to_s;end} ## add links to search que if they havent already been searched
	begin;f=File.open(Dir.getwd+"/dat/a_found_links.txt","a");f.write(l.join("\n")+"\n");f.close;rescue;;end ## maintain a file list of all links discovered
	else;puts "No links found on this page."
	end
	end
	begin;if @case_sensitivity==false;cp=p.downcase;else;cp=c;end;rescue;;end ## handle case sensetivity setting
	if cp.include?(@search_term) ## check page for search term
	puts "PAGE MATCH:" + @matching_pages.length.to_s + ": "+current.to_s
	@matching_pages<<p
	f=File.open(Dir.getwd+"/dat/match"[email protected]_s+".txt","w");f.write(p.to_s);f.close ## save page because it matched search term
	end
	if @search_que.length==0;@crawling=false;puts "SEARCH CONCLUDED";end#if out of links to crawl stop , like maybe thats possible
	@already_searched<<current.to_s ## list this link so we wont search it again
	end
	end

	def get_page(url) ## returns a string of a webpages source
	if url.to_s[0].downcase=="x"
	url = url.split("")[-1]
	end
	begin ##just incase string contains invalid or illegal characters
	page=[]
	URI.open(url) {\|f\|
	f.each_line {\|line\| page<<line.to_s} ## get page lines
	}
	return page.join("\n") ## return page in its original form
	rescue;return''
	end
	end

	def get_links(str) #extracts links from strings
	begin ## protection from invalid and illegal characters
	return URI.extract(str.to_s) ## returns an array of urls parsed from a given string
	rescue
	return []
	end
	end

	end