Skip to content

Instantly share code, notes, and snippets.

@thomasjslone
Last active April 18, 2022 14:39
Show Gist options
  • Save thomasjslone/1b6118cc650d7c80a15f610d804e3ad2 to your computer and use it in GitHub Desktop.
Save thomasjslone/1b6118cc650d7c80a15f610d804e3ad2 to your computer and use it in GitHub Desktop.
webcrawler vers 1
# webcrawler 1.1 created 2021.05.04 last updated 2021.05.05 j.thomas
# starts with a basepage and grabs links as it finds them in each page while noting instances of search term appearences in pages and links
require 'uri';require 'open-uri'
class Web_Crawler
def initialize
@case_sensitivity=false ## wether caps is ignored when looking for search term
@counter=0 ##number of loops executed\pages crawled
@invalid_counter=0 ## number of invalid pages checked
@already_searched=[] ## links to not crawl again
@search_term=''
@search_que=[] ## a list of links to crawl
@matching_pages=[] ## a list of pages that matched the search term
@matching_links=[] ## links that directly contain the search term
@base_page='' ##starting page
end
def start_crawling(base_page,term)
if @case_sensitivity==false;@search_term=term.to_s.downcase;else;@search_term=term.to_s;end
@search_que=[base_page]
unless @crawling ; crawl_loop ; end
end
def stop_crawling
@crawling=false
end
def save_crawl
dat=[@counter.to_s,@invalid_counter.to_s,@case_sensitivity.to_s,@already_searched.to_s,@search_que.to_s,@matching_pages.to_s,@matching_links.to_s,@search_term.to_s].to_s
f=File.open(Dir.getwd+"/crawl.dat","w");f.write(dat);f.close
puts "Saved this point in the crawl to file."
end
def resume_saved_crawl ## to resume a crawl right after calling stop just call the crawl_loop again
if File.exist?(Dir.getwd+"/crawl.dat")
f=File.open(Dir.getwd+"/crawl.dat","r");dat=f.read.to_s;f.close
dat=eval(dat)
@counter=dat[0].to_i ; @invalid_counter=dat[1].to_i ; if dat[2]=="true";@case_sensitivity=true;else;@case_sensitivity=false;end
@already_searched=eval(dat[3].to_s)
@search_que=eval(dat[4].to_s)
@matching_pages = eval(dat[5].to_s)
@matching_links = eval(dat[6].to_s)
if @case_sensitivity==false;@search_term = dat[7].to_s.downcase;else;@search_term=dat[7].to_s;end
puts "Preparing to resume crawl..."
crawl_loop
else;return []
end
end
def crawl_loop ## base_page=String 'http://url.page.com',term=String 'gold prices in germany'
@crawling=true ## a variable to keep the loop running
while @crawling
#sleep 1 # can be used to prevent annoyance to servers being crawled
@counter=@counter+1 ## counts pages checked
## set next page to check ## STEERING GOES HERE kinda
current=@search_que[0];@search_que.delete_at(0)
## print info to consol
puts "Searching: " + current.to_s
puts @counter.to_s+"/"+@search_que.length.to_s
## get page contents
p=get_page(current)
## check page
if p == '' ;@invalid_counter+=1; puts "Invalid link checked.("+@invalid_counter.to_s+")" ## if invalid skip
else ## if valid page check for search term and save page if found
## write page address to file
begin;f=File.open(Dir.getwd+"/dat/a_traveled_links.txt","a");f.write(current.to_s+"\n");f.close;rescue;;end
## extract links from page
l=get_links(p)
## filter out links that do not have an http/https header
nl=[];l.each do |i| ; begin;if i.to_s[0..3].downcase=="http";nl<<i;end;rescue;;end;end;l=nl
## if links are found add them to the search que if they havent been checked before
if l.length>0;puts "Found "+l.length.to_s+" links on this page." ## print found links to screen
## filter links searched before
nl=[];l.each {|i| if @already_searched.include?(i)==false;nl<<i.to_s;end};l=nl
## write links to file
begin;f=File.open(Dir.getwd+"/dat/a_found_links.txt","a");f.write(l.join("\n")+"\n");f.close;rescue;;end
## check links for search term and add them to the beginning of the que if they match and the end if they dont
begin;if @case_sensitivity==false;nl=l.to_s;else;nl=l.to_s.downcase;end;rescue;nl=l.to_s;end;nl=eval(nl.to_s)
nl.each do |i|
if i.include?(@search_term)
begin;f=File.open(Dir.getwd+"/dat/a_matching_links.txt","w");f.write(i.to_s);f.close;rescue;;end
@search_que.insert(0,i)
else
@search_que<<i
end
end
else;puts "No links found on this page." ## print found links to screen
end
end ## end of link check
## begin search term check
begin;if @case_sensitivity==false;cp=p.downcase;else;cp=c;end;rescue;cp=p.to_s;end ## handle case sensetivity setting
if cp.include?(@search_term) ## the term was found
puts "PAGE MATCH:" + @matching_pages.length.to_s + ": "+current.to_s ## print match alert to consol
@matching_pages<<p ##add page to list of matches
unless File.exist?(Dir.getwd+"/dat/"+current.to_s.gsub(/[^a-zA-Z0-9]/,"_")+".txt")
f=File.open(Dir.getwd+"/dat/"+current.to_s.gsub(/[^a-zA-Z0-9]/,"_")+".txt","w");f.write("url: "+current.to_s+p.to_s);f.close ## save entire page to file
end
end ## end of page check
## list this link so we wont search it again
@already_searched<<current.to_s
## if out of links to crawk stop(the search que so far has never reached zero)
##if @search_que.length==0;@crawling=false;puts "SEARCH CONCLUDED";end
save_crawl
end ## end of crawl loop
end ## end of crawl method
def get_page(url) ## get page source code from link
begin;page=URI.open(url);cont=page.read.to_s;page.close;page=cont;return page.to_s;rescue;puts "A page load resulted in an exception.";end
end
def get_links(str) ## use rubys URI class to extract links, returns an array
begin;return URI.extract(str.to_s);rescue;return [];end ## rescue the code incase of illegal characters
end
end
$w=Web_Crawler.new ##so you wont have to type it every time
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment