Skip to content

Instantly share code, notes, and snippets.

@thomasjslone
Created May 5, 2021 03:53
Show Gist options
  • Save thomasjslone/6ce059147a9be54a56e64f9f152cff3d to your computer and use it in GitHub Desktop.
Save thomasjslone/6ce059147a9be54a56e64f9f152cff3d to your computer and use it in GitHub Desktop.
basic blind ruby webcrawler with search terms
# webcrawler 1.0 created 2021.05.04 ## follow the term across every connected link blindly
require 'uri';require 'open-uri'
class Web_Crawler
def initialize
@case_sensitivity=false
end
def crawl(base_page,term)
@counter=0 ##number of loops executed\pages crawled
@search_term=term.to_s ## a string to search pages for
@already_searched=[] ## links to not crawl again
@search_que=[base_page] ## a list of links to crawl
@matching_pages=[] ## a list of pages that matched the search term
@base_page=base_page ##starting page
@crawling=true
while @crawling
@invalid_counter=0 ## a counter for how many invalid urls have been skipped
#sleep 1 ## sleeps for one second between page loads to throttle network ussage and keep your isp and the sites you crawl happy, its just being polite really but you can reduce this value
@counter=@counter+1 ## counts pages checked
current=@search_que[0];@search_que.delete_at(0) ## assume the next page to check
puts "Searching: " + current.to_s
puts @counter.to_s+"/"+@search_que.length.to_s
p=get_page(current)
if p == '' ;@invalid_counter+=1; puts "Invalid link checked.("+@invalid_counter.to_s+")" ##page couldnt be loaded or was blank
else
begin;f=File.open(Dir.getwd+"/dat/a_traveled_links.txt","a");f.write(current.to_s+"\n");f.close;rescue;;end ## maintain a list of links checked in a file
l=get_links(p) ## extract http links from the page and add them to the seach que
nl=[];l.each do |i| ; begin;if i.to_s[0..3].downcase=="http";nl<<i;end;rescue;;end ; end ; l=nl ## filter out only hyper text links
if l.length>0;puts "Found "+l.length.to_s+" links on this page."
l.each {|i| if @already_searched.include?(i)==false;@search_que<<i.to_s;end} ## add links to search que if they havent already been searched
begin;f=File.open(Dir.getwd+"/dat/a_found_links.txt","a");f.write(l.join("\n")+"\n");f.close;rescue;;end ## maintain a file list of all links discovered
else;puts "No links found on this page."
end
end
begin;if @case_sensitivity==false;cp=p.downcase;else;cp=c;end;rescue;;end ## handle case sensetivity setting
if cp.include?(@search_term) ## check page for search term
puts "PAGE MATCH:" + @matching_pages.length.to_s + ": "+current.to_s
@matching_pages<<p
f=File.open(Dir.getwd+"/dat/match"[email protected]_s+".txt","w");f.write(p.to_s);f.close ## save page because it matched search term
end
if @search_que.length==0;@crawling=false;puts "SEARCH CONCLUDED";end#if out of links to crawl stop , like maybe thats possible
@already_searched<<current.to_s ## list this link so we wont search it again
end
end
def get_page(url) ## returns a string of a webpages source
if url.to_s[0].downcase=="x"
url = url.split("")[-1]
end
begin ##just incase string contains invalid or illegal characters
page=[]
URI.open(url) {|f|
f.each_line {|line| page<<line.to_s} ## get page lines
}
return page.join("\n") ## return page in its original form
rescue;return''
end
end
def get_links(str) #extracts links from strings
begin ## protection from invalid and illegal characters
return URI.extract(str.to_s) ## returns an array of urls parsed from a given string
rescue
return []
end
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment