Created
May 5, 2021 03:53
-
-
Save thomasjslone/6ce059147a9be54a56e64f9f152cff3d to your computer and use it in GitHub Desktop.
basic blind ruby webcrawler with search terms
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# webcrawler 1.0 created 2021.05.04 ## follow the term across every connected link blindly | |
require 'uri';require 'open-uri' | |
class Web_Crawler | |
def initialize | |
@case_sensitivity=false | |
end | |
def crawl(base_page,term) | |
@counter=0 ##number of loops executed\pages crawled | |
@search_term=term.to_s ## a string to search pages for | |
@already_searched=[] ## links to not crawl again | |
@search_que=[base_page] ## a list of links to crawl | |
@matching_pages=[] ## a list of pages that matched the search term | |
@base_page=base_page ##starting page | |
@crawling=true | |
while @crawling | |
@invalid_counter=0 ## a counter for how many invalid urls have been skipped | |
#sleep 1 ## sleeps for one second between page loads to throttle network ussage and keep your isp and the sites you crawl happy, its just being polite really but you can reduce this value | |
@counter=@counter+1 ## counts pages checked | |
current=@search_que[0];@search_que.delete_at(0) ## assume the next page to check | |
puts "Searching: " + current.to_s | |
puts @counter.to_s+"/"+@search_que.length.to_s | |
p=get_page(current) | |
if p == '' ;@invalid_counter+=1; puts "Invalid link checked.("+@invalid_counter.to_s+")" ##page couldnt be loaded or was blank | |
else | |
begin;f=File.open(Dir.getwd+"/dat/a_traveled_links.txt","a");f.write(current.to_s+"\n");f.close;rescue;;end ## maintain a list of links checked in a file | |
l=get_links(p) ## extract http links from the page and add them to the seach que | |
nl=[];l.each do |i| ; begin;if i.to_s[0..3].downcase=="http";nl<<i;end;rescue;;end ; end ; l=nl ## filter out only hyper text links | |
if l.length>0;puts "Found "+l.length.to_s+" links on this page." | |
l.each {|i| if @already_searched.include?(i)==false;@search_que<<i.to_s;end} ## add links to search que if they havent already been searched | |
begin;f=File.open(Dir.getwd+"/dat/a_found_links.txt","a");f.write(l.join("\n")+"\n");f.close;rescue;;end ## maintain a file list of all links discovered | |
else;puts "No links found on this page." | |
end | |
end | |
begin;if @case_sensitivity==false;cp=p.downcase;else;cp=c;end;rescue;;end ## handle case sensetivity setting | |
if cp.include?(@search_term) ## check page for search term | |
puts "PAGE MATCH:" + @matching_pages.length.to_s + ": "+current.to_s | |
@matching_pages<<p | |
f=File.open(Dir.getwd+"/dat/match"[email protected]_s+".txt","w");f.write(p.to_s);f.close ## save page because it matched search term | |
end | |
if @search_que.length==0;@crawling=false;puts "SEARCH CONCLUDED";end#if out of links to crawl stop , like maybe thats possible | |
@already_searched<<current.to_s ## list this link so we wont search it again | |
end | |
end | |
def get_page(url) ## returns a string of a webpages source | |
if url.to_s[0].downcase=="x" | |
url = url.split("")[-1] | |
end | |
begin ##just incase string contains invalid or illegal characters | |
page=[] | |
URI.open(url) {|f| | |
f.each_line {|line| page<<line.to_s} ## get page lines | |
} | |
return page.join("\n") ## return page in its original form | |
rescue;return'' | |
end | |
end | |
def get_links(str) #extracts links from strings | |
begin ## protection from invalid and illegal characters | |
return URI.extract(str.to_s) ## returns an array of urls parsed from a given string | |
rescue | |
return [] | |
end | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment