Created
May 15, 2017 02:12
-
-
Save chand1012/50381ed9dd885ec2428a94fbc2e747a6 to your computer and use it in GitHub Desktop.
Web Crawler made in ruby. Designed for Wikipedia.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require 'rest-client' | |
require 'nokogiri' | |
require 'random_methods' | |
require 'os' | |
#v1.1: Added help section | |
#v1.2: Fixed bug with Ruby 2.2.3 | |
#v1.3: Added change log | |
wikiLink = "wikipedia.org" | |
urlList = Array.new | |
url = nil | |
css_selector = nil | |
mainUrl = nil | |
#init | |
begin | |
# get the Url from the user | |
puts "Enter '/help' for help\nEnter '/changes' for change log" | |
loop do | |
print "Enter Url:" | |
url = gets.chomp | |
if url['http://'] then | |
puts "Already contains http://, Skipping..." | |
break | |
elsif url == '/help' then #help section | |
puts "This program is made to assist people in the quick finding of webpages" | |
puts "relevant to a specific topic." | |
puts "At the moment, the program is optimized for wikipedia and more basic sites, \nas it may not find links on more modern websites." | |
puts "Start by entering a url on your topic ie: \nhttp://en.wikipedia.org/wiki/Ruby_(programming_language)" | |
puts "Enter the name of the file that you want the links to be in,\nie:'ruby'" | |
puts "Then enter the number of topics you want to search for, for examle, 3" | |
puts "Enter each topic in their line, use underscores(_) instead of spaces, \nand do not capitalize the first letter." | |
puts "The program will automatically search the webpage for your topics and then output them to the file name you chose." | |
elsif url == '/changes' then #changes section | |
puts 'v1.0: Program released' | |
puts 'v1.1: Added help section' | |
puts 'v1.2: Fixed compatibitly issues with Ruby 2.2.2' | |
puts 'v1.3: Added change log' | |
puts 'Planned changes:' | |
puts "Fix bug making it print the link with only 2 letters ie: '.or' instead of '.org'" | |
puts 'Make user interface' | |
else | |
puts "Adding http://..." | |
url = "http://#{url}" | |
break | |
end | |
end | |
# dowload and init the webpage | |
page = Nokogiri::HTML(RestClient.get(url)) | |
if mainUrl == wikiLink then | |
css_selector = 'p' | |
else | |
css_selector = 'body' | |
end | |
paras = page.css(css_selector) | |
rescue | |
puts "404: Page not found." | |
puts "Please restart client with valid url!" | |
if OS.windows? == true then | |
puts `pause` | |
exit | |
else | |
sleep(5) | |
exit | |
end | |
end | |
print 'Enter Output file name and extenstion:' | |
outputFile = gets.chomp | |
if !outputFile['.'] then # if it has no extenstion, default to txt | |
outputFile += ".txt" | |
end | |
#get search items | |
print "Enter number of items to search for:" | |
searchTimes = gets.chomp.to_i | |
x = 1 | |
searchItems = Array.new | |
searchTimes.times do | |
print "Enter item #{x}:" | |
getItem = gets.chomp | |
searchItems += [getItem] | |
x += 1 | |
end | |
mainUrl = url.domain | |
# process the page for each item | |
paras.each do |link| | |
hrefs = link.css('a').map do |a| | |
a['href'] | |
end.compact.uniq | |
searchItems.each do |item| | |
hrefs.each do |href| | |
if href[item] or href[item.capitalize] then | |
if href == url then | |
puts "Duplicate link. Skipping..." | |
elsif href['Citation_needed'] or href['Help'] or href['File:'] or href['Category:'] or href['Talk:'] or href['Special:'] or href['Template'] then | |
puts "Contains Unacceptable character(s). Skipping...." | |
elsif href['http://'] && !href[mainUrl] then | |
puts "External Link Found!" | |
urlList += [href] | |
elsif href['/wiki/'] and not href['#'] then | |
puts 'Link Found!' | |
remoteUrl = wikiLink + href | |
urlList += [remoteUrl] | |
end | |
end | |
end | |
end | |
end | |
urlList = urlList.compact.uniq | |
urlList.each do |site| | |
puts "Adding link #{site} to file..." | |
end | |
urlList.write_all(outputFile) | |
puts "Search complete!" | |
if OS.windows? == true then | |
puts `pause` | |
else | |
sleep(5) | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment