Created
April 2, 2012 22:00
-
-
Save carlcrott/2287518 to your computer and use it in GitHub Desktop.
scraper
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require 'open-uri' | |
require 'nokogiri' | |
require 'mechanize' | |
require 'json' | |
REPO_NAME = __FILE__.split(".")[0] | |
class String | |
def valid_json? | |
begin | |
JSON.parse(self) | |
return true | |
rescue Exception => e | |
return false | |
end | |
end | |
end | |
def build_json(arr) | |
full_array = [] | |
if arr[1][-6..-1] == '.shtml' # regular internal link | |
abb = arr[1].split('/')[1] | |
arr[1][0] != '/' ? ( arr[1].insert 0,'/' ): "" # some internal links are missing a leading / | |
@temp = { | |
"url" => "http://www.worldscinet.com#{arr[1]}", | |
"rss" => "http://www.worldscinet.com/#{abb}/#{abb}.rss", | |
"index" => "http://www.worldscinet.com/#{abb}/mkt/archive.shtml" | |
} | |
elsif arr[1][0..6] == 'http://' | |
@temp = { | |
"url" => arr[1], | |
"rss" => "idk", | |
"index" => "idk" | |
} | |
else | |
puts "BLEEP! BLOOP! I dont know how to build this entry: #{arr}" | |
end | |
full_array = { "name" => arr[0], "url" => @temp['url'], "rss" => @temp['rss'], "index" => @temp['index'] } | |
return full_array | |
end | |
def verify_data(entry, v = true) | |
begin ###### Verify url | |
open(entry['url']).is_a? Tempfile | |
rescue | |
puts "ERROR: Expecting '#{entry['url']}' to parse open-uri" unless entry['index'] == 'idk' | |
end | |
begin ###### Verify rss | |
Mechanize.new.get(entry['rss']).content.class.is_a? Nokogiri::XML::Document | |
rescue | |
if entry['rss'] != 'idk' | |
puts "ERROR: Expecting '#{entry['rss']}' to parse as Mechanize::File class" | |
entry['rss'] == 'idk' | |
end | |
end | |
begin ###### Verify index | |
page = Mechanize.new.get(entry['index']) | |
url_tests = [] | |
(2008..2012).map {|x| x="[text()*='#{x}']"; url_tests << page.search(x).count} | |
raise "" unless url_tests.any? != 0 | |
rescue | |
entry['index'] == 'idk' ? "": (puts "ERROR: Expecting '#{entry['index']}' to contain strings '2008..2012'") | |
end | |
v ? (puts "VERIFIED: #{entry}") : "" | |
return entry | |
end | |
def main() | |
page = Mechanize.new.get('http://www.worldscinet.com/alphabetical.shtml') | |
journals = page.search('table')[15].search('a') | |
topics_list = [] | |
for journal in journals | |
link = journal.attributes["href"].text() | |
name = journal.text() | |
name != "" ? (topics_list << [name,link]) : "" | |
end | |
final = [] | |
for t in topics_list | |
build_json(t) | |
journal_entry = verify_data(build_json(t)) | |
final << journal_entry | |
end | |
puts "VALID JSON? #{final.to_json.valid_json?}" | |
output_file = "#{REPO_NAME}_output.json" | |
puts "Writing output to file: #{output_file}" | |
File.open(output_file,'a').write(final.to_json) | |
puts "VERIFYING... All outputs should be quiet" | |
for entry in final | |
verify_data(entry, false) | |
end | |
end | |
main() | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment