Last active
August 29, 2015 14:13
-
-
Save ptrin/f1fdcfedcdafa8648195 to your computer and use it in GitHub Desktop.
Small script to scrape plane data and download images from Canadian Warplane Heritage Museum website
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require "fileutils" | |
require "httparty" | |
require "json" | |
require "nokogiri" | |
require "open-uri" | |
DOMAIN = "http://www.warplane.com/" | |
# strip domain from URL | |
def strip_domain(url) | |
url.gsub(DOMAIN,'') | |
end | |
# setup planes array of hashes | |
def get_planes_array(page) | |
page.css(".div-button a").reject { |link| | |
link["href"] == "#" | |
}.map{ |link| | |
{ | |
"link" => link["href"], | |
"thumbnail" => strip_domain(link.css("img")[0]["src"]), | |
"name" => link.css("b")[0].text, | |
} | |
} | |
end | |
# get Nokogiri page obj of a plane's detail page | |
def get_detail_page(plane) | |
Nokogiri::HTML(open(plane["link"])) | |
end | |
# download gallery photos for specific plane and get filename info | |
def get_gallery_photos(detail_page) | |
begin | |
images = detail_page.css("table[id*=PhotosDataList] a").map { |link| | |
new_image = { | |
"src" => strip_domain(link["href"]), | |
"description" => link["title"] | |
} | |
# some images have no thumbnail, so we test against a regex | |
# which captures the contents of the inline background image | |
if link.css("div")[0]["style"] =~ /\((.*)\)/ | |
new_image["thumbnail"] = strip_domain($1) | |
end | |
new_image | |
} | |
images | |
rescue => e | |
puts e.message | |
end | |
end | |
# get detailed plane information and add it to | |
def get_plane_specs(detail_page) | |
# get sidebar | |
sidebar = detail_page.css("td[style]").last | |
specs = {} | |
spec_strings = sidebar.to_s.split("<br>").select { |str| str.match /<b>/ } | |
spec_strings.each do |s| | |
matches = s.match /<b>(.+):<\/b>(.*)/ | |
specs[matches[1].downcase.gsub(" ","_")] = matches[2].strip | |
end | |
specs | |
end | |
def download_plane_image(path) | |
localpath = File.expand_path path | |
FileUtils.mkdir_p File.dirname(localpath) | |
File.open(localpath, "wb") do |f| | |
puts "Downloading #{DOMAIN}#{path}" | |
f.write HTTParty.get(DOMAIN+path).parsed_response | |
end | |
end | |
# writes json file of plane info | |
def write_json_file(planes) | |
File.open("planes.json","wb") { |f| f.write(JSON.pretty_generate(planes))} | |
puts "Wrote planes to file" | |
end | |
# "main" | |
page = Nokogiri::HTML(open("http://www.warplane.com/warplane-vintage-aircraft-collection.aspx")) | |
planes = get_planes_array(page) | |
planes.each do |plane| | |
detail_page = get_detail_page plane | |
plane["images"] = get_gallery_photos detail_page | |
plane["specs"] = get_plane_specs detail_page | |
thread_list = [] | |
plane["images"].each do |image| | |
thread_list << Thread.new { | |
download_plane_image(plane["thumbnail"]) | |
download_plane_image(image["src"]) | |
if image["thumbnail"] | |
download_plane_image(image["thumbnail"]) | |
end | |
} | |
end | |
thread_list.each {|x| x.join} | |
end | |
write_json_file planes |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment