Created
October 6, 2017 11:35
-
-
Save LitvinenkoD89/798d2796aed003541514c9799aaee12a to your computer and use it in GitHub Desktop.
findprivateclinics
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require 'csv' | |
main_url = "https://www.findprivateclinics.ca/" | |
url = "https://www.findprivateclinics.ca/listings.html" | |
t = Typhoeus.get url | |
page = Nokogiri::HTML t.body | |
data = {} | |
def parse_page(page, category, subcategory, data) | |
page.search(".listing").each do |el| | |
name = el.at(".name").at("a").text.strip | |
location_url = URI.join("https://www.findprivateclinics.ca/", el.at(".name").at("a").attr("href")).to_s | |
address = el.at(".address").nil? ? "" : el.at(".address").text.strip | |
city = el.at(".city").nil? ? "" : el.at(".city").text.strip | |
provstate = el.at(".provstate").nil? ? "" : el.at(".provstate").text.strip | |
zip = el.at(".zip").nil? ? "" : el.at(".zip").text.strip | |
t_loc = Typhoeus.get location_url | |
loc_page = Nokogiri::HTML t_loc.body | |
services = [] | |
unless loc_page.at(".services").nil? | |
loc_page.at(".services").search(".service").each do |serv_el| | |
services.push(serv_el.text.strip) | |
end | |
end | |
provstate = loc_page.at('[itemprop="addressRegion"]').nil? ? "" : loc_page.at('[itemprop="addressRegion"]').text.strip | |
record = { name: name, address: address, city: city, provstate: provstate, zip: zip, location_url: location_url, services: services.join(",") } | |
if data[name].present? | |
if data[name][category].present? | |
if data[name][category][subcategory].present? | |
data[name][category][subcategory].push(record) | |
else | |
data[name][category] = { subcategory => [record]} | |
end | |
else | |
data[name] = { category => { subcategory => [record] } } | |
end | |
else | |
data[name] = { category => { subcategory => [record] } } | |
end | |
end | |
data | |
end | |
page.search("select").last.search("option").each do |opt| | |
next if opt.text == "All Categories" | |
category = opt.text | |
cat_url = URI.join(main_url, opt.attr("data-url")).to_s | |
t_cat = Typhoeus.get cat_url | |
page_cat = Nokogiri::HTML t_cat.body | |
if page_cat.search("select").last.attr("id") == "FilterCategoryId[0]" | |
subcategory = "-" | |
data = parse_page(page_cat, category, subcategory, data) | |
else | |
page_cat.search("select").last.search("option").each do |sub_opt| | |
next if sub_opt.text == "All Subcategories" | |
subcategory = sub_opt.text | |
sub_cat_url = URI.join(main_url, sub_opt.attr("data-url")).to_s | |
t_sub_cat = Typhoeus.get sub_cat_url | |
page_sub_cat = Nokogiri::HTML t_sub_cat.body | |
data = parse_page(page_sub_cat, category, subcategory, data) | |
end | |
end | |
end | |
data = Hash[ data.sort_by { |key, val| key } ] | |
CSV.open('findprivateclinics_ca.csv','w', | |
:write_headers=> true, | |
:headers => ["Name", "Category", "Subcategory", "Civic Address", "City", "Province", "Postal Code", "Services"], | |
:col_sep => ";") do|hdr| | |
data.each do |name, categories| | |
categories.each do |cat_name, subcategories| | |
subcategories.each do |subcat_name, records| | |
records.each do |record| | |
hdr << [name, cat_name, subcat_name, record[:address], record[:city], record[:provstate], record[:zip], record[:services]] | |
end | |
end | |
end | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment