Skip to content

Instantly share code, notes, and snippets.

@jdjkelly
Created September 10, 2013 15:49
Show Gist options
  • Save jdjkelly/6511411 to your computer and use it in GitHub Desktop.
Save jdjkelly/6511411 to your computer and use it in GitHub Desktop.
task :scrape_blog_to => :environment do
blog_to = "http://www.blogto.com"
lists = ["/toronto/the_best_antique_stores_in_toronto/", "/toronto/the_best_art_supply_stores_in_toronto/", "/toronto/the_best_new_art_galleries_in_toronto_2008/", "/toronto/the_best_new_art_galleries_in_toronto_2009/", "/toronto/best_place_to_buy_a_suit_in_toronto/", "/toronto/the_best_baby_stores_in_toronto/", "/toronto/the_best_backyard_patios_in_toronto/", "/toronto/the_best_barber_shops_in_toronto/", "/toronto/the_best_bespoke_tailors_in_toronto/", "/toronto/the_best_bike_repair_shops_in_toronto/", "/toronto/the_best_bike_stores_in_toronto/", "/toronto/the_best_blues_bars_in_toronto/", "/toronto/the_best_bookstores_in_toronto/", "/toronto/the_best_boxing_gym_in_toronto/", "/toronto/the_best_bridal_stores_in_toronto/", "/toronto/the_best_bridesmaid_dresses_in_toronto/", "/toronto/the_best_card_paper_and_stationery_shops_in_toronto/", "/toronto/the_best_catering_companies_in_toronto/", "/toronto/the_best_comic_shops_in_toronto/", "/toronto/the_best_consignment_stores_in_toronto/", "/toronto/the_best_contemporary_art_galleries_for_emerging_artists_in_toronto/", "/toronto/the_best_contemporary_art_galleries_in_toronto/", "/toronto/the_best_crossfit_gyms_in_toronto/", "/toronto/the_best_custom_t-shirts_in_toronto/", "/toronto/the_best_new_clothing_stores_in_toronto_2008/", "/toronto/the_best_diy_spots_in_toronto/", "/toronto/the_best_dog_parks_in_toronto/", "/toronto/the_best_dry_cleaners_in_toronto/", "/toronto/the_best_new_design_stores_in_toronto_2009/", "/toronto/the_best_new_design_stores_in_toronto_2010/", "/toronto/the_best_eyeglasses_in_toronto/", "/toronto/the_best_fabric_stores_in_toronto/", "/toronto/the_best_fashion_designers_in_toronto/", "/toronto/the_best_film_festivals_in_toronto/", "/toronto/the_best_fitness_clubs_in_toronto/", "/toronto/the_best_florists_in_toronto/", "/toronto/the_best_furniture_stores_in_toronto/", "/toronto/the_best_new_fashion_stores_in_toronto_2009/", "/toronto/the_best_green_retailers_in_toronto/", "/toronto/the_best_green_services_in_toronto/", "/toronto/the_best_green_wedding_services_in_toronto/", "/toronto/the_best_hair_salons_in_toronto/", "/toronto/the_best_halloween_costume_stores_in_toronto/", "/toronto/the_best_hostels_in_toronto/", "/toronto/the_best_hot_yoga_in_toronto/", "/toronto/the_best_hotels_in_toronto/", "/toronto/the_best_indoor_sports_fields_in_toronto/", "/toronto/the_best_jazz_bars_in_toronto/", "/toronto/the_best_jewellery_stores_in_toronto/", "/toronto/the_best_kitchen_supply_stores_in_toronto/", "/toronto/the_best_laundromats_in_toronto/", "/toronto/the_best_live_music_venues_in_toronto/", "/toronto/the_best_martial_arts_in_toronto/", "/toronto/the_best_menswear_stores_in_toronto/", "/toronto/the_best_movers_in_toronto/", "/toronto/the_best_musical_instrument_stores_in_toronto/", "/toronto/the_best_outdoor_sports_fields_in_toronto/", "/toronto/the_best_personal_trainers_in_toronto/", "/toronto/the_best_pet_grooming_and_daycare_in_toronto/", "/toronto/the_best_pet_stores_in_toronto/", "/toronto/the_best_photography_galleries_in_toronto/", "/toronto/the_best_pilates_in_toronto/", "/toronto/the_best_place_to_watch_a_film_in_toronto/", "/toronto/the_best_places_to_find_stuff_made_by_local_designers/", "/toronto/the_best_public_swimming_pools_in_toronto/", "/toronto/the_best_salvage_and_reclaimed_furniture_in_toronto/", "/toronto/the_best_self_storage_in_toronto/", "/toronto/the_best_sex_shops_in_toronto/", "/toronto/the_best_shoe_stores_in_toronto/", "/toronto/the_best_skateboard_shops_in_toronto/", "/toronto/the_best_sneaker_shops_in_toronto/", "/toronto/the_best_spas_in_toronto/", "/toronto/the_best_sunglasses_in_toronto/", "/toronto/the_best_tattoo_parlours_in_toronto/", "/toronto/the_best_tennis_clubs_in_toronto/", "/toronto/the_best_theatre_production_companies_in_toronto/", "/toronto/the_best_used_bookstores_in_toronto/", "/toronto/the_best_used_cd_stores_in_toronto/", "/toronto/the_best_video_stores_in_toronto/", "/toronto/the_best_vintage_clothing_stores_in_toronto/", "/toronto/the_best_vintage_furniture_stores_in_toronto/", "/toronto/the_best_vinyl_record_stores_in_toronto/", "/toronto/the_best_waxing_salons_in_toronto/", "/toronto/the_best_yoga_studios_in_toronto/"]
lists.each do |list_url|
html = Nokogiri::HTML.parse(RestClient.get("#{blog_to}#{list_url}"))
title = /The Best (.+) in Toronto/.match(html.css('h1').first.content)
if title
title = title[1]
case title
when "Antique Stores"
title = "antiques"
when "Art Supply Stores"
title = "art supplies"
when "New Art Galleries"
title = "art"
when "Baby Stores"
title = "baby supplies"
when "Bespoke Tailors"
title = "bespoke suits"
when "Bike Repair Shops"
title = "bikes"
when "Bike Stores"
title = "bikes"
when "Bookstores"
title = "books"
when "Bridal Stores"
title = "bridal gowns"
when "Bridesmaid Dresses"
title = "bridesmaid dresses"
when "Card, Paper and Stationery Shops"
title = "stationery"
when "Comic Shops"
title = "comics"
when "Contemporary Art Galleries for Emerging Artists"
title = "art"
when "Contemporary Art Galleries"
title = "art"
when "Custom T-Shirts"
title = "t-shirts"
when "Eyeglasses"
title = "glasses"
when "Fabric Stores"
title = "fabric"
when "Fashion Designers"
title = "designer fashion"
when "Florists"
title = "flowers"
when "Furniture Stores"
title = "furniture"
when "New Fashion Stores"
title = "designer fashion"
when "Halloween Costume Stores"
title = "halloween costumes"
when "Jewellery Stores"
title = "jewellery"
when "Kitchen Supply Stores"
title = "kitchen supplies"
when "Menswear Stores"
title = "menswear"
when "Musical Instrument Stores"
title = "musical instruments"
when "Pet Stores"
title = "pet supplies"
when "Salvage and Reclaimed Furniture"
title = "vintage furniture"
when "Sex Shops"
title = "sex toys"
when "Shoe Stores"
title = "shoes"
when "Skateboard Shops"
title = "skateboards"
when "Sneaker Shops"
title = "sneakers"
when "Sunglasses"
title = "sunglasses"
when "Used Bookstores"
title = "books"
when "Used CD Stores"
title = "CDs"
when "Video Stores"
title = "DVDs"
when "Vintage Clothing Stores"
title = "vintage clothing"
when "Vintage Furniture Stores"
title = "vintage furniture"
when "Vinyl Record Stores"
title = "vinyl records"
else
@stop = true
end
else
@stop = true
end
unless @stop
noun = Noun.find_or_create_by(name: title)
html.css(".torontolists-item h2 .name").each do |location|
foursquare_response = JSON.parse(RestClient.get("https://api.foursquare.com/v2/venues/search?near=Toronto&query=#{URI.escape(location.content)}&limit=1&client_id=O3TYYRX0Q1XKSL10XKY2L4YXOJEBETNVE2ZQO4WJ1YTQFIBF&client_secret=AOB1CBYH2OBVUPSAC2VEGD3YP0SHZD1BM0QG0V5EY02RIXP4&v=20121006"))
case location.content
when "Kantelberg + Co."
location.content = "Kantelberg & Co."
when "Gwartzman's Art Supplies"
location.content = "Gwartzmans Art Supplies"
when "Curry's Artists' Materials"
location.content = "Curry's Artist's Materials"
when "Georgia Sherman Projects"
location.content = "Georgia Scherman projects"
when "107Shaw Gallery"
location.content = "107 Shaw Gallery"
when "Macklem's"
location.content = "Macklems"
when "Ella + Elliot"
location.content = "Ella + Elliot"
when "Moms to be and More"
location.content = "Moms to be... and More"
when "Hello Sunshine"
location.content = "hello sunshine creative baby shop"
when "Aquarius Menswear"
location.content = "Aquarius Men's Wear"
when "Walter Beauchamp Tailors"
location.content = "Walter Beauchamp"
when "Harry Rosen (Bloor)"
location.content = "Harry Rosen Menswear"
when "Dave Fix My Bike"
location.content = "Dave...Fix My Bike"
when "Cyclepath Danforth"
location.content = "Cyclepath Danforth"
when "Bakka Phoenix"
location.content = "Bakka-Phoenix Books"
when "Becker's Bridal"
location.content = "Beckers Bridal"
when "Jealous Bridesmaids Bridal Studio"
location.content = "Jealous Bridesmaids"
when "Your White Dress Bridal Outlet"
location.content = "Your White Dress"
when "Lea Ann Belter"
location.content = "Lea-Ann Belter Bridal"
when "Pink Tartan/Seventy Seven"
location.content = "Pink Tartan"
when "Atomic Age Comix"
location.content = "Atomic Age"
when "Don't Tell Mama Gallery"
location.content = "Don't Tell Mama"
when "Stylegarage/Gus"
location.content = "Gus* Studio@STYLEGARAGE"
when "Linda Penwarden Jewellery"
location.content = "Linda Penwarden"
when "Jonathan + Olivia"
location.content = "Jonathan & Olivia"
when "The Academy of Music"
location.content = "Academy of Music"
when "Helmutt's Pet Supply"
location.content = "Helmutt's"
when "David's"
location.content = "Davids"
when "Vortex Records"
location.content = "Vortex Used Records"
when "Around Again Records"
location.content = "Around Again"
when "Penny Arcade Vintage"
location.content = "Penny Arcade"
when "Phil'z 20th Century"
location.content = "Phil'z"
when "Vortex Records"
location.content = "Vortex Used Records"
end
if foursquare_response["response"]["venues"].length > 0 && foursquare_response["response"]["venues"].first["name"].downcase.match(location.content.downcase)
# puts "#{location.content} succeeded against #{foursquare_response["response"]["venues"].first["name"]}"
location = Location.find_or_create_by(name: foursquare_response["response"]["venues"].first["name"])
location.update_attributes(
foursquare_venue_object: foursquare_response["response"]["venues"].first,
coordinates: [foursquare_response["response"]["venues"].first["location"]["lng"], foursquare_response["response"]["venues"].first["location"]["lat"]],
city: [foursquare_response["response"]["venues"].first["location"]["city"],
country: [foursquare_response["response"]["venues"].first["location"]["country"],
state: [foursquare_response["response"]["venues"].first["location"]["state"],
street_address: [foursquare_response["response"]["venues"].first["location"]["address"]
)
elsif foursquare_response["response"]["venues"].length > 0
puts "#{location.content} did not match #{foursquare_response["response"]["venues"].first["name"]}"
else
puts "#{location.content} failed"
end
end
end
@stop = false
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment