Last active
August 1, 2017 14:37
-
-
Save LitvinenkoD89/470ed4352f8745bbbb69ffefb8f631ab to your computer and use it in GitHub Desktop.
pappas
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# PAPPAS BRANDS {source_name: 'pappas', batch_number: 7, request_id: 7000000048, request_name: 'PAPPAS BRANDS'} | |
scraper_service.scrape do |browser, scraper, init_vars| | |
easy_seeder = Library.lib('EasySeeder') | |
easy_extractor = Library.lib('EasyExtractor') | |
fetcher_agent = Library.lib('FetcherAgent') | |
easy_seeder.seed(source_name: init_vars[:source_name]) do | |
queue_url "http://www.pappas.com/locations-list/?msg=noaddy" | |
end | |
easy_extractor.extract( | |
source_name: init_vars[:source_name], | |
scraper: scraper, | |
batch_number: init_vars[:batch_number], | |
request_id: init_vars[:request_id], | |
request_name: init_vars[:request_name], | |
) do | |
find_pages page_format: :html do |url, parser_page, page| | |
page.search('.locListState').each do |store| | |
tmp = store.at('.locListLoc').inner_html.gsub("\t", '').gsub("\r", '').gsub("\n", '').split('<br>') | |
tmp.pop | |
link = tmp.pop | |
location_url = link.split('"')[1] | |
id = location_url.split('id=')[1] | |
tel = tmp.pop | |
tmp = tmp.slice(1, tmp.length) | |
address_container = tmp.join('<br>') | |
street1, city_and_st = tmp | |
city, st = city_and_st_zip.split(', ') | |
city = city.strip | |
doc_id = store_doc({ | |
store_id: id, | |
# brand: "", | |
# type: "", | |
property_id: id, | |
name: 'PAPPAS BRANDS', # required | |
address_1: street1, | |
address_2: '', | |
city: city, | |
state: state, | |
# zipcode: zip_code, | |
country: '', | |
# lat: lat, | |
# long: long, | |
address_container_html: address_container, | |
# map_link: store.parent.parent.at('.googlemap').attr('name'), | |
location_url: location_url, | |
flags: { | |
} | |
}) | |
seeder.queue_url location_url, { | |
page_type: 'profile', | |
doc_id: doc_id | |
} | |
end | |
end | |
# Find profile pages | |
find_pages page_format: :html, page_type: 'profile' do |url, parser_page, page| | |
doc = find_location(parser_page[:doc_id]) | |
if doc.present? | |
title = page.at('title').text.split(' - ')[1] | |
if title =~ /PAPPADEAUX SEAFOOD KITCHEN/i | |
t = 'PAPPADEAUX SEAFOOD KITCHEN' | |
elsif title =~ /PAPPASITOS CANTINA/i | |
t = 'PAPPASITOS CANTINA' | |
elsif title =~ /PAPPAS BROS STEAKHOUSE/i | |
t = 'PAPPAS BROS STEAKHOUSE' | |
elsif title =~ /PAPPAS SEAFOOD HOUSE/i | |
t = 'PAPPAS SEAFOOD HOUSE' | |
elsif title =~ /PAPPAS BAR B Q/i | |
t = 'PAPPAS BAR B Q' | |
elsif title =~ /PAPPAS BURGER/i | |
t = 'PAPPAS BURGER' | |
elsif title =~ /PAPPAS BROS/i | |
t = 'PAPPAS BROS' | |
elsif title =~ /YIA YIA MARYS/i | |
t = 'PAPPAS BROS' | |
elsif title =~ /DOT/i | |
t = 'DOT' | |
end | |
address, tel = page.at('.profile_location').at('p').inner_html.split('<br>') | |
tmp = address.split(',').map(&:strip) | |
state, zip_code = tmp.last.split(' ') | |
doc[:brand] = t | |
doc[:type] = t | |
doc[:zipcode] = zip_code | |
store_doc doc | |
end | |
end | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment