Skip to content

Instantly share code, notes, and snippets.

@Watemlifts
Forked from johndavidback/googlebiz.py
Created November 30, 2019 09:26
Show Gist options
  • Save Watemlifts/bc2770b42b00c1f23bcfc7f0dcc5fa88 to your computer and use it in GitHub Desktop.
Save Watemlifts/bc2770b42b00c1f23bcfc7f0dcc5fa88 to your computer and use it in GitHub Desktop.
Google Places for Business scraper
from scrapy.spider import BaseSpider
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.http import FormRequest
from scrapy.selector import HtmlXPathSelector
from tutorial.items import GoogleItem
# This is the class that does work.
class LoginSpider(BaseSpider):
name = 'google-login'
start_urls = ['https://accounts.google.com/ServiceLogin?service=lbc&passive=1209600&continue=http://www.google.com/local/add/businessCenter?hl%3Den-US%26gl%3DUS&followup=http://www.google.com/local/add/businessCenter?hl%3Den-US%26gl%3DUS&hl=en-US']
def parse(self, response):
"""
This overrides the builtin function parse() and forces us to login
to the google service.
"""
return [FormRequest.from_response(response,
formdata={'Email': 'not telling', 'Passwd': 'also not telling'},
callback = self.after_login)]
def after_login(self, response):
"""
This is the callback from the login function and does the
actual parsing
"""
# Display the body in the console
print response.body
# Create an xpath selector
hxs = HtmlXPathSelector(response)
# Container for item objects
items = []
# Get all of the links to the next pages
links = hxs.select('//a[contains(@href, "/local/add/analytics?storeid=")]')
# Go through all of the links on the analytics page
for l in links:
# Create a new google item
item = GoogleItem()
# Assign the values
item['link'] = l.select('@href').extract()
item['value'] = l.select('text()').extract()
item['next_link'] = ''
items.append(item)
# This plucks the 'Next >>' links from the site
next_links = hxs.select('//a[contains(text(), "Next")]')
for n in next_links:
item = GoogleItem()
item['link'] = n.select('@href').extract()
item['value'] = n.select('text()').extract()
item['next_link'] = n.select('@href').extract()
items.append(item)
return items
# This is the class that does not work
class GoogleSpider(CrawlSpider):
name = 'google-spider'
allowed_domains = ['google.com']
start_urls = [
'https://accounts.google.com/ServiceLogin?service=lbc&passive=1209600&continue=http://www.google.com/local/add/businessCenter?hl%3Den-US%26gl%3DUS&followup=http://www.google.com/local/add/businessCenter?hl%3Den-US%26gl%3DUS&hl=en-US'
]
rules = (
Rule(SgmlLinkExtractor(allow=('/local/add/businessCenter?page=', ))),
)
def init_request(self):
"""
This is called initially
"""
return self.login()
def login(self):
"""
This is where I am stuck. Obviously response is not defined.
"""
return [FormRequest.from_response(response,
formdata={'Email': '[email protected]', 'Passwd': 'Since1955'},
callback = self.after_login)]
def after_login(self):
"""
Required for the crawler to start crawling
"""
self.initialized()
def parse_items(self, response):
print response.body
hxs = HtmlXPathSelector(response)
items = []
# Get all of the links to the next pages
links = hxs.select('//a[contains(@href, "/local/add/analytics?storeid=")]')
for l in links:
item = GoogleItem()
item['link'] = l.select('@href').extract()
item['value'] = l.select('text()').extract()
item['next_link'] = ''
items.append(item)
next_links = hxs.select('//a[contains(text(), "Next")]')
for n in next_links:
item = GoogleItem()
item['link'] = n.select('@href').extract()
item['value'] = n.select('text()').extract()
item['next_link'] = n.select('@href').extract()
items.append(item)
return items
# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/topics/items.html
from scrapy.item import Item, Field
class GoogleItem(Item):
link = Field()
value = Field()
next_link = Field()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment