-
-
Save Watemlifts/bc2770b42b00c1f23bcfc7f0dcc5fa88 to your computer and use it in GitHub Desktop.
Google Places for Business scraper
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from scrapy.spider import BaseSpider | |
from scrapy.contrib.spiders import CrawlSpider, Rule | |
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor | |
from scrapy.http import FormRequest | |
from scrapy.selector import HtmlXPathSelector | |
from tutorial.items import GoogleItem | |
# This is the class that does work. | |
class LoginSpider(BaseSpider): | |
name = 'google-login' | |
start_urls = ['https://accounts.google.com/ServiceLogin?service=lbc&passive=1209600&continue=http://www.google.com/local/add/businessCenter?hl%3Den-US%26gl%3DUS&followup=http://www.google.com/local/add/businessCenter?hl%3Den-US%26gl%3DUS&hl=en-US'] | |
def parse(self, response): | |
""" | |
This overrides the builtin function parse() and forces us to login | |
to the google service. | |
""" | |
return [FormRequest.from_response(response, | |
formdata={'Email': 'not telling', 'Passwd': 'also not telling'}, | |
callback = self.after_login)] | |
def after_login(self, response): | |
""" | |
This is the callback from the login function and does the | |
actual parsing | |
""" | |
# Display the body in the console | |
print response.body | |
# Create an xpath selector | |
hxs = HtmlXPathSelector(response) | |
# Container for item objects | |
items = [] | |
# Get all of the links to the next pages | |
links = hxs.select('//a[contains(@href, "/local/add/analytics?storeid=")]') | |
# Go through all of the links on the analytics page | |
for l in links: | |
# Create a new google item | |
item = GoogleItem() | |
# Assign the values | |
item['link'] = l.select('@href').extract() | |
item['value'] = l.select('text()').extract() | |
item['next_link'] = '' | |
items.append(item) | |
# This plucks the 'Next >>' links from the site | |
next_links = hxs.select('//a[contains(text(), "Next")]') | |
for n in next_links: | |
item = GoogleItem() | |
item['link'] = n.select('@href').extract() | |
item['value'] = n.select('text()').extract() | |
item['next_link'] = n.select('@href').extract() | |
items.append(item) | |
return items | |
# This is the class that does not work | |
class GoogleSpider(CrawlSpider): | |
name = 'google-spider' | |
allowed_domains = ['google.com'] | |
start_urls = [ | |
'https://accounts.google.com/ServiceLogin?service=lbc&passive=1209600&continue=http://www.google.com/local/add/businessCenter?hl%3Den-US%26gl%3DUS&followup=http://www.google.com/local/add/businessCenter?hl%3Den-US%26gl%3DUS&hl=en-US' | |
] | |
rules = ( | |
Rule(SgmlLinkExtractor(allow=('/local/add/businessCenter?page=', ))), | |
) | |
def init_request(self): | |
""" | |
This is called initially | |
""" | |
return self.login() | |
def login(self): | |
""" | |
This is where I am stuck. Obviously response is not defined. | |
""" | |
return [FormRequest.from_response(response, | |
formdata={'Email': '[email protected]', 'Passwd': 'Since1955'}, | |
callback = self.after_login)] | |
def after_login(self): | |
""" | |
Required for the crawler to start crawling | |
""" | |
self.initialized() | |
def parse_items(self, response): | |
print response.body | |
hxs = HtmlXPathSelector(response) | |
items = [] | |
# Get all of the links to the next pages | |
links = hxs.select('//a[contains(@href, "/local/add/analytics?storeid=")]') | |
for l in links: | |
item = GoogleItem() | |
item['link'] = l.select('@href').extract() | |
item['value'] = l.select('text()').extract() | |
item['next_link'] = '' | |
items.append(item) | |
next_links = hxs.select('//a[contains(text(), "Next")]') | |
for n in next_links: | |
item = GoogleItem() | |
item['link'] = n.select('@href').extract() | |
item['value'] = n.select('text()').extract() | |
item['next_link'] = n.select('@href').extract() | |
items.append(item) | |
return items |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Define here the models for your scraped items | |
# | |
# See documentation in: | |
# http://doc.scrapy.org/topics/items.html | |
from scrapy.item import Item, Field | |
class GoogleItem(Item): | |
link = Field() | |
value = Field() | |
next_link = Field() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment