Created
February 13, 2013 14:10
-
-
Save cheekybastard/4944871 to your computer and use it in GitHub Desktop.
Scrapy + selenium to scrape a page and a subpage from a list
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Using Scrapy to Scrape a Page and a Subpage from a list | |
| from scrapy.contrib.spiders.init import InitSpider | |
| from scrapy.http import Request, FormRequest | |
| from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor | |
| from scrapy.contrib.spiders import CrawlSpider, Rule | |
| from scrapy.spider import BaseSpider | |
| from scrapy.selector import HtmlXPathSelector | |
| from selenium import selenium | |
| from linkedpy.items import LinkedPyItem | |
| class LinkedPySpider(InitSpider): | |
| name = 'LinkedPy' | |
| allowed_domains = ['linkedin.com'] | |
| login_page = 'https://www.linkedin.com/uas/login' | |
| start_urls = [ | |
| "http://www.linkedin.com/company/onward-search", | |
| "http://www.linkedin.com/company/circle-one-marketing", | |
| "http://www.linkedin.com/company/ryan-partnership" | |
| ] | |
| def init_request(self): | |
| #"""This function is called before crawling starts.""" | |
| return Request(url=self.login_page, callback=self.login) | |
| def login(self, response): | |
| #"""Generate a login request.""" | |
| return FormRequest.from_response(response, | |
| formdata={'session_key': 'some@email.com', 'session_password': 'somepassword'}, | |
| callback=self.check_login_response) | |
| def check_login_response(self, response): | |
| #"""Check the response returned by a login request to see if we aresuccessfully logged in.""" | |
| if "Sign Out" in response.body: | |
| self.log("nnnSuccessfully logged in. Let's start crawling!nnn") | |
| # Now the crawling can begin.. | |
| return self.initialized() | |
| else: | |
| self.log("nnnFailed, Bad times :(nnn") | |
| # Something went wrong, we couldn't log in, so nothing happens. | |
| def parse(self, response): | |
| self.log("nnn We got data! nnn") | |
| hxs = HtmlXPathSelector(response) | |
| all_items = [] | |
| item = LinkedPyItem() | |
| item['type'] = hxs.select('//div[@class='basic-info']/div/dl/dd[1]/text()').extract() | |
| item['size'] = hxs.select('//div[@class='basic-info']/div/dl/dd[2]/text()').extract() | |
| item['website'] = hxs.select('//div[@class='basic-info']/div/dl/dd[3]/a/text()').extract() | |
| item['industry'] = hxs.select('//div[@class='basic-info']/div/dl/dd[4]/text()').extract() | |
| item['founded'] = hxs.select('//div[@class='basic-info']/div/dl/dd[5]/text()').extract() | |
| item['specialties'] = hxs.select('//div[@class='text-logo']/p[2]/text()').extract() | |
| #### Where the Problem Lives #### | |
| self.newlink = "%s/insights" % response.url | |
| request = Request(url=self.newlink) | |
| hxs2 = HtmlXPathSelector(request) | |
| sites = hxs2.select('//div[@class='common-skills']/div/ul/li') | |
| skills = [] | |
| for site in sites: | |
| skills = site.select('a/text()').extract() | |
| item['skills'].append("%s, " % skills) | |
| ################################# | |
| all_items.append(item) | |
| return all_items |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment