Skip to content

Instantly share code, notes, and snippets.

@cheekybastard
Created February 13, 2013 14:17
Show Gist options
  • Save cheekybastard/4944914 to your computer and use it in GitHub Desktop.
Save cheekybastard/4944914 to your computer and use it in GitHub Desktop.
Using Scrapy with Selenium to scape a rendered page
#Using Scrapy with Selenium to scape a rendered page [Updated]
from scrapy.contrib.spiders.init import InitSpider
from scrapy.http import Request, FormRequest
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
from selenium import selenium
from linkedpy.items import LinkedPyItem
class LinkedPySpider(InitSpider):
name = 'LinkedPy'
allowed_domains = ['linkedin.com']
login_page = 'https://www.linkedin.com/uas/login'
start_urls = ["http://www.linkedin.com/csearch/results?type=companies&keywords=&pplSearchOrigin=GLHD&pageKey=member-home&search=Search#facets=pplSearchOrigin%3DFCTD%26keywords%3D%26search%3DSubmit%26facet_CS%3DC%26facet_I%3D80%26openFacets%3DJO%252CN%252CCS%252CNFR%252CF%252CCCR%252CI"]
def __init__(self):
InitSpider.__init__(self)
self.verificationErrors = []
self.selenium = selenium("localhost", 4444, "*firefox", "http://www.linkedin.com")
self.log("nnn Starting the Selenium Server! nnn")
self.selenium.start()
self.log("nnn Successfully, Started the Selenium Server! nnn")
def __del__(self):
self.selenium.stop()
print self.verificationErrors
CrawlSpider.__del__(self)
def init_request(self):
#"""This function is called before crawling starts."""
return Request(url=self.login_page, callback=self.login)
def login(self, response):
#"""Generate a login request."""
return FormRequest.from_response(response,
formdata={'session_key': '[email protected]', 'session_password': 'password'},
callback=self.check_login_response)
def check_login_response(self, response):
#"""Check the response returned by a login request to see if we aresuccessfully logged in."""
if "Sign Out" in response.body:
self.log("nnnSuccessfully logged in. Let's start crawling!nnn")
# Now the crawling can begin..
return self.initialized()
else:
self.log("nnnFailed, Bad times :(nnn")
# Something went wrong, we couldn't log in, so nothing happens.
def parse(self, response):
hxs = HtmlXPathSelector(response)
sel = self.selenium
sel.open(response.url)
time.sleep(2.5)
sites = sel.select('//ol[@id='result-set']/li')
items = []
for site in sites:
item = LinkedPyItem()
item['title'] = site.select('h2/a/text()').extract()
item['link'] = site.select('h2/a/@href').extract()
items.append(item)
return items
@subramaniank
Copy link

Why do you need the HtmlXPathSelector in parse() ?

@athul2
Copy link

athul2 commented Jun 8, 2018

ImportError: No module named linkedpy.items

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment