Last active
November 9, 2020 14:57
-
-
Save nl5887/b981b217338494682bf7 to your computer and use it in GitHub Desktop.
Scrapy spider with V8 javascript parser. More info at http://dutchcoders.ghost.io/using-scrapy-and-pyv8-to-scrape-inline-javascript/.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
import scrapy | |
from scrapy.http.request import Request | |
from scrapy.selector import Selector | |
import urllib2 | |
import re | |
import PyV8 | |
import json | |
from pdc.items import Product | |
class V8Spider(scrapy.Spider): | |
def parse_page(self, response): | |
item = response.meta['item'] | |
item['link'] = response.url | |
exts = [] | |
productsData = None | |
with PyV8.JSContext(Global(), extensions=exts) as ctxt: | |
for script in response.xpath("//script"): | |
try: | |
if (script.xpath("@src")): | |
src = script.xpath("@src").extract()[0] | |
import requests | |
r = requests.get(src) | |
print("loading script source ", src) | |
ext = PyV8.JSExtension(str(src), js_escape_unicode(r.text)) | |
ctxt.eval(js_escape_unicode(r.text)) | |
if script.xpath("text()").extract(): | |
ctxt.eval(str(js_escape_unicode(script.xpath("text()").extract()[0]))) | |
except Exception as exc: | |
import traceback | |
traceback.print_exc() | |
if ctxt.eval("[productsData]"): | |
productsData = PyV8.convert(ctxt.eval("[productsData]")[0]) | |
for color_key in productsData['colors'].keys(): | |
color = productsData['colors'][color_key] | |
for size_key in color['sizes'].keys(): | |
size = productsData['sizes'][size_key] | |
product_key = "{0}_{1}".format(color_key, size_key) | |
product = productsData['products'][product_key] | |
subitem = item.copy() | |
subitem['productid']=product['id'] | |
subitem['title']= "{0} {1} {2}".format(product['name'], size['label'], color['label']) | |
subitem['img']=color['media']['images'][0]['page'] | |
price = Selector(text=productsData['products'][product_key]['price_html']) | |
subitem['price'] = price.xpath('//span[@class="new_price"]/strong/text()').extract()[0].strip() + price.xpath('//span[@class="new_price"]/strong/sup/text()').extract()[0].strip() | |
yield subitem | |
else: | |
yield item |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import PyV8 | |
class js_dom_stylesheet(PyV8.JSClass): | |
def __init__(self, document, *args, **kwargs): | |
self.cssRules = [ {'cssText': "test"}] | |
self.cssText = "test" | |
def __getattr__(self, name): | |
return super(js_dom_stylesheet, self).__getattr__(name) | |
def __setattr__(self, name, value): | |
super(js_dom_stylesheet, self).__setattr__(name, value) | |
pass | |
def __delattr__(self, name): | |
super(js_dom_stylesheet, self).__delattr__(name) | |
pass | |
class js_dom_element(PyV8.JSClass): | |
def __init__(self, document, *args, **kwargs): | |
print ("js_dom_element", args, kwargs) | |
self.navigator = {'userAgent':"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2106.0 Safari/537.36", 'appVersion': '5.0 (Macintosh; Intel Mac OS X 10_10_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2106.0 Safari/537.36"'} | |
self.tagName = "HTML" | |
self.nodeType = 9 | |
self.style = {'background': None } | |
self.sheet = self.styleSheet= js_dom_stylesheet(document) | |
self.innerHTML = "" | |
self.className = "" | |
self.id = "" | |
self.offsetLeft = self.offsetHeight = 0 | |
self.document = self.ownerDocument = document | |
pass | |
def __str__(self): | |
return str(self.__properties__) + str(self.__dict__) | |
def appendChild(self, *args, **kwargs): | |
args[0].parentNode = self | |
return args[0] | |
def getBoundingClientRect(self, *args, **kwargs): | |
return {} | |
def removeChild(self, *args, **kwargs): | |
return None | |
""" | |
def parentNode(self, *args, **kwargs): | |
print ("parentNode") | |
return js_dom_element(self.document) | |
""" | |
def insertBefore(self, *args, **kwargs): | |
args[0].parentNode = self | |
return args[0] | |
def offsetTop(self, *args, **kwargs): | |
return 0 | |
def getAttribute(self, *args, **kwargs): | |
return None | |
def ondrop(self, *args, **kwargs): | |
pass | |
def ondragstart(self, *args, **kwargs): | |
pass | |
def setAttribute(self, *args, **kwargs): | |
return None | |
""" | |
def __getitem__(self, key): | |
print ("__getitem__", key) | |
return super(js_dom_element, self).__getitem__(key) | |
def __setitem__(self, key, value): | |
print ("__setitem__", key) | |
super(js_dom_element, self).__setitem__(key, value) | |
pass | |
def __delitem__(self, key): | |
print ("__delitem__", key) | |
super(js_dom_element, self).__delitem__(key) | |
pass | |
""" | |
def __getattr__(self, name): | |
return super(js_dom_element, self).__getattr__(name) | |
def __setattr__(self, name, value): | |
super(js_dom_element, self).__setattr__(name, value) | |
pass | |
def __delattr__(self, name): | |
super(js_dom_element, self).__delattr__(name) | |
pass | |
def attachEvent(self, *args, **kwargs): | |
pass | |
def getComputedStyle(self, *args, **kwargs): | |
return {} | |
def getElementsByTagName(self, *args, **kwargs): | |
return [js_dom_element(self.document)] | |
def addEventListener(self, *args, **kwargs): | |
pass | |
class js_window(PyV8.JSClass): | |
def __init__(self, document): | |
self.location = { 'href': '', 'hostname': 'www.test.nl' } | |
self.Event = {} | |
self.document = document | |
self.navigator = {'userAgent':"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2106.0 Safari/537.36", 'appVersion': '5.0 (Macintosh; Intel Mac OS X 10_10_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2106.0 Safari/537.36"'} | |
pass | |
def top(self): | |
return self | |
def self(self): | |
return self | |
def WebSocket(self, *args, **kwargs): | |
pass | |
def ontouchstart(self, *args, **kwargs): | |
pass | |
def setTimeout(self, *args, **kwargs): | |
pass | |
def postMessage(self, *args, **kwargs): | |
pass | |
def pushState(self, *args, **kwargs): | |
pass | |
def history(self, *args, **kwargs): | |
pass | |
def __setattr__(self, name, value): | |
print ("js_window.__setattr__", name, value) | |
#super(js_window, self).__setattr__(name, value) | |
#print (value, self.__getattr__(name)) | |
try: | |
super(js_window, self).__setattr__(name, value) | |
except AttributeError as exc: | |
print ("__setattr__,AttributeError") | |
pass | |
def __getattr__(self, name): | |
print ("js_window.___getattr__", name) | |
print ("js_window.___getattr__", name, super(js_window, self).__getattr__(name)) | |
try: | |
return super(js_window, self).__getattr__(name) | |
except AttributeError as exc: | |
pass | |
return None | |
def __delattr__(self, name): | |
print ("js_window.__delattr__", name) | |
super(js_window, self).__delattr__(name) | |
pass | |
def addEventListener(self, *args, **kwargs): | |
pass | |
def attachEvent(self, *args, **kwargs): | |
pass | |
class js_event(PyV8.JSClass): | |
def __init__(self): | |
self.__proto__ = {} | |
pass | |
class js_document(PyV8.JSClass): | |
def __init__(self): | |
self.window = js_window(self) | |
self.body = js_dom_element(self) | |
self.location = { 'href': '', 'hostname': 'www.test.nl' } | |
self.documentElement = js_dom_element(self) | |
self.navigator = {'userAgent':"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2106.0 Safari/537.36", 'appVersion': '5.0 (Macintosh; Intel Mac OS X 10_10_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2106.0 Safari/537.36"'} | |
pass | |
def appendChild(self, *args, **kwargs): | |
return None | |
def removeChild(self, *args, **kwargs): | |
return None | |
def getElementById(self, *args, **kwargs): | |
print ("getElementById", args, kwargs) | |
return js_dom_element(self) | |
def attachEvent(self, *args, **kwargs): | |
pass | |
def createEvent(self, *args, **kwargs): | |
return js_event() | |
def getElementsByTagName(self, *args, **kwargs): | |
return [js_dom_element(self)] | |
def createTextNode(self, *args, **kwargs): | |
return js_dom_element(self, *args, **kwargs) | |
def createComment(self, *args, **kwargs): | |
return js_dom_element(self, *args, **kwargs) | |
def createDocumentFragment(self, *args, **kwargs): | |
return js_dom_element(self, *args, **kwargs) | |
def createElement(self, *args, **kwargs): | |
return js_dom_element(self, *args, **kwargs) | |
def querySelector(self, *args, **kwargs): | |
pass | |
def evaluate(self, *args, **kwargs): | |
pass | |
def observe(self, *args, **kwargs): | |
pass | |
def __setattr__(self, name, value): | |
print ("__setattr__", name) | |
super(js_document, self).__setattr__(name, value) | |
pass | |
def __getattr__(self, name): | |
print ("js_document.___getattr__", name) | |
try: | |
return super(js_document, self).__getattr__(name) | |
except AttributeError as exc: | |
pass | |
return None | |
def addEventListener(self, *args, **kwargs): | |
pass | |
def __delattr__(self, name): | |
print ("__delattr__", name) | |
super(js_document, self).__delattr__(name) | |
pass | |
class Global(PyV8.JSClass): # define a compatible javascript class | |
def __init__(self): | |
self.document = js_document() | |
self.window = self.document.window | |
self.navigator = {'userAgent':"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2106.0 Safari/537.36", 'appVersion': '5.0 (Macintosh; Intel Mac OS X 10_10_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2106.0 Safari/537.36"'} | |
self.HTMLElement = js_dom_element(self.document) | |
pass | |
def Image(self): | |
return js_dom_element(self.document) | |
def __setattr__(self, name, value): | |
super(PyV8.JSClass, self).__setattr__(name, value) | |
pass | |
def __getattr__(self, name): | |
if self.window.__getattr__(name): | |
return self.window.__getattr__(name) | |
return super(PyV8.JSClass, self).__getattr__(name) | |
def __delattr__(self, name): | |
super(PyV8.JSClass, self).__delattr__(name) | |
pass |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment