Skip to content

Instantly share code, notes, and snippets.

@rafikahmed
Created May 26, 2020 12:48
Show Gist options
  • Save rafikahmed/70a38deb6ff11c2ad02173db6689d1a0 to your computer and use it in GitHub Desktop.
Save rafikahmed/70a38deb6ff11c2ad02173db6689d1a0 to your computer and use it in GitHub Desktop.
class MySpider(scrapy.Spider):
name = 'example'
current_page = 1
script = '''
function main(splash, args)
splash.private_mode_enabled = false
url = args.url
assert(splash:go(url))
assert(splash:wait(1))
assert(splash:run_js(args.js_source))
assert(splash:wait(1))
return splash:html()
end
'''
def start_requests(self):
# initial request
def parse(self, response):
# parse your data
#pagination
if self.current_page <= 5:
self.current_page += 1
js_source = f"javascript:__doPostBack('ctl00$MainContent$gvReport','Page${self.current_page}')"
yield SplashRequest(url='YOUR_URL', callback=self.parse, endpoint='execute', args={'lua_source': self.script, 'js_source':js_source}, dont_filter=True)
function main(splash, args)
splash.private_mode_enabled = false
url = args.url
assert(splash:go(url))
assert(splash:wait(1))
assert(splash:run_js(args.js_source))
assert(splash:wait(1))
return splash:html()
end
@kazekage92
Copy link

`

This one is the complete spider I want to perform, with parse and parse_qr

-- coding: utf-8 --

import scrapy
from scrapy_splash import SplashRequest

class StockQrSpider(scrapy.Spider):
name = 'stock_qr'
allowed_domains = ['www.malaysiastock.biz']
start_urls = ['https://www.malaysiastock.biz/Listed-Companies.aspx?type=S&s1=9']

current_page = 1

script = '''
    function main(splash, args)
        splash.private_mode_enabled = true
        assert(splash:go(args.url))
        assert(splash:wait(3))
        assert(splash:runjs(args.js_source))
        assert(splash:wait(5))
        return splash:html()
    end
'''

def parse(self, response):
    stocks = response.xpath("//table[contains(@id,'MainContent_tStock')]/tbody/tr")
    for stock in stocks:
        name = stock.xpath(".//td[1]/h3/a/text()").get()
        link = stock.xpath(".//td[1]/h3/a/@href").get()

        yield SplashRequest(url=link, callback=self.parse_qr, endpoint='execute', meta={'stock_name':name}, args={'lua_source': self.script})

def parse_qr(self, response):
    name = response.request.meta['stock_name']
    rows = response.xpath("//table[contains(@id, 'MainContent_gvReport')]/tbody/tr[position()<last()]")
    for row in rows:
        date = row.xpath(".//td[1]/text()").get()
        year = row.xpath(".//td[2]//text()").get()
        number = row.xpath(".//td[3]//text()").get()
        quarter = row.xpath(".//td[4]//text()").get()
        revenue = row.xpath(".//td[5]//text()").get()
        pbt = row.xpath(".//td[6]//text()").get()
        profit = row.xpath(".//td[7]//text()").get()
        eps = row.xpath(".//td[8]//text()").get()
        dividend = row.xpath(".//td[9]//text()").get()
        nta = row.xpath(".//td[10]//text()").get()

        yield{
            'date': date,
            'financial_year': year,
            'number': number,
            'financial_quarter': quarter,
            'revenue': revenue,
            'profit_before_tax': pbt,
            'net_profit': profit,
            'earnings_per_share': eps,
            'dividend': dividend,
            'net_tangible_asset': nta,
            'User-Agent': response.request.headers['User-Agent']
        }

    last_page = response.xpath("//table[contains(@id, 'MainContent_gvReport')]/tbody/tr[position()=last()]/td/table/tbody/tr/td[position()=last()]/a/text()") #extract_last_page_number
    
    if self.current_page <= last_page:
        self.current_page += 1
        js_source = f"javascript:__doPostBack('ctl00$MainContent$gvReport','Page${self.current_page}')"
        yield SplashRequest(url='https://www.malaysiastock.biz/Corporate-Infomation.aspx?securityCode=1155', callback=self.parse, endpoint='execute', args={'lua_source': self.script, 'js_source':js_source}, dont_filter=True)

`

@rafikahmed
Copy link
Author

rafikahmed commented May 27, 2020

import scrapy
from scrapy_splash import SplashRequest


class StockQrSpider(scrapy.Spider):

    name = 'stock_qr'
    allowed_domains = ['www.malaysiastock.biz']
    # start_urls = ['https://www.malaysiastock.biz/Corporate-Infomation.aspx?securityCode=1155']

    current_page = 1

    inital_script = '''
        function main(splash, args)
            splash.private_mode_enabled = true
            assert(splash:go(args.url))
            assert(splash:wait(3))
            return splash:html()
        end
    '''

    script = '''
        function main(splash, args)
            splash.private_mode_enabled = true
            assert(splash:go(args.url))
            assert(splash:wait(3))
            assert(splash:runjs(args.js_source))
            assert(splash:wait(5))
            return splash:html()
        end
    '''

    def start_requests(self):
        # Initial request doen't require the js_source script
        yield SplashRequest(url='https://www.malaysiastock.biz/Corporate-Infomation.aspx?securityCode=1155', callback=self.parse_qr, endpoint='execute', args={
            'lua_source': self.inital_script
        })

    def parse_qr(self, response):
        # name = response.request.meta['dividend_name']
        rows = response.xpath(
            "//table[contains(@id, 'MainContent_gvReport')]/tbody/tr[position()<last()]")
        for row in rows:
            date = row.xpath(".//td[1]/text()").get()
            year = row.xpath(".//td[2]//text()").get()
            number = row.xpath(".//td[3]//text()").get()
            quarter = row.xpath(".//td[4]//text()").get()
            revenue = row.xpath(".//td[5]//text()").get()
            pbt = row.xpath(".//td[6]//text()").get()
            profit = row.xpath(".//td[7]//text()").get()
            eps = row.xpath(".//td[8]//text()").get()
            dividend = row.xpath(".//td[9]//text()").get()
            nta = row.xpath(".//td[10]//text()").get()

            yield{
                'date': date,
                'financial_year': year,
                'number': number,
                'financial_quarter': quarter,
                'revenue': revenue,
                'profit_before_tax': pbt,
                'net_profit': profit,
                'earnings_per_share': eps,
                'dividend': dividend,
                'net_tangible_asset': nta,
                'User-Agent': response.request.headers['User-Agent']
            }

        # extract_last_page_number
        last_page = response.xpath(
            "//table[contains(@id, 'MainContent_gvReport')]/tbody/tr[position()=last()]/td/table/tbody/tr/td[position()=last()]/a/text()").get() #you forgot to use get() here
        

        #last_page should be converted to an integer.
        if last_page:
            if self.current_page <= int(last_page):
                self.current_page += 1
                js_source = f"javascript:__doPostBack('ctl00$MainContent$gvReport','Page${self.current_page}')"
                # here you used the parse method which doesn't exist (it should be parse_qr)
                yield SplashRequest(url='https://www.malaysiastock.biz/Corporate-Infomation.aspx?securityCode=1155', callback=self.parse_qr, endpoint='execute', args={'lua_source': self.script, 'js_source': js_source}, dont_filter=True)

@kazekage92
Copy link

Trying to loop through multiple stock but they get mixed together

`
import scrapy
from scrapy_splash import SplashRequest

class StockQrSpider(scrapy.Spider):

name = 'stock_qr'
allowed_domains = ['www.malaysiastock.biz']
# start_urls = ['https://www.malaysiastock.biz/Corporate-Infomation.aspx?securityCode=1155']
# stock_code = ['1155','6012']

current_page = 1
stock_url = ""

inital_script = '''
    function main(splash, args)
        splash.private_mode_enabled = true
        assert(splash:go(args.url))
        assert(splash:wait(3))
        return splash:html()
    end
'''

script = '''
    function main(splash, args)
        splash.private_mode_enabled = true
        assert(splash:go(args.url))
        assert(splash:wait(3))
        assert(splash:runjs(args.js_source))
        assert(splash:wait(5))
        return splash:html()
    end
'''

def __init__(self):
    self.stock_code = ['1155','6888','7029']

def start_requests(self):
    # Initial request doen't require the js_source script
    for i in self.stock_code:
        StockQrSpider.stock_url = 'https://www.malaysiastock.biz/Corporate-Infomation.aspx?securityCode='+ i
        yield SplashRequest(url=self.stock_url , callback=self.parse_qr, endpoint='execute', args={
            'lua_source': self.inital_script
        })

def parse_qr(self, response):
    # name = response.request.meta['dividend_name']
    
    # for i in self.stock_code:
    #     stock_url = 'https://www.malaysiastock.biz/Corporate-Infomation.aspx?securityCode='+ i



    rows = response.xpath(
        "//table[contains(@id, 'MainContent_gvReport')]/tbody/tr[position()<last()]")
    for row in rows:
        stockname = response.xpath("//div[contains(@id, 'MainContent_Panel')][1]/div/h1/label/text()").get()
        date = row.xpath(".//td[1]/text()").get()
        year = row.xpath(".//td[2]//text()").get()
        number = row.xpath(".//td[3]//text()").get()
        quarter = row.xpath(".//td[4]//text()").get()
        revenue = row.xpath(".//td[5]//text()").get()
        pbt = row.xpath(".//td[6]//text()").get()
        profit = row.xpath(".//td[7]//text()").get()
        eps = row.xpath(".//td[8]//text()").get()
        dividend = row.xpath(".//td[9]//text()").get()
        nta = row.xpath(".//td[10]//text()").get()

        yield{
            'stock_name': stockname,
            'date': date,
            'financial_year': year,
            'number': number,
            'financial_quarter': quarter,
            'revenue': revenue,
            'profit_before_tax': pbt,
            'net_profit': profit,
            'earnings_per_share': eps,
            'dividend': dividend,
            'net_tangible_asset': nta,
            'User-Agent': response.request.headers['User-Agent']
        }

# extract_last_page_number
    last_page = response.xpath(
        "//table[contains(@id, 'MainContent_gvReport')]/tbody/tr[position()=last()]/td/table/tbody/tr/td[position()=last()]/a/text()").get() #you forgot to use get() here
    

    #last_page should be converted to an integer.
    if last_page:
        if self.current_page <= int(last_page):
            self.current_page += 1
            js_source = f"javascript:__doPostBack('ctl00$MainContent$gvReport','Page${self.current_page}')"
            # here you used the parse method which doesn't exist (it should be parse_qr)
            print(self.stock_url)
            yield SplashRequest(url= self.stock_url , callback=self.parse_qr, endpoint='execute', args={'lua_source': self.script, 'js_source': js_source}, dont_filter=True)

`

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment