rafikahmed/spider.py

rafikahmed · 2020-05-27T09:13:57Z

import scrapy
from scrapy_splash import SplashRequest


class StockQrSpider(scrapy.Spider):

    name = 'stock_qr'
    allowed_domains = ['www.malaysiastock.biz']
    # start_urls = ['https://www.malaysiastock.biz/Corporate-Infomation.aspx?securityCode=1155']

    current_page = 1

    inital_script = '''
        function main(splash, args)
            splash.private_mode_enabled = true
            assert(splash:go(args.url))
            assert(splash:wait(3))
            return splash:html()
        end
    '''

    script = '''
        function main(splash, args)
            splash.private_mode_enabled = true
            assert(splash:go(args.url))
            assert(splash:wait(3))
            assert(splash:runjs(args.js_source))
            assert(splash:wait(5))
            return splash:html()
        end
    '''

    def start_requests(self):
        # Initial request doen't require the js_source script
        yield SplashRequest(url='https://www.malaysiastock.biz/Corporate-Infomation.aspx?securityCode=1155', callback=self.parse_qr, endpoint='execute', args={
            'lua_source': self.inital_script
        })

    def parse_qr(self, response):
        # name = response.request.meta['dividend_name']
        rows = response.xpath(
            "//table[contains(@id, 'MainContent_gvReport')]/tbody/tr[position()<last()]")
        for row in rows:
            date = row.xpath(".//td[1]/text()").get()
            year = row.xpath(".//td[2]//text()").get()
            number = row.xpath(".//td[3]//text()").get()
            quarter = row.xpath(".//td[4]//text()").get()
            revenue = row.xpath(".//td[5]//text()").get()
            pbt = row.xpath(".//td[6]//text()").get()
            profit = row.xpath(".//td[7]//text()").get()
            eps = row.xpath(".//td[8]//text()").get()
            dividend = row.xpath(".//td[9]//text()").get()
            nta = row.xpath(".//td[10]//text()").get()

            yield{
                'date': date,
                'financial_year': year,
                'number': number,
                'financial_quarter': quarter,
                'revenue': revenue,
                'profit_before_tax': pbt,
                'net_profit': profit,
                'earnings_per_share': eps,
                'dividend': dividend,
                'net_tangible_asset': nta,
                'User-Agent': response.request.headers['User-Agent']
            }

        # extract_last_page_number
        last_page = response.xpath(
            "//table[contains(@id, 'MainContent_gvReport')]/tbody/tr[position()=last()]/td/table/tbody/tr/td[position()=last()]/a/text()").get() #you forgot to use get() here
        

        #last_page should be converted to an integer.
        if last_page:
            if self.current_page <= int(last_page):
                self.current_page += 1
                js_source = f"javascript:__doPostBack('ctl00$MainContent$gvReport','Page${self.current_page}')"
                # here you used the parse method which doesn't exist (it should be parse_qr)
                yield SplashRequest(url='https://www.malaysiastock.biz/Corporate-Infomation.aspx?securityCode=1155', callback=self.parse_qr, endpoint='execute', args={'lua_source': self.script, 'js_source': js_source}, dont_filter=True)

kazekage92 · 2020-05-28T05:05:34Z

Trying to loop through multiple stock but they get mixed together

`
import scrapy
from scrapy_splash import SplashRequest

class StockQrSpider(scrapy.Spider):

name = 'stock_qr'
allowed_domains = ['www.malaysiastock.biz']
# start_urls = ['https://www.malaysiastock.biz/Corporate-Infomation.aspx?securityCode=1155']
# stock_code = ['1155','6012']

current_page = 1
stock_url = ""

inital_script = '''
    function main(splash, args)
        splash.private_mode_enabled = true
        assert(splash:go(args.url))
        assert(splash:wait(3))
        return splash:html()
    end
'''

script = '''
    function main(splash, args)
        splash.private_mode_enabled = true
        assert(splash:go(args.url))
        assert(splash:wait(3))
        assert(splash:runjs(args.js_source))
        assert(splash:wait(5))
        return splash:html()
    end
'''

def __init__(self):
    self.stock_code = ['1155','6888','7029']

def start_requests(self):
    # Initial request doen't require the js_source script
    for i in self.stock_code:
        StockQrSpider.stock_url = 'https://www.malaysiastock.biz/Corporate-Infomation.aspx?securityCode='+ i
        yield SplashRequest(url=self.stock_url , callback=self.parse_qr, endpoint='execute', args={
            'lua_source': self.inital_script
        })

def parse_qr(self, response):
    # name = response.request.meta['dividend_name']
    
    # for i in self.stock_code:
    #     stock_url = 'https://www.malaysiastock.biz/Corporate-Infomation.aspx?securityCode='+ i



    rows = response.xpath(
        "//table[contains(@id, 'MainContent_gvReport')]/tbody/tr[position()<last()]")
    for row in rows:
        stockname = response.xpath("//div[contains(@id, 'MainContent_Panel')][1]/div/h1/label/text()").get()
        date = row.xpath(".//td[1]/text()").get()
        year = row.xpath(".//td[2]//text()").get()
        number = row.xpath(".//td[3]//text()").get()
        quarter = row.xpath(".//td[4]//text()").get()
        revenue = row.xpath(".//td[5]//text()").get()
        pbt = row.xpath(".//td[6]//text()").get()
        profit = row.xpath(".//td[7]//text()").get()
        eps = row.xpath(".//td[8]//text()").get()
        dividend = row.xpath(".//td[9]//text()").get()
        nta = row.xpath(".//td[10]//text()").get()

        yield{
            'stock_name': stockname,
            'date': date,
            'financial_year': year,
            'number': number,
            'financial_quarter': quarter,
            'revenue': revenue,
            'profit_before_tax': pbt,
            'net_profit': profit,
            'earnings_per_share': eps,
            'dividend': dividend,
            'net_tangible_asset': nta,
            'User-Agent': response.request.headers['User-Agent']
        }

# extract_last_page_number
    last_page = response.xpath(
        "//table[contains(@id, 'MainContent_gvReport')]/tbody/tr[position()=last()]/td/table/tbody/tr/td[position()=last()]/a/text()").get() #you forgot to use get() here
    

    #last_page should be converted to an integer.
    if last_page:
        if self.current_page <= int(last_page):
            self.current_page += 1
            js_source = f"javascript:__doPostBack('ctl00$MainContent$gvReport','Page${self.current_page}')"
            # here you used the parse method which doesn't exist (it should be parse_qr)
            print(self.stock_url)
            yield SplashRequest(url= self.stock_url , callback=self.parse_qr, endpoint='execute', args={'lua_source': self.script, 'js_source': js_source}, dont_filter=True)

`

	class MySpider(scrapy.Spider):
	name = 'example'

	current_page = 1

	script = '''
	function main(splash, args)
	splash.private_mode_enabled = false
	url = args.url
	assert(splash:go(url))
	assert(splash:wait(1))
	assert(splash:run_js(args.js_source))
	assert(splash:wait(1))
	return splash:html()
	end
	'''

	def start_requests(self):
	# initial request


	def parse(self, response):
	# parse your data
	#pagination
	if self.current_page <= 5:
	self.current_page += 1
	js_source = f"javascript:__doPostBack('ctl00$MainContent$gvReport','Page${self.current_page}')"
	yield SplashRequest(url='YOUR_URL', callback=self.parse, endpoint='execute', args={'lua_source': self.script, 'js_source':js_source}, dont_filter=True)

	function main(splash, args)
	splash.private_mode_enabled = false
	url = args.url
	assert(splash:go(url))
	assert(splash:wait(1))
	assert(splash:run_js(args.js_source))
	assert(splash:wait(1))
	return splash:html()
	end

rafikahmed/spider.py

rafikahmed commented May 27, 2020 •

edited

Loading

kazekage92 commented May 28, 2020

rafikahmed/spider.py

rafikahmed commented May 27, 2020 • edited Loading

kazekage92 commented May 28, 2020

rafikahmed commented May 27, 2020 •

edited

Loading