pradeepbn · July 31, 2016 03:16
diff --git a/lua_script.lua b/lua_script.lua
 function get_session_id(session_table)
    if #session_table <= 100 then
        -- Crawlera C10 package supports only 100 sessions
        return session_table[math.random(1,100)]
    else
        return session_table[0]
    end 
 end

 function add_session_id(session_table, session_id)
    if #session_table < 100 then
        session_table[#session_table + 1] = session_id
    end 
 end

 function use_crawlera(splash)
    -- Put your Crawlera username and password here. This is different from your
    -- Scrappinghub account. Find your Crawlera username and password in
    -- https://app.scrapinghub.com
    local user = "API_KEY"
    local password = ""

    local host = 'proxy.crawlera.com'
    local port = 8010
    local session_header = "X-Crawlera-Session"
    local session_table = {}
    add_session_id(session_table, "create")

    splash:on_request(function (request)
        -- Requests to Google domains are not allowed by Crawlera, but pages
        -- frequently include tracking code or ads served by google. Block
        -- those requests.
        --
        -- lua patterns follow a different syntax to normal regular expressions
    
        if string.find(request.url, 'google%.[a-z][a-z][a-z]?') or
           string.find(request.url, 'google') or
           string.find(request.url, 'doubleclick%.net') or
           string.find(request.url, 'googleapis%.com') or
           string.find(request.url, 'facebook%.[a-z][a-z][a-z]?') or
           string.find(request.url, 'twitter%.[a-z][a-z][a-z]?') or
           string.find(request.url, 'syndication.twitter.com?') then
            request.abort()
            return
        end

        -- If possible, avoid using Crawlera for subresource requests that
        -- are not monitored. This will increase speed a lot. Here are some
        -- example rules that you can use to match subresources:

        -- Don't use Crawlera for domains starting with static.
        if string.find(request.url, '://static%.') ~= nil then
            return
        end

        -- Don't use Crawlera for urls ending in .png, .jpg, .jpeg
        if (string.find(request.url, '%.png$') ~= nil) or
           (string.find(request.url, '%.jpg$') ~= nil) or
           (string.find(request.url, '%.jpeg$') ~= nil) then
            return
        end
        request:set_header("X-Crawlera-UA", "desktop")
        request:set_header('X-Crawlera-Cookies', 'disable')
        session_id = get_session_id(session_table)
        request:set_header(session_header, session_id)
        request:set_proxy{host, port, username=user, password=password}
    end)

    splash:on_response_headers(function (response)
        if type(response.headers[session_header]) ~= nil then
            session_id = response.headers[session_header]
            add_session_id(session_table, session_id)
        end
    end)
 end

 function main(splash)
    use_crawlera(splash)
    splash:go(splash.args.url)
    return splash:html()
 end
	function get_session_id(session_table)
	if #session_table <= 100 then
	-- Crawlera C10 package supports only 100 sessions
	return session_table[math.random(1,100)]
	else
	return session_table[0]
	end
	end

	function add_session_id(session_table, session_id)
	if #session_table < 100 then
	session_table[#session_table + 1] = session_id
	end
	end

	function use_crawlera(splash)
	-- Put your Crawlera username and password here. This is different from your
	-- Scrappinghub account. Find your Crawlera username and password in
	-- https://app.scrapinghub.com
	local user = "API_KEY"
	local password = ""

	local host = 'proxy.crawlera.com'
	local port = 8010
	local session_header = "X-Crawlera-Session"
	local session_table = {}
	add_session_id(session_table, "create")

	splash:on_request(function (request)
	-- Requests to Google domains are not allowed by Crawlera, but pages
	-- frequently include tracking code or ads served by google. Block
	-- those requests.
	--
	-- lua patterns follow a different syntax to normal regular expressions

	if string.find(request.url, 'google%.[a-z][a-z][a-z]?') or
	string.find(request.url, 'google') or
	string.find(request.url, 'doubleclick%.net') or
	string.find(request.url, 'googleapis%.com') or
	string.find(request.url, 'facebook%.[a-z][a-z][a-z]?') or
	string.find(request.url, 'twitter%.[a-z][a-z][a-z]?') or
	string.find(request.url, 'syndication.twitter.com?') then
	request.abort()
	return
	end

	-- If possible, avoid using Crawlera for subresource requests that
	-- are not monitored. This will increase speed a lot. Here are some
	-- example rules that you can use to match subresources:

	-- Don't use Crawlera for domains starting with static.
	if string.find(request.url, '://static%.') ~= nil then
	return
	end

	-- Don't use Crawlera for urls ending in .png, .jpg, .jpeg
	if (string.find(request.url, '%.png$') ~= nil) or
	(string.find(request.url, '%.jpg$') ~= nil) or
	(string.find(request.url, '%.jpeg$') ~= nil) then
	return
	end
	request:set_header("X-Crawlera-UA", "desktop")
	request:set_header('X-Crawlera-Cookies', 'disable')
	session_id = get_session_id(session_table)
	request:set_header(session_header, session_id)
	request:set_proxy{host, port, username=user, password=password}
	end)

	splash:on_response_headers(function (response)
	if type(response.headers[session_header]) ~= nil then
	session_id = response.headers[session_header]
	add_session_id(session_table, session_id)
	end
	end)
	end

	function main(splash)
	use_crawlera(splash)
	splash:go(splash.args.url)
	return splash:html()
	end