Skip to content

Instantly share code, notes, and snippets.

@pradeepbn
Created July 31, 2016 03:16
Show Gist options
  • Save pradeepbn/ad09c9b72fbb80f4a63d9239eba74c28 to your computer and use it in GitHub Desktop.
Save pradeepbn/ad09c9b72fbb80f4a63d9239eba74c28 to your computer and use it in GitHub Desktop.
Leverage multiple Crawlera sessions using lua script
function get_session_id(session_table)
if #session_table <= 100 then
-- Crawlera C10 package supports only 100 sessions
return session_table[math.random(1,100)]
else
return session_table[0]
end
end
function add_session_id(session_table, session_id)
if #session_table < 100 then
session_table[#session_table + 1] = session_id
end
end
function use_crawlera(splash)
-- Put your Crawlera username and password here. This is different from your
-- Scrappinghub account. Find your Crawlera username and password in
-- https://app.scrapinghub.com
local user = "API_KEY"
local password = ""
local host = 'proxy.crawlera.com'
local port = 8010
local session_header = "X-Crawlera-Session"
local session_table = {}
add_session_id(session_table, "create")
splash:on_request(function (request)
-- Requests to Google domains are not allowed by Crawlera, but pages
-- frequently include tracking code or ads served by google. Block
-- those requests.
--
-- lua patterns follow a different syntax to normal regular expressions
if string.find(request.url, 'google%.[a-z][a-z][a-z]?') or
string.find(request.url, 'google') or
string.find(request.url, 'doubleclick%.net') or
string.find(request.url, 'googleapis%.com') or
string.find(request.url, 'facebook%.[a-z][a-z][a-z]?') or
string.find(request.url, 'twitter%.[a-z][a-z][a-z]?') or
string.find(request.url, 'syndication.twitter.com?') then
request.abort()
return
end
-- If possible, avoid using Crawlera for subresource requests that
-- are not monitored. This will increase speed a lot. Here are some
-- example rules that you can use to match subresources:
-- Don't use Crawlera for domains starting with static.
if string.find(request.url, '://static%.') ~= nil then
return
end
-- Don't use Crawlera for urls ending in .png, .jpg, .jpeg
if (string.find(request.url, '%.png$') ~= nil) or
(string.find(request.url, '%.jpg$') ~= nil) or
(string.find(request.url, '%.jpeg$') ~= nil) then
return
end
request:set_header("X-Crawlera-UA", "desktop")
request:set_header('X-Crawlera-Cookies', 'disable')
session_id = get_session_id(session_table)
request:set_header(session_header, session_id)
request:set_proxy{host, port, username=user, password=password}
end)
splash:on_response_headers(function (response)
if type(response.headers[session_header]) ~= nil then
session_id = response.headers[session_header]
add_session_id(session_table, session_id)
end
end)
end
function main(splash)
use_crawlera(splash)
splash:go(splash.args.url)
return splash:html()
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment