Created
July 31, 2016 03:16
-
-
Save pradeepbn/ad09c9b72fbb80f4a63d9239eba74c28 to your computer and use it in GitHub Desktop.
Leverage multiple Crawlera sessions using lua script
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
function get_session_id(session_table) | |
if #session_table <= 100 then | |
-- Crawlera C10 package supports only 100 sessions | |
return session_table[math.random(1,100)] | |
else | |
return session_table[0] | |
end | |
end | |
function add_session_id(session_table, session_id) | |
if #session_table < 100 then | |
session_table[#session_table + 1] = session_id | |
end | |
end | |
function use_crawlera(splash) | |
-- Put your Crawlera username and password here. This is different from your | |
-- Scrappinghub account. Find your Crawlera username and password in | |
-- https://app.scrapinghub.com | |
local user = "API_KEY" | |
local password = "" | |
local host = 'proxy.crawlera.com' | |
local port = 8010 | |
local session_header = "X-Crawlera-Session" | |
local session_table = {} | |
add_session_id(session_table, "create") | |
splash:on_request(function (request) | |
-- Requests to Google domains are not allowed by Crawlera, but pages | |
-- frequently include tracking code or ads served by google. Block | |
-- those requests. | |
-- | |
-- lua patterns follow a different syntax to normal regular expressions | |
if string.find(request.url, 'google%.[a-z][a-z][a-z]?') or | |
string.find(request.url, 'google') or | |
string.find(request.url, 'doubleclick%.net') or | |
string.find(request.url, 'googleapis%.com') or | |
string.find(request.url, 'facebook%.[a-z][a-z][a-z]?') or | |
string.find(request.url, 'twitter%.[a-z][a-z][a-z]?') or | |
string.find(request.url, 'syndication.twitter.com?') then | |
request.abort() | |
return | |
end | |
-- If possible, avoid using Crawlera for subresource requests that | |
-- are not monitored. This will increase speed a lot. Here are some | |
-- example rules that you can use to match subresources: | |
-- Don't use Crawlera for domains starting with static. | |
if string.find(request.url, '://static%.') ~= nil then | |
return | |
end | |
-- Don't use Crawlera for urls ending in .png, .jpg, .jpeg | |
if (string.find(request.url, '%.png$') ~= nil) or | |
(string.find(request.url, '%.jpg$') ~= nil) or | |
(string.find(request.url, '%.jpeg$') ~= nil) then | |
return | |
end | |
request:set_header("X-Crawlera-UA", "desktop") | |
request:set_header('X-Crawlera-Cookies', 'disable') | |
session_id = get_session_id(session_table) | |
request:set_header(session_header, session_id) | |
request:set_proxy{host, port, username=user, password=password} | |
end) | |
splash:on_response_headers(function (response) | |
if type(response.headers[session_header]) ~= nil then | |
session_id = response.headers[session_header] | |
add_session_id(session_table, session_id) | |
end | |
end) | |
end | |
function main(splash) | |
use_crawlera(splash) | |
splash:go(splash.args.url) | |
return splash:html() | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment