Created
November 13, 2016 23:31
-
-
Save toxatoor/58febad8add868d0a664c8887732e045 to your computer and use it in GitHub Desktop.
HTTP Crawler
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# HTTP crawler inside nginx. | |
# | |
# start crawling by curl http://127.0.0.1:18080/?host=<START URL> | |
# | |
worker_processes auto; | |
events { | |
worker_connections 16384; | |
} | |
http { | |
include mime.types; | |
default_type application/octet-stream; | |
sendfile on; | |
keepalive_timeout 65; | |
lua_shared_dict queue 1M; | |
resolver 8.8.8.8 ; | |
server { | |
listen 18080; | |
server_name localhost; | |
access_log /dev/null ; | |
if ($arg_host ~* '^http://([\d|\w|\.]+)/(.*)$') { set $crawlfile $2; set $crawlhost $1 ; } | |
if ($arg_host ~* '^https://([\d|\w|\.]+)/(.*)$') { set $crawlfile $2; set $crawlhost $1 ; } | |
if ($crawlfile ~* '^(.+)/$') { set $crawlfile "$1/index.html"; } | |
location / { | |
proxy_pass $arg_host; | |
proxy_set_header Host $crawlhost ; | |
header_filter_by_lua ' | |
allowed_types = { | |
["text/html"] = true , | |
["text/plain"] = true , | |
} | |
local ctype = string.gsub(ngx.resp.get_headers()["Content-Type"], ";.+", "") | |
if allowed_types[ctype] == nil then | |
return ngx.exit(ngx.HTTP_NOT_FOUND) | |
end | |
'; | |
body_filter_by_lua ' | |
local shared = ngx.shared.queue | |
local body = string.lower(ngx.arg[1]) | |
local hreflist = {} | |
local prefix = "http://" .. ngx.var.crawlhost | |
local size = string.len(prefix) | |
for href in string.gmatch(body, \'a href="%U-"\') do | |
local link = string.gsub(string.gsub(href, \'"\', ""), "a href=", "") .. " " | |
if string.sub(link, 1, 1) == "/" then link = prefix .. link end | |
if string.sub(link, 1, size) == prefix then do | |
shared.set(shared, link, "1") | |
end | |
end | |
end | |
'; | |
proxy_store /tmp/store/$crawlhost/$crawlfile; | |
post_action @queue; | |
} | |
location /proxy { | |
proxy_pass http://127.0.0.1:18080/?$args ; | |
} | |
location @queue { | |
content_by_lua ' | |
local keys = ngx.shared.queue:get_keys(1) | |
local loc = "/proxy/?host=" .. keys[1] | |
ngx.shared.queue:delete(keys[1]) | |
res = ngx.location.capture(loc) | |
'; | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment