Created November 13, 2016 23:31
HTTP Crawler
# HTTP crawler inside nginx.
# start crawling by curl<START URL>
worker_processes auto;
events {
worker_connections 16384;
http {
include mime.types;
default_type application/octet-stream;
sendfile on;
keepalive_timeout 65;
lua_shared_dict queue 1M;
resolver ;
server {
listen 18080;
server_name localhost;
access_log /dev/null ;
if ($arg_host ~* '^http://([\d|\w|\.]+)/(.*)$') { set $crawlfile $2; set $crawlhost $1 ; }
if ($arg_host ~* '^https://([\d|\w|\.]+)/(.*)$') { set $crawlfile $2; set $crawlhost $1 ; }
if ($crawlfile ~* '^(.+)/$') { set $crawlfile "$1/index.html"; }
location / {
proxy_pass $arg_host;
proxy_set_header Host $crawlhost ;
header_filter_by_lua '
allowed_types = {
["text/html"] = true ,
["text/plain"] = true ,
local ctype = string.gsub(ngx.resp.get_headers()["Content-Type"], ";.+", "")
if allowed_types[ctype] == nil then
return ngx.exit(ngx.HTTP_NOT_FOUND)
body_filter_by_lua '
local shared = ngx.shared.queue
local body = string.lower(ngx.arg[1])
local hreflist = {}
local prefix = "http://" .. ngx.var.crawlhost
local size = string.len(prefix)
for href in string.gmatch(body, \'a href="%U-"\') do
local link = string.gsub(string.gsub(href, \'"\', ""), "a href=", "") .. " "
if string.sub(link, 1, 1) == "/" then link = prefix .. link end
if string.sub(link, 1, size) == prefix then do
shared.set(shared, link, "1")
proxy_store /tmp/store/$crawlhost/$crawlfile;
post_action @queue;
location /proxy {
proxy_pass$args ;
location @queue {
content_by_lua '
local keys = ngx.shared.queue:get_keys(1)
local loc = "/proxy/?host=" .. keys[1]
res = ngx.location.capture(loc)
