Last active
December 16, 2019 09:27
-
-
Save libo1106/0eab50b12bc111800708 to your computer and use it in GitHub Desktop.
PhantomJS spider.js
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// ExpressJS调用方式 | |
var express = require('express'); | |
var app = express(); | |
// 引入NodeJS的子进程模块 | |
var child_process = require('child_process'); | |
app.get('/', function(req, res){ | |
// 完整URL | |
var url = req.protocol + '://'+ req.hostname + req.originalUrl; | |
// 预渲染后的页面字符串容器 | |
var content = ''; | |
// 开启一个phantomjs子进程 | |
var phantom = child_process.spawn('phantomjs', ['spider.js', url]); | |
// 设置stdout字符编码 | |
phantom.stdout.setEncoding('utf8'); | |
// 监听phantomjs的stdout,并拼接起来 | |
phantom.stdout.on('data', function(data){ | |
content += data.toString(); | |
}); | |
// 监听子进程退出事件 | |
phantom.on('exit', function(code){ | |
switch (code){ | |
case 1: | |
console.log('加载失败'); | |
res.send('加载失败'); | |
break; | |
case 2: | |
console.log('加载超时: '+ url); | |
res.send(content); | |
break; | |
default: | |
res.send(content); | |
break; | |
} | |
}); | |
}); |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# 定义一个Nginx的upstream为spider_server | |
upstream spider_server { | |
server localhost:3000; | |
} | |
# 指定一个范围,默认 / 表示全部请求 | |
location / { | |
proxy_set_header Host $host:$proxy_port; | |
proxy_set_header X-Real-IP $remote_addr; | |
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; | |
# 当UA里面含有Baiduspider的时候,流量Nginx以反向代理的形式,将流量传递给spider_server | |
if ($http_user_agent ~* "Baiduspider") { | |
proxy_pass http://spider_server; | |
} | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/*global phantom*/ | |
"use strict"; | |
// 单个资源等待时间,避免资源加载后还需要加载其他资源 | |
var resourceWait = 500; | |
var resourceWaitTimer; | |
// 最大等待时间 | |
var maxWait = 5000; | |
var maxWaitTimer; | |
// 资源计数 | |
var resourceCount = 0; | |
// PhantomJS WebPage模块 | |
var page = require('webpage').create(); | |
// NodeJS 系统模块 | |
var system = require('system'); | |
// 从CLI中获取第二个参数为目标URL | |
var url = system.args[1]; | |
// 设置PhantomJS视窗大小 | |
page.viewportSize = { | |
width: 1280, | |
height: 1014 | |
}; | |
// 获取镜像 | |
var capture = function(errCode){ | |
// 外部通过stdout获取页面内容 | |
console.log(page.content); | |
// 清除计时器 | |
clearTimeout(maxWaitTimer); | |
// 任务完成,正常退出 | |
phantom.exit(errCode); | |
}; | |
// 资源请求并计数 | |
page.onResourceRequested = function(req){ | |
resourceCount++; | |
clearTimeout(resourceWaitTimer); | |
}; | |
// 资源加载完毕 | |
page.onResourceReceived = function (res) { | |
// chunk模式的HTTP回包,会多次触发resourceReceived事件,需要判断资源是否已经end | |
if (res.stage !== 'end'){ | |
return; | |
} | |
resourceCount--; | |
if (resourceCount === 0){ | |
// 当页面中全部资源都加载完毕后,截取当前渲染出来的html | |
// 由于onResourceReceived在资源加载完毕就立即被调用了,我们需要给一些时间让JS跑解析任务 | |
// 这里默认预留500毫秒 | |
resourceWaitTimer = setTimeout(capture, resourceWait); | |
} | |
}; | |
// 资源加载超时 | |
page.onResourceTimeout = function(req){ | |
resouceCount--; | |
}; | |
// 资源加载失败 | |
page.onResourceError = function(err){ | |
resourceCount--; | |
}; | |
// 打开页面 | |
page.open(url, function (status) { | |
if (status !== 'success') { | |
phantom.exit(1); | |
} else { | |
// 当改页面的初始html返回成功后,开启定时器 | |
// 当到达最大时间(默认5秒)的时候,截取那一时刻渲染出来的html | |
maxWaitTimer = setTimeout(function(){ | |
capture(2); | |
}, maxWait); | |
} | |
}); |
@nomospace 是的
麻烦问下 页面分布加载 你有什么好的处理方案吗?
当我使用 phantomjs spider.js http://localhost
是可以看到返回的页面结构的,但是当我通过 express_spider.js
来进行访问就会超时,请问这可能是什么原因?
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
感谢分享!我有个疑问,nginx.conf 里注明 3000 端口,但 express_spider.js 里并没有监听 3000,是不是要在 express_spider.js 中注明 app.listen(3000) ?