Skip to content

Instantly share code, notes, and snippets.

@gasp
Created March 16, 2013 12:49
Show Gist options
  • Save gasp/5176238 to your computer and use it in GitHub Desktop.
Save gasp/5176238 to your computer and use it in GitHub Desktop.
node.js httpClient to test and build cache
var http = require('http');
var cache = {
pages : {},
urls : []
};
var site = {
// hostname: 'oldlaps.freelancis.net',
// hostname: 'www.ryogasp.com',
hostname: 'laps.site',
// hostname: 'groupe-laps.org',
port: 80,
path: '/',
method: 'GET'
};
var init = function(){
crawl('/');
setTimeout(function(){
follow();
},5000)
}
var crawl = function(path){
var options = site;
options.path = path;
var buffer = ''; // new Buffer()
console.log('crawling ',path);
var req = http.request(options, function(res) {
// console.log(res.statusCode);
// console.log(res.headers);
if(res.statusCode == 301){
// console.log("redirected to"+res.headers['Location']);
follow(res.headers['location']);
}
res.setEncoding('utf8');
res.on('data', function (chunk) {
buffer += chunk;
});
res.on('end', function (chunk) {
cache.pages[path] = extract.title(buffer);
links = extract.href(buffer);
for (var i = links.length - 1; i >= 0; i--){
storelink(links[i]);
};
});
});
/* req.on('error', function(e) {
console.log('problem with request: ' + e.message);
});
*/
req.end();
}
var follow = function(){
if(cache.urls.length){
console.log('pages to crawl: ',cache.urls.length);
// be agressive ?
// var l = cache.urls.length/4
// for (var i=0; i < l; i++) {
crawl(cache.urls.pop());
// };
setTimeout(function(){
follow();
},1000);
}
else{
console.log(cache);
}
}
var storelink = function(link){
var r = false;
if(!link.match(/^(#|http:\/\/|https:\/\/|ftp:\/\/|mailto:)/i)){
if(!link.match(/^\/*/)[0]){
link = '/'+link;
}
// store link in url cache
if((typeof(cache.pages[link]) === 'undefined')
&& (cache.urls.indexOf(link))){
cache.urls[cache.urls.length]=link;
r= true;
}
}
return r;
}
var extract = {
title: function(raw){
var m = raw.match(/<title>(.*?)<\/title>/i);
if(m !== null)
return m[1];
else return '';
},
href: function(raw){
var m = raw.match(/href="([^"]*")/img);
if(m !== null){
// console.log(m);
for (var i = m.length - 1; i >= 0; i--){
m[i] = m[i].match(/"([^"]*)"/i)[1];
};
return m;
}
else return [];
},
useful: function (raw) {
}
}
init();
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment