Skip to content

Instantly share code, notes, and snippets.

@sawantuday
Created April 25, 2016 16:40
Show Gist options
  • Select an option

  • Save sawantuday/1b9b17906b7b2222bff7624da2f2e035 to your computer and use it in GitHub Desktop.

Select an option

Save sawantuday/1b9b17906b7b2222bff7624da2f2e035 to your computer and use it in GitHub Desktop.
PhantomJS based web server which accepts URL and return parsed html document. Use parameter href=url to pass in url.
var system = require("system")
var webserver = require("webserver")
var webpage = require("webpage")
var port = '8001';
var blockedRes = [
"google-analytics.com",
"api.mixpanel.com",
"fonts.googleapis.com",
"stats.g.doubleclick.net",
"mc.yandex.ru",
"use.typekit.net",
"beacon.tapfiliate.com",
"js-agent.newrelic.com",
"api.segment.io",
"woopra.com",
"\\.ttf",
"static.olark.com",
"static.getclicky.com",
"fast.fonts.com",
"youtube.com\/embed",
"cdn.heapanalytics.com",
"googleads.g.doubleclick.net",
"pagead2.googlesyndication.com",
"fullstory.com/rec",
"navilytics.com/nls_ajax.php",
"\\.eot",
"log.optimizely.com/event",
"\\.otf",
"hn.inspectlet.com",
"tpc.googlesyndication.com",
"partner.googleadservices.com",
"facebook.com",
"twitter.com",
"plus.google.com",
"instragram.com",
"chat.racoon.in"
];
if (!port) {
console.error("No port specified")
phantom.exit(1)
}
var server = webserver.create()
var listening = server.listen(port, onRequest)
if (!listening) {
console.error("Could not bind to port " + port);
phantom.exit(1)
} else {
console.log("Connection success");
}
phantom.onError = function(msg, trace) {
console.log('PHANTOM ERROR ' + msg + ' ' + JSON.stringify(trace));
phantom.exit(1);
};
function onRequest(req, res) {
var page = webpage.create()
page.settings.localToRemoteUrlAccessEnabled = true;
page.settings.userAgent = 'Mozilla/5.0 (compatible; sweetcouch/1.0; +http://www.sweetcouch.com)';
page.viewportSize = {width: 800,height: 600};
page.settings.resourceTimeout = 10000; // Avoid freeze!!!
if (req.method != "GET") {
return send(405, toHTML("Method not accepted."))
}
var url = parse(req.url);
if (url.pathname != "/") {
return send(404, toHTML("Not found."))
}
console.log("request received");
var query = url.query
var href = query.href
if (!href) {
return send(400, toHTML("`href` parameter is missing."))
}
page.onInitialized = function() {
page.evaluate(function() {
window.addEventListener('DOMContentLoaded', function() {
setTimeout(window.callPhantom, 0);
}, false);
});
};
page.onCallback = function() {
document.contentLoaded = true;
// send(200, page.content);
};
page.onError = function(msg, trace) {
console.log('PAGE ERROR ' + msg + ' ' + JSON.stringify(trace));
};
page.onConsoleMessage = function(msg) {
console.log('CONSOLE: ' + msg);
};
page.onLoadFinished = function(status) {
console.log('CONSOLE: ' + status);
};
page.onResourceRequested = function(requestData, request) {
// discard any css resources
try {
if ((/http:\/\/.+?\.css/gi).test(requestData['url'])
|| requestData.headers['Content-Type'] == 'text/css') {
console.log('Aborting: ' + requestData['url']);
request.abort();
return;
}
// discard unrequired scripts cush as gooel analytics
for(var i = 0,l = blockedRes.length; i < l; i++) {
var regex = new RegExp(blockedRes[i], 'gi');
if(regex.test(requestData.url)) {
console.log('Aborting: '+ requestData['url']);
request.abort();
return;
}
}
console.log('Allowed: '+ requestData['url']);
} catch (ex) {
console.log("ERROR: " + ex.message);
}
};
page.open(href);
try {
console.log('Starting wait for loop');
waitFor(function() {
// console.log('In wait for loop');
return page.evaluate(function() {
if(document.contentLoaded && document.readyState === 'complete'){
console.log('Document content loaded');
return true;
}
if(document.loaded){
console.log('Document loaded');
return true;
}
});
}, function() {
setTimeout(function(){ // let page render and for another 300ms
console.log('Wait for done. Waiting 1000 ms');
send(200, page.content);
}, 1000);
}, 24000);
} catch (ex) {
console.log("ERROR: "+ ex.message);
send(200, page.content);
}
function send(statusCode, data) {
try {
var matches = data.match(/<script(?:.*?)>(?:[\S\s]*?)<\/script>/gi);
for (var i = 0; matches && i < matches.length; i++) {
if(matches[i].indexOf('application/ld+json') === -1) {
data = data.replace(matches[i], '');
}
}
} catch (ex) {
console.log("ERROR: "+ ex.message);
}
res.statusCode = statusCode;
res.setHeader("Content-Type", "text/html");
res.setHeader("Content-Length", byteLength(data));
res.write(data)
res.close()
page.close()
}
function waitFor(testFx, onReady, timeOutMillis) {
var maxtimeOutMillis = timeOutMillis ? timeOutMillis : 3000, //< Default Max Timout is 3s
start = new Date().getTime(),
condition = false,
interval = setInterval(function() {
if ( (new Date().getTime() - start < maxtimeOutMillis) && !condition ) {
condition = (typeof(testFx) === "string" ? eval(testFx) : testFx()); //< defensive code
} else {
if(!condition) {
console.log("WaitFor() timeout");
} else {
console.log("WaitFor() finished in " + (new Date().getTime() - start) + "ms.");
}
clearInterval(interval); // Stop this interval
typeof(onReady) === "string" ? eval(onReady) : onReady();
}
}, 100); //< repeat check every 100ms
};
}
function byteLength(str) {
return encodeURIComponent(str).match(/%..|./g).length
}
function toHTML(message) {
return "<!DOCTYPE html><body>" + message + "</body>\n"
}
function parse(url) {
var anchor = document.createElement("a")
anchor.href = url
anchor.query = {}
anchor.search.slice(1).split("&").forEach(function(pair) {
pair = pair.split("=").map(decodeURIComponent)
anchor.query[pair[0]] = pair[1]
})
return anchor
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment