Created
April 25, 2016 16:40
-
-
Save sawantuday/1b9b17906b7b2222bff7624da2f2e035 to your computer and use it in GitHub Desktop.
PhantomJS based web server which accepts URL and return parsed html document. Use parameter href=url to pass in url.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| var system = require("system") | |
| var webserver = require("webserver") | |
| var webpage = require("webpage") | |
| var port = '8001'; | |
| var blockedRes = [ | |
| "google-analytics.com", | |
| "api.mixpanel.com", | |
| "fonts.googleapis.com", | |
| "stats.g.doubleclick.net", | |
| "mc.yandex.ru", | |
| "use.typekit.net", | |
| "beacon.tapfiliate.com", | |
| "js-agent.newrelic.com", | |
| "api.segment.io", | |
| "woopra.com", | |
| "\\.ttf", | |
| "static.olark.com", | |
| "static.getclicky.com", | |
| "fast.fonts.com", | |
| "youtube.com\/embed", | |
| "cdn.heapanalytics.com", | |
| "googleads.g.doubleclick.net", | |
| "pagead2.googlesyndication.com", | |
| "fullstory.com/rec", | |
| "navilytics.com/nls_ajax.php", | |
| "\\.eot", | |
| "log.optimizely.com/event", | |
| "\\.otf", | |
| "hn.inspectlet.com", | |
| "tpc.googlesyndication.com", | |
| "partner.googleadservices.com", | |
| "facebook.com", | |
| "twitter.com", | |
| "plus.google.com", | |
| "instragram.com", | |
| "chat.racoon.in" | |
| ]; | |
| if (!port) { | |
| console.error("No port specified") | |
| phantom.exit(1) | |
| } | |
| var server = webserver.create() | |
| var listening = server.listen(port, onRequest) | |
| if (!listening) { | |
| console.error("Could not bind to port " + port); | |
| phantom.exit(1) | |
| } else { | |
| console.log("Connection success"); | |
| } | |
| phantom.onError = function(msg, trace) { | |
| console.log('PHANTOM ERROR ' + msg + ' ' + JSON.stringify(trace)); | |
| phantom.exit(1); | |
| }; | |
| function onRequest(req, res) { | |
| var page = webpage.create() | |
| page.settings.localToRemoteUrlAccessEnabled = true; | |
| page.settings.userAgent = 'Mozilla/5.0 (compatible; sweetcouch/1.0; +http://www.sweetcouch.com)'; | |
| page.viewportSize = {width: 800,height: 600}; | |
| page.settings.resourceTimeout = 10000; // Avoid freeze!!! | |
| if (req.method != "GET") { | |
| return send(405, toHTML("Method not accepted.")) | |
| } | |
| var url = parse(req.url); | |
| if (url.pathname != "/") { | |
| return send(404, toHTML("Not found.")) | |
| } | |
| console.log("request received"); | |
| var query = url.query | |
| var href = query.href | |
| if (!href) { | |
| return send(400, toHTML("`href` parameter is missing.")) | |
| } | |
| page.onInitialized = function() { | |
| page.evaluate(function() { | |
| window.addEventListener('DOMContentLoaded', function() { | |
| setTimeout(window.callPhantom, 0); | |
| }, false); | |
| }); | |
| }; | |
| page.onCallback = function() { | |
| document.contentLoaded = true; | |
| // send(200, page.content); | |
| }; | |
| page.onError = function(msg, trace) { | |
| console.log('PAGE ERROR ' + msg + ' ' + JSON.stringify(trace)); | |
| }; | |
| page.onConsoleMessage = function(msg) { | |
| console.log('CONSOLE: ' + msg); | |
| }; | |
| page.onLoadFinished = function(status) { | |
| console.log('CONSOLE: ' + status); | |
| }; | |
| page.onResourceRequested = function(requestData, request) { | |
| // discard any css resources | |
| try { | |
| if ((/http:\/\/.+?\.css/gi).test(requestData['url']) | |
| || requestData.headers['Content-Type'] == 'text/css') { | |
| console.log('Aborting: ' + requestData['url']); | |
| request.abort(); | |
| return; | |
| } | |
| // discard unrequired scripts cush as gooel analytics | |
| for(var i = 0,l = blockedRes.length; i < l; i++) { | |
| var regex = new RegExp(blockedRes[i], 'gi'); | |
| if(regex.test(requestData.url)) { | |
| console.log('Aborting: '+ requestData['url']); | |
| request.abort(); | |
| return; | |
| } | |
| } | |
| console.log('Allowed: '+ requestData['url']); | |
| } catch (ex) { | |
| console.log("ERROR: " + ex.message); | |
| } | |
| }; | |
| page.open(href); | |
| try { | |
| console.log('Starting wait for loop'); | |
| waitFor(function() { | |
| // console.log('In wait for loop'); | |
| return page.evaluate(function() { | |
| if(document.contentLoaded && document.readyState === 'complete'){ | |
| console.log('Document content loaded'); | |
| return true; | |
| } | |
| if(document.loaded){ | |
| console.log('Document loaded'); | |
| return true; | |
| } | |
| }); | |
| }, function() { | |
| setTimeout(function(){ // let page render and for another 300ms | |
| console.log('Wait for done. Waiting 1000 ms'); | |
| send(200, page.content); | |
| }, 1000); | |
| }, 24000); | |
| } catch (ex) { | |
| console.log("ERROR: "+ ex.message); | |
| send(200, page.content); | |
| } | |
| function send(statusCode, data) { | |
| try { | |
| var matches = data.match(/<script(?:.*?)>(?:[\S\s]*?)<\/script>/gi); | |
| for (var i = 0; matches && i < matches.length; i++) { | |
| if(matches[i].indexOf('application/ld+json') === -1) { | |
| data = data.replace(matches[i], ''); | |
| } | |
| } | |
| } catch (ex) { | |
| console.log("ERROR: "+ ex.message); | |
| } | |
| res.statusCode = statusCode; | |
| res.setHeader("Content-Type", "text/html"); | |
| res.setHeader("Content-Length", byteLength(data)); | |
| res.write(data) | |
| res.close() | |
| page.close() | |
| } | |
| function waitFor(testFx, onReady, timeOutMillis) { | |
| var maxtimeOutMillis = timeOutMillis ? timeOutMillis : 3000, //< Default Max Timout is 3s | |
| start = new Date().getTime(), | |
| condition = false, | |
| interval = setInterval(function() { | |
| if ( (new Date().getTime() - start < maxtimeOutMillis) && !condition ) { | |
| condition = (typeof(testFx) === "string" ? eval(testFx) : testFx()); //< defensive code | |
| } else { | |
| if(!condition) { | |
| console.log("WaitFor() timeout"); | |
| } else { | |
| console.log("WaitFor() finished in " + (new Date().getTime() - start) + "ms."); | |
| } | |
| clearInterval(interval); // Stop this interval | |
| typeof(onReady) === "string" ? eval(onReady) : onReady(); | |
| } | |
| }, 100); //< repeat check every 100ms | |
| }; | |
| } | |
| function byteLength(str) { | |
| return encodeURIComponent(str).match(/%..|./g).length | |
| } | |
| function toHTML(message) { | |
| return "<!DOCTYPE html><body>" + message + "</body>\n" | |
| } | |
| function parse(url) { | |
| var anchor = document.createElement("a") | |
| anchor.href = url | |
| anchor.query = {} | |
| anchor.search.slice(1).split("&").forEach(function(pair) { | |
| pair = pair.split("=").map(decodeURIComponent) | |
| anchor.query[pair[0]] = pair[1] | |
| }) | |
| return anchor | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment