Created
September 22, 2016 19:20
-
-
Save twoixter/56f1d67f89e22d1b5b03888d5927c655 to your computer and use it in GitHub Desktop.
Proof of concept of a filter node.js script to convert PhantomJSCloud JSON events to HAR
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env node | |
/** | |
* Script to convert PhantomJSCloud JSON output to HAR. | |
* See: https://phantomjscloud.com | |
* | |
* Usage: pipe in the JSON from PhantomJSCloud and pipe out to a file. | |
* Be sure to use the following options when using PhantomJSCloud: | |
* | |
* + outputAsJson = true To force JSON output | |
* + suppressJson = [] To not filter out any JSON data | |
* | |
* Example: | |
* | |
* curl -X POST https://phantomjscloud.com/api/browser/v2/a-demo-key-with-low-quota-per-ip-address/ \ | |
* -d '{url:"http://www.etsy.com/",renderType:"text",outputAsJson:true,suppressJson:[]}' \ | |
* | node pjsc2har.js > etsy.har | |
* | |
* TODO: | |
* **1** Means to know the HTTP version used. | |
* **2** Means to know cookies of each request. | |
* **3** Possible duplicated "resourceReceived" events withoud data. (E.G: Etsy page) | |
* | |
*/ | |
var fs = require("fs"); | |
var path = require("path"); | |
var url = require("url"); | |
var qs = require("querystring"); | |
/** | |
* Process the PhantomJSCloud JSON from stdin and yield a JS Object as a result | |
*/ | |
function fetchJSON(cb) | |
{ | |
process.stdin.setEncoding("utf8"); | |
var _input = ""; | |
process.stdin.on("data", function(chunk) { _input += chunk; }); | |
process.stdin.on("end", function() { | |
cb(JSON.parse(_input)); | |
}); | |
} | |
/** | |
* Helper methods for debugging. Trying not to mess with stdout so that we can | |
* save the HAR output by piping the node stdout output. | |
*/ | |
function _debug(message) | |
{ | |
process.stderr.write("\033[32m[DEBUG]\033[0m " + message + "\n"); | |
} | |
function _error(message) | |
{ | |
process.stderr.write("\033[91m[DEBUG]\033[0m " + message + "\n"); | |
} | |
/** | |
* Parse URL params to a HAR compatible params array | |
*/ | |
function params_to_array(params) | |
{ | |
var _params = []; | |
for (param in params) { | |
_params.push({name: param, value: params[param]}); | |
} | |
return _params; | |
} | |
/** | |
* Some global variables to help process. Yeah, not good, but this is a | |
* proof of concept anyway. :-) | |
*/ | |
var _currentPage = {}; | |
var _cookies = {}; | |
var _theHAR = {}; | |
var _resources = {}; | |
var _activeNavigation = false; | |
/** | |
* Some event handlers to help transformation from PhanomJSCloud events into | |
* individual HAR entries. | |
*/ | |
var eventHandlers = { | |
// A navigation event has been requested. Start over any previous session | |
// we had since this stablish a new load like loading a new "HTTPS" page | |
// while redirecting from "HTTP" 301 redirect. | |
navigationRequested: function(data, time) { | |
// If this is not a main navigation event, do nothing... | |
if (!data.main) return; | |
// Else, we got a new navigation event, perhaps as response to a | |
// redirection like 301, etc. | |
_debug("New navigation event to '" + data.url + "'. Starting over..."); | |
_currentPage = { | |
"id": "page_1", | |
"title": data.url, | |
"startedDateTime": time, | |
"pageTimings": { | |
onContentLoad: 0, // Will be filled by "domReady" | |
onLoad: 0 // ...and "loadFinished" events | |
} | |
}; | |
_activeNavigation = true; | |
_resources = {}; | |
}, | |
// Dummy event. Do nothing as all is done in the "navigationRequested" | |
loadStarted: function(data, time) { /** noop handler **/ }, | |
// Dummy event. URL is already changed by "loadStarted" | |
urlChanged: function(data, time) { /** noop handler **/ }, | |
// Dummy event. Do nothing on console messages. | |
consoleMessage: function(data, time) { /** noop handler **/ }, | |
// Dummy event. Don't know what to do with these?? | |
targetUrlReceived: function(data, time) { /** noop handler **/ }, | |
// Throw this to the debug console... | |
browserError: function(data, time) { _error(data.message); }, | |
// A new resource has been requested. Log this request in "_resources" | |
// only if a current _activeNavigation is enabled. This prevents adding | |
// resources to a different session (E.G: resources requested by a non | |
// "main" navigation request) | |
resourceRequested: function(data, time) { | |
// Do nothing if not actively navigating the current page. | |
if (!_activeNavigation) return; | |
var req = data.resourceRequest; | |
// WATFUK 1! A resourceReceived without an ID? | |
if (!req.id) { | |
_error("WAAAAAT: A resource without ID!!"); | |
return; | |
} | |
// WATFUK 2! A current _resource is already activated with same ID? LOL | |
if (_resources[req.id]) { | |
_error("WAAAAAT: A resource with ID:'" + req.id + "' is already loading?"); | |
// Continue anyway... | |
} | |
var resource = _resources[req.id] = { | |
startedDateTime: req.time, | |
cache: {}, | |
time: -1, | |
pageref: "page_1" | |
}; | |
resource.request = { | |
method: req.method, | |
url: req.url, | |
queryString: params_to_array(url.parse(req.url, true).query), | |
httpVersion: "HTTP1/1", // No means to know HTTP version. See **1** | |
headers: req.headers, | |
headersSize: -1, | |
bodySize: -1, | |
cookies: _cookies // Reuse cookies. See **2** | |
}; | |
switch (req.method) { | |
case "POST": | |
// [[TODO]] | |
// OK, we assume an application/x-www-form-urlencoded | |
// We would need to search in headers for the Content-Type | |
// and fill this appropiately. | |
// Currently this is a mess. But at least we fill this fields | |
// so that HAR schema checkers are pleased. | |
resource.request.postData = { | |
mimeType: "application/x-www-form-urlencoded; charset=UTF-8", | |
params: params_to_array(qs.parse(req.postData)) | |
} | |
break; | |
} | |
}, | |
// A resource has been received. Calculate timings, etc | |
resourceReceived: function (data, time, idx) { | |
// Do nothing if not actively navigating the main session. | |
if (!_activeNavigation) return; | |
// Some "resourceReceived" events contains no "resourceResponse", | |
// might be duplicated events from PhantomJSCloud since the immediate | |
// previous event is for the same URL. | |
if (!data.resourceResponse) return; | |
var res = data.resourceResponse; | |
// WATFUK 1! A resourceReceived without an ID? | |
if (!res.id) { | |
_error("WAAAAAT: A resource received without ID!!"); | |
return; | |
} | |
// WATFUK 2! A current _resource does not exists?? | |
if (!_resources[res.id]) { | |
_error("WAAAAAT: A resource with ID:'" + res.id + "' does not exists!"); | |
return; | |
} | |
var resource = _resources[res.id]; | |
// Just bail out if this request was in error | |
if (resource.errorCode) return; | |
// Based on the current stage, we can do different things. | |
if (res.stage == "start") { | |
// Create the start of the response. We can fill in some data and | |
// start creating timings. | |
resource.responseStartedDateTime = res.time; // NOTE: Not part of HAR 1.2 | |
resource.response = { | |
status: res.status, | |
statusText: res.statusText, | |
httpVersion: "HTTP1/1", | |
cookies: [], | |
headers: res.headers, | |
headersSize: -1, | |
redirectURL: res.redirectUrl || "", | |
bodySize: res.bodySize, | |
content: { | |
size: res.bodySize, | |
mimeType: res.contentType | |
} | |
} | |
} else if (res.stage == "end") { | |
// Special case for responses for entities for "data:*" Urls... | |
if (!res.status && !res.statusText) { | |
_debug("Deleted empty resource with Content-Type: " + res.contentType); | |
delete _resources[res.id]; | |
return; | |
} | |
// May be we don't have a previous "start" stage for some events. | |
// I've seen a bunch of these for 301 redirects or 204 No content | |
if (!resource.response) { | |
resource.responseStartedDateTime = res.time; // NOTE: Not part of HAR 1.2 | |
resource.response = { | |
status: res.status, | |
statusText: res.statusText, | |
httpVersion: "HTTP1/1", | |
cookies: [], | |
headers: res.headers, | |
headersSize: -1, | |
redirectURL: res.redirectUrl || "", | |
bodySize: res.bodySize || 0, | |
content: { | |
size: res.bodySize || 0, | |
mimeType: res.contentType || "" | |
} | |
}; | |
} | |
// Time to calculate timings... | |
var startTime = new Date(resource.startedDateTime); | |
var startReceivingTime = new Date(resource.responseStartedDateTime); | |
var endTime = new Date(res.time); | |
resource.time = endTime - startTime; | |
resource.timings = { | |
blocked: 0, | |
dns: -1, | |
connect: -1, | |
send: 0, | |
wait: startReceivingTime - startTime, | |
receive: endTime - startReceivingTime, | |
ssl: -1 | |
}; | |
// Remove "responseStartedDateTime" since it is not par of HAR 1.2 | |
delete resource.responseStartedDateTime; | |
} else { | |
_error("Unkown response stage '" + res.stage + "'"); | |
} | |
}, | |
// Some events reports as "resourceError", with a message like saying: | |
// "errorCode 5: Operation canceled". Don't know what is causing this, | |
// might be the QTWebKit cache inside PhantomJS cancelling parallels | |
// requests or something. In any case, we need to mark those. | |
resourceError: function(data, time) { | |
// Do nothing if not actively navigating the main session. | |
if (!_activeNavigation) return; | |
var err = data.resourceError; | |
// Mark the entry as error | |
_resources[err.id] = { | |
errorCode: err.errorCode, | |
errorString: err.errorString, | |
url: err.url | |
}; | |
_error("Error " + err.errorCode + ": " + err.errorString + " for " + err.url); | |
}, | |
// Mark domLoaded timings on the current page. | |
domReady: function(data, time) { | |
var startTime = new Date(_currentPage.startedDateTime); | |
var domTime = new Date(time); | |
_currentPage.pageTimings.onContentLoad = domTime - startTime; | |
}, | |
// Stops loading more events... is that right? | |
loadFinished: function(data, time) { | |
var startTime = new Date(_currentPage.startedDateTime); | |
var endTime = new Date(time); | |
_currentPage.pageTimings.onLoad = endTime - startTime; | |
// Stop active navigation. I think this should filter out events for | |
// requests after loadFinished?? May be this is silly, test. | |
_activeNavigation = false; | |
} | |
}; | |
/** | |
* Process the PhantomJSCloud JSON object and create a HAR | |
*/ | |
function processHAR(obj) | |
{ | |
// Extract the PNG to its own file | |
if (obj.content.data && (obj.content.encoding == "base64")) { | |
var filename = url.parse(obj.content.url, true).hostname + path.extname(obj.content.name); | |
fs.writeFile(filename, obj.content.data, obj.content.encoding); | |
} | |
// We can only process one page request currently. | |
if (obj.pageResponses.length != 1) { | |
_error("Multiple (or none) pages found."); | |
_error("I can only process one page at a time!"); | |
process.exit(); | |
} | |
var page = obj.pageResponses[0]; | |
var version = JSON.parse(obj.meta.backend.platformVersion); | |
_theHAR = { | |
"log": { | |
"version": "1.2", | |
"creator": { | |
"name": "PhantomJSCloud", | |
"version": obj.meta.backend.id, | |
"comment": obj.meta.about | |
}, | |
"browser": { | |
"name": obj.meta.backend.platform + " " + obj.meta.backend.os, | |
"version": version.major + "." + version.minor + "." + version.patch | |
}, | |
"pages": [], | |
"entries": [] | |
} | |
}; | |
_cookies = page.cookies; | |
page.events.forEach(function(event, idx){ | |
if (eventHandlers[event.key]) { | |
eventHandlers[event.key](event.value, event.time, idx); | |
} else { | |
_error("No handler for event '" + event.key + "' (idx:" + idx + ")"); | |
} | |
}); | |
// After all events had been processed, time to finish the HAR | |
_theHAR.log.pages.push(_currentPage); | |
for (resource in _resources) { | |
// Filter out cancelled requests/responses | |
if (!_resources[resource].errorCode) { | |
_theHAR.log.entries.push(_resources[resource]); | |
} | |
} | |
// Outputs the final HAR. This will go to stdout so we can pipe it out | |
console.log(JSON.stringify(_theHAR, null, 4)); | |
// Some statistics to stderr. Uncomment to test differences in timing. | |
// _debug("Metrics as reported by PhantomJSCloud:"); | |
// _debug(" * Elapsed time ms: " + page.metrics.elapsedMs); | |
// _debug(" * Starting time: " + page.metrics.startTime); | |
// _debug(" * Ending time: " + page.metrics.endTime); | |
// | |
// var startHARTime = new Date(_theHAR.log.pages[0].startedDateTime); | |
// var endHARTime = new Date(startHARTime.getTime() + _theHAR.log.pages[0].pageTimings.onLoad); | |
// _debug("Calculated HAR metrics"); | |
// _debug(" * Elapsed time ms: " + _theHAR.log.pages[0].pageTimings.onLoad); | |
// _debug(" * Starting time: " + _theHAR.log.pages[0].startedDateTime); | |
// _debug(" * Ending time: " + endHARTime.toISOString()); | |
} | |
fetchJSON(processHAR); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment