We run multiple server processes in two data centers. Each process listens on two ports, one for HTTP and one for HTTPS. HTTPS is terminated by Apache prior to reaching node.js. HTTP goes directly from the client to node.js (through a master load balancer). We do not use clusters. We slice our physical servers into thin virtual machines running SmartOS, each with about 3GB of memory designed for a single node.js process.
Our node.js servers are hapi.js servers using the composer functionality and plugins architecture. We have three sets of plugins loaded: mobile web front end experience (single page app), legacy API reverse proxy, and monitoring.
We also serve original node.js services off another server zone which runs closed source plugins using hapi.
Monitoring uses the good plugin. Each process reports its status to another set of node.js servers for analytics collection at 30 seconds sample interval. We also collect the full execution log of each incoming request. Those are received by a super simple node.js process called elmer. Each elmer process is a "dumb" HTTP server taking each incoming request, bundles the raw unparsed payload buffer and headers, and sticks them into a RabbitMQ server cluster queue.
On the other side, we have a bunch of node.js server processes called prospector which subscribe to the various queues (each for every analytics source: native apps, mobile web, servers, etc.). prospector does light validation and processing on the raw data and forwards it on to multiple destinations based on the data: MongoDB, Splunk, statd, and Omniture.
On Splunk, we use a wide set of searches and dashboards to chart all the data (both operations and business) and monitor the system. The engineering team is glued to live operations metrics which include RSS, V8 heap (max and used), CPU, requests per second, concurrent requests, socket disconnects, garbage collection cycles, HTTP status code counters, response times, error rates, etc.
We log everything from good into files and run the good broadcast process in fixed intervals to send the data to the elmer processes. This provides us with a disk-based log in case something fails and reduced socket-based load from the active production processes in case something goes wrong (so analytics doesn't add to the load).
The majority of business logic is still served from Java. All incoming traffic is proxied through node.js. We use a hapi plugin called services-us-legacy which looks like this:
exports.register = function (plugin, options, next) {
// Set server-specific configuration
plugin.select('secure').api({ base: 'https://' + options.https.host + ':' + options.https.port });
plugin.select('http').api({ base: 'http://' + options.http.host + ':' + options.http.port });
// Define upstream route mapping
var mapUri = function (request, callback) {
var uri = request.server.plugins['services-us-legacy'].base + request.raw.req.url;
return callback(null, uri, { Host: options.header });
};
// Configure catch-all proxy
plugin.select('api').route({
method: '*',
path: '/{p*}',
handler: {
proxy: {
mapUri: mapUri,
passThrough: true,
rejectUnauthorized: false,
timeout: options.timeout
}
}
});
// Override route proxy settings
if (options.routes) {
var paths = Object.keys(options.routes);
for (var i = 0, il = paths.length; i < il; ++i) {
var path = paths[i];
var route = options.routes[path];
var endpoint = {
method: 'GET',
path: path,
config: {
handler: {
proxy: {
mapUri: mapUri,
rejectUnauthorized: false,
timeout: route.timeout || options.timeout,
ttl: (!route.expiresAt && !route.expiresIn ? 'upstream': null)
}
},
cache: {
mode: route.mode || 'client+server',
cache: options.cache || route.cache,
expiresAt: route.expiresAt,
expiresIn: route.expiresIn,
staleIn: route.staleIn,
staleTimeout: route.staleTimeout
}
}
};
plugin.select('api').route(endpoint);
}
}
return next();
};
This plugin is loaded using the hapi composer functionality using the following (slightly modified) configuration file:
{
"pack": {
"cache": [{
"name": "shared",
"engine": "redis",
"partition": "http",
"shared": true
}]
},
"servers": [
{
"port": 8080,
"options": {
"labels": ["api", "http"],
"timeout": {
"server": 60000
},
"maxSockets": 300,
"cors": {
"origin": [
"https://mobile.walmart.com"
],
"credentials": true,
"matchOrigin": false
},
"state": {
"cookies": {
"failAction": "log",
"strictHeader": false
}
},
"load": {
"maxHeapUsedBytes": 1073741824,
"maxRssBytes": 1610612736,
"sampleInterval": 1000,
"maxEventLoopDelay": 5000
}
}
},
{
"port": 8443,
"options": {
"labels": ["api", "secure"],
"timeout": {
"server": 60000
},
"maxSockets": 300,
"cors": {
"origin": [
"http://mobile.walmart.com"
],
"credentials": true,
"matchOrigin": false
},
"state": {
"cookies": {
"failAction": "log",
"strictHeader": false
}
},
"load": {
"maxHeapUsedBytes": 1073741824,
"maxRssBytes": 1610612736,
"sampleInterval": 1000,
"maxEventLoopDelay": 5000
}
}
},
{
"port": 8999,
"options":{
"labels": ["admin"],
"timeout": {
"server": 60000
}
}
}
],
"plugins": {
"furball": null,
"good":{
"subscribers": {
"/var/log/mobile/request_services-us.log": ["request"],
"/var/log/mobile/ops_services-us.log": ["ops"],
"/var/log/mobile/log_services-us.log": ["log"],
"/var/log/mobile/internal_error_services-us.log": ["error"]
},
"gcDetection": true,
"extendedRequests": true,
"opsInterval": 30000
},
"reptile": {
"port": 8039
},
"services-us-legacy": {
"header": "mobile.walmart.com",
"timeout": 60000,
"cache": "shared",
"http": {
"host": "10.x.x.x",
"port": 80
},
"https": {
"host": "10.x.x.x",
"port": 443
},
"routes": {
"/legacy/suggestions/{p}": { "expiresAt": "10:00", "timeout": 1000 },
"/legacy/reviews/{p}": { "expiresAt": "10:00" },
"/legacy/item/get": {},
"/legacy/itemInventory/get": {}
}
}
}
}