Last active
August 11, 2024 13:00
-
Star
(138)
You must be signed in to star a gist -
Fork
(41)
You must be signed in to fork a gist
-
-
Save jdx/0f535be1ada0ea964cae to your computer and use it in GitHub Desktop.
zero-downtime node.js app runner
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// This script will boot app.js with the number of workers | |
// specified in WORKER_COUNT. | |
// | |
// The master will respond to SIGHUP, which will trigger | |
// restarting all the workers and reloading the app. | |
var cluster = require('cluster'); | |
var workerCount = process.env.WORKER_COUNT || 2; | |
// Defines what each worker needs to run | |
// In this case, it's app.js a simple node http app | |
cluster.setupMaster({ exec: 'app.js' }); | |
// Gets the count of active workers | |
function numWorkers() { return Object.keys(cluster.workers).length; } | |
var stopping = false; | |
// Forks off the workers unless the server is stopping | |
function forkNewWorkers() { | |
if (!stopping) { | |
for (var i = numWorkers(); i < workerCount; i++) { cluster.fork(); } | |
} | |
} | |
// A list of workers queued for a restart | |
var workersToStop = []; | |
// Stops a single worker | |
// Gives 60 seconds after disconnect before SIGTERM | |
function stopWorker(worker) { | |
console.log('stopping', worker.process.pid); | |
worker.disconnect(); | |
var killTimer = setTimeout(function() { | |
worker.kill(); | |
}, 60000); | |
// Ensure we don't stay up just for this setTimeout | |
killTimer.unref(); | |
} | |
// Tell the next worker queued to restart to disconnect | |
// This will allow the process to finish it's work | |
// for 60 seconds before sending SIGTERM | |
function stopNextWorker() { | |
var i = workersToStop.pop(); | |
var worker = cluster.workers[i]; | |
if (worker) stopWorker(worker); | |
} | |
// Stops all the works at once | |
function stopAllWorkers() { | |
stopping = true; | |
console.log('stopping all workers'); | |
for (var id in cluster.workers) { | |
stopWorker(cluster.workers[id]); | |
} | |
} | |
// Worker is now listening on a port | |
// Once it is ready, we can signal the next worker to restart | |
cluster.on('listening', stopNextWorker); | |
// A worker has disconnected either because the process was killed | |
// or we are processing the workersToStop array restarting each process | |
// In either case, we will fork any workers needed | |
cluster.on('disconnect', forkNewWorkers); | |
// HUP signal sent to the master process to start restarting all the workers sequentially | |
process.on('SIGHUP', function() { | |
console.log('restarting all workers'); | |
workersToStop = Object.keys(cluster.workers); | |
stopNextWorker(); | |
}); | |
// Kill all the workers at once | |
process.on('SIGTERM', stopAllWorkers); | |
// Fork off the initial workers | |
forkNewWorkers(); | |
console.log('app master', process.pid, 'booted'); |
Workers are not receiving message on disconnect, what can it be?
Starting and stopping by about 3 - 5 times causes the error below when connecting to a database:
events.js:72
throw er; // Unhandled 'error' event
^
Error: write ENOTSUP - cannot write to IPC channel.
at errnoException (child_process.js:1001:11)
at ChildProcess.target.send (child_process.js:465:16)
at Worker.send (cluster.js:406:21)
at sendInternalMessage (cluster.js:399:10)
at handleResponse (cluster.js:177:5)
at respond (cluster.js:192:5)
at Object.messageHandler.queryServer (cluster.js:247:5)
at handleMessage (cluster.js:197:32)
at ChildProcess.emit (events.js:117:20)
at handleMessage (child_process.js:322:10)
I am using MSSQL database connector for Node.js
facing issue in the script.
scenario: restarting workers
For 2 workers (say), it stops first worker. cluster.on('disconnect', ...) is called immediately (before worker.kill() is called through a timer). It is a bug.
Ideally, that function should be scheduled on cluster.on( 'exit', ...)
Hope I am correct. Doing this makes your script fly. would appreciate your inputs.
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Great script, thank you!
However, when having more than two workers and restarting it can take quite some time until all workers are restarted when they are having open connections. Before the next worker is restarted, it waits for 60 seconds. Wouldn't it be better if the script would try to already restart the next worker instead of waiting or would this make it possible that no worker is available during restart for a brief period?