Created
December 28, 2010 18:27
-
-
Save darkside/757512 to your computer and use it in GitHub Desktop.
Resque God Recipe
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# ------------------------------------------------------------------------------ | |
# Resque | |
# Manages background jobs and does all the hard work so we dont have to. | |
# | |
# Notes: | |
# This configuration assumes we are only using one background worker for our | |
# tasks. This makes it easier to let god take care of it, since there's only | |
# one pid to control. | |
# | |
# There's also a hackish, ugly thread that runs with it to kill anyone stuck, | |
# their sacrifice is for the greater good. | |
# | |
# Anyway, the start command does give the correct pid of the worker, feel free | |
# to refactor this nicely to allow multiple workers to live. | |
# ------------------------------------------------------------------------------ | |
resque_service = "<%= resque_service %>" | |
resque_pid_file = "/var/run/god/#{resque_service}.pid" | |
resque_user = "<%= user %>" | |
resque_group = "<%= user %>" | |
God.watch do |w| | |
w.name = resque_service | |
w.group = '<%= application %>' | |
w.log = "#{rails_root}/log/resque.log" | |
w.interval = 60.seconds | |
w.start = "/usr/bin/rake -f #{rails_root}/Rakefile environment resque:work" | |
w.stop = "kill -QUIT `cat #{w.pid_file}`" | |
w.dir = rails_root | |
w.env = god_environment | |
w.uid = resque_user | |
w.gid = resque_group | |
w.start_grace = 20.seconds | |
w.restart_grace = 20.seconds | |
w.behavior(:clean_pid_file) | |
# TODO: refactor everything below to use helper method "generic_monitor" | |
# retart if memory gets too high | |
w.transition(:up, :restart) do |on| | |
on.condition(:memory_usage) do |c| | |
c.above = 350.megabytes | |
c.times = 2 | |
end | |
end | |
# determine the state on startup | |
w.transition(:init, { true => :up, false => :start }) do |on| | |
on.condition(:process_running) do |c| | |
c.running = true | |
end | |
end | |
# determine when process has finished starting | |
w.transition([:start, :restart], :up) do |on| | |
on.condition(:process_running) do |c| | |
c.running = true | |
c.interval = 5.seconds | |
end | |
# failsafe | |
on.condition(:tries) do |c| | |
c.times = 5 | |
c.transition = :start | |
c.interval = 5.seconds | |
end | |
end | |
# start if process is not running | |
w.transition(:up, :start) do |on| | |
on.condition(:process_running) do |c| | |
c.running = false | |
end | |
end | |
# lifecycle | |
w.lifecycle do |on| | |
on.condition(:flapping) do |c| | |
c.to_state = [:start, :restart] | |
c.times = 5 | |
c.within = 5.minute | |
c.transition = :unmonitored | |
c.retry_in = 10.minutes | |
c.retry_times = 5 | |
c.retry_within = 2.hours | |
end | |
end | |
end | |
# ------------------------------------------------------------------------------ | |
# Resque Workers Suicide | |
# ------------------------------------------------------------------------------ | |
# This will ride alongside god and kill any rogue stale worker | |
# processes. Their sacrifice is for the greater good. | |
# ------------------------------------------------------------------------------ | |
WORKER_TIMEOUT = 60 * 10 # 10 minutes | |
Thread.new do | |
loop do | |
begin | |
`ps -e -o pid,command | grep [r]esque`.split("\n").each do |line| | |
parts = line.split(' ') | |
next if parts[-2] != "at" | |
started = parts[-1].to_i | |
elapsed = Time.now - Time.at(started) | |
if elapsed >= WORKER_TIMEOUT | |
::Process.kill('USR1', parts[0].to_i) | |
end | |
end | |
rescue | |
# don't die because of stupid exceptions | |
nil | |
end | |
sleep 30 | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment