Skip to content

Instantly share code, notes, and snippets.

@binarytemple
Forked from angrycub/README.md
Created October 17, 2017 10:21
Show Gist options
  • Save binarytemple/5c457cd7355d17f8188b8dabbdf969a5 to your computer and use it in GitHub Desktop.
Save binarytemple/5c457cd7355d17f8188b8dabbdf969a5 to your computer and use it in GitHub Desktop.
Erlang Supervisor Ghost Child Monitor

Erlang Supervisor Ghost Montior

Applies to:

  • Riak EE clusters with MDC:
    • 1.4.x - all versions
    • 2.0.x - all versions prior to Riak 2.0.7
    • 2.1.x - all version; fixed in 2.2.0

About

This workaround is a mitigation for basho/riak_kv#1178

Calls to riak_kv_get_fsm_sup:start_get_fsm leave abandoned PIDs in the riak_kv_get_fsm_sup state because FSMs are linked to a sidejob supervisor rather than the get FSM supervisor. A similar issue applies to the put FSM supervisor. This can cause extended shutdown times as the supervisor attempts to iterate through millions of dead PIDs.

You can confirm that a cluster is impacted by this issue and needs this mitgation using the following riak-attach snippet:

CountPids=fun()->length(supervisor:which_children(riak_kv_get_fsm_sup)) end, riak_core_util:rpc_every_member_ann(erlang, apply, [CountPids,[]], 10000).

If you observe any node in the output with a child count greater than 1,000, or if you run the snippet several times and see no reduction in the child count, then that cluster is likely impacted by the above issue.

Installation

  • Compile this beam and add to the Riak basho-patches folder.

  • Create an /etc/riak/advanced.config with the following or add the vm_args proplist to an existing advanced.config.

    [
      {vm_args, [
        {"-s supmon_i",""}
      ]}
    ].
%% -------------------------------------------------------------------
%%
%% supmon-i: periodically exorcise ghost children of supervisor processes
%%
%% Copyright (c) 2016 Basho Technologies, Inc. All Rights Reserved.
%%
%% This file is provided to you under the Apache License,
%% Version 2.0 (the "License"); you may not use this file
%% except in compliance with the License. You may obtain
%% a copy of the License at
%%
%% http://www.apache.org/licenses/LICENSE-2.0
%%
%% Unless required by applicable law or agreed to in writing,
%% software distributed under the License is distributed on an
%% "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
%% KIND, either express or implied. See the License for the
%% specific language governing permissions and limitations
%% under the License.
%%
%% -------------------------------------------------------------------
-module(supmon_i).
-compile(export_all).
-behaviour(gen_server).
-export([start/0, start/2, stop/0]).
-export([init/1, handle_call/3, handle_cast/2, handle_info/2,
terminate/2, code_change/3]).
-export([exorcise_ghost_children/1]).
-define(SAMPLE_RATE, 5*1000). % in milliseconds
-record(state, {
sample_rate :: integer(),
supervisor :: atom()
}).
start() ->
start(?SAMPLE_RATE, riak_kv_get_fsm_sup).
start(SampleRate, Supervisor) ->
gen_server:start({local, ?MODULE}, ?MODULE, [SampleRate, Supervisor], []).
stop() ->
gen_server:cast(?MODULE, stop).
init([SampleRate, Supervisor]) ->
error_logger:info_msg("Starting supmon_i: Checking ~p every ~p ms",[Supervisor, SampleRate]),
erlang:send_after(SampleRate, self(), check),
{ok, #state{sample_rate=SampleRate, supervisor=Supervisor}}.
handle_call(_Request, _From, State) ->
{reply, ok, State}.
handle_cast(stop, State) ->
error_logger:info_msg("Stopping supmon_i",[]),
{stop, normal, State};
handle_cast(_Msg, State) ->
{noreply, State}.
handle_info(check, State) ->
erlang:spawn(?MODULE, exorcise_ghost_children,[State#state.supervisor]),
erlang:send_after(State#state.sample_rate, self(), check),
{noreply, State};
handle_info(_Info, State) ->
{noreply, State}.
terminate(_Reason, _State) ->
ok.
code_change(_OldVsn, State, _Extra) ->
{ok, State}.
%% internal functions
exorcise_ghost_children(Supervisor) ->
error_logger:info_msg("Checking and cleaning out ~p's children.", [Supervisor]),
MaybeKill = fun({_,Pid,_,_}) ->
case erlang:is_process_alive(Pid) of
false ->
?MODULE:mock_exit(Supervisor, Pid);
_ ->
ok
end
end,
lists:foreach(MaybeKill, supervisor:which_children(Supervisor)),
ok.
mock_exit(Supervisor, Pid) ->
%% error_logger:info_msg("Sending EXIT to ~p on behalf of dead PID ~p",[Supervisor, Pid]),
Supervisor ! {'EXIT', Pid, normal}.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment