Search code examples
erlangerlang-otp

Allow a supervisor child take advantage of restarts, but not kill the supervisor once it passes max restarts?


I have a simple_one_for_one supervisor that manages a fairly volatile set of children -- they often die due to external causes, e.g. their network conn being terminated. Erlang's supervision system is brilliant for this -- it just restarts them back up and everything rolls on.

The problem occurs when one of the children has a serious problem with the connection and hits the supervisor's max restart limit, at which point the supervisor kills all children, and then kills itself. Awesome, this is specified in the documentation. However, my ideal behavior would be for the supervisor to give up restarting that particular child, and continue.

I know I can implement this using separate processes which monitors the supervisor, but this seems like overkill.

Thanks for any ideas!


Solution

  • I didn't try it but I suggest that the supervisor launches another supervisor (one per process) with the restart strategy simple_one_for_one, and the restart child spec transient.

    Then this supervisor launch the process itself with the restart strategy one_for_one and the restart child spec permanent, and the maxrestarts and the maxtime fitting your need.

    There is something strange in your question, you say that the supervisor kills all the children that were started when it reach the maxrestart for one faulty child, I thought that the simple_one_for_one strategy left the workers die by themselves.

    [edit] As I was curious to test this idea, I wrote a small set of module to test it.

    her is the code of the top supervisor:

    -module (factory).
    
    -behaviour(supervisor).
    
    -export([start_link/0]).
    -export([init/1, start_process/1]).
    
    
    -define(CHILD(I, Arglist), {I, {I, start_link, [Arglist]}, temporary, 5000, supervisor, [I]}).
    
    start_link() ->
        supervisor:start_link({local, ?MODULE}, ?MODULE, []).
    
    init([]) ->
        {ok, { {simple_one_for_one, 0, 10}, [?CHILD(proc_sup, [])]} }.
    
    start_process(Arglist)->
        supervisor:start_child(?MODULE, [Arglist]). 
    

    Then the code of the intermediate one, in charge to restart a few time a process in case of problem:

    -module (proc_sup).
    
    -behaviour(supervisor).
    
    -export([start_link/2]).
    -export([init/1]).
    
    -define(CHILD(Mod, Start, Arglist), {Mod, {Mod, Start, Arglist}, permanent, 5000, worker, [Mod]}).
    
    start_link(_,Arglist) ->
        io:format("proc_sup arg = ~p~n",[Arglist]),
        supervisor:start_link(?MODULE, [Arglist]).
    
    init([[Mod,Start|[Arglist]]]) ->
        {ok, { {one_for_one, 5, 10}, [?CHILD(Mod,Start,Arglist)]} }.
    

    And then the code of a small modules that can be stopped, receive a message, be programmed to die after a certain time, in order to test the mechanism.

    -module(dumb).
    -export([start_link/1,loop/2]).
    
    start_link(Arg) ->
        io:format("dumb start param = ~p~n",[Arg]),
        {ok,spawn_link(?MODULE,loop,[Arg,init])}.
    
    
    loop({die,T},_) ->
        receive
        after T -> ok
        end;
    loop(Arg,init) ->
        io:format("loop pid ~p with arg ~p~n",[self(),Arg]),
        loop(Arg,0);
    loop(Arg,N) ->
        io:format("loop ~p (~p) cycle ~p~n",[Arg,self(),N]),
        receive
            stop -> 'restart_:o)';
            _ -> loop(Arg,N+1)
        end.
    

    Finally a copy of the shell execution:

    1> factory:start_link().
    {ok,<0.37.0>}
    2> 
    2> factory:start_process([dumb,start_link,[loop_1]]).
    proc_sup arg = [dumb,start_link,[loop_1]]
    dumb start param = loop_1
    loop pid <0.40.0> with arg loop_1
    loop loop_1 (<0.40.0>) cycle 0
    {ok,<0.39.0>}
    3> 
    3> factory:start_process([dumb,start_link,[loop_1]]).
    proc_sup arg = [dumb,start_link,[loop_1]]
    dumb start param = loop_1
    loop pid <0.43.0> with arg loop_1
    loop loop_1 (<0.43.0>) cycle 0
    {ok,<0.42.0>}
    4> 
    4> factory:start_process([dumb,start_link,[loop_2]]).
    proc_sup arg = [dumb,start_link,[loop_2]]
    dumb start param = loop_2
    loop pid <0.46.0> with arg loop_2
    loop loop_2 (<0.46.0>) cycle 0
    {ok,<0.45.0>}
    5> 
    5> pid(0, 2310, 0) ! hello.                          
    hello
    6> 
    6> pid(0, 40, 0) ! hello.  
    loop loop_1 (<0.40.0>) cycle 1
    hello
    7> pid(0, 40, 0) ! hello.
    loop loop_1 (<0.40.0>) cycle 2
    hello
    8> pid(0, 40, 0) ! hello.
    loop loop_1 (<0.40.0>) cycle 3
    hello
    9> pid(0, 43, 0) ! hello.
    loop loop_1 (<0.43.0>) cycle 1
    hello
    10> pid(0, 43, 0) ! hello.
    loop loop_1 (<0.43.0>) cycle 2
    hello
    11> pid(0, 40, 0) ! stop. 
    dumb start param = loop_1
    stop
    loop pid <0.54.0> with arg loop_1
    loop loop_1 (<0.54.0>) cycle 0
    12> pid(0, 40, 0) ! stop.
    stop
    13> pid(0, 54, 0) ! stop.
    dumb start param = loop_1
    stop
    loop pid <0.57.0> with arg loop_1
    loop loop_1 (<0.57.0>) cycle 0
    14> pid(0, 57, 0) ! hello.
    loop loop_1 (<0.57.0>) cycle 1
    hello
    15> factory:start_process([dumb,start_link,[{die,5}]]).
    proc_sup arg = [dumb,start_link,[{die,5}]]
    dumb start param = {die,5}
    {ok,<0.60.0>}
    16> 
    dumb start param = {die,5}
    dumb start param = {die,5}
    dumb start param = {die,5}
    dumb start param = {die,5}
    dumb start param = {die,5}
    16> factory:start_process([dumb,start_link,[{die,50000}]]).
    proc_sup arg = [dumb,start_link,[{die,50000}]]
    dumb start param = {die,50000}
    {ok,<0.68.0>}
    17> 
    dumb start param = {die,50000}
    17>