关注LAMP|PHP源代码分析|web架构|PHP扩展|Erlang|服务端架构

Erlang OTP之terminate 深入分析

2010-08-03

原创文章,转载请注明: 转载自庆亮的博客-webgame架构

本文链接地址: Erlang OTP之terminate 深入分析

作者:庆亮 (qing.liang.cn@gmail.com)

日期:2010-08-03

环境:centos  5.5  64     erlang 14A

 

一、terminate简述及问题产生

 

terminategen_server的一个回调函数,如果一个gen_server进程设置了trap_exittrueprocess_flag(trap_exit, true)),则在该进程结束时会自动调用terminate。利用这个功能,我们可以在进程退出时进行一些善后工作,例如持久化数据、清理等等。但实际上terminate不一定有时间完成所有的任务,在此之前可能已经被系统强制结束了(如果使用init:stop形式结束beam)

 

二、测试terminate

一个erlang 内部 process结束有两种形式:主动结束(如玩家下线后,玩家进程会自动结束)和被动结束(init:stop)

系统停止时(init:stop/c:q/erlang:halt),会依次停止所有的进程,如果一个进程是监控树,则该监控树会先依次停止所有的子进程,然后结束自己。对于子进程也是同样的处理方法。

 

先做测试,后分析源码。测试分为四种情况:

 

进程主动退出 + simple_one_for_one

init:stop + simple_one_for_one

进程主动退出 + one_for_one

init:stop + one_for_one

 

源码文件:

test.erl (application)

test_sup.erl (supervisor)

test_server.erl (gen_server)

 

test.erl源码:

 

-module(test).

-behaviour(application).

-export([start/0, start/2, stop/1]).

start() ->
   
application:start(test).

start(_StartType, _StartArgs) ->
   
case test_sup:start_link() of
        {ok,
Pid} ->
            {ok,
Pid};
       
Error ->
           
Error
   
end.

stop(_State) ->
    ok.

 

非常简单,直接通过 erl -name test@192.168.1.83 -setcookie 123456 -boot start_sasl -s test start 即可启动该app

 

test_sup.erl源码:

 

-module(test_sup).

-behaviour(supervisor).

%% API
-
export([start_link/0]).

%% Supervisor callbacks
-
export([init/1]).

-define(SERVER, ?MODULE).

start_link() ->
   
supervisor:start_link({local, ?SERVER}, ?MODULE, []).

init([]) ->
   
RestartStrategy = simple_one_for_one,
   
MaxRestarts = 1000,
   
MaxSecondsBetweenRestarts = 3600,

    SupFlags = {RestartStrategy, MaxRestarts, MaxSecondsBetweenRestarts},

    Restart = transient,
   
Shutdown = 200000,
   
Type = worker,

    AChild = {test_server, {test_server, start_link, []},
             
Restart, Shutdown, Type, [test_server]},

    {ok, {SupFlags, [AChild]}}.

 

源码骨架都是emacs生成的,我们只关注RestartStrategy = simple_one_for_one,这里,等会需要改成one_for_one以便测试对比。

 

test_server.erl

 

-module(test_server).

-behaviour(gen_server).

-export([
         start
/0,
         start_link
/0
        ]).

-export([init/1, handle_call/3, handle_cast/2, handle_info/2,
         terminate
/2, code_change/3]).

-define(SERVER, ?MODULE).

-record(state, {}).

start() ->
    {ok, _}
= supervisor:start_child(test_sup, []).

start_link() ->
   
gen_server:start_link(?MODULE, [], []).

init([]) ->
   
erlang:process_flag(trap_exit, true),
    {ok,
#state{}}.

handle_call(_Request, _From, State) ->
   
Reply = ok,
    {reply,
Reply, State}.

handle_cast(_Msg, State) ->
    {noreply,
State}.

handle_info({'EXIT', _, Reason}, State) ->
   
io:format("exit:~p~n", [Reason]),
    {stop, normal,
State};

handle_info(_Info, State) ->
    {noreply,
State}.

terminate(Reason, _State) ->
   
io:format("i'm terminate:~p~n", [Reason]),
   
timer:sleep(10000),
   
io:format("~s", ["end"]),
    ok.

code_change(_OldVsn, State, _Extra) ->
    {ok,
State}.

 

简单说明下test_server

2. handle_info({'EXIT', _, Reason}, State) -> 方便simple_one_for_one下进程进程退出操作

3. terminate中的清理工作:io timer:sleep io

 

启动命令行:erl -name test@192.168.1.83 -setcookie 123456 -boot start_sasl -s test start

 

 

1. 进程主动退出 + simple_one_for_one

(test@192.168.1.83)1> erlang:exit(list_to_pid("<0.51.0>"), test).

exit:test

true

i'm terminate:normal

(test@192.168.1.83)2> end

 

正常完成了terminate

 

2 init:stop + simple_one_for_one

 

 (test@192.168.1.83)1> test_server:start().

{ok,<0.53.0>}

(test@192.168.1.83)2> init:stop().

ok

(test@192.168.1.83)3> i'm terminate:shutdown

[root@ming2_local_dev test]#

可以看到似乎没能正常的处理完terminate

 

3. 进程主动退出 + one_for_one

(test@192.168.1.83)1> erlang:exit(list_to_pid("<0.51.0>"), test).

exit:test

true

i'm terminate:normal

(test@192.168.1.83)2> end

 

正常完成了terminate

 

4. init:stop + one_for_one

(test@192.168.1.83)2> init:stop().

ok

(test@192.168.1.83)3> i'm terminate:shutdown

end[root@ming2_local_dev test]#

 

ok,很完整的执行了我们的terminate

 

terminate执行测试结果

 

 

Simple_one_for_one

One_for_one

进程主动退出

完整执行

完整执行

init:stop

不能完整执行

完整执行

 

三、底层分析

看起来很奇怪的结果,还是从源码来分析问题。从supervisor开始:

 

 

terminate(_Reason, State) ->
    terminate_children(
State#state.children, State#state.name),
    ok.

 

terminate_children/2 是一个尾递归函数,依次结束每个子进程:

terminate_children(Children, SupName) ->
    terminate_children(
Children, SupName, []).

terminate_children([Child | Children], SupName, Res) ->
   
NChild = do_terminate(Child, SupName),
    terminate_children(
Children, SupName, [NChild | Res]);
terminate_children([], _SupName, Res) ->
   
Res.

  

在看do_terminate/2

 

do_terminate(Child, SupName) when Child#child.pid =/= undefined ->
   
case shutdown(Child#child.pid,
         
Child#child.shutdown) of
    ok
->
       
Child#child{pid = undefined};
    {error,
OtherReason} ->
        report_error(shutdown_error,
OtherReason, Child, SupName),
       
Child#child{pid = undefined}
   
end;
do_terminate(Child, _SupName) ->
   
Child.

 

继续:

shutdown(Pid, brutal_kill) ->
 
   
case monitor_child(Pid) of
    ok
->
       
exit(Pid, kill),
       
receive
        {'DOWN', _
MRef, process, Pid, killed} ->
            ok;
        {'DOWN', _
MRef, process, Pid, OtherReason} ->
            {error,
OtherReason}
       
end;
    {error,
Reason} ->     
        {error,
Reason}
   
end;

shutdown(Pid, Time) ->
   
   
case monitor_child(Pid) of
    ok
->
       
exit(Pid, shutdown), %% Try to shutdown gracefully
       
receive
        {'DOWN', _
MRef, process, Pid, shutdown} ->
            ok;
        {'DOWN', _
MRef, process, Pid, OtherReason} ->
            {error,
OtherReason}
       
after Time ->
           
exit(Pid, kill),  %% Force termination.
           
receive
            {'DOWN', _
MRef, process, Pid, OtherReason} ->
                {error,
OtherReason}
           
end
       
end;
    {error,
Reason} ->     
        {error,
Reason}
   
end.

 

 

Ok,结束子进程时分情况处理了,先看看monitor_child/1,代码注释的比较详细,简单的说是用于处理child自己退出的情况。

 

monitor_child(Pid) ->
   
   
%% Do the monitor operation first so that if the child dies
   
%% before the monitoring is done causing a 'DOWN'-message with
   
%% reason noproc, we will get the real reason in the 'EXIT'-message
   
%% unless a naughty child has already done unlink…
   
erlang:monitor(process, Pid),
   
unlink(Pid),

    receive
   
%% If the child dies before the unlik we must empty
   
%% the mail-box of the 'EXIT'-message and the 'DOWN'-message.
    {'EXIT',
Pid, Reason} ->
       
receive
        {'DOWN', _, process,
Pid, _} ->
            {error,
Reason}
       
end
   
after 0 ->
       
%% If a naughty child did unlink and the child dies before
       
%% monitor the result will be that shutdown/2 receives a
       
%% 'DOWN'-message with reason noproc.
       
%% If the child should die after the unlink there
       
%% will be a 'DOWN'-message with a correct reason
       
%% that will be handled in shutdown/2.
        ok  
   
end.

 

回头看shutdown/2,主要区别在于exit(Pid, Reason)这一行,如果子进程的shutdown策略为brutal_kill,则子进程被直接kill,而kill消息是不能被捕捉的,也就不存在terminate被调用的可能了(terminate能被调用是因为捕捉了{‘EXIT’,_, _}消息,详细情况请自行查看gen_server实现)。如果你想在退出时清理数据,这里一定不能设置为brutal_kill,而是设置为一个较大的时间数值(毫秒),用于等待子进程做善后工作:

exit(Pid, shutdown), %% Try to shutdown gracefully
       
receive
        {'DOWN', _
MRef, process, Pid, shutdown} ->
            ok;
        {'DOWN', _
MRef, process, Pid, OtherReason} ->
            {error,
OtherReason}
       
after Time ->
           
exit(Pid, kill),  %% Force termination.
           
receive
            {'DOWN', _
MRef, process, Pid, OtherReason} ->
                {error,
OtherReason}
           
end
如果在指定时间内,子进程尚未结束,则强制kill

 

从这一块的源码中我们没有看到restart strategy(one_for_one …)terminate的影响,这跟上面的测试结果不太吻合。过一遍supervisor的代码,发现针对simple_one_for_oneone_for_one的子进程的启动过程是不同的:

 

handle_call({start_child, EArgs}, _From, State) when ?is_simple(State) ->
   
#child{mfa = {M, F, A}} = hd(State#state.children),
   
Args = A ++ EArgs,
   
case do_start_child_i(M, F, Args) of
    {ok,
Pid} ->
       
NState = State#state{dynamics =
                
?DICT:store(Pid, Args, State#state.dynamics)},
        {reply, {ok,
Pid}, NState};
    {ok,
Pid, Extra} ->
       
NState = State#state{dynamics =
                
?DICT:store(Pid, Args, State#state.dynamics)},
        {reply, {ok,
Pid, Extra}, NState};
   
What ->
        {reply,
What, State}
   
end;

%%% The requests terminate_child, delete_child and restart_child are
%%% invalid for simple_one_for_one supervisors.
handle_call({_Req, _Data}, _From, State) when ?is_simple(State) ->
    {reply, {error, simple_one_for_one},
State};

handle_call({start_child, ChildSpec}, _From, State) ->
   
case check_childspec(ChildSpec) of
    {ok,
Child} ->
        {
Resp, NState} = handle_start_child(Child, State),
        {reply,
Resp, NState};
   
What ->
        {reply, {error,
What}, State}
   
end;

 

simple_one_for_one形式启动的子进程根本没有放在supervisorstate.children里面,也就是说supervisorterminate的时候根本没管simple_one_for_one形式启动的子进程,如此当supervisor结束时,所有的simple_one_for_one子进程都会收到一条{‘EXIT’, Pid, Reason}的消息,如果子进程有处理这样的消息并返回了stop,则会调用terminate 但在执行terminate期间,app可能已经结束,从而正在停止中的系统会直接kill掉该进程(实际上是所有剩余的进程),使得其没有时间执行完所有的功能代码(参考之前的分析《init:stop浅析》)。

 

四、结论、问题与解决办法

 

1.  结论

 

 

Simple_one_for_one

One_for_one

进程主动退出

完整执行

完整执行

init:stop

不能完整执行

完整执行

 

 

2.  问题

使用simple_one_for_one时,在系统关闭时,可能无法正常的完成某些的善后工作,如数据持久等等

 

3.  解决办法

使用one_for_one,但是one_for_one的启动过程需要做一些简单的调整:

 

sup树的init返回:

 

  init([]) ->
   
RestartStrategy = one_for_one,
   
MaxRestarts = 1000,
   
MaxSecondsBetweenRestarts = 3600,
   
SupFlags = {RestartStrategy, MaxRestarts, MaxSecondsBetweenRestarts},
    {ok, {
SupFlags, []}}.

 

start_child的时候childspec需要拼凑spec id

supervisor:start_child(mod_stall_sup,
                       {
lists:concat(["mod_stall_server_", MAPID]),
                        {mod_stall_server, start_link, [
MAPID]},
                        transient,
30000, worker, [mod_stall_server]})  
 

作者:庆亮 | 分类目录:Erlang | 标签:

发表评论

电子邮件地址不会被公开。 必填项已用 * 标注

*

您可以使用这些 HTML 标签和属性: <a href="" title=""> <abbr title=""> <acronym title=""> <b> <blockquote cite=""> <cite> <code> <del datetime=""> <em> <i> <q cite=""> <strike> <strong>