某Team在用Python开发一些代码,涉及子进程以及设法消除僵尸进程的需求。实践中他们碰上Python程序非预期退出的现象。最初他们决定 用GDB调试Python解释器,查看exit()的源头。我听了之后,觉得这个问题应该用别的调试思路。帮他们排查这次程序故障时,除去原始问题,还衍 生了其他问题。
这次的问题相比西安研发中心曾经碰上的Python信号处理问题,有不少基础知识、先验知识是共用的,此处不做再普及,感兴趣的同学可以翻看我以前发过的文章。
下文是一次具体的调试、分析记录。为了简化现场、方便调试,已将原始问题、衍生问题浓缩成DebugPythonWithGDB_6.py、DebugPythonWithGDB_7.py。
$ vi DebugPythonWithGDB_6.py
PHP
#!/usr/bin/env python# -*- encoding: utf-8 -*-import sys, os, signal, subprocess, shlex, tracebackdef on_SIGCHLD ( signum, frame ) :print "[on_SIGCHLD"sys.stdout.write( "signum = %u\n" % signum )traceback.print_stack( frame )print os.waitpid( -1, os.WNOHANG )"""try :print os.waitpid( -1, os.WNOHANG )except OSError :sys.stdout.write( 'Line[%u]: OSError\n' % sys.exc_info()[2].tb_lineno )"""print "on_SIGCHLD]"def do_more ( count ) :print '[do_more() begin %u]' % countos.system( r'printf "Child = %u\n" $$;/bin/sleep 1' )"""## 这里存在竞争条件,可以增加触发OSError异常的概率#os.system( r'printf "Child = %u\n" $$;/bin/sleep 1' )os.system( r'printf "Child = %u\n" $$;/bin/sleep 1' )os.system( r'printf "Child = %u\n" $$;/bin/sleep 1' )os.system( r'printf "Child = %u\n" $$;/bin/sleep 1' )"""print '[do_more() end %u]' % countdef main ( prog, args ) :if 0 == len( args ) :print 'Usage: %s ' % progelse :sys.stdout.write( "Parent = %u\n" % os.getpid() )## 本例中,即使有下列代码,Ctrl-C仍然无效。#signal.signal( signal.SIGINT, signal.SIG_DFL )## signal.signal( signal.SIGCHLD, signal.SIG_IGN )#signal.signal( signal.SIGCHLD, on_SIGCHLD )#count = 0while True :## 本例中父进程只是一个调度框架,不需要与子进程进行通信,因此不# 需要特别处理"stdin=None, stdout=None, stderr=None"。#child = subprocess.Popen \(## 不要直接用args[0].split(),它在处理单、双引号时不是我们# 期望的行为。考虑这种例子,ls -l "/tmp/non exist"#shlex.split( args[0] ),## all file descriptors except 0, 1 and 2 will be closed# before the child process is executed#close_fds = True,cwd = "/tmp")sys.stdout.write( "Child = %u\n" % child.pid )## child.send_signal( signal.SIGTERM )# child.terminate()#child.kill()## child.wait()#do_more( count )count += 1if '__main__' == __name__ :try :main( os.path.basename( sys.argv[0] ), sys.argv[1:] )except KeyboardInterrupt :pass
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 | #!/usr/bin/env python # -*- encoding: utf-8 -*- import sys, os, signal, subprocess, shlex, traceback def on_SIGCHLD ( signum, frame ) : print "[on_SIGCHLD" sys.stdout.write( "signum = %u\n" % signum ) traceback.print_stack( frame ) print os.waitpid( -1, os.WNOHANG ) """ try : print os.waitpid( -1, os.WNOHANG ) except OSError : sys.stdout.write( 'Line[%u]: OSError\n' % sys.exc_info()[2].tb_lineno ) """ print "on_SIGCHLD]" def do_more ( count ) : print '[do_more() begin %u]' % count os.system( r'printf "Child = %u\n" $$;/bin/sleep 1' ) """ # # 这里存在竞争条件,可以增加触发OSError异常的概率 # os.system( r'printf "Child = %u\n" $$;/bin/sleep 1' ) os.system( r'printf "Child = %u\n" $$;/bin/sleep 1' ) os.system( r'printf "Child = %u\n" $$;/bin/sleep 1' ) os.system( r'printf "Child = %u\n" $$;/bin/sleep 1' ) """ print '[do_more() end %u]' % count def main ( prog, args ) : if 0 == len( args ) : print 'Usage: %s ' % prog else : sys.stdout.write( "Parent = %u\n" % os.getpid() ) # # 本例中,即使有下列代码,Ctrl-C仍然无效。 # signal.signal( signal.SIGINT, signal.SIG_DFL ) # # signal.signal( signal.SIGCHLD, signal.SIG_IGN ) # signal.signal( signal.SIGCHLD, on_SIGCHLD ) # count = 0 while True : # # 本例中父进程只是一个调度框架,不需要与子进程进行通信,因此不 # 需要特别处理"stdin=None, stdout=None, stderr=None"。 # child = subprocess.Popen \ ( # # 不要直接用args[0].split(),它在处理单、双引号时不是我们 # 期望的行为。考虑这种例子,ls -l "/tmp/non exist" # shlex.split( args[0] ), # # all file descriptors except 0, 1 and 2 will be closed # before the child process is executed # close_fds = True, cwd = "/tmp" ) sys.stdout.write( "Child = %u\n" % child.pid ) # # child.send_signal( signal.SIGTERM ) # child.terminate() # child.kill() # # child.wait() # do_more( count ) count += 1 if '__main__' == __name__ : try : main( os.path.basename( sys.argv[0] ), sys.argv[1:] ) except KeyboardInterrupt : pass |
PHP
$ python DebugPythonWithGDB_6.py 'python -c "import time;time.sleep(3600)"'Parent = 10244Child = 10245[do_more() begin 0][on_SIGCHLDsignum = 17File "DebugPythonWithGDB_6.py", line 81, inmain( os.path.basename( sys.argv[0] ), sys.argv[1:] )File "DebugPythonWithGDB_6.py", line 76, in maindo_more( count )File "DebugPythonWithGDB_6.py", line 20, in do_moreprint '[do_more() begin %u]' % count(10245, 9)on_SIGCHLD]Child = 10246[on_SIGCHLDsignum = 17File "DebugPythonWithGDB_6.py", line 81, inmain( os.path.basename( sys.argv[0] ), sys.argv[1:] )File "DebugPythonWithGDB_6.py", line 76, in maindo_more( count )File "DebugPythonWithGDB_6.py", line 21, in do_moreos.system( r'printf "Child = %u\n" $$;/bin/sleep 1' )Traceback (most recent call last):File "DebugPythonWithGDB_6.py", line 81, inmain( os.path.basename( sys.argv[0] ), sys.argv[1:] )File "DebugPythonWithGDB_6.py", line 76, in maindo_more( count )File "DebugPythonWithGDB_6.py", line 21, in do_moreos.system( r'printf "Child = %u\n" $$;/bin/sleep 1' )File "DebugPythonWithGDB_6.py", line 10, in on_SIGCHLDprint os.waitpid( -1, os.WNOHANG )OSError: [Errno 10] No child processes
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 | $ python DebugPythonWithGDB_6.py 'python -c "import time;time.sleep(3600)"' Parent = 10244 Child = 10245 [do_more() begin 0] [on_SIGCHLD signum = 17 File "DebugPythonWithGDB_6.py", line 81, in main( os.path.basename( sys.argv[0] ), sys.argv[1:] ) File "DebugPythonWithGDB_6.py", line 76, in main do_more( count ) File "DebugPythonWithGDB_6.py", line 20, in do_more print '[do_more() begin %u]' % count (10245, 9) on_SIGCHLD] Child = 10246 [on_SIGCHLD signum = 17 File "DebugPythonWithGDB_6.py", line 81, in main( os.path.basename( sys.argv[0] ), sys.argv[1:] ) File "DebugPythonWithGDB_6.py", line 76, in main do_more( count ) File "DebugPythonWithGDB_6.py", line 21, in do_more os.system( r'printf "Child = %u\n" $$;/bin/sleep 1' ) Traceback (most recent call last): File "DebugPythonWithGDB_6.py", line 81, in main( os.path.basename( sys.argv[0] ), sys.argv[1:] ) File "DebugPythonWithGDB_6.py", line 76, in main do_more( count ) File "DebugPythonWithGDB_6.py", line 21, in do_more os.system( r'printf "Child = %u\n" $$;/bin/sleep 1' ) File "DebugPythonWithGDB_6.py", line 10, in on_SIGCHLD print os.waitpid( -1, os.WNOHANG ) OSError: [Errno 10] No child processes |
流程进入on_SIGCHLD(),但os.waitpid()抛出OSError异常。帮助里写的是,如果系统调用 waitpid()返回-1,就抛出异常: An OSError is raised with the value of errno when the syscall returns -1. 10245号子进程在on_SIGCHLD()里waitpid()成功,(10245, 9)中的9表示该进程是被SIGKILL干掉的,符合预期。 10246号子进程是do_more()里的os.system()产生的shell进程,它结束时向10244号父进程投递了SIGCHLD信号。 on_SIGCHLD()里waitpid()时,已经在别处wait*()过,10246号子进程已经彻底消失,系统调用waitpid()返回 -1,Python函数os.waitpid()抛出异常。 整个过程非常复杂,用伪代码描述如下:
PHP
do_more()os.system()posix_system() // posixmodule.c__libc_system() // weak_alias (__libc_system, system)do_system() // sysdeps/posix/system.c/** SIG_IGN** Ctrl-C暂时失效*/sigaction( SIGINT, &sa, &intr )/** 屏蔽(阻塞)SIGCHLD信号*/sigaddset( &sa.sa_mask, SIGCHLD )sigprocmask( SIG_BLOCK, &sa.sa_mask, &omask )fork()子进程(10246号子进程)/** 恢复原有SIGINT信号处理方式*/sigaction( SIGINT, &intr, (struct sigaction *)NULL )/** 调用"sh -c ..."*/execve()[shell子进程结束,向DebugPythonWithGDB_6.py投递SIGCHLD][由于SIGCHLD信号已被屏蔽(阻塞),其保持在内核态的未决信号链上]父进程(10244号父进程)/** 同步调用,会阻塞。不是在信号句柄中异步调用。** 10246号子进程在此被wait*()回收后彻底消失*/waitpid( pid, &status, 0 )/** 恢复原有SIGINT信号处理方式*/sigaction( SIGINT, &intr, (struct sigaction *)NULL )/** 取消对SIGCHLD的屏蔽(阻塞)*/sigprocmask( SIG_SETMASK, &omask, (sigset_t *)NULL )[SIGCHLD信号的屏蔽(阻塞)被取消][DebugPythonWithGDB_6.py的C级信号句柄signal_handler()安排"延迟调用"后返回][DebugPythonWithGDB_6.py的on_SIGCHLD()此时并未得到执行,因为built-in函数os.system()尚未返回]/** built-in函数os.system()返回后,10244号父进程开始处理"延迟调用",调用* Python级信号句柄。这个SIGCHLD信号是10246号子进程投递过来的。** DebugPythonWithGDB_6.py的on_SIGCHLD()得到执行*/on_SIGCHLD()/** 调用waitpid( -1, &status, WNOHANG ),试图处理10246号子进程。** 10246号子进程已为前述waitpid( pid, &status, 0 )所处理,此处系统调用* 返回-1,导致os.waitpid()抛出OSError异常。*/os.waitpid( -1, os.WNOHANG )
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 | do_more() os.system() posix_system() // posixmodule.c __libc_system() // weak_alias (__libc_system, system) do_system() // sysdeps/posix/system.c /* * SIG_IGN * * Ctrl-C暂时失效 */ sigaction( SIGINT, &sa, &intr ) /* * 屏蔽(阻塞)SIGCHLD信号 */ sigaddset( &sa.sa_mask, SIGCHLD ) sigprocmask( SIG_BLOCK, &sa.sa_mask, &omask ) fork() 子进程(10246号子进程) /* * 恢复原有SIGINT信号处理方式 */ sigaction( SIGINT, &intr, (struct sigaction *)NULL ) /* * 调用"sh -c ..." */ execve() [shell子进程结束,向DebugPythonWithGDB_6.py投递SIGCHLD] [由于SIGCHLD信号已被屏蔽(阻塞),其保持在内核态的未决信号链上] 父进程(10244号父进程) /* * 同步调用,会阻塞。不是在信号句柄中异步调用。 * * 10246号子进程在此被wait*()回收后彻底消失 */ waitpid( pid, &status, 0 ) /* * 恢复原有SIGINT信号处理方式 */ sigaction( SIGINT, &intr, (struct sigaction *)NULL ) /* * 取消对SIGCHLD的屏蔽(阻塞) */ sigprocmask( SIG_SETMASK, &omask, (sigset_t *)NULL ) [SIGCHLD信号的屏蔽(阻塞)被取消] [DebugPythonWithGDB_6.py的C级信号句柄signal_handler()安排"延迟调用"后返回] [DebugPythonWithGDB_6.py的on_SIGCHLD()此时并未得到执行,因为built-in函数os.system()尚未返回] /* * built-in函数os.system()返回后,10244号父进程开始处理"延迟调用",调用 * Python级信号句柄。这个SIGCHLD信号是10246号子进程投递过来的。 * * DebugPythonWithGDB_6.py的on_SIGCHLD()得到执行 */ on_SIGCHLD() /* * 调用waitpid( -1, &status, WNOHANG ),试图处理10246号子进程。 * * 10246号子进程已为前述waitpid( pid, &status, 0 )所处理,此处系统调用 * 返回-1,导致os.waitpid()抛出OSError异常。 */ os.waitpid( -1, os.WNOHANG ) |
整个过程之所以如此复杂,主要是因为Python的信号处理机制比较复杂,让已经非常复杂的Linux信号机制再添变数。参看:
PHP
《2.50 对Python解释器进行调试》《22.0 Linux信号机制》
1 2 | 《2.50 对Python解释器进行调试》 《22.0 Linux信号机制》 |
就本例而言,为了确保DebugPythonWithGDB_6.py不因OSError异常而终止,只需在on_SIGCHLD()中调用os.waitpid()时捕捉OSError异常:
PHP
def on_SIGCHLD ( signum, frame ) :try :print os.waitpid( -1, os.WNOHANG )except OSError :sys.stdout.write( 'Line[%u]: OSError\n' % sys.exc_info()[2].tb_lineno )
1 2 3 4 5 | def on_SIGCHLD ( signum, frame ) : try : print os.waitpid( -1, os.WNOHANG ) except OSError : sys.stdout.write( 'Line[%u]: OSError\n' % sys.exc_info()[2].tb_lineno ) |
前述观点有些是动态调试得到,有些是静态分析得到。有人可能问了,为什么不拦截Python进程的C级信号句柄,查看SIGCHLD 信号源,以此确认10246号子进程可能被回收两次?其实我最初也想这么干来着,但这是行不通的,因为Python的C级信号句柄 signal_handler()是那种最原始的单形参信号句柄,不是高大上的三形参信号句柄。 用GDB调试Python解释器:
PHP
# gdb -q -ex "b *signal_handler" -ex r --args /usr/bin/python2.7-dbg DebugPythonWithGDB_6.py '/usr/bin/python2.7-dbg -c "import time;time.sleep(3600)"'...Breakpoint 1 at 0x8216f2d: file ../Modules/signalmodule.c, line 185.Starting program: /usr/bin/python2.7-dbg DebugPythonWithGDB_6.py /usr/bin/python2.7-dbg\ -c\ \"import\ time\;time.sleep\(3600\)\"[Thread debugging using libthread_db enabled]Using host libthread_db library "/lib/i386-linux-gnu/i686/cmov/libthread_db.so.1".Parent = 10284Child = 10288[do_more() begin 0]Child = 10289Breakpoint 1, signal_handler (sig_num=17) at ../Modules/signalmodule.c:185185 {(gdb) py-bt#10 Frame 0xb7c20034, for file DebugPythonWithGDB_6.py, line 21, in do_more (count=0)os.system( r'printf "Child = %u\n" $$;/bin/sleep 1' )#13 Frame 0xb7cb37dc, for file DebugPythonWithGDB_6.py, line 76, in main (prog='DebugPythonWithGDB_6.py', args=['/usr/bin/python2.7-dbg -c "import time;time.sleep(3600)"'], count=0, child=)do_more( count )#16 Frame 0xb7cbe49c, for file DebugPythonWithGDB_6.py, line 81, in ()main( os.path.basename( sys.argv[0] ), sys.argv[1:] )(gdb) bt 7#0 signal_handler (sig_num=17) at ../Modules/signalmodule.c:185#1#2 0xb7fdcd3c in __kernel_vsyscall ()#3 0xb7db25eb in __sigprocmask (how=how@entry=2, set=0x0, set@entry=0xbffff0d4, oset=oset@entry=0x0) at ../sysdeps/unix/sysv/linux/sigprocmask.c:57#4 0xb7dc2084 in do_system (line=line@entry=0xb7cbf9e4 "printf \"Child = %u\\n\" $$;/bin/sleep 1") at ../sysdeps/posix/system.c:161#5 0xb7dc2380 in __libc_system (line=line@entry=0xb7cbf9e4 "printf \"Child = %u\\n\" $$;/bin/sleep 1") at ../sysdeps/posix/system.c:184#6 0xb7fa9bfb in system (line=0xb7cbf9e4 "printf \"Child = %u\\n\" $$;/bin/sleep 1") at pt-system.c:28(More stack frames follow...)
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 | # gdb -q -ex "b *signal_handler" -ex r --args /usr/bin/python2.7-dbg DebugPythonWithGDB_6.py '/usr/bin/python2.7-dbg -c "import time;time.sleep(3600)"' ... Breakpoint 1 at 0x8216f2d: file ../Modules/signalmodule.c, line 185. Starting program: /usr/bin/python2.7-dbg DebugPythonWithGDB_6.py /usr/bin/python2.7-dbg\ -c\ \"import\ time\;time.sleep\(3600\)\" [Thread debugging using libthread_db enabled] Using host libthread_db library "/lib/i386-linux-gnu/i686/cmov/libthread_db.so.1". Parent = 10284 Child = 10288 [do_more() begin 0] Child = 10289 Breakpoint 1, signal_handler (sig_num=17) at ../Modules/signalmodule.c:185 185 { (gdb) py-bt #10 Frame 0xb7c20034, for file DebugPythonWithGDB_6.py, line 21, in do_more (count=0) os.system( r'printf "Child = %u\n" $$;/bin/sleep 1' ) #13 Frame 0xb7cb37dc, for file DebugPythonWithGDB_6.py, line 76, in main (prog='DebugPythonWithGDB_6.py', args=['/usr/bin/python2.7-dbg -c "import time;time.sleep(3600)"'], count=0, child=) do_more( count ) #16 Frame 0xb7cbe49c, for file DebugPythonWithGDB_6.py, line 81, in () main( os.path.basename( sys.argv[0] ), sys.argv[1:] ) (gdb) bt 7 #0 signal_handler (sig_num=17) at ../Modules/signalmodule.c:185 #1 #2 0xb7fdcd3c in __kernel_vsyscall () #3 0xb7db25eb in __sigprocmask (how=how@entry=2, set=0x0, set@entry=0xbffff0d4, oset=oset@entry=0x0) at ../sysdeps/unix/sysv/linux/sigprocmask.c:57 #4 0xb7dc2084 in do_system (line=line@entry=0xb7cbf9e4 "printf \"Child = %u\\n\" $$;/bin/sleep 1") at ../sysdeps/posix/system.c:161 #5 0xb7dc2380 in __libc_system (line=line@entry=0xb7cbf9e4 "printf \"Child = %u\\n\" $$;/bin/sleep 1") at ../sysdeps/posix/system.c:184 #6 0xb7fa9bfb in system (line=0xb7cbf9e4 "printf \"Child = %u\\n\" $$;/bin/sleep 1") at pt-system.c:28 (More stack frames follow...) |
查看#4的system.c:161,这个位置已经在waitpid( pid, &status, 0 )之后: sigprocmask( SIG_SETMASK, &omask, (sigset_t *)NULL ) 其作用是取消对SIGCHLD的屏蔽(阻塞)。 此时内存布局如下:
PHP
内存高址方向fpstate // ESP+0x2DC output/x *(struct _fpstate *)($esp+0x2dc)retcode // ESP+0x2D4 x/3i $esp+0x2d4extramask // ESP+0x2D0 x/1wx $esp+0x2d0fpstate_unused // ESP+0x60 output/x *(struct _fpstate *)($esp+0x60)sigcontext_ia32 // ESP+8 output/x *(struct sigcontext *)($esp+8)sig // ESP+4 信号值,信号句柄***形参pretcode // ESP RetAddr=__kernel_sigreturn// hexdump $esp 0x2dc内存低址方向
1 2 3 4 5 6 7 8 9 10 11 12 | 内存高址方向 fpstate // ESP+0x2DC output/x *(struct _fpstate *)($esp+0x2dc) retcode // ESP+0x2D4 x/3i $esp+0x2d4 extramask // ESP+0x2D0 x/1wx $esp+0x2d0 fpstate_unused // ESP+0x60 output/x *(struct _fpstate *)($esp+0x60) sigcontext_ia32 // ESP+8 output/x *(struct sigcontext *)($esp+8) sig // ESP+4 信号值,信号句柄***形参 pretcode // ESP RetAddr=__kernel_sigreturn // hexdump $esp 0x2dc 内存低址方向 |
PHP
(gdb) x/2wa $esp0xbfffea6c: 0xb7fdcd18 0x11(gdb) x/3i $esp+0x2d40xbfffed40: pop eax0xbfffed41: mov eax,0x770xbfffed46: int 0x80(gdb) output/x *(struct sigcontext *)($esp+8){gs = 0x33,__gsh = 0x0,fs = 0x0,__fsh = 0x0,es = 0x7b,__esh = 0x0,ds = 0x7b,__dsh = 0x0,edi = 0xb7f2a000,esi = 0x8,ebp = 0x1,esp = 0xbfffeff0,ebx = 0x2,edx = 0x0,ecx = 0xbffff0d4,eax = 0x0,trapno = 0x1,err = 0x0,eip = 0xb7fdcd3c,cs = 0x73,__csh = 0x0,eflags = 0x246,esp_at_signal = 0xbfffeff0,ss = 0x7b,__ssh = 0x0,fpstate = 0xbfffed50,oldmask = 0x0,cr2 = 0x0}
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 | (gdb) x/2wa $esp 0xbfffea6c: 0xb7fdcd18 0x11 (gdb) x/3i $esp+0x2d4 0xbfffed40: pop eax 0xbfffed41: mov eax,0x77 0xbfffed46: int 0x80 (gdb) output/x *(struct sigcontext *)($esp+8) { gs = 0x33, __gsh = 0x0, fs = 0x0, __fsh = 0x0, es = 0x7b, __esh = 0x0, ds = 0x7b, __dsh = 0x0, edi = 0xb7f2a000, esi = 0x8, ebp = 0x1, esp = 0xbfffeff0, ebx = 0x2, edx = 0x0, ecx = 0xbffff0d4, eax = 0x0, trapno = 0x1, err = 0x0, eip = 0xb7fdcd3c, cs = 0x73, __csh = 0x0, eflags = 0x246, esp_at_signal = 0xbfffeff0, ss = 0x7b, __ssh = 0x0, fpstate = 0xbfffed50, oldmask = 0x0, cr2 = 0x0 } |
因为是单形参信号句柄,没有siginfo,无法在用户态获知信号源。但我分析此时的信号源不是10289号子进程,而是10288 号子进程。10288产生SIGCHLD时,SIGCHLD信号已被屏蔽(阻塞),只能保持在内核态的未决信号链上。之后待10289产生SIGCHLD 时,sigpending.signal中相应位已经置位,10289产生的SIGCHLD被丢弃,不会进入内核态的未决信号链。SIGCHLD信号的屏 蔽(阻塞)被取消后,从内核态的未决信号链上取出10288产生的SIGCHLD进行处理。于是断点***。 如果完全理解了前述实验结果及分析,就会发现DebugPythonWithGDB_6.py存在竞争条件。subprocess.Popen()对应的 子进程投递SIGCHLD信号时,父进程有两种可能:
PHP
1) os.system()调用sigprocmask( SIG_BLOCK, &sa.sa_mask, &omask )之前2) os.system()调用sigprocmask( SIG_BLOCK, &sa.sa_mask, &omask )之后
1 2 | 1) os.system()调用sigprocmask( SIG_BLOCK, &sa.sa_mask, &omask )之前 2) os.system()调用sigprocmask( SIG_BLOCK, &sa.sa_mask, &omask )之后 |
情况1)会触发OSError异常,情况2)不会触发OSError异常。执行: $ python DebugPythonWithGDB_6.py ‘python -c “import time;time.sleep(3600)”‘ 有时会因OSError异常而终止,有时就一直循环执行下去。出现这种差异,正是竞争环境的表征。 小结一下: 假设针对SIGCHLD安装了Python级信号句柄,其调用os.waitpid( -1, os.WNOHANG )回收子进程。如果别处会调用os.system(),则必须在os.waitpid()外侧捕捉OSError异常。不建议这种方式的混用。 对waitpid()的分析到此就结束了,说点调试过程中出现的其他问题。 意外地发现Ctrl-C无法终止情况2),而我已经调用: signal.signal( signal.SIGINT, signal.SIG_DFL ) 这是因为do_system()中一上来就调用了:
PHP
sa.sa_handler = SIG_IGN;sigaction( SIGINT, &sa, &intr );
1 2 | sa.sa_handler = SIG_IGN; sigaction( SIGINT, &sa, &intr ); |
导致Ctrl-C暂时失效,直至do_system()结束。假设DebugPythonWithGDB_6.py已经出 现情况2),查看它的信号处理方式:
PHP
# ps auwx | grep pythonroot 10355 0.0 0.5 8116 5812 pts/0 S+ 15:57 0:00 python DebugPythonWithGDB_6.py python -c "import time;time.sleep(3600)"root 10389 0.0 0.0 0 0 pts/0 Z+ 15:57 0:00 [python]root 10393 0.0 0.0 2936 852 pts/1 R+ 15:57 0:00 grep python# stap -DMAXACTION=10000 -g /usr/share/doc/systemtap-doc/examples/process/psig.stp -x 1035510355: pythonHUP defaultINT ignored // 不是预期的defaultQUIT ignoredILL defaultTRAP defaultABRT defaultBUS defaultFPE defaultKILL defaultUSR1 defaultSEGV defaultUSR2 defaultPIPE ignoredALRM defaultTERM defaultSTKFLT defaultCHLD blocked,caught 0x818a480 0...
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 | # ps auwx | grep python root 10355 0.0 0.5 8116 5812 pts/0 S+ 15:57 0:00 python DebugPythonWithGDB_6.py python -c "import time;time.sleep(3600)" root 10389 0.0 0.0 0
网页名称:用GDB排查Python程序故障
攀枝花网站建设、攀枝花网站运维推广公司-贝锐智能,是专注品牌与效果的网络营销公司;服务项目有等 声明:本网站发布的内容(图片、视频和文字)以用户投稿、用户转载内容为主,如果涉及侵权请尽快告知,我们将会在第一时间删除。文章观点不代表本网站立场,如需处理请联系客服。电话:028-86922220;邮箱:631063699@qq.com。内容未经允许不得转载,或转载时需注明来源: 贝锐智能 贝锐智能技术为您推荐以下文章
|