用GDB排查Python程序故障

某Team在用Python开发一些代码,涉及子进程以及设法消除僵尸进程的需求。实践中他们碰上Python程序非预期退出的现象。最初他们决定 用GDB调试Python解释器,查看exit()的源头。我听了之后,觉得这个问题应该用别的调试思路。帮他们排查这次程序故障时,除去原始问题,还衍 生了其他问题。

这次的问题相比西安研发中心曾经碰上的Python信号处理问题,有不少基础知识、先验知识是共用的,此处不做再普及,感兴趣的同学可以翻看我以前发过的文章。

下文是一次具体的调试、分析记录。为了简化现场、方便调试,已将原始问题、衍生问题浓缩成DebugPythonWithGDB_6.py、DebugPythonWithGDB_7.py。

$ vi DebugPythonWithGDB_6.py

PHP

#!/usr/bin/env python# -*- encoding: utf-8 -*-import sys, os, signal, subprocess, shlex, tracebackdef on_SIGCHLD ( signum, frame ) :print "[on_SIGCHLD"sys.stdout.write( "signum = %u\n" % signum )traceback.print_stack( frame )print os.waitpid( -1, os.WNOHANG )"""try :print os.waitpid( -1, os.WNOHANG )except OSError :sys.stdout.write( 'Line[%u]: OSError\n' % sys.exc_info()[2].tb_lineno )"""print "on_SIGCHLD]"def do_more ( count ) :print '[do_more() begin %u]' % countos.system( r'printf "Child = %u\n" $$;/bin/sleep 1' )"""## 这里存在竞争条件,可以增加触发OSError异常的概率#os.system( r'printf "Child = %u\n" $$;/bin/sleep 1' )os.system( r'printf "Child = %u\n" $$;/bin/sleep 1' )os.system( r'printf "Child = %u\n" $$;/bin/sleep 1' )os.system( r'printf "Child = %u\n" $$;/bin/sleep 1' )"""print '[do_more() end %u]' % countdef main ( prog, args ) :if 0 == len( args ) :print 'Usage: %s ' % progelse :sys.stdout.write( "Parent = %u\n" % os.getpid() )## 本例中,即使有下列代码,Ctrl-C仍然无效。#signal.signal( signal.SIGINT, signal.SIG_DFL )## signal.signal( signal.SIGCHLD, signal.SIG_IGN )#signal.signal( signal.SIGCHLD, on_SIGCHLD )#count = 0while True :## 本例中父进程只是一个调度框架,不需要与子进程进行通信,因此不# 需要特别处理"stdin=None, stdout=None, stderr=None"。#child = subprocess.Popen \(## 不要直接用args[0].split(),它在处理单、双引号时不是我们# 期望的行为。考虑这种例子,ls -l "/tmp/non exist"#shlex.split( args[0] ),## all file descriptors except 0, 1 and 2 will be closed# before the child process is executed#close_fds = True,cwd = "/tmp")sys.stdout.write( "Child = %u\n" % child.pid )## child.send_signal( signal.SIGTERM )# child.terminate()#child.kill()## child.wait()#do_more( count )count += 1if '__main__' == __name__ :try :main( os.path.basename( sys.argv[0] ), sys.argv[1:] )except KeyboardInterrupt :pass

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

#!/usr/bin/env python

# -*- encoding: utf-8 -*-

import sys, os, signal, subprocess, shlex, traceback

def on_SIGCHLD ( signum, frame ) :

    print "[on_SIGCHLD"

    sys.stdout.write( "signum  = %u\n" % signum )

    traceback.print_stack( frame )

    print os.waitpid( -1, os.WNOHANG )

    """

    try :

        print os.waitpid( -1, os.WNOHANG )

    except OSError :

        sys.stdout.write( 'Line[%u]: OSError\n' % sys.exc_info()[2].tb_lineno )

    """

    print "on_SIGCHLD]"

def do_more ( count ) :

    print '[do_more() begin %u]' % count

    os.system( r'printf "Child   = %u\n" $$;/bin/sleep 1' )

    """

    #

    # 这里存在竞争条件,可以增加触发OSError异常的概率

    #

    os.system( r'printf "Child   = %u\n" $$;/bin/sleep 1' )

    os.system( r'printf "Child   = %u\n" $$;/bin/sleep 1' )

    os.system( r'printf "Child   = %u\n" $$;/bin/sleep 1' )

    os.system( r'printf "Child   = %u\n" $$;/bin/sleep 1' )

    """

    print '[do_more() end %u]' % count

def main ( prog, args ) :

    if 0 == len( args ) :

        print 'Usage: %s ' % prog

    else :

        sys.stdout.write( "Parent  = %u\n" % os.getpid() )

        #

        # 本例中,即使有下列代码,Ctrl-C仍然无效。

        #

        signal.signal( signal.SIGINT, signal.SIG_DFL )

        #

        # signal.signal( signal.SIGCHLD, signal.SIG_IGN )

        #

        signal.signal( signal.SIGCHLD, on_SIGCHLD )

        #

        count   = 0

        while True :

            #

            # 本例中父进程只是一个调度框架,不需要与子进程进行通信,因此不

            # 需要特别处理"stdin=None, stdout=None, stderr=None"。

            #

            child   = subprocess.Popen  \

            (

                #

                # 不要直接用args[0].split(),它在处理单、双引号时不是我们

                # 期望的行为。考虑这种例子,ls -l "/tmp/non exist"

                #

                shlex.split( args[0] ),

                #

                # all file descriptors except 0, 1 and 2 will be closed

                # before the child process is executed

                #

                close_fds   = True,

                cwd         = "/tmp"

            )

            sys.stdout.write( "Child   = %u\n" % child.pid )

            #

            # child.send_signal( signal.SIGTERM )

            # child.terminate()

            #

            child.kill()

            #

            # child.wait()

            #

            do_more( count )

            count  += 1

if '__main__' == __name__ :

    try :

        main( os.path.basename( sys.argv[0] ), sys.argv[1:] )

    except KeyboardInterrupt :

        pass

 

PHP

$ python DebugPythonWithGDB_6.py 'python -c "import time;time.sleep(3600)"'Parent = 10244Child = 10245[do_more() begin 0][on_SIGCHLDsignum = 17File "DebugPythonWithGDB_6.py", line 81, inmain( os.path.basename( sys.argv[0] ), sys.argv[1:] )File "DebugPythonWithGDB_6.py", line 76, in maindo_more( count )File "DebugPythonWithGDB_6.py", line 20, in do_moreprint '[do_more() begin %u]' % count(10245, 9)on_SIGCHLD]Child = 10246[on_SIGCHLDsignum = 17File "DebugPythonWithGDB_6.py", line 81, inmain( os.path.basename( sys.argv[0] ), sys.argv[1:] )File "DebugPythonWithGDB_6.py", line 76, in maindo_more( count )File "DebugPythonWithGDB_6.py", line 21, in do_moreos.system( r'printf "Child = %u\n" $$;/bin/sleep 1' )Traceback (most recent call last):File "DebugPythonWithGDB_6.py", line 81, inmain( os.path.basename( sys.argv[0] ), sys.argv[1:] )File "DebugPythonWithGDB_6.py", line 76, in maindo_more( count )File "DebugPythonWithGDB_6.py", line 21, in do_moreos.system( r'printf "Child = %u\n" $$;/bin/sleep 1' )File "DebugPythonWithGDB_6.py", line 10, in on_SIGCHLDprint os.waitpid( -1, os.WNOHANG )OSError: [Errno 10] No child processes

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

$ python DebugPythonWithGDB_6.py 'python -c "import time;time.sleep(3600)"'

Parent  = 10244

Child   = 10245

[do_more() begin 0]

[on_SIGCHLD

signum  = 17

  File "DebugPythonWithGDB_6.py", line 81, in

    main( os.path.basename( sys.argv[0] ), sys.argv[1:] )

  File "DebugPythonWithGDB_6.py", line 76, in main

    do_more( count )

  File "DebugPythonWithGDB_6.py", line 20, in do_more

    print '[do_more() begin %u]' % count

(10245, 9)

on_SIGCHLD]

Child   = 10246

[on_SIGCHLD

signum  = 17

  File "DebugPythonWithGDB_6.py", line 81, in

    main( os.path.basename( sys.argv[0] ), sys.argv[1:] )

  File "DebugPythonWithGDB_6.py", line 76, in main

    do_more( count )

  File "DebugPythonWithGDB_6.py", line 21, in do_more

    os.system( r'printf "Child   = %u\n" $$;/bin/sleep 1' )

Traceback (most recent call last):

  File "DebugPythonWithGDB_6.py", line 81, in

    main( os.path.basename( sys.argv[0] ), sys.argv[1:] )

  File "DebugPythonWithGDB_6.py", line 76, in main

    do_more( count )

  File "DebugPythonWithGDB_6.py", line 21, in do_more

    os.system( r'printf "Child   = %u\n" $$;/bin/sleep 1' )

  File "DebugPythonWithGDB_6.py", line 10, in on_SIGCHLD

    print os.waitpid( -1, os.WNOHANG )

OSError: [Errno 10] No child processes

流程进入on_SIGCHLD(),但os.waitpid()抛出OSError异常。帮助里写的是,如果系统调用 waitpid()返回-1,就抛出异常: An OSError is raised with the value of errno when the syscall returns -1. 10245号子进程在on_SIGCHLD()里waitpid()成功,(10245, 9)中的9表示该进程是被SIGKILL干掉的,符合预期。 10246号子进程是do_more()里的os.system()产生的shell进程,它结束时向10244号父进程投递了SIGCHLD信号。 on_SIGCHLD()里waitpid()时,已经在别处wait*()过,10246号子进程已经彻底消失,系统调用waitpid()返回 -1,Python函数os.waitpid()抛出异常。 整个过程非常复杂,用伪代码描述如下:

PHP

do_more()os.system()posix_system() // posixmodule.c__libc_system() // weak_alias (__libc_system, system)do_system() // sysdeps/posix/system.c/** SIG_IGN** Ctrl-C暂时失效*/sigaction( SIGINT, &sa, &intr )/** 屏蔽(阻塞)SIGCHLD信号*/sigaddset( &sa.sa_mask, SIGCHLD )sigprocmask( SIG_BLOCK, &sa.sa_mask, &omask )fork()子进程(10246号子进程)/** 恢复原有SIGINT信号处理方式*/sigaction( SIGINT, &intr, (struct sigaction *)NULL )/** 调用"sh -c ..."*/execve()[shell子进程结束,向DebugPythonWithGDB_6.py投递SIGCHLD][由于SIGCHLD信号已被屏蔽(阻塞),其保持在内核态的未决信号链上]父进程(10244号父进程)/** 同步调用,会阻塞。不是在信号句柄中异步调用。** 10246号子进程在此被wait*()回收后彻底消失*/waitpid( pid, &status, 0 )/** 恢复原有SIGINT信号处理方式*/sigaction( SIGINT, &intr, (struct sigaction *)NULL )/** 取消对SIGCHLD的屏蔽(阻塞)*/sigprocmask( SIG_SETMASK, &omask, (sigset_t *)NULL )[SIGCHLD信号的屏蔽(阻塞)被取消][DebugPythonWithGDB_6.py的C级信号句柄signal_handler()安排"延迟调用"后返回][DebugPythonWithGDB_6.py的on_SIGCHLD()此时并未得到执行,因为built-in函数os.system()尚未返回]/** built-in函数os.system()返回后,10244号父进程开始处理"延迟调用",调用* Python级信号句柄。这个SIGCHLD信号是10246号子进程投递过来的。** DebugPythonWithGDB_6.py的on_SIGCHLD()得到执行*/on_SIGCHLD()/** 调用waitpid( -1, &status, WNOHANG ),试图处理10246号子进程。** 10246号子进程已为前述waitpid( pid, &status, 0 )所处理,此处系统调用* 返回-1,导致os.waitpid()抛出OSError异常。*/os.waitpid( -1, os.WNOHANG )

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

do_more()

    os.system()

        posix_system()          // posixmodule.c

            __libc_system()     // weak_alias (__libc_system, system)

                do_system()     // sysdeps/posix/system.c

                    /*

                     * SIG_IGN

                     *

                     * Ctrl-C暂时失效

                     */

                    sigaction( SIGINT, &sa, &intr )

                    /*

                     * 屏蔽(阻塞)SIGCHLD信号

                     */

                    sigaddset( &sa.sa_mask, SIGCHLD )

                    sigprocmask( SIG_BLOCK, &sa.sa_mask, &omask )

                    fork()

                        子进程(10246号子进程)

                            /*

                             * 恢复原有SIGINT信号处理方式

                             */

                            sigaction( SIGINT, &intr, (struct sigaction *)NULL )

                            /*

                             * 调用"sh -c ..."

                             */

                            execve()

                            [shell子进程结束,向DebugPythonWithGDB_6.py投递SIGCHLD]

                            [由于SIGCHLD信号已被屏蔽(阻塞),其保持在内核态的未决信号链上]

                        父进程(10244号父进程)

                            /*

                             * 同步调用,会阻塞。不是在信号句柄中异步调用。

                             *

                             * 10246号子进程在此被wait*()回收后彻底消失

                             */

                            waitpid( pid, &status, 0 )

                            /*

                             * 恢复原有SIGINT信号处理方式

                             */

                            sigaction( SIGINT, &intr, (struct sigaction *)NULL )

                            /*

                             * 取消对SIGCHLD的屏蔽(阻塞)

                             */

                            sigprocmask( SIG_SETMASK, &omask, (sigset_t *)NULL )

                            [SIGCHLD信号的屏蔽(阻塞)被取消]

                            [DebugPythonWithGDB_6.py的C级信号句柄signal_handler()安排"延迟调用"后返回]

                            [DebugPythonWithGDB_6.py的on_SIGCHLD()此时并未得到执行,因为built-in函数os.system()尚未返回]

/*

* built-in函数os.system()返回后,10244号父进程开始处理"延迟调用",调用

* Python级信号句柄。这个SIGCHLD信号是10246号子进程投递过来的。

*

* DebugPythonWithGDB_6.py的on_SIGCHLD()得到执行

*/

on_SIGCHLD()

    /*

     * 调用waitpid( -1, &status, WNOHANG ),试图处理10246号子进程。

     *

     * 10246号子进程已为前述waitpid( pid, &status, 0 )所处理,此处系统调用

     * 返回-1,导致os.waitpid()抛出OSError异常。

     */

    os.waitpid( -1, os.WNOHANG )

整个过程之所以如此复杂,主要是因为Python的信号处理机制比较复杂,让已经非常复杂的Linux信号机制再添变数。参看:

PHP

《2.50 对Python解释器进行调试》《22.0 Linux信号机制》

1

2

《2.50 对Python解释器进行调试》

《22.0 Linux信号机制》

就本例而言,为了确保DebugPythonWithGDB_6.py不因OSError异常而终止,只需在on_SIGCHLD()中调用os.waitpid()时捕捉OSError异常:

PHP

def on_SIGCHLD ( signum, frame ) :try :print os.waitpid( -1, os.WNOHANG )except OSError :sys.stdout.write( 'Line[%u]: OSError\n' % sys.exc_info()[2].tb_lineno )

1

2

3

4

5

def on_SIGCHLD ( signum, frame ) :

    try :

        print os.waitpid( -1, os.WNOHANG )

    except OSError :

        sys.stdout.write( 'Line[%u]: OSError\n' % sys.exc_info()[2].tb_lineno )

前述观点有些是动态调试得到,有些是静态分析得到。有人可能问了,为什么不拦截Python进程的C级信号句柄,查看SIGCHLD 信号源,以此确认10246号子进程可能被回收两次?其实我最初也想这么干来着,但这是行不通的,因为Python的C级信号句柄 signal_handler()是那种最原始的单形参信号句柄,不是高大上的三形参信号句柄。 用GDB调试Python解释器:

PHP

# gdb -q -ex "b *signal_handler" -ex r --args /usr/bin/python2.7-dbg DebugPythonWithGDB_6.py '/usr/bin/python2.7-dbg -c "import time;time.sleep(3600)"'...Breakpoint 1 at 0x8216f2d: file ../Modules/signalmodule.c, line 185.Starting program: /usr/bin/python2.7-dbg DebugPythonWithGDB_6.py /usr/bin/python2.7-dbg\ -c\ \"import\ time\;time.sleep\(3600\)\"[Thread debugging using libthread_db enabled]Using host libthread_db library "/lib/i386-linux-gnu/i686/cmov/libthread_db.so.1".Parent = 10284Child = 10288[do_more() begin 0]Child = 10289Breakpoint 1, signal_handler (sig_num=17) at ../Modules/signalmodule.c:185185 {(gdb) py-bt#10 Frame 0xb7c20034, for file DebugPythonWithGDB_6.py, line 21, in do_more (count=0)os.system( r'printf "Child = %u\n" $$;/bin/sleep 1' )#13 Frame 0xb7cb37dc, for file DebugPythonWithGDB_6.py, line 76, in main (prog='DebugPythonWithGDB_6.py', args=['/usr/bin/python2.7-dbg -c "import time;time.sleep(3600)"'], count=0, child=)do_more( count )#16 Frame 0xb7cbe49c, for file DebugPythonWithGDB_6.py, line 81, in ()main( os.path.basename( sys.argv[0] ), sys.argv[1:] )(gdb) bt 7#0 signal_handler (sig_num=17) at ../Modules/signalmodule.c:185#1#2 0xb7fdcd3c in __kernel_vsyscall ()#3 0xb7db25eb in __sigprocmask (how=how@entry=2, set=0x0, set@entry=0xbffff0d4, oset=oset@entry=0x0) at ../sysdeps/unix/sysv/linux/sigprocmask.c:57#4 0xb7dc2084 in do_system (line=line@entry=0xb7cbf9e4 "printf \"Child = %u\\n\" $$;/bin/sleep 1") at ../sysdeps/posix/system.c:161#5 0xb7dc2380 in __libc_system (line=line@entry=0xb7cbf9e4 "printf \"Child = %u\\n\" $$;/bin/sleep 1") at ../sysdeps/posix/system.c:184#6 0xb7fa9bfb in system (line=0xb7cbf9e4 "printf \"Child = %u\\n\" $$;/bin/sleep 1") at pt-system.c:28(More stack frames follow...)

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

# gdb -q -ex "b *signal_handler" -ex r --args /usr/bin/python2.7-dbg DebugPythonWithGDB_6.py '/usr/bin/python2.7-dbg -c "import time;time.sleep(3600)"'

...

Breakpoint 1 at 0x8216f2d: file ../Modules/signalmodule.c, line 185.

Starting program: /usr/bin/python2.7-dbg DebugPythonWithGDB_6.py /usr/bin/python2.7-dbg\ -c\ \"import\ time\;time.sleep\(3600\)\"

[Thread debugging using libthread_db enabled]

Using host libthread_db library "/lib/i386-linux-gnu/i686/cmov/libthread_db.so.1".

Parent  = 10284

Child   = 10288

[do_more() begin 0]

Child   = 10289

Breakpoint 1, signal_handler (sig_num=17) at ../Modules/signalmodule.c:185

185     {

(gdb) py-bt

#10 Frame 0xb7c20034, for file DebugPythonWithGDB_6.py, line 21, in do_more (count=0)

    os.system( r'printf "Child   = %u\n" $$;/bin/sleep 1' )

#13 Frame 0xb7cb37dc, for file DebugPythonWithGDB_6.py, line 76, in main (prog='DebugPythonWithGDB_6.py', args=['/usr/bin/python2.7-dbg -c "import time;time.sleep(3600)"'], count=0, child=)

    do_more( count )

#16 Frame 0xb7cbe49c, for file DebugPythonWithGDB_6.py, line 81, in  ()

    main( os.path.basename( sys.argv[0] ), sys.argv[1:] )

(gdb) bt 7

#0  signal_handler (sig_num=17) at ../Modules/signalmodule.c:185

#1  

#2  0xb7fdcd3c in __kernel_vsyscall ()

#3  0xb7db25eb in __sigprocmask (how=how@entry=2, set=0x0, set@entry=0xbffff0d4, oset=oset@entry=0x0) at ../sysdeps/unix/sysv/linux/sigprocmask.c:57

#4  0xb7dc2084 in do_system (line=line@entry=0xb7cbf9e4 "printf \"Child   = %u\\n\" $$;/bin/sleep 1") at ../sysdeps/posix/system.c:161

#5  0xb7dc2380 in __libc_system (line=line@entry=0xb7cbf9e4 "printf \"Child   = %u\\n\" $$;/bin/sleep 1") at ../sysdeps/posix/system.c:184

#6  0xb7fa9bfb in system (line=0xb7cbf9e4 "printf \"Child   = %u\\n\" $$;/bin/sleep 1") at pt-system.c:28

(More stack frames follow...)

查看#4的system.c:161,这个位置已经在waitpid( pid, &status, 0 )之后: sigprocmask( SIG_SETMASK, &omask, (sigset_t *)NULL ) 其作用是取消对SIGCHLD的屏蔽(阻塞)。 此时内存布局如下:

PHP

内存高址方向fpstate // ESP+0x2DC output/x *(struct _fpstate *)($esp+0x2dc)retcode // ESP+0x2D4 x/3i $esp+0x2d4extramask // ESP+0x2D0 x/1wx $esp+0x2d0fpstate_unused // ESP+0x60 output/x *(struct _fpstate *)($esp+0x60)sigcontext_ia32 // ESP+8 output/x *(struct sigcontext *)($esp+8)sig // ESP+4 信号值,信号句柄***形参pretcode // ESP RetAddr=__kernel_sigreturn// hexdump $esp 0x2dc内存低址方向

1

2

3

4

5

6

7

8

9

10

11

12

内存高址方向

fpstate         // ESP+0x2DC output/x *(struct _fpstate *)($esp+0x2dc)

retcode         // ESP+0x2D4 x/3i $esp+0x2d4

extramask       // ESP+0x2D0 x/1wx $esp+0x2d0

fpstate_unused  // ESP+0x60 output/x *(struct _fpstate *)($esp+0x60)

sigcontext_ia32 // ESP+8 output/x *(struct sigcontext *)($esp+8)

sig             // ESP+4 信号值,信号句柄***形参

pretcode        // ESP RetAddr=__kernel_sigreturn

                // hexdump $esp 0x2dc

内存低址方向

 

PHP

(gdb) x/2wa $esp0xbfffea6c: 0xb7fdcd18 0x11(gdb) x/3i $esp+0x2d40xbfffed40: pop eax0xbfffed41: mov eax,0x770xbfffed46: int 0x80(gdb) output/x *(struct sigcontext *)($esp+8){gs = 0x33,__gsh = 0x0,fs = 0x0,__fsh = 0x0,es = 0x7b,__esh = 0x0,ds = 0x7b,__dsh = 0x0,edi = 0xb7f2a000,esi = 0x8,ebp = 0x1,esp = 0xbfffeff0,ebx = 0x2,edx = 0x0,ecx = 0xbffff0d4,eax = 0x0,trapno = 0x1,err = 0x0,eip = 0xb7fdcd3c,cs = 0x73,__csh = 0x0,eflags = 0x246,esp_at_signal = 0xbfffeff0,ss = 0x7b,__ssh = 0x0,fpstate = 0xbfffed50,oldmask = 0x0,cr2 = 0x0}

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

(gdb) x/2wa $esp

0xbfffea6c:     0xb7fdcd18  0x11

(gdb) x/3i $esp+0x2d4

   0xbfffed40:  pop    eax

   0xbfffed41:  mov    eax,0x77

   0xbfffed46:  int    0x80

(gdb) output/x *(struct sigcontext *)($esp+8)

{

  gs = 0x33,

  __gsh = 0x0,

  fs = 0x0,

  __fsh = 0x0,

  es = 0x7b,

  __esh = 0x0,

  ds = 0x7b,

  __dsh = 0x0,

  edi = 0xb7f2a000,

  esi = 0x8,

  ebp = 0x1,

  esp = 0xbfffeff0,

  ebx = 0x2,

  edx = 0x0,

  ecx = 0xbffff0d4,

  eax = 0x0,

  trapno = 0x1,

  err = 0x0,

  eip = 0xb7fdcd3c,

  cs = 0x73,

  __csh = 0x0,

  eflags = 0x246,

  esp_at_signal = 0xbfffeff0,

  ss = 0x7b,

  __ssh = 0x0,

  fpstate = 0xbfffed50,

  oldmask = 0x0,

  cr2 = 0x0

}

因为是单形参信号句柄,没有siginfo,无法在用户态获知信号源。但我分析此时的信号源不是10289号子进程,而是10288 号子进程。10288产生SIGCHLD时,SIGCHLD信号已被屏蔽(阻塞),只能保持在内核态的未决信号链上。之后待10289产生SIGCHLD 时,sigpending.signal中相应位已经置位,10289产生的SIGCHLD被丢弃,不会进入内核态的未决信号链。SIGCHLD信号的屏 蔽(阻塞)被取消后,从内核态的未决信号链上取出10288产生的SIGCHLD进行处理。于是断点***。 如果完全理解了前述实验结果及分析,就会发现DebugPythonWithGDB_6.py存在竞争条件。subprocess.Popen()对应的 子进程投递SIGCHLD信号时,父进程有两种可能:

PHP

1) os.system()调用sigprocmask( SIG_BLOCK, &sa.sa_mask, &omask )之前2) os.system()调用sigprocmask( SIG_BLOCK, &sa.sa_mask, &omask )之后

1

2

1) os.system()调用sigprocmask( SIG_BLOCK, &sa.sa_mask, &omask )之前

2) os.system()调用sigprocmask( SIG_BLOCK, &sa.sa_mask, &omask )之后

情况1)会触发OSError异常,情况2)不会触发OSError异常。执行: $ python DebugPythonWithGDB_6.py ‘python -c “import time;time.sleep(3600)”‘ 有时会因OSError异常而终止,有时就一直循环执行下去。出现这种差异,正是竞争环境的表征。 小结一下: 假设针对SIGCHLD安装了Python级信号句柄,其调用os.waitpid( -1, os.WNOHANG )回收子进程。如果别处会调用os.system(),则必须在os.waitpid()外侧捕捉OSError异常。不建议这种方式的混用。 对waitpid()的分析到此就结束了,说点调试过程中出现的其他问题。 意外地发现Ctrl-C无法终止情况2),而我已经调用: signal.signal( signal.SIGINT, signal.SIG_DFL ) 这是因为do_system()中一上来就调用了:

PHP

sa.sa_handler = SIG_IGN;sigaction( SIGINT, &sa, &intr );

1

2

sa.sa_handler   = SIG_IGN;

sigaction( SIGINT, &sa, &intr );

导致Ctrl-C暂时失效,直至do_system()结束。假设DebugPythonWithGDB_6.py已经出 现情况2),查看它的信号处理方式:

PHP

# ps auwx | grep pythonroot 10355 0.0 0.5 8116 5812 pts/0 S+ 15:57 0:00 python DebugPythonWithGDB_6.py python -c "import time;time.sleep(3600)"root 10389 0.0 0.0 0 0 pts/0 Z+ 15:57 0:00 [python]root 10393 0.0 0.0 2936 852 pts/1 R+ 15:57 0:00 grep python# stap -DMAXACTION=10000 -g /usr/share/doc/systemtap-doc/examples/process/psig.stp -x 1035510355: pythonHUP defaultINT ignored // 不是预期的defaultQUIT ignoredILL defaultTRAP defaultABRT defaultBUS defaultFPE defaultKILL defaultUSR1 defaultSEGV defaultUSR2 defaultPIPE ignoredALRM defaultTERM defaultSTKFLT defaultCHLD blocked,caught 0x818a480 0...

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

# ps auwx | grep python

root     10355  0.0  0.5   8116  5812 pts/0    S+   15:57   0:00 python DebugPythonWithGDB_6.py python -c "import time;time.sleep(3600)"

root     10389  0.0  0.0      0 网页名称:用GDB排查Python程序故障
分享链接:http://www.mswzjz.cn/qtweb/news49/462149.html

攀枝花网站建设、攀枝花网站运维推广公司-贝锐智能,是专注品牌与效果的网络营销公司;服务项目有等

广告

声明:本网站发布的内容(图片、视频和文字)以用户投稿、用户转载内容为主,如果涉及侵权请尽快告知,我们将会在第一时间删除。文章观点不代表本网站立场,如需处理请联系客服。电话:028-86922220;邮箱:631063699@qq.com。内容未经允许不得转载,或转载时需注明来源: 贝锐智能