From 9c655172a298766a37cc83173438f70fff83264c Mon Sep 17 00:00:00 2001 From: JINMEI Tatuya Date: Thu, 11 Oct 2012 15:39:53 -0700 Subject: [PATCH] [1858] stop sending hopeless signal when it once fails with EPERM. --- src/bin/bind10/bind10_messages.mes | 24 ++++++++++++++++++++++++ src/bin/bind10/bind10_src.py.in | 18 +++++++++++++----- src/bin/bind10/tests/bind10_test.py.in | 19 +++++++++++++++++-- 3 files changed, 54 insertions(+), 7 deletions(-) diff --git a/src/bin/bind10/bind10_messages.mes b/src/bin/bind10/bind10_messages.mes index 2f48325498..e8a2eb2ee5 100644 --- a/src/bin/bind10/bind10_messages.mes +++ b/src/bin/bind10/bind10_messages.mes @@ -305,3 +305,27 @@ the configuration manager to start up. The total length of time Boss will wait for the configuration manager before reporting an error is set with the command line --wait switch, which has a default value of ten seconds. + +% BIND10_SEND_SIGNAL_FAIL sending %1 to %2 (PID %3) failed: %4 +The boss module sent a single (either SIGTERM or SIGKILL) to a process, +but it failed due to some system level error. There are two major cases: +the target process has already terminated but the boss module had sent +the signal before it noticed the termination. In this case an error +message should indicate something like "no such process". This can be +safely ignored. The other case is that the boss module doesn't have +the privilege to send a signal to the process. It can typically +happen when the boss module started as a privileged process, spawned a +subprocess, and then dropped the privilege. It includes the case for +the socket creator when the boss process runs with the -u command line +option. In this case, the boss module simply gives up to terminate +the process explicitly because it's unlikely to succeed by keeping +sending the signal. Although the socket creator is implemented so +that it will terminate automatically when the boss process exits +(and that should be the case for any other future process running with +a higher privilege), but it's recommended to check if there's any +remaining BIND 10 process if this message is logged. For all other +cases, the boss module will keep sending the signal until it confirms +all child processes terminate. Although unlikely, this could prevent +the boss module from exiting, just keeping sending the signals. So, +again, it's advisable to check if it really terminates when this +message is logged. diff --git a/src/bin/bind10/bind10_src.py.in b/src/bin/bind10/bind10_src.py.in index f15c7185d9..7a245ae5bf 100755 --- a/src/bin/bind10/bind10_src.py.in +++ b/src/bin/bind10/bind10_src.py.in @@ -712,14 +712,22 @@ class BoB: ''' logmsg = BIND10_SEND_SIGKILL if forceful else BIND10_SEND_SIGTERM - for component in self.components.values(): + # We need to make a copy of values as the components may be modified + # in the loop. + for component in list(self.components.values()): logger.info(logmsg, component.name(), component.pid()) try: component.kill(forceful) - except OSError: - # ignore these (usually ESRCH because the child - # finally exited) - pass + except OSError as ex: + # If kill() failed due to EPERM, it doesn't make sense to + # keep trying, so we just log the fact and forget that + # component. Ignore other OSErrors (usually ESRCH because + # the child finally exited) + signame = "SIGKILL" if forceful else "SIGTERM" + logger.info(BIND10_SEND_SIGNAL_FAIL, signame, + component.name(), component.pid(), ex) + if ex.errno == errno.EPERM: + del self.components[component.pid()] def _get_process_exit_status(self): return os.waitpid(-1, os.WNOHANG) diff --git a/src/bin/bind10/tests/bind10_test.py.in b/src/bin/bind10/tests/bind10_test.py.in index 4839008560..ece6370c3a 100644 --- a/src/bin/bind10/tests/bind10_test.py.in +++ b/src/bin/bind10/tests/bind10_test.py.in @@ -1195,7 +1195,15 @@ class TestBossComponents(unittest.TestCase): (anyway it is not told so). It does not die if it is killed the first time. It dies only when killed forcefully. """ + def __init__(self): + # number of kill() calls, preventing infinite loop. + self.__call_count = 0 + def kill(self, forceful=False): + self.__call_count += 1 + if self.__call_count > 2: + raise Exception('Too many calls to ImmortalComponent.kill') + killed.append(forceful) if ex_on_kill is not None: # If exception is given by the test, raise it here. @@ -1247,10 +1255,17 @@ class TestBossComponents(unittest.TestCase): self.__real_test_kill() def test_kill_fail(self): - """Test cases where kill() results in an exception due to OS error.""" + """Test cases where kill() results in an exception due to OS error. + + The behavior should be different for EPERM, so we test two cases. + + """ ex = OSError() - ex.errno = errno.ESRCH + ex.errno, ex.strerror = errno.ESRCH, 'No such process' + self.__real_test_kill(ex_on_kill=ex) + + ex.errno, ex.strerror = errno.EPERM, 'Operation not permitted' self.__real_test_kill(ex_on_kill=ex) def test_nokill(self):