From dce8dabc766040dd3f10a977d6d23e69c0cd8ef3 Mon Sep 17 00:00:00 2001 From: Thomas Markwalder Date: Tue, 18 May 2021 09:22:51 -0400 Subject: [PATCH] [#1818] Added exception handling and ChangeLog src/hooks/dhcp/high_availability/ha_messages.mes HA_PAUSE_CLIENT_LISTENER_FAILED HA_RESUME_CLIENT_LISTENER_FAILED - new log messages src/hooks/dhcp/high_availability/ha_service.cc HAService::pauseClientAndListener() HAService::resumeClientAndListener() - made exception-safe src/lib/util/multi_threading_mgr.cc MultiThreadingMgr::stopProcessing() MultiThreadingMgr::startProcessing() - added exception-catch around callback invocations. --- ChangeLog | 8 +++++ .../dhcp/high_availability/ha_messages.cc | 4 +++ .../dhcp/high_availability/ha_messages.h | 2 ++ .../dhcp/high_availability/ha_messages.mes | 10 ++++++ .../dhcp/high_availability/ha_service.cc | 34 +++++++++++++------ src/lib/util/multi_threading_mgr.cc | 20 ++++++++--- src/lib/util/multi_threading_mgr.h | 21 +++++++++--- src/lib/util/named_callback.h | 2 +- 8 files changed, 81 insertions(+), 20 deletions(-) diff --git a/ChangeLog b/ChangeLog index b88a9f2cb4..39e9ed6ad4 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,11 @@ +1899. [func] tmark,razvan + In HA+Mt mode, the HA hook library now pauses and resumes + its worker threads when Kea core enters and exits critical + sections, respectively. This eliminates race conditions + during core processing such as reconfiguration, shutdown, + and certain RESTful API commands. + (Gitlab #1818) + 1898. [func] fdupont The DROP class may now depend on the KNOWN or UNKNOWN classes and may be used after the host reservation lookup. diff --git a/src/hooks/dhcp/high_availability/ha_messages.cc b/src/hooks/dhcp/high_availability/ha_messages.cc index deaca724a0..93cca6f8eb 100644 --- a/src/hooks/dhcp/high_availability/ha_messages.cc +++ b/src/hooks/dhcp/high_availability/ha_messages.cc @@ -84,9 +84,11 @@ extern const isc::log::MessageID HA_MAINTENANCE_STARTED = "HA_MAINTENANCE_STARTE extern const isc::log::MessageID HA_MAINTENANCE_STARTED_IN_PARTNER_DOWN = "HA_MAINTENANCE_STARTED_IN_PARTNER_DOWN"; extern const isc::log::MessageID HA_MAINTENANCE_START_HANDLER_FAILED = "HA_MAINTENANCE_START_HANDLER_FAILED"; extern const isc::log::MessageID HA_MISSING_CONFIGURATION = "HA_MISSING_CONFIGURATION"; +extern const isc::log::MessageID HA_PAUSE_CLIENT_LISTENER_FAILED = "HA_PAUSE_CLIENT_LISTENER_FAILED"; extern const isc::log::MessageID HA_RESET_COMMUNICATIONS_FAILED = "HA_RESET_COMMUNICATIONS_FAILED"; extern const isc::log::MessageID HA_RESET_FAILED = "HA_RESET_FAILED"; extern const isc::log::MessageID HA_RESET_HANDLER_FAILED = "HA_RESET_HANDLER_FAILED"; +extern const isc::log::MessageID HA_RESUME_CLIENT_LISTENER_FAILED = "HA_RESUME_CLIENT_LISTENER_FAILED"; extern const isc::log::MessageID HA_SCOPES_HANDLER_FAILED = "HA_SCOPES_HANDLER_FAILED"; extern const isc::log::MessageID HA_SERVICE_STARTED = "HA_SERVICE_STARTED"; extern const isc::log::MessageID HA_STATE_MACHINE_CONTINUED = "HA_STATE_MACHINE_CONTINUED"; @@ -183,9 +185,11 @@ const char* values[] = { "HA_MAINTENANCE_STARTED_IN_PARTNER_DOWN", "the server is now in the partner-down mode as a result of requested maintenance", "HA_MAINTENANCE_START_HANDLER_FAILED", "ha-maintenance-start command failed: %1", "HA_MISSING_CONFIGURATION", "high-availability parameter not specified for High Availability hooks library", + "HA_PAUSE_CLIENT_LISTENER_FAILED", "Pausing mutli-threaded HTTP processing failed: %1", "HA_RESET_COMMUNICATIONS_FAILED", "failed to send ha-reset command to %1: %2", "HA_RESET_FAILED", "failed to reset HA state machine of %1: %2", "HA_RESET_HANDLER_FAILED", "ha-reset command failed: %1", + "HA_RESUME_CLIENT_LISTENER_FAILED", "Resuming mutli-threaded HTTP processing failed: %1", "HA_SCOPES_HANDLER_FAILED", "ha-scopes command failed: %1", "HA_SERVICE_STARTED", "started high availability service in %1 mode as %2 server", "HA_STATE_MACHINE_CONTINUED", "state machine is un-paused", diff --git a/src/hooks/dhcp/high_availability/ha_messages.h b/src/hooks/dhcp/high_availability/ha_messages.h index 8750df2866..78ae6fb2ed 100644 --- a/src/hooks/dhcp/high_availability/ha_messages.h +++ b/src/hooks/dhcp/high_availability/ha_messages.h @@ -85,9 +85,11 @@ extern const isc::log::MessageID HA_MAINTENANCE_STARTED; extern const isc::log::MessageID HA_MAINTENANCE_STARTED_IN_PARTNER_DOWN; extern const isc::log::MessageID HA_MAINTENANCE_START_HANDLER_FAILED; extern const isc::log::MessageID HA_MISSING_CONFIGURATION; +extern const isc::log::MessageID HA_PAUSE_CLIENT_LISTENER_FAILED; extern const isc::log::MessageID HA_RESET_COMMUNICATIONS_FAILED; extern const isc::log::MessageID HA_RESET_FAILED; extern const isc::log::MessageID HA_RESET_HANDLER_FAILED; +extern const isc::log::MessageID HA_RESUME_CLIENT_LISTENER_FAILED; extern const isc::log::MessageID HA_SCOPES_HANDLER_FAILED; extern const isc::log::MessageID HA_SERVICE_STARTED; extern const isc::log::MessageID HA_STATE_MACHINE_CONTINUED; diff --git a/src/hooks/dhcp/high_availability/ha_messages.mes b/src/hooks/dhcp/high_availability/ha_messages.mes index 1acee64e5f..5799a601ea 100644 --- a/src/hooks/dhcp/high_availability/ha_messages.mes +++ b/src/hooks/dhcp/high_availability/ha_messages.mes @@ -481,6 +481,11 @@ This error message is issued to indicate that the configuration for the High Availability hooks library hasn't been specified. The 'high-availability' parameter must be specified for the hooks library to load properly. +% HA_PAUSE_CLIENT_LISTENER_FAILED Pausing mutli-threaded HTTP processing failed: %1 +This error message is emitted when an attempting to pause HA's HTTP client and +the listener threads. This error is highly unlikely and indicates a programmatic +issue that should be reported as a defect. + % HA_RESET_COMMUNICATIONS_FAILED failed to send ha-reset command to %1: %2 This warning message indicates a problem with communication with a HA peer while sending the ha-reset command. The first argument specifies a remote @@ -496,6 +501,11 @@ This error message is issued to indicate that the ha-reset command handler failed while processing the command. The argument provides the reason for failure. +% HA_RESUME_CLIENT_LISTENER_FAILED Resuming mutli-threaded HTTP processing failed: %1 +This error message is emitted when an attempting to resume HA's HTTP client and +the listener threads. This is unlikely to occur and indicates a programmatic +issue that should be reported as a defect. + % HA_SCOPES_HANDLER_FAILED ha-scopes command failed: %1 This error message is issued to indicate that the ha-scopes command handler failed while processing the command. The argument provides reason for diff --git a/src/hooks/dhcp/high_availability/ha_service.cc b/src/hooks/dhcp/high_availability/ha_service.cc index a7402d284d..2e9b883411 100644 --- a/src/hooks/dhcp/high_availability/ha_service.cc +++ b/src/hooks/dhcp/high_availability/ha_service.cc @@ -2831,23 +2831,37 @@ HAService::startClientAndListener() { void HAService::pauseClientAndListener() { - if (client_) { - client_->pause(); - } + // Since we're used as CS callback we need to suppress + // any exceptions, unlikey though they may be. + try { + if (client_) { + client_->pause(); + } - if (listener_) { - listener_->pause(); + if (listener_) { + listener_->pause(); + } + } catch (std::exception& ex) { + LOG_ERROR(ha_logger, HA_PAUSE_CLIENT_LISTENER_FAILED) + .arg(ex.what()); } } void HAService::resumeClientAndListener() { - if (client_) { - client_->resume(); - } + // Since we're used as CS callback we need to suppress + // any exceptions, unlikey though they may be. + try { + if (client_) { + client_->resume(); + } - if (listener_) { - listener_->resume(); + if (listener_) { + listener_->resume(); + } + } catch (std::exception& ex) { + LOG_ERROR(ha_logger, HA_RESUME_CLIENT_LISTENER_FAILED) + .arg(ex.what()); } } diff --git a/src/lib/util/multi_threading_mgr.cc b/src/lib/util/multi_threading_mgr.cc index f783581061..608e01ed85 100644 --- a/src/lib/util/multi_threading_mgr.cc +++ b/src/lib/util/multi_threading_mgr.cc @@ -1,4 +1,4 @@ -// Copyright (C) 2019-2020 Internet Systems Consortium, Inc. ("ISC") +// Copyright (C) 2019-2021 Internet Systems Consortium, Inc. ("ISC") // // This Source Code Form is subject to the terms of the Mozilla Public // License, v. 2.0. If a copy of the MPL was not distributed with this @@ -126,8 +126,13 @@ MultiThreadingMgr::stopProcessing() { } for (auto cb : critical_entry_cbs_.getCallbacks() ) { - // @todo need to think about what we do with exceptions - (cb.callback_)(); + try { + (cb.callback_)(); + } catch (...) { + // We can't log it and throwing could be chaos. + // We'll swallow it and tell people their callbacks + // must be exception-proof + } } } } @@ -140,8 +145,13 @@ MultiThreadingMgr::startProcessing() { } for (auto cb : critical_exit_cbs_.getCallbacks() ) { - // @todo need to think about what we do with exceptions - (cb.callback_)(); + try { + (cb.callback_)(); + } catch (...) { + // We can't log it and throwing could be chaos. + // We'll swallow it and tell people their callbacks + // must be exception-proof + } } } } diff --git a/src/lib/util/multi_threading_mgr.h b/src/lib/util/multi_threading_mgr.h index 9ad4373f41..0a0c2d4ae1 100644 --- a/src/lib/util/multi_threading_mgr.h +++ b/src/lib/util/multi_threading_mgr.h @@ -132,10 +132,14 @@ public: /// @brief Adds a pair of callbacks to the list of CriticalSection callbacks. /// + /// @note Callbacks must be exception-safe, handling any errors themselves. + /// /// @param name Name of the set of callbacks. This value is used by the - /// callback owner to remove or replace them.Duplicates are not allowed. - /// @param entry_cb Callback to invoke upon CriticalSection entry. - /// @param exit_cb Callback to invoke upon CriticalSection exit. + /// callback owner to add and remove them. Duplicates are not allowed. + /// @param entry_cb Callback to invoke upon CriticalSection entry. Cannot be + /// empty. + /// @param exit_cb Callback to invoke upon CriticalSection exit. Cannot be + /// be empty. void addCriticalSectionCallbacks(const std::string& name, const NamedCallback::Callback& entry_cb, const NamedCallback::Callback& exit_cb); @@ -143,7 +147,8 @@ public: /// @brief Removes the set of callbacks associated with a given name /// from the list of CriticalSection callbacks. /// - /// If the name is not found in the list, it simply returns. + /// If the name is not found in the list, it simply returns, otherwise + /// both callbacks registered under the name are removed. /// /// @param name Name of the set of callbacks to remove. void removeCriticalSectionCallbacks(const std::string& name); @@ -166,6 +171,10 @@ private: /// Stops the DHCP thread pool if it's running and invokes /// all CriticalSection entry callbacks. Has no effect /// in single-threaded mode. + /// + /// @note This function swallows exceptions thrown by exit + /// callbacks without logging to avoid breaking the CS + /// chain. void stopProcessing(); /// @brief Class method (re)starts non-critical processing. @@ -173,6 +182,10 @@ private: /// Starts the DHCP thread pool according to current configuration, /// and invokes all CriticalSection exit callbacks. Has no effect /// in single-threaded mode. + /// + /// @note This function swallows exceptions thrown by entry + /// callbacks without logging to avoid breaking the CS + /// chain. void startProcessing(); /// @brief The current multi-threading mode. diff --git a/src/lib/util/named_callback.h b/src/lib/util/named_callback.h index ac98459baa..13a1ba6336 100644 --- a/src/lib/util/named_callback.h +++ b/src/lib/util/named_callback.h @@ -39,7 +39,7 @@ struct NamedCallback { /// @brief Maintains list of unique NamedCallbacks. /// -/// The list emphasizes iteration order and speed over +/// The list emphasizes iteration order and speed over /// retrieval by name. When iterating over the list of /// callbacks, they are returned in the order they were /// added, not by name.