[master] Patch the failover code to avoid deadlocks

Patch for 36810 & 20352 This coves several related problems 1) When the primary is in conflict done it allows the secondary to transition around resolution interrupted and potentical conflict previously the primary would die on an illegal state. 2) It allows the servers to restart a bind update request. Previously if one of the servers sent an udpate request and there died (or had the communications interrupted) in some states the first server wouldn't retransmit a new update request and the other server wouldn't send any bind updates. This was noticed in potential conflict. 3) Updated the state transitions to move the leases on the ack queue back to the update queue in case of conflict-done as we might need to retransmit them all. 4) Updated a transition from startup to potentical conflict instead of resolution interrupted when the servers reconnect during the startup phase in order to avoid a diffferent dead lock.
2025-08-22 09:57:20 +00:00 · 2014-11-10 19:04:13 -08:00 · 2014-11-10 19:04:13 -08:00 · 21d3034757
commit 21d3034757
parent 6444928cf5
3 changed files with 142 additions and 72 deletions
--- a/9
+++ b/9
@ -137,6 +137,15 @@ by Eric Young (eay@cryptsoft.com).
  the patch upon which the fix is based.
  [ISC-Bugs #32222]

+- In the failover code, handle the case of communications being interrupted
+  when the servers are dealing with POTENTIAL-CONFLICT.  This patch allows
+  the primary to accept the secondary moving from POTENTIAL-CONFLICT to
+  RESOLUTION-INTERRUPTED as well as handling the bind update process better.
+  In addition the code to resend update or update all requests has been
+  modified to send requests more often.
+  [ISC-Bugs #36810]
+  [ISC-Bugs #20352]
+
 			Changes since 4.3.1b1

 - Modify the linux and openwrt dhclient scripts to process information
--- a/includes/dhcpd.h
+++ b/includes/dhcpd.h
@ -3419,6 +3419,7 @@ isc_result_t dhcp_failover_state_signal (omapi_object_t *,
 isc_result_t dhcp_failover_state_transition (dhcp_failover_state_t *,
 					     const char *);
 isc_result_t dhcp_failover_set_service_state (dhcp_failover_state_t *state);
+void dhcp_failover_rescind_updates (dhcp_failover_state_t *);
 isc_result_t dhcp_failover_set_state (dhcp_failover_state_t *,
 				      enum failover_state);
 isc_result_t dhcp_failover_peer_state_changed (dhcp_failover_state_t *,
--- a/server/failover.c
+++ b/server/failover.c
@ -1519,8 +1519,16 @@ isc_result_t dhcp_failover_state_transition (dhcp_failover_state_t *state,
 		      /* In these situations, we remain in the current
 		       * state, or if in startup enter those states.
 		       */
-		      case communications_interrupted:
 		      case conflict_done:
+			/* As the peer may not have received or may have
+			 * lost track of updates we sent previously we
+			 * rescind them, causing us to retransmit them
+			 * on an update request.
+			 */
+			dhcp_failover_rescind_updates(state);
+			/* fall through */
+
+		      case communications_interrupted:
 		      case partner_down:
 		      case paused:
 		      case recover:
@ -1703,6 +1711,52 @@ isc_result_t dhcp_failover_set_service_state (dhcp_failover_state_t *state)
 	return ISC_R_SUCCESS;
 }

+/*!
+ * \brief Return any leases on the ack queue back to the update queue
+ *
+ * Re-schedule any pending updates by moving them from the ack queue
+ * (update sent awaiting response) back to the update queue (need to
+ * send an update for this lease).  This will result in a retransmission
+ * of the update.
+ *
+ * \param state is the state block for the failover connection we are
+ * updating.
+ */
+
+void dhcp_failover_rescind_updates (dhcp_failover_state_t *state)
+{
+    struct lease *lp;
+
+    if (state->ack_queue_tail == NULL)
+	    return;
+
+    /* Zap the flags. */
+    for (lp = state->ack_queue_head; lp; lp = lp->next_pending)
+	    lp->flags = ((lp->flags & ~ON_ACK_QUEUE) | ON_UPDATE_QUEUE);
+
+    /* Now hook the ack queue to the beginning of the update queue. */
+    if (state->update_queue_head) {
+	    lease_reference(&state->ack_queue_tail->next_pending,
+			    state->update_queue_head, MDL);
+	    lease_dereference(&state->update_queue_head, MDL);
+    }
+    lease_reference(&state->update_queue_head, state->ack_queue_head, MDL);
+
+    if (!state->update_queue_tail) {
+#if defined (POINTER_DEBUG)
+	    if (state->ack_queue_tail->next_pending) {
+		    log_error("next pending on ack queue tail.");
+		    abort();
+	    }
+#endif
+	    lease_reference(&state->update_queue_tail,
+			    state->ack_queue_tail, MDL);
+    }
+    lease_dereference(&state->ack_queue_tail, MDL);
+    lease_dereference(&state->ack_queue_head, MDL);
+    state->cur_unacked_updates = 0;
+}
+
 isc_result_t dhcp_failover_set_state (dhcp_failover_state_t *state,
 				      enum failover_state new_state)
 {
@ -1721,37 +1775,9 @@ isc_result_t dhcp_failover_set_state (dhcp_failover_state_t *state,
      case normal:
      case potential_conflict:
      case partner_down:
-	if (state -> ack_queue_tail) {
-	    struct lease *lp;
+	/* Move the ack queue to the update queue */
+	dhcp_failover_rescind_updates(state);

-	    /* Zap the flags. */
-	    for (lp = state -> ack_queue_head; lp; lp = lp -> next_pending)
-		    lp -> flags = ((lp -> flags & ~ON_ACK_QUEUE) |
-				   ON_UPDATE_QUEUE);
-
-	    /* Now hook the ack queue to the beginning of the update
-	       queue. */
-	    if (state -> update_queue_head) {
-		lease_reference (&state -> ack_queue_tail -> next_pending,
-				 state -> update_queue_head, MDL);
-		lease_dereference (&state -> update_queue_head, MDL);
-	    }
-	    lease_reference (&state -> update_queue_head,
-			     state -> ack_queue_head, MDL);
-	    if (!state -> update_queue_tail) {
-#if defined (POINTER_DEBUG)
-		if (state -> ack_queue_tail -> next_pending) {
-		    log_error ("next pending on ack queue tail.");
-		    abort ();
-		}
-#endif
-		lease_reference (&state -> update_queue_tail,
-				 state -> ack_queue_tail, MDL);
-	    }
-	    lease_dereference (&state -> ack_queue_tail, MDL);
-	    lease_dereference (&state -> ack_queue_head, MDL);
-	    state -> cur_unacked_updates = 0;
-	}
 	/* We will re-queue a timeout later, if applicable. */
 	cancel_timeout (dhcp_failover_keepalive, state);
 	break;
@ -1859,7 +1885,9 @@ isc_result_t dhcp_failover_set_state (dhcp_failover_state_t *state,
 	    break;

 	  case potential_conflict:
-	    if (state -> i_am == primary)
+	    if ((state->i_am == primary) ||
+		((state->i_am == secondary) &&
+		 (state->partner.state == conflict_done)))
 		    dhcp_failover_send_update_request (state);
 	    break;

@ -1960,7 +1988,18 @@ isc_result_t dhcp_failover_peer_state_changed (dhcp_failover_state_t *state,
 	if (state -> partner.state == new_state && state -> me.state) {
 		switch (state -> me.state) {
 		      case startup:
-			dhcp_failover_set_state (state, state -> saved_state);
+			/*
+			 * If we have a peer state we must be connected.
+			 * If so we should move to potential_conflict
+			 * instead of resolution_interrupted, otherwise
+			 * back to whereever we were before we stopped.
+			 */
+			if (state->saved_state == resolution_interrupted)
+				dhcp_failover_set_state(state,
+							potential_conflict);
+			else 
+				dhcp_failover_set_state(state,
+							state->saved_state);
 			return ISC_R_SUCCESS;

 		      case unknown_state:
@ -2183,6 +2222,17 @@ isc_result_t dhcp_failover_peer_state_changed (dhcp_failover_state_t *state,
 			dhcp_failover_set_state(state, new_state);
 			break;

+		      case potential_conflict:
+		      case resolution_interrupted:
+			/*
+			 * This can happen when the connection is lost and 
+			 * recovered after the primary has moved to 
+			 * conflict-done but the secondary is still in 
+			 * potential-conflict.  In that case, we have to 
+			 * remain in conflict-done.
+			 */
+			break;
+
 		      default:
 			log_fatal("Peer %s: Invalid attempt to move from %s "
 				"to %s while local state is conflict-done.",
@ -4863,21 +4913,22 @@ isc_result_t dhcp_failover_send_update_request (dhcp_failover_state_t *state)

 	if (!state->link_to_peer ||
 	    state->link_to_peer->type != dhcp_type_failover_link)
-		return DHCP_R_INVALIDARG;
+		return (DHCP_R_INVALIDARG);
 	link = (dhcp_failover_link_t *)state->link_to_peer;

 	if (!link->outer || link->outer->type != omapi_type_connection)
-		return DHCP_R_INVALIDARG;
+		return (DHCP_R_INVALIDARG);

-	if (state -> curUPD)
-		return ISC_R_ALREADYRUNNING;
+	/* We allow an update to be restarted in case we requested an update
+	 * and were interrupted by something. If we had an ALL going we need
+	 * to restart that.  Otherwise we simply continue with the request */
+	if (state->curUPD == FTM_UPDREQALL) {
+		return (dhcp_failover_send_update_request_all(state));
+	}

-	status = (dhcp_failover_put_message
-		  (link, link -> outer,
-		   FTM_UPDREQ, link->xid++,
-		   (failover_option_t *)0));
+	status = (dhcp_failover_put_message(link, link->outer, FTM_UPDREQ,
+					    link->xid++, NULL));

-	if (status == ISC_R_SUCCESS)
 	state->curUPD = FTM_UPDREQ;

 #if defined (DEBUG_FAILOVER_MESSAGES)
@ -4888,8 +4939,14 @@ isc_result_t dhcp_failover_send_update_request (dhcp_failover_state_t *state)
 		log_debug("%s", obuf);
 	}
 #endif
+
+	if (status == ISC_R_SUCCESS) {
 		log_info("Sent update request message to %s", state->name);
-	return status;
+	} else {
+		log_error("Failed to send update request all message to %s: %s",
+			 state->name, isc_result_totext(status));
+	}
+	return (status);
 }

 isc_result_t dhcp_failover_send_update_request_all (dhcp_failover_state_t
@ -4909,22 +4966,19 @@ isc_result_t dhcp_failover_send_update_request_all (dhcp_failover_state_t

 	if (!state->link_to_peer ||
 	    state->link_to_peer->type != dhcp_type_failover_link)
-		return DHCP_R_INVALIDARG;
+		return (DHCP_R_INVALIDARG);
 	link = (dhcp_failover_link_t *)state->link_to_peer;

 	if (!link->outer || link->outer->type != omapi_type_connection)
-		return DHCP_R_INVALIDARG;
+		return (DHCP_R_INVALIDARG);

-	/* If there is an UPDREQ in progress, then upgrade to UPDREQALL. */
-	if (state -> curUPD && (state -> curUPD != FTM_UPDREQ))
-		return ISC_R_ALREADYRUNNING;
+	/* We allow an update to be restarted in case we requested an update
+	 * and were interrupted by something.
+	 */

-	status = (dhcp_failover_put_message
-		  (link, link -> outer,
-		   FTM_UPDREQALL, link->xid++,
-		   (failover_option_t *)0));
+	status = (dhcp_failover_put_message(link, link->outer, FTM_UPDREQALL,
+					    link->xid++, NULL));

-	if (status == ISC_R_SUCCESS)
 	state->curUPD = FTM_UPDREQALL;

 #if defined (DEBUG_FAILOVER_MESSAGES)
@ -4935,8 +4989,14 @@ isc_result_t dhcp_failover_send_update_request_all (dhcp_failover_state_t
 		log_debug("%s", obuf);
 	}
 #endif
+
+	if (status == ISC_R_SUCCESS) {
 		log_info("Sent update request all message to %s", state->name);
-	return status;
+	} else {
+		log_error("Failed to send update request all message to %s: %s",
+			 state->name, isc_result_totext(status));
+	}
+	return (status);
 }

 isc_result_t dhcp_failover_send_update_done (dhcp_failover_state_t *state)