2
0
mirror of https://gitlab.isc.org/isc-projects/dhcp synced 2025-08-22 09:57:20 +00:00

[master] Patch the failover code to avoid deadlocks

Patch for 36810 & 20352
This coves several related problems
1) When the primary is in conflict done it allows the secondary to
transition around resolution interrupted and potentical conflict previously
the primary would die on an illegal state.

2) It allows the servers to restart a bind update request.  Previously if
one of the servers sent an udpate request and there died (or had the communications
interrupted) in some states the first server wouldn't retransmit a new
update request and the other server wouldn't send any bind updates. This
was noticed in potential conflict.

3) Updated the state transitions to move the leases on the ack queue
back to the update queue in case of conflict-done as we might need to
retransmit them all.

4) Updated a transition from startup to potentical conflict instead
of resolution interrupted when the servers reconnect during the startup
phase in order to avoid a diffferent dead lock.
This commit is contained in:
Shawn Routhier 2014-11-10 19:04:13 -08:00
parent 6444928cf5
commit 21d3034757
3 changed files with 142 additions and 72 deletions

View File

@ -137,6 +137,15 @@ by Eric Young (eay@cryptsoft.com).
the patch upon which the fix is based.
[ISC-Bugs #32222]
- In the failover code, handle the case of communications being interrupted
when the servers are dealing with POTENTIAL-CONFLICT. This patch allows
the primary to accept the secondary moving from POTENTIAL-CONFLICT to
RESOLUTION-INTERRUPTED as well as handling the bind update process better.
In addition the code to resend update or update all requests has been
modified to send requests more often.
[ISC-Bugs #36810]
[ISC-Bugs #20352]
Changes since 4.3.1b1
- Modify the linux and openwrt dhclient scripts to process information

View File

@ -3419,6 +3419,7 @@ isc_result_t dhcp_failover_state_signal (omapi_object_t *,
isc_result_t dhcp_failover_state_transition (dhcp_failover_state_t *,
const char *);
isc_result_t dhcp_failover_set_service_state (dhcp_failover_state_t *state);
void dhcp_failover_rescind_updates (dhcp_failover_state_t *);
isc_result_t dhcp_failover_set_state (dhcp_failover_state_t *,
enum failover_state);
isc_result_t dhcp_failover_peer_state_changed (dhcp_failover_state_t *,

View File

@ -1519,8 +1519,16 @@ isc_result_t dhcp_failover_state_transition (dhcp_failover_state_t *state,
/* In these situations, we remain in the current
* state, or if in startup enter those states.
*/
case communications_interrupted:
case conflict_done:
/* As the peer may not have received or may have
* lost track of updates we sent previously we
* rescind them, causing us to retransmit them
* on an update request.
*/
dhcp_failover_rescind_updates(state);
/* fall through */
case communications_interrupted:
case partner_down:
case paused:
case recover:
@ -1703,6 +1711,52 @@ isc_result_t dhcp_failover_set_service_state (dhcp_failover_state_t *state)
return ISC_R_SUCCESS;
}
/*!
* \brief Return any leases on the ack queue back to the update queue
*
* Re-schedule any pending updates by moving them from the ack queue
* (update sent awaiting response) back to the update queue (need to
* send an update for this lease). This will result in a retransmission
* of the update.
*
* \param state is the state block for the failover connection we are
* updating.
*/
void dhcp_failover_rescind_updates (dhcp_failover_state_t *state)
{
struct lease *lp;
if (state->ack_queue_tail == NULL)
return;
/* Zap the flags. */
for (lp = state->ack_queue_head; lp; lp = lp->next_pending)
lp->flags = ((lp->flags & ~ON_ACK_QUEUE) | ON_UPDATE_QUEUE);
/* Now hook the ack queue to the beginning of the update queue. */
if (state->update_queue_head) {
lease_reference(&state->ack_queue_tail->next_pending,
state->update_queue_head, MDL);
lease_dereference(&state->update_queue_head, MDL);
}
lease_reference(&state->update_queue_head, state->ack_queue_head, MDL);
if (!state->update_queue_tail) {
#if defined (POINTER_DEBUG)
if (state->ack_queue_tail->next_pending) {
log_error("next pending on ack queue tail.");
abort();
}
#endif
lease_reference(&state->update_queue_tail,
state->ack_queue_tail, MDL);
}
lease_dereference(&state->ack_queue_tail, MDL);
lease_dereference(&state->ack_queue_head, MDL);
state->cur_unacked_updates = 0;
}
isc_result_t dhcp_failover_set_state (dhcp_failover_state_t *state,
enum failover_state new_state)
{
@ -1721,37 +1775,9 @@ isc_result_t dhcp_failover_set_state (dhcp_failover_state_t *state,
case normal:
case potential_conflict:
case partner_down:
if (state -> ack_queue_tail) {
struct lease *lp;
/* Move the ack queue to the update queue */
dhcp_failover_rescind_updates(state);
/* Zap the flags. */
for (lp = state -> ack_queue_head; lp; lp = lp -> next_pending)
lp -> flags = ((lp -> flags & ~ON_ACK_QUEUE) |
ON_UPDATE_QUEUE);
/* Now hook the ack queue to the beginning of the update
queue. */
if (state -> update_queue_head) {
lease_reference (&state -> ack_queue_tail -> next_pending,
state -> update_queue_head, MDL);
lease_dereference (&state -> update_queue_head, MDL);
}
lease_reference (&state -> update_queue_head,
state -> ack_queue_head, MDL);
if (!state -> update_queue_tail) {
#if defined (POINTER_DEBUG)
if (state -> ack_queue_tail -> next_pending) {
log_error ("next pending on ack queue tail.");
abort ();
}
#endif
lease_reference (&state -> update_queue_tail,
state -> ack_queue_tail, MDL);
}
lease_dereference (&state -> ack_queue_tail, MDL);
lease_dereference (&state -> ack_queue_head, MDL);
state -> cur_unacked_updates = 0;
}
/* We will re-queue a timeout later, if applicable. */
cancel_timeout (dhcp_failover_keepalive, state);
break;
@ -1859,7 +1885,9 @@ isc_result_t dhcp_failover_set_state (dhcp_failover_state_t *state,
break;
case potential_conflict:
if (state -> i_am == primary)
if ((state->i_am == primary) ||
((state->i_am == secondary) &&
(state->partner.state == conflict_done)))
dhcp_failover_send_update_request (state);
break;
@ -1960,7 +1988,18 @@ isc_result_t dhcp_failover_peer_state_changed (dhcp_failover_state_t *state,
if (state -> partner.state == new_state && state -> me.state) {
switch (state -> me.state) {
case startup:
dhcp_failover_set_state (state, state -> saved_state);
/*
* If we have a peer state we must be connected.
* If so we should move to potential_conflict
* instead of resolution_interrupted, otherwise
* back to whereever we were before we stopped.
*/
if (state->saved_state == resolution_interrupted)
dhcp_failover_set_state(state,
potential_conflict);
else
dhcp_failover_set_state(state,
state->saved_state);
return ISC_R_SUCCESS;
case unknown_state:
@ -2183,6 +2222,17 @@ isc_result_t dhcp_failover_peer_state_changed (dhcp_failover_state_t *state,
dhcp_failover_set_state(state, new_state);
break;
case potential_conflict:
case resolution_interrupted:
/*
* This can happen when the connection is lost and
* recovered after the primary has moved to
* conflict-done but the secondary is still in
* potential-conflict. In that case, we have to
* remain in conflict-done.
*/
break;
default:
log_fatal("Peer %s: Invalid attempt to move from %s "
"to %s while local state is conflict-done.",
@ -4863,21 +4913,22 @@ isc_result_t dhcp_failover_send_update_request (dhcp_failover_state_t *state)
if (!state->link_to_peer ||
state->link_to_peer->type != dhcp_type_failover_link)
return DHCP_R_INVALIDARG;
return (DHCP_R_INVALIDARG);
link = (dhcp_failover_link_t *)state->link_to_peer;
if (!link->outer || link->outer->type != omapi_type_connection)
return DHCP_R_INVALIDARG;
return (DHCP_R_INVALIDARG);
if (state -> curUPD)
return ISC_R_ALREADYRUNNING;
/* We allow an update to be restarted in case we requested an update
* and were interrupted by something. If we had an ALL going we need
* to restart that. Otherwise we simply continue with the request */
if (state->curUPD == FTM_UPDREQALL) {
return (dhcp_failover_send_update_request_all(state));
}
status = (dhcp_failover_put_message
(link, link -> outer,
FTM_UPDREQ, link->xid++,
(failover_option_t *)0));
status = (dhcp_failover_put_message(link, link->outer, FTM_UPDREQ,
link->xid++, NULL));
if (status == ISC_R_SUCCESS)
state->curUPD = FTM_UPDREQ;
#if defined (DEBUG_FAILOVER_MESSAGES)
@ -4888,8 +4939,14 @@ isc_result_t dhcp_failover_send_update_request (dhcp_failover_state_t *state)
log_debug("%s", obuf);
}
#endif
if (status == ISC_R_SUCCESS) {
log_info("Sent update request message to %s", state->name);
return status;
} else {
log_error("Failed to send update request all message to %s: %s",
state->name, isc_result_totext(status));
}
return (status);
}
isc_result_t dhcp_failover_send_update_request_all (dhcp_failover_state_t
@ -4909,22 +4966,19 @@ isc_result_t dhcp_failover_send_update_request_all (dhcp_failover_state_t
if (!state->link_to_peer ||
state->link_to_peer->type != dhcp_type_failover_link)
return DHCP_R_INVALIDARG;
return (DHCP_R_INVALIDARG);
link = (dhcp_failover_link_t *)state->link_to_peer;
if (!link->outer || link->outer->type != omapi_type_connection)
return DHCP_R_INVALIDARG;
return (DHCP_R_INVALIDARG);
/* If there is an UPDREQ in progress, then upgrade to UPDREQALL. */
if (state -> curUPD && (state -> curUPD != FTM_UPDREQ))
return ISC_R_ALREADYRUNNING;
/* We allow an update to be restarted in case we requested an update
* and were interrupted by something.
*/
status = (dhcp_failover_put_message
(link, link -> outer,
FTM_UPDREQALL, link->xid++,
(failover_option_t *)0));
status = (dhcp_failover_put_message(link, link->outer, FTM_UPDREQALL,
link->xid++, NULL));
if (status == ISC_R_SUCCESS)
state->curUPD = FTM_UPDREQALL;
#if defined (DEBUG_FAILOVER_MESSAGES)
@ -4935,8 +4989,14 @@ isc_result_t dhcp_failover_send_update_request_all (dhcp_failover_state_t
log_debug("%s", obuf);
}
#endif
if (status == ISC_R_SUCCESS) {
log_info("Sent update request all message to %s", state->name);
return status;
} else {
log_error("Failed to send update request all message to %s: %s",
state->name, isc_result_totext(status));
}
return (status);
}
isc_result_t dhcp_failover_send_update_done (dhcp_failover_state_t *state)