mirror of
https://gitlab.isc.org/isc-projects/bind9
synced 2025-09-01 06:55:30 +00:00
Refactor stale lookups, ignore active RRsets
When doing a staleonly lookup, ignore active RRsets from cache. If we don't, we may add a duplicate RRset to the message, and hit an assertion failure in query.c because adding the duplicate RRset to the ANSWER section failed. This can happen on a race condition. When a client query is received, the recursion is started. When 'stale-answer-client-timeout' triggers around the same time the recursion completes, the following sequence of events may happen: 1. Queue the "try stale" fetch_callback() event to the client task. 2. Add the RRsets from the authoritative response to the cache. 3. Queue the "fetch complete" fetch_callback() event to the client task. 4. Execute the "try stale" fetch_callback(), which retrieves the just-inserted RRset from the database. 5. In "ns_query_done()" we are still recursing, but the "staleonly" query attribute has already been cleared. In other words, the query will resume when recursion ends (it already has ended but is still on the task queue). 6. Execute the "fetch complete" fetch_callback(). It finds the answer from recursion in the cache again and tries to add the duplicate to the answer section. This commit changes the logic for finding stale answers in the cache, such that on "stale_only" lookups actually only stale RRsets are considered. It refactors the code so that code paths for "dbfind_stale", "stale_refresh_window", and "stale_only" are more clear. First we call some generic code that applies in all three cases, formatting the domain name for logging purposes, increment the trystale stats, and check if we actually found stale data that we can use. The "dbfind_stale" lookup will return SERVFAIL if we didn't found a usable answer, otherwise we will continue with the lookup (query_gotanswer()). This is no different as before the introduction of "stale-answer-client-timeout" and "stale-refresh-time". The "stale_refresh_window" lookup is similar to the "dbfind_stale" lookup: return SERVFAIL if we didn't found a usable answer, otherwise continue with the lookup (query_gotanswer()). Finally the "stale_only" lookup. If the "stale_only" lookup was triggered because of an actual client timeout (stale-answer-client-timeout > 0), and if database lookup returned a stale usable RRset, trigger a response to the client. Otherwise return and wait until the recursion completes (or the resolver query times out). If the "stale_only" lookup is a "stale-anwer-client-timeout 0" lookup, preferring stale data over a lookup. In this case if there was no stale data, or the data was not a positive answer, retry the lookup with the stale options cleared, a.k.a. a normal lookup. Otherwise, continue with the lookup (query_gotanswer()) and refresh the stale RRset. This will trigger a response to the client, but will not detach the handle because a fetch will be created to refresh the RRset.
This commit is contained in:
117
lib/ns/query.c
117
lib/ns/query.c
@@ -5744,6 +5744,7 @@ query_lookup(query_ctx_t *qctx) {
|
|||||||
dns_clientinfomethods_t cm;
|
dns_clientinfomethods_t cm;
|
||||||
dns_clientinfo_t ci;
|
dns_clientinfo_t ci;
|
||||||
dns_name_t *rpzqname = NULL;
|
dns_name_t *rpzqname = NULL;
|
||||||
|
char namebuf[DNS_NAME_FORMATSIZE];
|
||||||
unsigned int dboptions;
|
unsigned int dboptions;
|
||||||
dns_ttl_t stale_refresh = 0;
|
dns_ttl_t stale_refresh = 0;
|
||||||
bool dbfind_stale = false;
|
bool dbfind_stale = false;
|
||||||
@@ -5861,7 +5862,8 @@ query_lookup(query_ctx_t *qctx) {
|
|||||||
if (dbfind_stale ||
|
if (dbfind_stale ||
|
||||||
(STALE(qctx->rdataset) && (stale_only || stale_refresh_window)))
|
(STALE(qctx->rdataset) && (stale_only || stale_refresh_window)))
|
||||||
{
|
{
|
||||||
char namebuf[DNS_NAME_FORMATSIZE];
|
dns_name_format(qctx->client->query.qname, namebuf,
|
||||||
|
sizeof(namebuf));
|
||||||
|
|
||||||
inc_stats(qctx->client, ns_statscounter_trystale);
|
inc_stats(qctx->client, ns_statscounter_trystale);
|
||||||
|
|
||||||
@@ -5874,96 +5876,90 @@ query_lookup(query_ctx_t *qctx) {
|
|||||||
} else {
|
} else {
|
||||||
stale_found = false;
|
stale_found = false;
|
||||||
}
|
}
|
||||||
|
}
|
||||||
dns_name_format(qctx->client->query.qname, namebuf,
|
|
||||||
sizeof(namebuf));
|
|
||||||
|
|
||||||
if (dbfind_stale) {
|
if (dbfind_stale) {
|
||||||
isc_log_write(ns_lctx, NS_LOGCATEGORY_SERVE_STALE,
|
isc_log_write(ns_lctx, NS_LOGCATEGORY_SERVE_STALE,
|
||||||
NS_LOGMODULE_QUERY, ISC_LOG_INFO,
|
NS_LOGMODULE_QUERY, ISC_LOG_INFO,
|
||||||
"%s resolver failure, stale answer %s",
|
"%s resolver failure, stale answer %s", namebuf,
|
||||||
namebuf,
|
|
||||||
stale_found ? "used" : "unavailable");
|
stale_found ? "used" : "unavailable");
|
||||||
if (!stale_found) {
|
if (!stale_found) {
|
||||||
/*
|
/*
|
||||||
* Resolver failure, no stale data, nothing
|
* Resolver failure, no stale data, nothing more we
|
||||||
* more we can do, return SERVFAIL.
|
* can do, return SERVFAIL.
|
||||||
*/
|
*/
|
||||||
QUERY_ERROR(qctx, DNS_R_SERVFAIL);
|
QUERY_ERROR(qctx, DNS_R_SERVFAIL);
|
||||||
return (ns_query_done(qctx));
|
return (ns_query_done(qctx));
|
||||||
}
|
}
|
||||||
|
qctx->client->nodetach = false;
|
||||||
stale_only = false;
|
|
||||||
} else if (stale_refresh_window) {
|
} else if (stale_refresh_window) {
|
||||||
/*
|
/*
|
||||||
* A recent lookup failed, so during this time window
|
* A recent lookup failed, so during this time window we are
|
||||||
* we are allowed to return stale data immediately.
|
* allowed to return stale data immediately.
|
||||||
*/
|
*/
|
||||||
isc_log_write(ns_lctx, NS_LOGCATEGORY_SERVE_STALE,
|
isc_log_write(ns_lctx, NS_LOGCATEGORY_SERVE_STALE,
|
||||||
NS_LOGMODULE_QUERY, ISC_LOG_INFO,
|
NS_LOGMODULE_QUERY, ISC_LOG_INFO,
|
||||||
"%s query within stale refresh time, "
|
"%s query within stale refresh time, stale "
|
||||||
"stale answer %s",
|
"answer %s",
|
||||||
namebuf,
|
namebuf, stale_found ? "used" : "unavailable");
|
||||||
stale_found ? "used" : "unavailable");
|
|
||||||
|
|
||||||
if (!stale_found) {
|
if (!stale_found) {
|
||||||
/*
|
/*
|
||||||
* During the stale refresh window explicitly
|
* During the stale refresh window explicitly do not try
|
||||||
* do not try to refresh the data, because a
|
* to refresh the data, because a recent lookup failed.
|
||||||
* recent lookup failed.
|
|
||||||
*/
|
*/
|
||||||
QUERY_ERROR(qctx, DNS_R_SERVFAIL);
|
QUERY_ERROR(qctx, DNS_R_SERVFAIL);
|
||||||
return (ns_query_done(qctx));
|
return (ns_query_done(qctx));
|
||||||
}
|
}
|
||||||
|
qctx->client->nodetach = false;
|
||||||
stale_only = false;
|
} else if (stale_only) {
|
||||||
} else if ((dboptions & DNS_DBFIND_STALEONLY) != 0) {
|
|
||||||
if ((qctx->options & DNS_GETDB_STALEFIRST) != 0) {
|
|
||||||
if (stale_found && result == ISC_R_SUCCESS) {
|
|
||||||
/*
|
/*
|
||||||
* Immediately return the stale answer,
|
* This is a staleonly lookup: if there wass no stale answer
|
||||||
* start a resolver fetch to refresh
|
* in cache, treat as we don't have an answer and wait for the
|
||||||
* the data in cache.
|
* resolver fetch to finish.
|
||||||
*/
|
*/
|
||||||
isc_log_write(
|
qctx->client->nodetach = !stale_found;
|
||||||
ns_lctx,
|
|
||||||
NS_LOGCATEGORY_SERVE_STALE,
|
if ((qctx->options & DNS_GETDB_STALEFIRST) != 0) {
|
||||||
NS_LOGMODULE_QUERY,
|
if (!stale_found || result != ISC_R_SUCCESS) {
|
||||||
ISC_LOG_INFO,
|
|
||||||
"%s stale answer used, an "
|
|
||||||
"attempt to refresh the RRset "
|
|
||||||
"will still be made",
|
|
||||||
namebuf);
|
|
||||||
refresh_rrset = true;
|
|
||||||
} else {
|
|
||||||
/*
|
/*
|
||||||
* We have nothing useful in cache to
|
* We have nothing useful in cache to return
|
||||||
* return immediately.
|
* immediately.
|
||||||
*/
|
*/
|
||||||
qctx_clean(qctx);
|
qctx_clean(qctx);
|
||||||
qctx_freedata(qctx);
|
qctx_freedata(qctx);
|
||||||
dns_db_attach(
|
dns_db_attach(qctx->client->view->cachedb,
|
||||||
qctx->client->view->cachedb,
|
|
||||||
&qctx->db);
|
&qctx->db);
|
||||||
qctx->client->query.dboptions &=
|
qctx->client->query.dboptions &=
|
||||||
~DNS_DBFIND_STALEONLY;
|
~DNS_DBFIND_STALEONLY;
|
||||||
qctx->client->nodetach = false;
|
|
||||||
qctx->options &= ~DNS_GETDB_STALEFIRST;
|
qctx->options &= ~DNS_GETDB_STALEFIRST;
|
||||||
if (qctx->client->query.fetch != NULL) {
|
if (qctx->client->query.fetch != NULL) {
|
||||||
dns_resolver_destroyfetch(
|
dns_resolver_destroyfetch(
|
||||||
&qctx->client->query
|
&qctx->client->query.fetch);
|
||||||
.fetch);
|
|
||||||
}
|
}
|
||||||
return query_lookup(qctx);
|
return query_lookup(qctx);
|
||||||
}
|
|
||||||
} else {
|
} else {
|
||||||
|
INSIST(stale_found);
|
||||||
|
INSIST(result == ISC_R_SUCCESS);
|
||||||
/*
|
/*
|
||||||
* The 'stale-answer-client-timeout' triggered,
|
* Immediately return the stale answer, start a
|
||||||
* return the stale answer if available,
|
* resolver fetch to refresh the data in cache.
|
||||||
* otherwise wait until the resolver finishes.
|
|
||||||
*/
|
*/
|
||||||
isc_log_write(
|
isc_log_write(
|
||||||
ns_lctx, NS_LOGCATEGORY_SERVE_STALE,
|
ns_lctx, NS_LOGCATEGORY_SERVE_STALE,
|
||||||
|
NS_LOGMODULE_QUERY, ISC_LOG_INFO,
|
||||||
|
"%s stale answer used, an attempt to "
|
||||||
|
"refresh the RRset will still be made",
|
||||||
|
namebuf);
|
||||||
|
refresh_rrset = STALE(qctx->rdataset);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
/*
|
||||||
|
* The 'stale-answer-client-timeout' triggered, return
|
||||||
|
* the stale answer if available, otherwise wait until
|
||||||
|
* the resolver finishes.
|
||||||
|
*/
|
||||||
|
isc_log_write(ns_lctx, NS_LOGCATEGORY_SERVE_STALE,
|
||||||
NS_LOGMODULE_QUERY, ISC_LOG_INFO,
|
NS_LOGMODULE_QUERY, ISC_LOG_INFO,
|
||||||
"%s client timeout, stale answer %s",
|
"%s client timeout, stale answer %s",
|
||||||
namebuf,
|
namebuf,
|
||||||
@@ -5973,29 +5969,6 @@ query_lookup(query_ctx_t *qctx) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else if (stale_only && result != ISC_R_SUCCESS) {
|
|
||||||
/*
|
|
||||||
* This is a staleonly lookup and no stale answer was found
|
|
||||||
* in cache. Treat as we don't have an answer and wait for
|
|
||||||
* the resolver fetch to finish.
|
|
||||||
*/
|
|
||||||
if ((qctx->options & DNS_GETDB_STALEFIRST) == 0) {
|
|
||||||
return (result);
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
stale_only = false;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!stale_only) {
|
|
||||||
/*
|
|
||||||
* The DNS_DBFIND_STALEONLY option may be set, but if we
|
|
||||||
* didn't have stale data in cache, or we responded with
|
|
||||||
* a stale answer because of 'stale-refresh-time', this is
|
|
||||||
* actually not a 'stale-only' lookup. Clear the flag to
|
|
||||||
* allow the client to be detach the handle.
|
|
||||||
*/
|
|
||||||
qctx->client->nodetach = false;
|
|
||||||
}
|
|
||||||
|
|
||||||
result = query_gotanswer(qctx, result);
|
result = query_gotanswer(qctx, result);
|
||||||
|
|
||||||
|
Reference in New Issue
Block a user