diff --git a/CHANGES b/CHANGES index fb144ded61..b406b1904d 100644 --- a/CHANGES +++ b/CHANGES @@ -1,3 +1,7 @@ +2538. [bug] cache/ADB memory could grow over max-cache-size, + especially with threads and smaller max-cache-size + values. [RT #19240] + 2537. [func] Added more statistics counters including those on socket I/O events and query RTT histograms. [RT #18802] diff --git a/lib/dns/adb.c b/lib/dns/adb.c index ded6415df4..29203cb81c 100644 --- a/lib/dns/adb.c +++ b/lib/dns/adb.c @@ -15,7 +15,7 @@ * PERFORMANCE OF THIS SOFTWARE. */ -/* $Id: adb.c,v 1.245 2009/01/27 23:47:54 tbox Exp $ */ +/* $Id: adb.c,v 1.246 2009/01/28 23:20:23 jinmei Exp $ */ /*! \file * @@ -141,6 +141,7 @@ struct dns_adb { * XXXRTH Have a per-bucket structure that contains all of these? */ dns_adbnamelist_t names[NBUCKETS]; + dns_adbnamelist_t deadnames[NBUCKETS]; /*% See dns_adbnamelist_t */ isc_mutex_t namelocks[NBUCKETS]; /*% See dns_adbnamelist_t */ @@ -154,6 +155,7 @@ struct dns_adb { * XXXRTH Have a per-bucket structure that contains all of these? */ dns_adbentrylist_t entries[NBUCKETS]; + dns_adbentrylist_t deadentries[NBUCKETS]; isc_mutex_t entrylocks[NBUCKETS]; isc_boolean_t entry_sd[NBUCKETS]; /*%< shutting down */ unsigned int entry_refcnt[NBUCKETS]; @@ -279,7 +281,8 @@ static inline void free_adbfetch(dns_adb_t *, dns_adbfetch_t **); static inline dns_adbname_t *find_name_and_lock(dns_adb_t *, dns_name_t *, unsigned int, int *); static inline dns_adbentry_t *find_entry_and_lock(dns_adb_t *, - isc_sockaddr_t *, int *); + isc_sockaddr_t *, int *, + isc_stdtime_t); static void dump_adb(dns_adb_t *, FILE *, isc_boolean_t debug, isc_stdtime_t); static void print_dns_name(FILE *, dns_name_t *); static void print_namehook_list(FILE *, const char *legend, @@ -296,12 +299,13 @@ static inline void inc_entry_refcnt(dns_adb_t *, dns_adbentry_t *, static inline isc_boolean_t dec_entry_refcnt(dns_adb_t *, dns_adbentry_t *, isc_boolean_t); static inline void violate_locking_hierarchy(isc_mutex_t *, isc_mutex_t *); -static isc_boolean_t clean_namehooks(dns_adb_t *, dns_adbnamehooklist_t *, - isc_boolean_t); +static isc_boolean_t clean_namehooks(dns_adb_t *, dns_adbnamehooklist_t *); static void clean_target(dns_adb_t *, dns_name_t *); static void clean_finds_at_name(dns_adbname_t *, isc_eventtype_t, unsigned int); static isc_boolean_t check_expire_namehooks(dns_adbname_t *, isc_stdtime_t); +static isc_boolean_t check_expire_entry(dns_adb_t *, dns_adbentry_t **, + isc_stdtime_t); static void cancel_fetches_at_name(dns_adbname_t *); static isc_result_t dbfind_name(dns_adbname_t *, isc_stdtime_t, dns_rdatatype_t); @@ -315,8 +319,7 @@ static inline void link_name(dns_adb_t *, int, dns_adbname_t *); static inline isc_boolean_t unlink_name(dns_adb_t *, dns_adbname_t *); static inline void link_entry(dns_adb_t *, int, dns_adbentry_t *); static inline isc_boolean_t unlink_entry(dns_adb_t *, dns_adbentry_t *); -static isc_boolean_t kill_name(dns_adbname_t **, isc_eventtype_t, - isc_boolean_t); +static isc_boolean_t kill_name(dns_adbname_t **, isc_eventtype_t); static void water(void *, int); static void dump_entry(FILE *, dns_adbentry_t *, isc_boolean_t, isc_stdtime_t); @@ -338,6 +341,12 @@ static void dump_entry(FILE *, dns_adbentry_t *, isc_boolean_t, isc_stdtime_t); #define NAME_GLUEOK(n) (((n)->flags & NAME_GLUE_OK) != 0) #define NAME_HINTOK(n) (((n)->flags & NAME_HINT_OK) != 0) +/* + * Private flag(s) for entries. + * MUST NOT overlap FCTX_ADDRINFO_xxx and DNS_FETCHOPT_NOEDNS0. + */ +#define ENTRY_IS_DEAD 0x80000000 + /* * To the name, address classes are all that really exist. If it has a * V6 address it doesn't care if it came from a AAAA query. @@ -540,7 +549,8 @@ import_rdataset(dns_adbname_t *adbname, dns_rdataset_t *rdataset, goto fail; } - foundentry = find_entry_and_lock(adb, &sockaddr, &addr_bucket); + foundentry = find_entry_and_lock(adb, &sockaddr, &addr_bucket, + now); if (foundentry == NULL) { dns_adbentry_t *entry; @@ -617,10 +627,11 @@ import_rdataset(dns_adbname_t *adbname, dns_rdataset_t *rdataset, * Requires the name's bucket be locked. */ static isc_boolean_t -kill_name(dns_adbname_t **n, isc_eventtype_t ev, isc_boolean_t is_purge) { +kill_name(dns_adbname_t **n, isc_eventtype_t ev) { dns_adbname_t *name; isc_boolean_t result = ISC_FALSE; isc_boolean_t result4, result6; + int bucket; dns_adb_t *adb; INSIST(n != NULL); @@ -649,8 +660,8 @@ kill_name(dns_adbname_t **n, isc_eventtype_t ev, isc_boolean_t is_purge) { * in that they will always empty the list. */ clean_finds_at_name(name, ev, DNS_ADBFIND_ADDRESSMASK); - result4 = clean_namehooks(adb, &name->v4, is_purge); - result6 = clean_namehooks(adb, &name->v6, is_purge); + result4 = clean_namehooks(adb, &name->v4); + result6 = clean_namehooks(adb, &name->v6); clean_target(adb, &name->target); result = ISC_TF(result4 || result6); @@ -665,8 +676,13 @@ kill_name(dns_adbname_t **n, isc_eventtype_t ev, isc_boolean_t is_purge) { if (result) result = dec_adb_irefcnt(adb); } else { - name->flags |= NAME_IS_DEAD; cancel_fetches_at_name(name); + if (!NAME_DEAD(name)) { + bucket = name->lock_bucket; + ISC_LIST_UNLINK(adb->names[bucket], name, plink); + ISC_LIST_APPEND(adb->deadnames[bucket], name, plink); + name->flags |= NAME_IS_DEAD; + } } return (result); } @@ -690,7 +706,7 @@ check_expire_namehooks(dns_adbname_t *name, isc_stdtime_t now) { if (!NAME_FETCH_V4(name) && EXPIRE_OK(name->expire_v4, now)) { if (NAME_HAS_V4(name)) { DP(DEF_LEVEL, "expiring v4 for name %p", name); - result4 = clean_namehooks(adb, &name->v4, ISC_FALSE); + result4 = clean_namehooks(adb, &name->v4); name->partial_result &= ~DNS_ADBFIND_INET; } name->expire_v4 = INT_MAX; @@ -703,7 +719,7 @@ check_expire_namehooks(dns_adbname_t *name, isc_stdtime_t now) { if (!NAME_FETCH_V6(name) && EXPIRE_OK(name->expire_v6, now)) { if (NAME_HAS_V6(name)) { DP(DEF_LEVEL, "expiring v6 for name %p", name); - result6 = clean_namehooks(adb, &name->v6, ISC_FALSE); + result6 = clean_namehooks(adb, &name->v6); name->partial_result &= ~DNS_ADBFIND_INET6; } name->expire_v6 = INT_MAX; @@ -743,7 +759,10 @@ unlink_name(dns_adb_t *adb, dns_adbname_t *name) { bucket = name->lock_bucket; INSIST(bucket != DNS_ADB_INVALIDBUCKET); - ISC_LIST_UNLINK(adb->names[bucket], name, plink); + if (NAME_DEAD(name)) + ISC_LIST_UNLINK(adb->deadnames[bucket], name, plink); + else + ISC_LIST_UNLINK(adb->names[bucket], name, plink); name->lock_bucket = DNS_ADB_INVALIDBUCKET; INSIST(adb->name_refcnt[bucket] > 0); adb->name_refcnt[bucket]--; @@ -757,6 +776,26 @@ unlink_name(dns_adb_t *adb, dns_adbname_t *name) { */ static inline void link_entry(dns_adb_t *adb, int bucket, dns_adbentry_t *entry) { + int i; + dns_adbentry_t *e; + + if (adb->overmem) { + for (i = 0; i < 2; i++) { + e = ISC_LIST_TAIL(adb->entries[bucket]); + if (e == NULL) + break; + if (e->refcnt == 0) { + unlink_entry(adb, e); + free_adbentry(adb, &e); + continue; + } + INSIST((e->flags & ENTRY_IS_DEAD) == 0); + e->flags |= ENTRY_IS_DEAD; + ISC_LIST_UNLINK(adb->entries[bucket], e, plink); + ISC_LIST_PREPEND(adb->deadentries[bucket], e, plink); + } + } + ISC_LIST_PREPEND(adb->entries[bucket], entry, plink); entry->lock_bucket = bucket; adb->entry_refcnt[bucket]++; @@ -773,7 +812,10 @@ unlink_entry(dns_adb_t *adb, dns_adbentry_t *entry) { bucket = entry->lock_bucket; INSIST(bucket != DNS_ADB_INVALIDBUCKET); - ISC_LIST_UNLINK(adb->entries[bucket], entry, plink); + if ((entry->flags & ENTRY_IS_DEAD) != 0) + ISC_LIST_UNLINK(adb->deadentries[bucket], entry, plink); + else + ISC_LIST_UNLINK(adb->entries[bucket], entry, plink); entry->lock_bucket = DNS_ADB_INVALIDBUCKET; INSIST(adb->entry_refcnt[bucket] > 0); adb->entry_refcnt[bucket]--; @@ -826,8 +868,7 @@ shutdown_names(dns_adb_t *adb) { next_name = ISC_LIST_NEXT(name, plink); INSIST(result == ISC_FALSE); result = kill_name(&name, - DNS_EVENT_ADBSHUTDOWN, - ISC_FALSE); + DNS_EVENT_ADBSHUTDOWN); name = next_name; } } @@ -853,7 +894,7 @@ shutdown_entries(dns_adb_t *adb) { adb->entry_sd[bucket] = ISC_TRUE; entry = ISC_LIST_HEAD(adb->entries[bucket]); - if (entry == NULL) { + if (adb->entry_refcnt[bucket] == 0) { /* * This bucket has no entries. We must decrement the * irefcnt ourselves, since it will not be @@ -899,9 +940,7 @@ cancel_fetches_at_name(dns_adbname_t *name) { * Assumes the name bucket is locked. */ static isc_boolean_t -clean_namehooks(dns_adb_t *adb, dns_adbnamehooklist_t *namehooks, - isc_boolean_t is_purge) -{ +clean_namehooks(dns_adb_t *adb, dns_adbnamehooklist_t *namehooks) { dns_adbentry_t *entry; dns_adbnamehook_t *namehook; int addr_bucket; @@ -926,13 +965,6 @@ clean_namehooks(dns_adb_t *adb, dns_adbnamehooklist_t *namehooks, LOCK(&adb->entrylocks[addr_bucket]); } - /* - * If we are in an overmem situation, force expiration - * so that # of names and # of entries are well - * balanced. - */ - if (is_purge) - entry->expires = 0; result = dec_entry_refcnt(adb, entry, ISC_FALSE); } @@ -1220,7 +1252,8 @@ dec_entry_refcnt(dns_adb_t *adb, dns_adbentry_t *entry, isc_boolean_t lock) { destroy_entry = ISC_FALSE; if (entry->refcnt == 0 && - (adb->entry_sd[bucket] || entry->expires == 0)) { + (adb->entry_sd[bucket] || entry->expires == 0 || adb->overmem || + (entry->flags & ENTRY_IS_DEAD) != 0)) { destroy_entry = ISC_TRUE; result = unlink_entry(adb, entry); } @@ -1622,8 +1655,10 @@ find_name_and_lock(dns_adb_t *adb, dns_name_t *name, * the bucket changes. */ static inline dns_adbentry_t * -find_entry_and_lock(dns_adb_t *adb, isc_sockaddr_t *addr, int *bucketp) { - dns_adbentry_t *entry; +find_entry_and_lock(dns_adb_t *adb, isc_sockaddr_t *addr, int *bucketp, + isc_stdtime_t now) +{ + dns_adbentry_t *entry, *entry_next; int bucket; bucket = isc_sockaddr_hash(addr, ISC_TRUE) % NBUCKETS; @@ -1637,11 +1672,18 @@ find_entry_and_lock(dns_adb_t *adb, isc_sockaddr_t *addr, int *bucketp) { *bucketp = bucket; } - entry = ISC_LIST_HEAD(adb->entries[bucket]); - while (entry != NULL) { - if (isc_sockaddr_equal(addr, &entry->sockaddr)) + /* Search the list, while cleaning up expired entries. */ + for (entry = ISC_LIST_HEAD(adb->entries[bucket]); + entry != NULL; + entry = entry_next) { + entry_next = ISC_LIST_NEXT(entry, plink); + (void)check_expire_entry(adb, &entry, now); + if (entry != NULL && + isc_sockaddr_equal(addr, &entry->sockaddr)) { + ISC_LIST_UNLINK(adb->entries[bucket], entry, plink); + ISC_LIST_PREPEND(adb->entries[bucket], entry, plink); return (entry); - entry = ISC_LIST_NEXT(entry, plink); + } } return (NULL); @@ -1809,7 +1851,7 @@ check_expire_name(dns_adbname_t **namep, isc_stdtime_t now) { /* * The name is empty. Delete it. */ - result = kill_name(&name, DNS_EVENT_ADBEXPIRED, ISC_FALSE); + result = kill_name(&name, DNS_EVENT_ADBEXPIRED); *namep = NULL; /* @@ -1852,16 +1894,9 @@ check_stale_name(dns_adb_t *adb, int bucket, isc_stdtime_t now) { for (victims = 0; victim != NULL && victims < max_victims && scans < 10; victim = next_victim) { + INSIST(!NAME_DEAD(victim)); scans++; next_victim = ISC_LIST_PREV(victim, plink); - - /* - * If the victim is already dead, it simply waits for some - * final events. Ignore it. - */ - if (NAME_DEAD(victim)) - goto next; - result = check_expire_name(&victim, now); if (victim == NULL) { victims++; @@ -1871,8 +1906,7 @@ check_stale_name(dns_adb_t *adb, int bucket, isc_stdtime_t now) { if (!NAME_FETCH(victim) && (overmem || victim->last_used + ADB_STALE_MARGIN <= now)) { RUNTIME_CHECK(kill_name(&victim, - DNS_EVENT_ADBCANCELED, - ISC_TRUE) == + DNS_EVENT_ADBCANCELED) == ISC_FALSE); victims++; } @@ -2070,12 +2104,14 @@ dns_adb_create(isc_mem_t *mem, dns_view_t *view, isc_timermgr_t *timermgr, goto fail1; for (i = 0; i < NBUCKETS; i++) { ISC_LIST_INIT(adb->names[i]); + ISC_LIST_INIT(adb->deadnames[i]); adb->name_sd[i] = ISC_FALSE; adb->name_refcnt[i] = 0; adb->irefcnt++; } for (i = 0; i < NBUCKETS; i++) { ISC_LIST_INIT(adb->entries[i]); + ISC_LIST_INIT(adb->deadentries[i]); adb->entry_sd[i] = ISC_FALSE; adb->entry_refcnt[i] = 0; adb->irefcnt++; @@ -3157,8 +3193,7 @@ fetch_callback(isc_task_t *task, isc_event_t *ev) { free_adbfetch(adb, &fetch); isc_event_free(&ev); - want_check_exit = kill_name(&name, DNS_EVENT_ADBCANCELED, - ISC_FALSE); + want_check_exit = kill_name(&name, DNS_EVENT_ADBCANCELED); UNLOCK(&adb->namelocks[bucket]); @@ -3457,7 +3492,7 @@ dns_adb_findaddrinfo(dns_adb_t *adb, isc_sockaddr_t *sa, result = ISC_R_SUCCESS; bucket = DNS_ADB_INVALIDBUCKET; - entry = find_entry_and_lock(adb, sa, &bucket); + entry = find_entry_and_lock(adb, sa, &bucket, now); if (adb->entry_sd[bucket]) { result = ISC_R_SHUTTINGDOWN; goto unlock; @@ -3570,8 +3605,7 @@ dns_adb_flushname(dns_adb_t *adb, dns_name_t *name) { if (!NAME_DEAD(adbname) && dns_name_equal(name, &adbname->name)) { RUNTIME_CHECK(kill_name(&adbname, - DNS_EVENT_ADBCANCELED, - ISC_TRUE) == + DNS_EVENT_ADBCANCELED) == ISC_FALSE); } adbname = nextname; diff --git a/lib/dns/rbtdb.c b/lib/dns/rbtdb.c index e6220971b9..02c1c47849 100644 --- a/lib/dns/rbtdb.c +++ b/lib/dns/rbtdb.c @@ -15,7 +15,7 @@ * PERFORMANCE OF THIS SOFTWARE. */ -/* $Id: rbtdb.c,v 1.272 2009/01/17 23:47:42 tbox Exp $ */ +/* $Id: rbtdb.c,v 1.273 2009/01/28 23:20:23 jinmei Exp $ */ /*! \file */ @@ -190,6 +190,15 @@ typedef isc_mutex_t nodelock_t; #define NODE_WEAKDOWNGRADE(l) ((void)0) #endif +/*% + * Whether to rate-limit updating the LRU to avoid possible thread contention. + * Our performance measurement has shown the cost is marginal, so it's defined + * to be 0 by default either with or without threads. + */ +#ifndef DNS_RBTDB_LIMITLRUUPDATE +#define DNS_RBTDB_LIMITLRUUPDATE 0 +#endif + /* * Allow clients with a virtual time of up to 5 minutes in the past to see * records that would have otherwise have expired. @@ -8302,15 +8311,17 @@ rdataset_putadditional(dns_acache_t *acache, dns_rdataset_t *rdataset, /*% * See if a given cache entry that is being reused needs to be updated - * in the LRU-list. For the non-threaded case this is always true unless the - * entry has already been marked as stale; for the threaded case, updating - * the entry every time it is referenced might be expensive because it requires - * a node write lock. Thus this function returns true if the entry has not been - * updated for some period of time. We differentiate the NS or glue address - * case and the others since experiments have shown that the former tends to be - * accessed relatively infrequently and the cost of cache miss is higher - * (e.g., a missing NS records may cause external queries at a higher level - * zone, involving more transactions). + * in the LRU-list. From the LRU management point of view, this function is + * expected to return true for almost all cases. When used with threads, + * however, this may cause a non-negligible performance penalty because a + * writer lock will have to be acquired before updating the list. + * If DNS_RBTDB_LIMITLRUUPDATE is defined to be non 0 at compilation time, this + * function returns true if the entry has not been updated for some period of + * time. We differentiate the NS or glue address case and the others since + * experiments have shown that the former tends to be accessed relatively + * infrequently and the cost of cache miss is higher (e.g., a missing NS records + * may cause external queries at a higher level zone, involving more + * transactions). * * Caller must hold the node (read or write) lock. */ @@ -8320,7 +8331,7 @@ need_headerupdate(rdatasetheader_t *header, isc_stdtime_t now) { (RDATASET_ATTR_NONEXISTENT|RDATASET_ATTR_STALE)) != 0) return (ISC_FALSE); -#ifdef ISC_PLATFORM_USETHREADS +#if DNS_RBTDB_LIMITLRUUPDATE if (header->type == dns_rdatatype_ns || (header->trust == dns_trust_glue && (header->type == dns_rdatatype_a || @@ -8399,6 +8410,15 @@ overmem_purge(dns_rbtdb_t *rbtdb, unsigned int locknum_start, header != NULL && purgecount > 0; header = header_prev) { header_prev = ISC_LIST_PREV(header, lru_link); + /* + * Unlink the entry at this point to avoid checking it + * again even if it's currently used someone else and + * cannot be purged at this moment. This entry won't be + * referenced any more (so unlinking is safe) since the + * TTL was reset to 0. + */ + ISC_LIST_UNLINK(rbtdb->rdatasets[locknum], header, + lru_link); expire_header(rbtdb, header, tree_locked); purgecount--; }