diff --git a/CHANGES b/CHANGES
index 099d3ca434..4575338d2b 100644
--- a/CHANGES
+++ b/CHANGES
@@ -1,3 +1,6 @@
+2475.	[bug]		LRU cache cleanup under overmem condition could purge
+			particular entries more aggresively. [RT #17628]
+
 2474.	[bug]		ACL structures could be allocated with insufficient
 			space, causing an array overrun. [RT #18765]
 
diff --git a/lib/dns/rbtdb.c b/lib/dns/rbtdb.c
index fa813f99b8..41d04cbe25 100644
--- a/lib/dns/rbtdb.c
+++ b/lib/dns/rbtdb.c
@@ -15,7 +15,7 @@
  * PERFORMANCE OF THIS SOFTWARE.
  */
 
-/* $Id: rbtdb.c,v 1.265 2008/10/24 00:11:17 marka Exp $ */
+/* $Id: rbtdb.c,v 1.266 2008/10/27 03:52:43 marka Exp $ */
 
 /*! \file */
 
@@ -322,7 +322,26 @@ struct acachectl {
 	(((header)->attributes & RDATASET_ATTR_OPTOUT) != 0)
 
 #define DEFAULT_NODE_LOCK_COUNT         7       /*%< Should be prime. */
-#define DEFAULT_CACHE_NODE_LOCK_COUNT   1009    /*%< Should be prime. */
+
+/*%
+ * Number of buckets for cache DB entries (locks, LRU lists, TTL heaps).
+ * There is a tradeoff issue about configuring this value: if this is too
+ * small, it may cause heavier contention between threads; if this is too large,
+ * LRU purge algorithm won't work well (entries tend to be purged prematurely).
+ * The default value should work well for most environments, but this can
+ * also be configurable at compilation time via the
+ * DNS_RBTDB_CACHE_NODE_LOCK_COUNT variable.  This value must be larger than
+ * 1 due to the assumption of overmem_purge().
+ */
+#ifdef DNS_RBTDB_CACHE_NODE_LOCK_COUNT
+#if DNS_RBTDB_CACHE_NODE_LOCK_COUNT <= 1
+#error "DNS_RBTDB_CACHE_NODE_LOCK_COUNT must be larger 1"
+#else
+#define DEFAULT_CACHE_NODE_LOCK_COUNT DNS_RBTDB_CACHE_NODE_LOCK_COUNT
+#endif
+#else
+#define DEFAULT_CACHE_NODE_LOCK_COUNT   16
+#endif	/* DNS_RBTDB_CACHE_NODE_LOCK_COUNT */
 
 typedef struct {
 	nodelock_t                      lock;
@@ -499,8 +518,10 @@ static inline isc_boolean_t need_headerupdate(rdatasetheader_t *header,
 					      isc_stdtime_t now);
 static void update_header(dns_rbtdb_t *rbtdb, rdatasetheader_t *header,
 			  isc_stdtime_t now);
-static void check_stale_cache(dns_rbtdb_t *rbtdb, dns_rbtnode_t *rbtnode,
-			      isc_stdtime_t now, isc_boolean_t tree_locked);
+static void expire_header(dns_rbtdb_t *rbtdb, rdatasetheader_t *header,
+			  isc_boolean_t tree_locked);
+static void overmem_purge(dns_rbtdb_t *rbtdb, unsigned int locknum_start,
+			  isc_stdtime_t now, isc_boolean_t tree_locked);
 static isc_result_t resign_insert(dns_rbtdb_t *rbtdb, int idx,
 				  rdatasetheader_t *newheader);
 
@@ -5776,6 +5797,7 @@ addrdataset(dns_db_t *db, dns_dbnode_t *node, dns_dbversion_t *version,
 	rbtdb_version_t *rbtversion = version;
 	isc_region_t region;
 	rdatasetheader_t *newheader;
+	rdatasetheader_t *header;
 	isc_result_t result;
 	isc_boolean_t delegating;
 	isc_boolean_t tree_locked = ISC_FALSE;
@@ -5871,6 +5893,9 @@ addrdataset(dns_db_t *db, dns_dbnode_t *node, dns_dbversion_t *version,
 		RWLOCK(&rbtdb->tree_lock, isc_rwlocktype_write);
 	}
 
+	if (IS_CACHE(rbtdb) && rbtdb->overmem)
+		overmem_purge(rbtdb, rbtnode->locknum, now, tree_locked);
+
 	NODE_LOCK(&rbtdb->node_locks[rbtnode->locknum].lock,
 		  isc_rwlocktype_write);
 
@@ -5882,7 +5907,10 @@ addrdataset(dns_db_t *db, dns_dbnode_t *node, dns_dbversion_t *version,
 	if (IS_CACHE(rbtdb)) {
 		if (tree_locked)
 			cleanup_dead_nodes(rbtdb, rbtnode->locknum);
-		check_stale_cache(rbtdb, rbtnode, now, tree_locked);
+
+		header = isc_heap_element(rbtdb->heaps[rbtnode->locknum], 1);
+		if (header && header->rdh_ttl <= now - RBTDB_VIRTUAL)
+			expire_header(rbtdb, header, tree_locked);
 
 		/*
 		 * If we've been holding a write lock on the tree just for
@@ -8323,79 +8351,70 @@ update_header(dns_rbtdb_t *rbtdb, rdatasetheader_t *header,
 }
 
 /*%
- * Examine the tail entry of the LRU list to see if it expires or is stale
- * (unused for some period).  If so, it's marked as stale and possibly freed.
- * If the DB is in the overmem condition, the tail and the next to tail entries
- * will be unconditionally marked.  We don't care about a race on 'overmem'
- * at the risk of causing some collateral damage or a small delay in starting
- * cleanup, so we don't bother to lock rbtdb.
- *
- * Caller must hold the node (write) lock.
- *
- * We can get away with locking only one node here, since it will lock all
- * other nodes in that lock pool bucket.
+ * Purge some expired and/or stale (i.e. unused for some period) cache entries
+ * under an overmem condition.  To recover from this condition quickly, up to
+ * 2 entries will be purged.  This process is triggered while adding a new
+ * entry, and we specifically avoid purging entries in the same LRU bucket as
+ * the one to which the new entry will belong.  Otherwise, we might purge
+ * entries of the same name of different RR types while adding RRsets from a
+ * single response (consider the case where we're adding A and AAAA glue records
+ * of the same NS name).
  */
 static void
-check_stale_cache(dns_rbtdb_t *rbtdb, dns_rbtnode_t *rbtnode,
-		  isc_stdtime_t now, isc_boolean_t tree_locked)
+overmem_purge(dns_rbtdb_t *rbtdb, unsigned int locknum_start,
+	      isc_stdtime_t now, isc_boolean_t tree_locked)
 {
-	rdatasetheader_t *victim;
-	isc_boolean_t overmem = rbtdb->overmem;
-	int victims = 0;
+	rdatasetheader_t *header, *header_prev;
+	unsigned int locknum;
+	int purgecount = 2;
 
-	/*
-	 * Check for TTL-based expiry.
-	 */
-	victim = isc_heap_element(rbtdb->heaps[rbtnode->locknum], 1);
-	if (victim != NULL && victim->rdh_ttl <= now - RBTDB_VIRTUAL) {
-		INSIST(victim->node->locknum == rbtnode->locknum);
-		victims++;
+	for (locknum = (locknum_start + 1) % rbtdb->node_lock_count;
+	     locknum != locknum_start && purgecount > 0;
+	     locknum = (locknum + 1) % rbtdb->node_lock_count) {
+		NODE_LOCK(&rbtdb->node_locks[locknum].lock,
+			  isc_rwlocktype_write);
 
-		set_ttl(rbtdb, victim, 0);
-		victim->attributes |= RDATASET_ATTR_STALE;
-		victim->node->dirty = 1;
-
-		if (dns_rbtnode_refcurrent(victim->node) == 0) {
-			INSIST(rbtnode != victim->node);
-			/*
-			 * If no one else is using the node, we can
-			 * clean it up now.  We first need to gain
-			 * a new reference to the node to meet a
-			 * requirement of decrement_reference().
-			 */
-			new_reference(rbtdb, victim->node);
-			decrement_reference(rbtdb, victim->node, 0,
-					    isc_rwlocktype_write,
-					    tree_locked ? isc_rwlocktype_write :
-					    isc_rwlocktype_none);
+		header = isc_heap_element(rbtdb->heaps[locknum], 1);
+		if (header && header->rdh_ttl <= now - RBTDB_VIRTUAL) {
+			expire_header(rbtdb, header, tree_locked);
+			purgecount--;
 		}
-	}
 
-	/*
-	 * If we are over memory, delete the end entry from the LRU.
-	 */
-	victim = ISC_LIST_TAIL(rbtdb->rdatasets[rbtnode->locknum]);
-	if (victim != NULL && overmem) {
-		INSIST(victim->node->locknum == rbtnode->locknum);
-		victims++;
-
-		set_ttl(rbtdb, victim, 0);
-		victim->attributes |= RDATASET_ATTR_STALE;
-		victim->node->dirty = 1;
-
-		if (dns_rbtnode_refcurrent(victim->node) == 0) {
-			INSIST(rbtnode != victim->node);
-			/*
-			 * If no one else is using the node, we can
-			 * clean it up now.  We first need to gain
-			 * a new reference to the node to meet a
-			 * requirement of decrement_reference().
-			 */
-			new_reference(rbtdb, victim->node);
-			decrement_reference(rbtdb, victim->node, 0,
-					    isc_rwlocktype_write,
-					    tree_locked ? isc_rwlocktype_write :
-					    isc_rwlocktype_none);
+		for (header = ISC_LIST_TAIL(rbtdb->rdatasets[locknum]);
+		     header != NULL && purgecount > 0;
+		     header = header_prev) {
+			header_prev = ISC_LIST_PREV(header, lru_link);
+			expire_header(rbtdb, header, tree_locked);
+			purgecount--;
 		}
+
+		NODE_UNLOCK(&rbtdb->node_locks[locknum].lock,
+				    isc_rwlocktype_write);
+	}
+}
+
+static void
+expire_header(dns_rbtdb_t *rbtdb, rdatasetheader_t *header,
+	      isc_boolean_t tree_locked)
+{
+	set_ttl(rbtdb, header, 0);
+	header->attributes |= RDATASET_ATTR_STALE;
+	header->node->dirty = 1;
+
+	/*
+	 * Caller must hold the node (write) lock.
+	 */
+
+	if (dns_rbtnode_refcurrent(header->node) == 0) {
+		/*
+		 * If no one else is using the node, we can clean it up now.
+		 * We first need to gain a new reference to the node to meet a
+		 * requirement of decrement_reference().
+		 */
+		new_reference(rbtdb, header->node);
+		decrement_reference(rbtdb, header->node, 0,
+				    isc_rwlocktype_write,
+				    tree_locked ? isc_rwlocktype_write :
+				    isc_rwlocktype_none);
 	}
 }