diff --git a/CHANGES b/CHANGES index 187887cb35..4431c6c050 100644 --- a/CHANGES +++ b/CHANGES @@ -24,7 +24,8 @@ 2376. [bug] Change #2144 was not complete. -2375. [placeholder] +2375. [security] Fully randomize UDP query ports to improve + forgery resilience. [RT #17949, #18098] 2374. [bug] "blackhole" ACLs could cause named to segfault due to some uninitialized memory. [RT #18095] diff --git a/bin/named/bind9.xsl b/bin/named/bind9.xsl index f592e9d98d..cf3e04bcc6 100644 --- a/bin/named/bind9.xsl +++ b/bin/named/bind9.xsl @@ -15,7 +15,7 @@ - PERFORMANCE OF THIS SOFTWARE. --> - +
Bind 9 Configuration and Statistics
-
diff --git a/bin/named/bind9.xsl.h b/bin/named/bind9.xsl.h index 39edc8baeb..2c413786b2 100644 --- a/bin/named/bind9.xsl.h +++ b/bin/named/bind9.xsl.h @@ -91,7 +91,6 @@ static char xslmsg[] = " \n" " \n" "
Bind 9 Configuration and Statistics
\n" - "\n" "
\n" "\n" "
\n" diff --git a/bin/named/client.c b/bin/named/client.c index 87b172270f..756d3c84d7 100644 --- a/bin/named/client.c +++ b/bin/named/client.c @@ -15,7 +15,7 @@ * PERFORMANCE OF THIS SOFTWARE. */ -/* $Id: client.c,v 1.257 2008/04/03 06:09:04 tbox Exp $ */ +/* $Id: client.c,v 1.258 2008/06/23 19:41:18 jinmei Exp $ */ #include @@ -1524,14 +1524,6 @@ client_request(isc_task_t *task, isc_event_t *event) { dns_generalstats_increment(ns_g_server->nsstats, dns_nsstatscounter_tcp); - /* - * Hash the incoming request here as it is after - * dns_dispatch_importrecv(). - */ - dns_dispatch_hash(&client->now, sizeof(client->now)); - dns_dispatch_hash(isc_buffer_base(buffer), - isc_buffer_usedlength(buffer)); - /* * It's a request. Parse it. */ diff --git a/bin/named/server.c b/bin/named/server.c index 02dc33bf7d..377716dbc9 100644 --- a/bin/named/server.c +++ b/bin/named/server.c @@ -15,7 +15,7 @@ * PERFORMANCE OF THIS SOFTWARE. */ -/* $Id: server.c,v 1.508 2008/05/21 23:47:00 tbox Exp $ */ +/* $Id: server.c,v 1.509 2008/06/23 19:41:18 jinmei Exp $ */ /*! \file */ @@ -33,6 +33,7 @@ #include #include #include +#include #include #include #include @@ -538,13 +539,15 @@ mustbesecure(const cfg_obj_t *mbs, dns_resolver_t *resolver) */ static isc_result_t get_view_querysource_dispatch(const cfg_obj_t **maps, - int af, dns_dispatch_t **dispatchp) + int af, dns_dispatch_t **dispatchp, + isc_boolean_t is_firstview) { isc_result_t result; dns_dispatch_t *disp; isc_sockaddr_t sa; unsigned int attrs, attrmask; const cfg_obj_t *obj = NULL; + unsigned int maxdispatchbuffers; /* * Make compiler happy. @@ -596,6 +599,20 @@ get_view_querysource_dispatch(const cfg_obj_t **maps, attrs |= DNS_DISPATCHATTR_IPV6; break; } + if (isc_sockaddr_getport(&sa) == 0) { + attrs |= DNS_DISPATCHATTR_EXCLUSIVE; + maxdispatchbuffers = 4096; + } else { + INSIST(obj != NULL); + if (is_firstview) { + cfg_obj_log(obj, ns_g_lctx, ISC_LOG_INFO, + "using specific query-source port " + "suppresses port randomization and can be " + "insecure."); + } + maxdispatchbuffers = 1000; + } + attrmask = 0; attrmask |= DNS_DISPATCHATTR_UDP; attrmask |= DNS_DISPATCHATTR_TCP; @@ -605,7 +622,7 @@ get_view_querysource_dispatch(const cfg_obj_t **maps, disp = NULL; result = dns_dispatch_getudp(ns_g_dispatchmgr, ns_g_socketmgr, ns_g_taskmgr, &sa, 4096, - 1000, 32768, 16411, 16433, + maxdispatchbuffers, 32768, 16411, 16433, attrs, attrmask, &disp); if (result != ISC_R_SUCCESS) { isc_sockaddr_t any; @@ -1015,7 +1032,6 @@ configure_view(dns_view_t *view, const cfg_obj_t *config, isc_boolean_t rfc1918; isc_boolean_t empty_zones_enable; const cfg_obj_t *disablelist = NULL; - isc_uint32_t nqports, qports_updateinterval; dns_stats_t *resstats = NULL; dns_stats_t *resquerystats = NULL; @@ -1272,8 +1288,12 @@ configure_view(dns_view_t *view, const cfg_obj_t *config, * * XXXRTH Hardwired number of tasks. */ - CHECK(get_view_querysource_dispatch(maps, AF_INET, &dispatch4)); - CHECK(get_view_querysource_dispatch(maps, AF_INET6, &dispatch6)); + CHECK(get_view_querysource_dispatch(maps, AF_INET, &dispatch4, + ISC_TF(ISC_LIST_PREV(view, link) + == NULL))); + CHECK(get_view_querysource_dispatch(maps, AF_INET6, &dispatch6, + ISC_TF(ISC_LIST_PREV(view, link) + == NULL))); if (dispatch4 == NULL && dispatch6 == NULL) { UNEXPECTED_ERROR(__FILE__, __LINE__, "unable to obtain neither an IPv4 nor" @@ -1281,93 +1301,11 @@ configure_view(dns_view_t *view, const cfg_obj_t *config, result = ISC_R_UNEXPECTED; goto cleanup; } - - obj = NULL; - (void)ns_config_get(maps, "use-queryport-pool", &obj); - if (obj == NULL || cfg_obj_asboolean(obj)) { - isc_sockaddr_t sa; - isc_boolean_t logit4 = ISC_FALSE, logit6 = ISC_FALSE; - - resopts |= (DNS_RESOLVER_USEDISPATCHPOOL4 | - DNS_RESOLVER_USEDISPATCHPOOL6); - - /* Check consistency with query-source(-v6) */ - if (dispatch4 == NULL) - resopts &= ~DNS_RESOLVER_USEDISPATCHPOOL4; - else { - result = dns_dispatch_getlocaladdress(dispatch4, &sa); - INSIST(result == ISC_R_SUCCESS); - if (isc_sockaddr_getport(&sa) != 0) { - logit4 = ISC_TRUE; - resopts &= ~DNS_RESOLVER_USEDISPATCHPOOL4; - } - } - - if (dispatch6 == NULL) - resopts &= ~DNS_RESOLVER_USEDISPATCHPOOL6; - else { - result = dns_dispatch_getlocaladdress(dispatch6, &sa); - INSIST(result == ISC_R_SUCCESS); - if (isc_sockaddr_getport(&sa) != 0) { - logit6 = ISC_TRUE; - resopts &= ~DNS_RESOLVER_USEDISPATCHPOOL6; - } - } - if (logit4 && obj != NULL) - cfg_obj_log(obj, ns_g_lctx, ISC_LOG_ERROR, - "specific query-source port " - "cannot coexist with queryport-pool. " - "(Pool disabled)"); - if (logit6 && obj != NULL) - cfg_obj_log(obj, ns_g_lctx, ISC_LOG_ERROR, - "specific query-source-v6 port " - "cannot coexist with queryport-pool. " - "(Pool disabled)"); - } - CHECK(dns_view_createresolver(view, ns_g_taskmgr, 31, ns_g_socketmgr, ns_g_timermgr, resopts, ns_g_dispatchmgr, dispatch4, dispatch6)); - /* - * Query-port pool parameters. - */ - obj = NULL; - nqports = 8; - result = ns_config_get(maps, "queryport-pool-ports", &obj); - if (result == ISC_R_SUCCESS) { - if ((resopts & (DNS_RESOLVER_USEDISPATCHPOOL4 | - DNS_RESOLVER_USEDISPATCHPOOL6)) == 0) { - cfg_obj_log(obj, ns_g_lctx, ISC_LOG_ERROR, - "queryport-pool-ports is effective only " - "with 'use-queryport-pool yes' (ignored)"); - } else - nqports = cfg_obj_asuint32(obj); - } - - obj = NULL; - qports_updateinterval = 15; - result = ns_config_get(maps, "queryport-pool-updateinterval", &obj); - if (result == ISC_R_SUCCESS) { - if ((resopts & (DNS_RESOLVER_USEDISPATCHPOOL4 | - DNS_RESOLVER_USEDISPATCHPOOL6)) == 0) { - cfg_obj_log(obj, ns_g_lctx, ISC_LOG_ERROR, - "queryport-pool-updateinterval is " - "effective only with 'use-queryport-pool " - "yes' (ignored)"); - } else - qports_updateinterval = cfg_obj_asuint32(obj); - } - - if ((resopts & (DNS_RESOLVER_USEDISPATCHPOOL4 | - DNS_RESOLVER_USEDISPATCHPOOL6)) != 0) { - CHECK(dns_resolver_createdispatchpool(view->resolver, - nqports, - qports_updateinterval - * 60)); - } - if (resstats == NULL) { CHECK(dns_generalstats_create(mctx, &resstats, dns_resstatscounter_max)); @@ -2674,8 +2612,6 @@ adjust_interfaces(ns_server_t *server, isc_mem_t *mctx) { view != NULL; view = ISC_LIST_NEXT(view, link)) { dns_dispatch_t *dispatch6; - isc_boolean_t use_portpool = ISC_FALSE; - unsigned int resopts; dispatch6 = dns_resolver_dispatchv6(view->resolver); if (dispatch6 == NULL) @@ -2683,19 +2619,16 @@ adjust_interfaces(ns_server_t *server, isc_mem_t *mctx) { result = dns_dispatch_getlocaladdress(dispatch6, &addr); if (result != ISC_R_SUCCESS) goto fail; - resopts = dns_resolver_getoptions(view->resolver); - if ((resopts & (DNS_RESOLVER_USEDISPATCHPOOL4 | - DNS_RESOLVER_USEDISPATCHPOOL6)) != 0) { - /* - * If the resolver uses a dynamic pool of query ports - * with a specific source address, some of the current - * and future ports may override an existing wildcard - * IPv6 port. So we need to allow wildcard match - * in this case. - */ - use_portpool = ISC_TRUE; - } - result = add_listenelt(mctx, list, &addr, use_portpool); + + /* + * We always add non-wildcard address regardless of whether + * the port is 'any' (the fourth arg is TRUE): if the port is + * specific, we need to add it since it may conflict with a + * listening interface; if it's zero, we'll dynamically open + * query ports, and some of them may override an existing + * wildcard IPv6 port. + */ + result = add_listenelt(mctx, list, &addr, ISC_TRUE); if (result != ISC_R_SUCCESS) goto fail; } @@ -2884,24 +2817,41 @@ set_limits(const cfg_obj_t **maps) { SETLIMIT("files", openfiles, "open files"); } -static isc_result_t -portlist_fromconf(dns_portlist_t *portlist, unsigned int family, - const cfg_obj_t *ports) +static void +portset_fromconf(isc_portset_t *portset, const cfg_obj_t *ports, + isc_boolean_t positive) { const cfg_listelt_t *element; - isc_result_t result = ISC_R_SUCCESS; for (element = cfg_list_first(ports); element != NULL; element = cfg_list_next(element)) { const cfg_obj_t *obj = cfg_listelt_value(element); - in_port_t port = (in_port_t)cfg_obj_asuint32(obj); - result = dns_portlist_add(portlist, family, port); - if (result != ISC_R_SUCCESS) - break; + if (cfg_obj_isuint32(obj)) { + in_port_t port = (in_port_t)cfg_obj_asuint32(obj); + + if (positive) + isc_portset_add(portset, port); + else + isc_portset_remove(portset, port); + } else { + const cfg_obj_t *obj_loport, *obj_hiport; + in_port_t loport, hiport; + + obj_loport = cfg_tuple_get(obj, "loport"); + loport = (in_port_t)cfg_obj_asuint32(obj_loport); + obj_hiport = cfg_tuple_get(obj, "hiport"); + hiport = (in_port_t)cfg_obj_asuint32(obj_hiport); + + if (positive) + isc_portset_addrange(portset, loport, hiport); + else { + isc_portset_removerange(portset, loport, + hiport); + } + } } - return (result); } static isc_result_t @@ -2940,7 +2890,7 @@ load_configuration(const char *filename, ns_server_t *server, const cfg_obj_t *options; const cfg_obj_t *views; const cfg_obj_t *obj; - const cfg_obj_t *v4ports, *v6ports; + const cfg_obj_t *usev4ports, *avoidv4ports, *usev6ports, *avoidv6ports; const cfg_obj_t *maps[3]; const cfg_obj_t *builtin_views; const cfg_listelt_t *element; @@ -2952,7 +2902,9 @@ load_configuration(const char *filename, ns_server_t *server, isc_uint32_t interface_interval; isc_uint32_t heartbeat_interval; isc_uint32_t udpsize; - in_port_t listen_port; + in_port_t listen_port, udpport_low, udpport_high; + isc_portset_t *v4portset = NULL; + isc_portset_t *v6portset = NULL; int i; cfg_aclconfctx_init(&aclconfctx); @@ -3069,24 +3021,64 @@ load_configuration(const char *filename, ns_server_t *server, CHECKM(ns_statschannels_configure(ns_g_server, config, &aclconfctx), "configuring statistics server(s)"); - v4ports = NULL; - v6ports = NULL; - (void)ns_config_get(maps, "avoid-v4-udp-ports", &v4ports); - (void)ns_config_get(maps, "avoid-v6-udp-ports", &v6ports); - if (v4ports != NULL || v6ports != NULL) { - dns_portlist_t *portlist = NULL; - result = dns_portlist_create(ns_g_mctx, &portlist); - if (result == ISC_R_SUCCESS && v4ports != NULL) - result = portlist_fromconf(portlist, AF_INET, v4ports); - if (result == ISC_R_SUCCESS && v6ports != NULL) - portlist_fromconf(portlist, AF_INET6, v6ports); - if (result == ISC_R_SUCCESS) - dns_dispatchmgr_setblackportlist(ns_g_dispatchmgr, portlist); - if (portlist != NULL) - dns_portlist_detach(&portlist); - CHECK(result); - } else - dns_dispatchmgr_setblackportlist(ns_g_dispatchmgr, NULL); + /* + * Configure sets of UDP query source ports. + */ + CHECKM(isc_portset_create(ns_g_mctx, &v4portset), + "creating UDP port set"); + CHECKM(isc_portset_create(ns_g_mctx, &v6portset), + "creating UDP port set"); + + usev4ports = NULL; + usev6ports = NULL; + avoidv4ports = NULL; + avoidv6ports = NULL; + + (void)ns_config_get(maps, "use-v4-udp-ports", &usev4ports); + if (usev4ports != NULL) + portset_fromconf(v4portset, usev4ports, ISC_TRUE); + else { + CHECKM(isc_net_getudpportrange(AF_INET, &udpport_low, + &udpport_high), + "get the default UDP/IPv4 port range"); + if (udpport_low == udpport_high) + isc_portset_add(v4portset, udpport_low); + else { + isc_portset_addrange(v4portset, udpport_low, + udpport_high); + } + isc_log_write(ns_g_lctx, NS_LOGCATEGORY_GENERAL, + NS_LOGMODULE_SERVER, ISC_LOG_INFO, + "using default UDP/IPv4 port range: [%d, %d]", + udpport_low, udpport_high); + } + (void)ns_config_get(maps, "avoid-v4-udp-ports", &avoidv4ports); + if (avoidv4ports != NULL) + portset_fromconf(v4portset, avoidv4ports, ISC_FALSE); + + (void)ns_config_get(maps, "use-v6-udp-ports", &usev6ports); + if (usev6ports != NULL) + portset_fromconf(v6portset, usev6ports, ISC_TRUE); + else { + CHECKM(isc_net_getudpportrange(AF_INET6, &udpport_low, + &udpport_high), + "get the default UDP/IPv6 port range"); + if (udpport_low == udpport_high) + isc_portset_add(v6portset, udpport_low); + else { + isc_portset_addrange(v6portset, udpport_low, + udpport_high); + } + isc_log_write(ns_g_lctx, NS_LOGCATEGORY_GENERAL, + NS_LOGMODULE_SERVER, ISC_LOG_INFO, + "using default UDP/IPv6 port range: [%d, %d]", + udpport_low, udpport_high); + } + (void)ns_config_get(maps, "avoid-v6-udp-ports", &avoidv6ports); + if (avoidv6ports != NULL) + portset_fromconf(v6portset, avoidv6ports, ISC_FALSE); + + dns_dispatchmgr_setavailports(ns_g_dispatchmgr, v4portset, v6portset); /* * Set the EDNS UDP size when we don't match a view. @@ -3569,6 +3561,12 @@ load_configuration(const char *filename, ns_server_t *server, result = ISC_R_SUCCESS; cleanup: + if (v4portset != NULL) + isc_portset_destroy(ns_g_mctx, &v4portset); + + if (v6portset != NULL) + isc_portset_destroy(ns_g_mctx, &v6portset); + cfg_aclconfctx_destroy(&aclconfctx); if (parser != NULL) { diff --git a/configure.in b/configure.in index e31442a880..0ed9c480c5 100644 --- a/configure.in +++ b/configure.in @@ -18,7 +18,7 @@ AC_DIVERT_PUSH(1)dnl esyscmd([sed "s/^/# /" COPYRIGHT])dnl AC_DIVERT_POP()dnl -AC_REVISION($Revision: 1.443 $) +AC_REVISION($Revision: 1.444 $) AC_INIT(lib/dns/name.c) AC_PREREQ(2.59) @@ -317,6 +317,43 @@ lifconf.lifc_len = 0; ISC_PLATFORM_HAVELIFCONF="#undef ISC_PLATFORM_HAVELIFCONF"]) AC_SUBST(ISC_PLATFORM_HAVELIFCONF) +# +# check if we have kqueue +# +AC_CHECK_FUNC(kqueue, ac_cv_have_kqueue=yes, ac_cv_have_kqueue=no) +case $ac_cv_have_kqueue in +yes) + ISC_PLATFORM_HAVEKQUEUE="#define ISC_PLATFORM_HAVEKQUEUE 1" + ;; +*) + ISC_PLATFORM_HAVEKQUEUE="#undef ISC_PLATFORM_HAVEKQUEUE" + ;; +esac +AC_SUBST(ISC_PLATFORM_HAVEKQUEUE) + +# +# check if we have epoll +# +AC_CHECK_FUNC(epoll_create, ac_cv_have_epoll=yes, ac_cv_have_epoll=no) +case $ac_cv_have_epoll in +yes) + ISC_PLATFORM_HAVEEPOLL="#define ISC_PLATFORM_HAVEEPOLL 1" + ;; +*) + ISC_PLATFORM_HAVEEPOLL="#undef ISC_PLATFORM_HAVEEPOLL" + ;; +esac +AC_SUBST(ISC_PLATFORM_HAVEEPOLL) + +# +# check if we support /dev/poll +# +AC_CHECK_HEADERS(sys/devpoll.h, +ISC_PLATFORM_HAVEDEVPOLL="#define ISC_PLATFORM_HAVEDEVPOLL 1" +, +ISC_PLATFORM_HAVEDEVPOLL="#undef ISC_PLATFORM_HAVEDEVPOLL" +) +AC_SUBST(ISC_PLATFORM_HAVEDEVPOLL) # # check if we need to #include sys/select.h explicitly diff --git a/doc/arm/Bv9ARM-book.xml b/doc/arm/Bv9ARM-book.xml index db900d2e3a..c696568ffd 100644 --- a/doc/arm/Bv9ARM-book.xml +++ b/doc/arm/Bv9ARM-book.xml @@ -18,7 +18,7 @@ - PERFORMANCE OF THIS SOFTWARE. --> - + BIND 9 Administrator Reference Manual @@ -2955,6 +2955,33 @@ $ORIGIN 0.0.0.0.0.0.0.0.8.b.d.0.1.0.0.2.ip6.arpa. + + + + port_list + + + + + A list of an ip_port or a port + range. + A port range is specified in the form of + range followed by + two ip_ports, + port_low and + port_high, which represents + port numbers from port_low through + port_high, inclusive. + port_low must not be larger than + port_high. + For example, + range 1024 65535 represents + ports from 1024 through 65535. + In either case an asterisk (`*') character is not + allowed as a valid ip_port. + + + @@ -4513,7 +4540,9 @@ category notify { null; }; try-tcp-refresh yes_or_no; allow-v6-synthesis { address_match_list }; blackhole { address_match_list }; + use-v4-udp-ports { port_list }; avoid-v4-udp-ports { port_list }; + use-v6-udp-ports { port_list }; avoid-v6-udp-ports { port_list }; listen-on port ip_port { address_match_list }; listen-on-v6 port ip_port { address_match_list }; @@ -6269,32 +6298,104 @@ listen-on-v6 port 1234 { !2001:db8::/32; any; }; If address is * (asterisk) or is omitted, a wildcard IP address (INADDR_ANY) will be used. + + + If port is * or is omitted, - a pool of random unprivileged ports will be used. See the - use-queryport-pool, - queryport-pool-ports and - queryport-pool-updateinterval options below for how the pool - is configured. - The avoid-v4-udp-ports - and avoid-v6-udp-ports options can be used - to prevent named - from selecting certain ports. - The defaults are: + a random port number from a pre-configured + range is picked up and will be used for each query. + The port range(s) is that specified in + the use-v4-udp-ports (for IPv4) + and use-v6-udp-ports (for IPv6) + options, excluding the ranges specified in + the avoid-v4-udp-ports + and avoid-v6-udp-ports options, respectively. + + + + The defaults of the query-source and + query-source-v6 options + are: query-source address * port *; query-source-v6 address * port *; + + If use-v4-udp-ports or + use-v6-udp-ports is unspecified, + named will check if the operating + system provides a programming interface to retrieve the + system's default range for ephemeral ports. + If such an interface is available, + named will use the corresponding system + default range; otherwise, it will use its own defaults: + + +use-v4-udp-ports { range 1024 65535; }; +use-v6-udp-ports { range 1024 65535; }; + + + + Note: make sure the ranges be sufficiently large for + security. A desirable size depends on various parameters, + but we generally recommend it contain at least 16384 ports + (14 bits of entropy). + Note also that the system's default range when used may be + too small for this purpose, and that the range may even be + changed while named is running; the new + range will automatically be applied when named + is reloaded. + It is encouraged to + configure use-v4-udp-ports and + use-v6-udp-ports explicitly so that the + ranges are sufficiently large and are reasonably + independent from the ranges used by other applications. + + + + Note: the operational configuration + where named runs may prohibit the use + of some ports. For example, UNIX systems will not allow + named running without a root privilege + to use ports less than 1024. + If such ports are included in the specified (or detected) + set of query ports, the corresponding query attempts will + fail, resulting in resolution failures or delay. + It is therefore important to configure the set of ports + that can be safely used in the expected operational environment. + + + + The defaults of the avoid-v4-udp-ports and + avoid-v6-udp-ports options + are: + + +avoid-v4-udp-ports {}; +avoid-v6-udp-ports {}; + + + + Note: BIND 9.5.0 introduced + the use-queryport-pool + option to support a pool of such random ports, but this + option is now obsolete because reusing the same ports in + the pool may not be sufficiently secure. + For the same reason, it is generally strongly discouraged to + specify a particular port for the + query-source or + query-source-v6 options; + it implicitly disables the use of randomized port numbers. + + use-queryport-pool - Enable the use of query port pools. By default query port - pools are enabled unless there is a explicit port defined - in query-source or - query-source-v6. + This option is obsolete. @@ -6303,7 +6404,7 @@ query-source-v6 address * port *; queryport-pool-ports - Specify how many pool ports to use. The default is 8. + This option is obsolete. @@ -6312,9 +6413,7 @@ query-source-v6 address * port *; queryport-pool-updateinterval - Specify how often, in minutes, that the queryport pool - should be recreated (new ports selected). The default - is 15 minutes. + This option is obsolete. @@ -6664,17 +6763,48 @@ query-source-v6 address * port *; - Bad UDP Port Lists - avoid-v4-udp-ports - and avoid-v6-udp-ports specify a list - of IPv4 and IPv6 UDP ports that will not be used as system - assigned source ports for UDP sockets. These lists - prevent named from choosing as its random source port a - port that is blocked by your firewall. If a query went - out with such a source port, the answer would not get by - the firewall and the name server would have to query - again. + UDP Port Lists + + use-v4-udp-ports, + avoid-v4-udp-ports, + use-v6-udp-ports, and + avoid-v6-udp-ports + specify a list of IPv4 and IPv6 UDP ports that will be + used or not used as source ports for UDP messages. + See about how the + available ports are determined. + For example, with the following configuration + + +use-v6-udp-ports { range 32768 65535; }; +avoid-v6-udp-ports { 40000; range 50000 60000; }; + + + + UDP ports of IPv6 messages sent + from named will be in one + of the following ranges: 32768 to 39999, 40001 to 49999, + and 60001 to 65535. + + + + avoid-v4-udp-ports and + avoid-v6-udp-ports can be used + to prevent named from choosing as its random source port a + port that is blocked by your firewall or a port that is + used by other applications; + if a query went out with a source port blocked by a + firewall, the + answer would not get by the firewall and the name server would + have to query again. + Note: the desired range can also be represented only with + use-v4-udp-ports and + use-v6-udp-ports, and the + avoid- options are redundant in that + sense; they are provided for backward compatibility and + to possibly simplify the port specification. + diff --git a/lib/dns/dispatch.c b/lib/dns/dispatch.c index 4e19282990..43ee5ada2a 100644 --- a/lib/dns/dispatch.c +++ b/lib/dns/dispatch.c @@ -15,7 +15,7 @@ * PERFORMANCE OF THIS SOFTWARE. */ -/* $Id: dispatch.c,v 1.140 2008/04/03 06:09:04 tbox Exp $ */ +/* $Id: dispatch.c,v 1.141 2008/06/23 19:41:19 jinmei Exp $ */ /*! \file */ @@ -24,11 +24,14 @@ #include #include #include +#include #include #include #include +#include #include +#include #include #include #include @@ -46,24 +49,23 @@ typedef ISC_LIST(dns_dispentry_t) dns_displist_t; -/* transaction ID */ -typedef struct dns_tid { - isc_uint16_t tid_state; - isc_uint16_t *tid_vtable; - isc_uint16_t *tid_pool; - isc_uint16_t tid_a1, tid_a2, tid_a3; - isc_uint16_t tid_c1, tid_c2, tid_c3; - isc_uint16_t tid_state2; - isc_boolean_t tid_usepool; -} dns_tid_t; +/* ARC4 Random generator state */ +typedef struct arc4ctx { + isc_uint8_t i; + isc_uint8_t j; + isc_uint8_t s[256]; + int count; + isc_entropy_t *entropy; /*%< entropy source for ARC4 */ + isc_mutex_t *lock; +} arc4ctx_t; typedef struct dns_qid { unsigned int magic; unsigned int qid_nbuckets; /*%< hash table size */ unsigned int qid_increment; /*%< id increment on collision */ isc_mutex_t lock; - dns_tid_t tid; dns_displist_t *qid_table; /*%< the table itself */ + dns_displist_t *addr_table; /*%< address/port table */ } dns_qid_t; struct dns_dispatchmgr { @@ -73,12 +75,17 @@ struct dns_dispatchmgr { dns_acl_t *blackhole; dns_portlist_t *portlist; dns_stats_t *stats; + isc_entropy_t *entropy; /*%< entropy source */ /* Locked by "lock". */ isc_mutex_t lock; unsigned int state; ISC_LIST(dns_dispatch_t) list; + /* Locked by arc4_lock. */ + isc_mutex_t arc4_lock; + arc4ctx_t arc4ctx; /*%< ARC4 context for QID */ + /* locked by buffer lock */ dns_qid_t *qid; isc_mutex_t buffer_lock; @@ -92,8 +99,27 @@ struct dns_dispatchmgr { isc_mempool_t *rpool; /*%< memory pool for replies */ isc_mempool_t *dpool; /*%< dispatch allocations */ isc_mempool_t *bpool; /*%< memory pool for buffers */ + isc_mempool_t *spool; /*%< memory pool for dispsocs */ - isc_entropy_t *entropy; /*%< entropy source */ + /*% + * Locked by qid->lock if qid exists; otherwise, can be used without + * being locked. + * Memory footprint considerations: this is a simple implementation of + * available ports, i.e., an ordered array of the actual port numbers. + * This will require about 256KB of memory in the worst case (128KB for + * each of IPv4 and IPv6). We could reduce it by representing it as a + * more sophisticated way such as a list (or array) of ranges that are + * searched to identify a specific port. Our decision here is the saved + * memory isn't worth the implementation complexity, considering the + * fact that the whole BIND9 process (which is mainly named) already + * requires a pretty large memory footprint. We may, however, have to + * revisit the decision when we want to use it as a separate module for + * an environment where memory requirement is severer. + */ + in_port_t *v4ports; /*%< available ports for IPv4 */ + unsigned int nv4ports; /*%< # of available ports for IPv4 */ + in_port_t *v6ports; /*%< available ports for IPv4 */ + unsigned int nv6ports; /*%< # of available ports for IPv4 */ }; #define MGR_SHUTTINGDOWN 0x00000001U @@ -101,29 +127,79 @@ struct dns_dispatchmgr { #define IS_PRIVATE(d) (((d)->attributes & DNS_DISPATCHATTR_PRIVATE) != 0) +typedef struct dispsocket dispsocket_t; + struct dns_dispentry { unsigned int magic; dns_dispatch_t *disp; dns_messageid_t id; + in_port_t port; unsigned int bucket; + unsigned int abucket; isc_sockaddr_t host; isc_task_t *task; isc_taskaction_t action; void *arg; isc_boolean_t item_out; + dispsocket_t *dispsocket; ISC_LIST(dns_dispatchevent_t) items; ISC_LINK(dns_dispentry_t) link; + ISC_LINK(dns_dispentry_t) alink; +}; + +/*% + * Maximum number of dispatch sockets that can be pooled for reuse. The + * appropriate value may vary, but experiments have shown a busy caching server + * may need more than 1000 sockets concurrently opened. The maximum allowable + * number of dispatch sockets (per manager) will be set to the double of this + * value. + */ +#ifndef DNS_DISPATCH_POOLSOCKS +#define DNS_DISPATCH_POOLSOCKS 2048 +#endif + +/*% + * Quota to control the number of dispatch sockets. If a dispatch has more + * than the quota of sockets, new queries will purge oldest ones, so that + * a massive number of outstanding queries won't prevent subsequent queries + * (especially if the older ones take longer time and result in timeout). + */ +#ifndef DNS_DISPATCH_SOCKSQUOTA +#define DNS_DISPATCH_SOCKSQUOTA 3072 +#endif + +struct dispsocket { + unsigned int magic; + isc_socket_t *socket; + dns_dispatch_t *disp; + dns_dispentry_t *resp; + isc_task_t *task; + ISC_LINK(dispsocket_t) link; }; #define INVALID_BUCKET (0xffffdead) +/*% + * Number of tasks for each dispatch that use separate sockets for different + * transactions. This must be a power of 2 as it will divide 32 bit numbers + * to get an uniformly random tasks selection. See get_dispsocket(). + */ +#define MAX_INTERNAL_TASKS 64 + struct dns_dispatch { /* Unlocked. */ unsigned int magic; /*%< magic */ dns_dispatchmgr_t *mgr; /*%< dispatch manager */ - isc_task_t *task; /*%< internal task */ + int ntasks; + /*% + * internal task buckets. We use multiple tasks to distribute various + * socket events well when using separate dispatch sockets. We use the + * 1st task (task[0]) for internal control events. + */ + isc_task_t *task[MAX_INTERNAL_TASKS]; isc_socket_t *socket; /*%< isc socket attached to */ isc_sockaddr_t local; /*%< local address */ + in_port_t localport; /*%< local UDP port */ unsigned int maxrequests; /*%< max requests */ isc_event_t *ctlevent; @@ -142,10 +218,14 @@ struct dns_dispatch { tcpmsg_valid : 1, recv_pending : 1; /*%< is a recv() pending? */ isc_result_t shutdown_why; + ISC_LIST(dispsocket_t) activesockets; + ISC_LIST(dispsocket_t) inactivesockets; + unsigned int nsockets; unsigned int requests; /*%< how many requests we have */ unsigned int tcpbuffers; /*%< allocated buffers */ dns_tcpmsg_t tcpmsg; /*%< for tcp streams */ dns_qid_t *qid; + arc4ctx_t arc4ctx; /*%< for QID/UDP port num */ }; #define QID_MAGIC ISC_MAGIC('Q', 'i', 'd', ' ') @@ -154,6 +234,9 @@ struct dns_dispatch { #define RESPONSE_MAGIC ISC_MAGIC('D', 'r', 's', 'p') #define VALID_RESPONSE(e) ISC_MAGIC_VALID((e), RESPONSE_MAGIC) +#define DISPSOCK_MAGIC ISC_MAGIC('D', 's', 'o', 'c') +#define VALID_DISPSOCK(e) ISC_MAGIC_VALID((e), DISPSOCK_MAGIC) + #define DISPATCH_MAGIC ISC_MAGIC('D', 'i', 's', 'p') #define VALID_DISPATCH(e) ISC_MAGIC_VALID((e), DISPATCH_MAGIC) @@ -162,18 +245,36 @@ struct dns_dispatch { #define DNS_QID(disp) ((disp)->socktype == isc_sockettype_tcp) ? \ (disp)->qid : (disp)->mgr->qid +#define DISP_ARC4CTX(disp) ((disp)->socktype == isc_sockettype_udp) ? \ + (&(disp)->arc4ctx) : (&(disp)->mgr->arc4ctx) + +/*% + * Locking a query port buffer is a bit tricky. We access the buffer without + * locking until qid is created. Technically, there is a possibility of race + * between the creation of qid and access to the port buffer; in practice, + * however, this should be safe because qid isn't created until the first + * dispatch is created and there should be no contending situation until then. + */ +#define PORTBUFLOCK(mgr) if ((mgr)->qid != NULL) LOCK(&((mgr)->qid->lock)) +#define PORTBUFUNLOCK(mgr) if ((mgr)->qid != NULL) UNLOCK((&(mgr)->qid->lock)) + /* * Statics. */ -static dns_dispentry_t *bucket_search(dns_qid_t *, isc_sockaddr_t *, - dns_messageid_t, unsigned int); +static dns_dispentry_t *bucket_search(dns_qid_t *, dns_displist_t *, + isc_sockaddr_t *, dns_messageid_t, + in_port_t, unsigned int, isc_boolean_t); static isc_boolean_t destroy_disp_ok(dns_dispatch_t *); static void destroy_disp(isc_task_t *task, isc_event_t *event); -static void udp_recv(isc_task_t *, isc_event_t *); +static void destroy_dispsocket(dns_dispatch_t *, dispsocket_t **); +static void deactivate_dispsocket(dns_dispatch_t *, dispsocket_t *); +static void udp_exrecv(isc_task_t *, isc_event_t *); +static void udp_shrecv(isc_task_t *, isc_event_t *); +static void udp_recv(isc_event_t *, dns_dispatch_t *, dispsocket_t *); static void tcp_recv(isc_task_t *, isc_event_t *); -static void startrecv(dns_dispatch_t *); -static dns_messageid_t dns_randomid(dns_tid_t *); -static isc_uint32_t dns_hash(dns_qid_t *, isc_sockaddr_t *, dns_messageid_t); +static isc_result_t startrecv(dns_dispatch_t *, dispsocket_t *); +static isc_uint32_t dns_hash(dns_qid_t *, isc_sockaddr_t *, dns_messageid_t, + in_port_t); static void free_buffer(dns_dispatch_t *disp, void *buf, unsigned int len); static void *allocate_udp_buffer(dns_dispatch_t *disp); static inline void free_event(dns_dispatch_t *disp, dns_dispatchevent_t *ev); @@ -183,6 +284,12 @@ static dns_dispentry_t *linear_first(dns_qid_t *disp); static dns_dispentry_t *linear_next(dns_qid_t *disp, dns_dispentry_t *resp); static void dispatch_free(dns_dispatch_t **dispp); +static isc_result_t get_udpsocket(dns_dispatchmgr_t *mgr, + dns_dispatch_t *disp, + isc_socketmgr_t *sockmgr, + isc_sockaddr_t *localaddr, + isc_socket_t **sockp, + unsigned int maxtry); static isc_result_t dispatch_createudp(dns_dispatchmgr_t *mgr, isc_socketmgr_t *sockmgr, isc_taskmgr_t *taskmgr, @@ -193,13 +300,13 @@ static isc_result_t dispatch_createudp(dns_dispatchmgr_t *mgr, static isc_boolean_t destroy_mgr_ok(dns_dispatchmgr_t *mgr); static void destroy_mgr(dns_dispatchmgr_t **mgrp); static isc_result_t qid_allocate(dns_dispatchmgr_t *mgr, unsigned int buckets, - unsigned int increment, isc_boolean_t usepool, - dns_qid_t **qidp); + unsigned int increment, dns_qid_t **qidp, + isc_boolean_t needaddrtable); static void qid_destroy(isc_mem_t *mctx, dns_qid_t **qidp); -static isc_uint16_t tid_next(dns_tid_t *tid); -static isc_result_t tid_init(isc_mem_t *mctx, dns_tid_t *tid, - isc_boolean_t usepool); -static void tid_destroy(isc_mem_t *mctx, dns_tid_t *tid); +static isc_result_t open_socket(isc_socketmgr_t *mgr, isc_sockaddr_t *local, + isc_socket_t **sockp); +static isc_boolean_t portavailable(dns_dispatchmgr_t *mgr, isc_socket_t *sock, + isc_sockaddr_t *sockaddrp); #define LVL(x) ISC_LOG_DEBUG(x) @@ -279,41 +386,178 @@ request_log(dns_dispatch_t *disp, dns_dispentry_t *resp, } } -/* - * Return an unpredictable non-reserved UDP port. We share the QID - * framework for this purpose. +/*% + * ARC4 random number generator derived from OpenBSD. + * Only dispatch_arc4random() and dispatch_arc4uniformrandom() are expected + * to be called from general dispatch routines; the rest of them are subroutines + * for these two. + * + * The original copyright follows: + * Copyright (c) 1996, David Mazieres + * Copyright (c) 2008, Damien Miller + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ -static in_port_t -get_randomport(dns_tid_t *tid) { - isc_uint16_t p; - - p = tid_next(tid); - - /* XXX: should the range be configurable? */ - return ((in_port_t)(1024 + (p % (65535 - 1024)))); +static void +dispatch_arc4init(arc4ctx_t *actx, isc_entropy_t *entropy, isc_mutex_t *lock) { + int n; + for (n = 0; n < 256; n++) + actx->s[n] = n; + actx->i = 0; + actx->j = 0; + actx->count = 0; + actx->entropy = entropy; /* don't have to attach */ + actx->lock = lock; } -/* - * Return an unpredictable message ID. - */ -static dns_messageid_t -dns_randomid(dns_tid_t *tid) { - isc_uint32_t id; +static void +dispatch_arc4addrandom(arc4ctx_t *actx, unsigned char *dat, int datlen) { + int n; + isc_uint8_t si; - id = tid_next(tid); + actx->i--; + for (n = 0; n < 256; n++) { + actx->i = (actx->i + 1); + si = actx->s[actx->i]; + actx->j = (actx->j + si + dat[n % datlen]); + actx->s[actx->i] = actx->s[actx->j]; + actx->s[actx->j] = si; + } + actx->j = actx->i; +} - return ((dns_messageid_t)id); +static inline isc_uint8_t +dispatch_arc4get8(arc4ctx_t *actx) { + isc_uint8_t si, sj; + + actx->i = (actx->i + 1); + si = actx->s[actx->i]; + actx->j = (actx->j + si); + sj = actx->s[actx->j]; + actx->s[actx->i] = sj; + actx->s[actx->j] = si; + + return (actx->s[(si + sj) & 0xff]); +} + +static inline isc_uint16_t +dispatch_arc4get16(arc4ctx_t *actx) { + isc_uint16_t val; + + val = dispatch_arc4get8(actx) << 8; + val |= dispatch_arc4get8(actx); + + return (val); +} + +static void +dispatch_arc4stir(arc4ctx_t *actx) { + int i; + union { + unsigned char rnd[128]; + isc_uint32_t rnd32[32]; + } rnd; + isc_result_t result; + + if (actx->entropy != NULL) { + /* + * We accept any quality of random data to avoid blocking. + */ + result = isc_entropy_getdata(actx->entropy, rnd.rnd, + sizeof(rnd), NULL, 0); + RUNTIME_CHECK(result == ISC_R_SUCCESS); + } else { + for (i = 0; i < 32; i++) + isc_random_get(&rnd.rnd32[i]); + } + dispatch_arc4addrandom(actx, rnd.rnd, sizeof(rnd.rnd)); + + /* + * Discard early keystream, as per recommendations in: + * http://www.wisdom.weizmann.ac.il/~itsik/RC4/Papers/Rc4_ksa.ps + */ + for (i = 0; i < 256; i++) + (void)dispatch_arc4get8(actx); + + /* + * Derived from OpenBSD's implementation. The rationale is not clear, + * but should be conservative enough in safety, and reasonably large + * for efficiency. + */ + actx->count = 1600000; +} + +static isc_uint16_t +dispatch_arc4random(arc4ctx_t *actx) { + isc_uint16_t result; + + if (actx->lock != NULL) + LOCK(actx->lock); + + actx->count -= sizeof(isc_uint16_t); + if (actx->count <= 0) + dispatch_arc4stir(actx); + result = dispatch_arc4get16(actx); + + if (actx->lock != NULL) + UNLOCK(actx->lock); + + return (result); +} + +static isc_uint16_t +dispatch_arc4uniformrandom(arc4ctx_t *actx, isc_uint16_t upper_bound) { + isc_uint16_t min, r; + + if (upper_bound < 2) + return (0); + + /* + * Ensure the range of random numbers [min, 0xffff] be a multiple of + * upper_bound and contain at least a half of the 16 bit range. + */ + + if (upper_bound > 0x8000) + min = 1 + ~upper_bound; /* 0x8000 - upper_bound */ + else + min = (isc_uint16_t)(0x10000 % (isc_uint32_t)upper_bound); + + /* + * This could theoretically loop forever but each retry has + * p > 0.5 (worst case, usually far better) of selecting a + * number inside the range we need, so it should rarely need + * to re-roll. + */ + for (;;) { + r = dispatch_arc4random(actx); + if (r >= min) + break; + } + + return (r % upper_bound); } /* * Return a hash of the destination and message id. */ static isc_uint32_t -dns_hash(dns_qid_t *qid, isc_sockaddr_t *dest, dns_messageid_t id) { +dns_hash(dns_qid_t *qid, isc_sockaddr_t *dest, dns_messageid_t id, + in_port_t port) +{ unsigned int ret; ret = isc_sockaddr_hash(dest, ISC_TRUE); - ret ^= id; + ret ^= (id << 16) | port; ret %= qid->qid_nbuckets; INSIST(ret < qid->qid_nbuckets); @@ -378,13 +622,15 @@ destroy_disp_ok(dns_dispatch_t *disp) if (disp->recv_pending != 0) return (ISC_FALSE); + if (!ISC_LIST_EMPTY(disp->activesockets)) + return (ISC_FALSE); + if (disp->shutting_down == 0) return (ISC_FALSE); return (ISC_TRUE); } - /* * Called when refcount reaches 0 (and safe to destroy). * @@ -396,6 +642,8 @@ destroy_disp(isc_task_t *task, isc_event_t *event) { dns_dispatch_t *disp; dns_dispatchmgr_t *mgr; isc_boolean_t killmgr; + dispsocket_t *dispsocket; + int i; INSIST(event->ev_type == DNS_EVENT_DISPATCHCONTROL); @@ -409,10 +657,16 @@ destroy_disp(isc_task_t *task, isc_event_t *event) { dispatch_log(disp, LVL(90), "shutting down; detaching from sock %p, task %p", - disp->socket, disp->task); + disp->socket, disp->task[0]); /* XXXX */ - isc_socket_detach(&disp->socket); - isc_task_detach(&disp->task); + if (disp->socket != NULL) + isc_socket_detach(&disp->socket); + while ((dispsocket = ISC_LIST_HEAD(disp->inactivesockets)) != NULL) { + ISC_LIST_UNLINK(disp->inactivesockets, dispsocket, link); + destroy_dispsocket(disp, &dispsocket); + } + for (i = 0; i < disp->ntasks; i++) + isc_task_detach(&disp->task[i]); isc_event_free(&event); dispatch_free(&disp); @@ -423,24 +677,171 @@ destroy_disp(isc_task_t *task, isc_event_t *event) { destroy_mgr(&mgr); } +/*% + * Make a new socket for a single dispatch with a random port number. + * The caller must hold the disp->lock and qid->lock. + */ +static isc_result_t +get_dispsocket(dns_dispatch_t *disp, isc_sockaddr_t *dest, + isc_socketmgr_t *sockmgr, dns_qid_t *qid, + dispsocket_t **dispsockp, unsigned int *abucketp, + in_port_t *portp) +{ + int i; + isc_uint32_t r; + dns_dispatchmgr_t *mgr = disp->mgr; + isc_socket_t *sock = NULL; + isc_result_t result = ISC_R_FAILURE; + in_port_t port; + isc_sockaddr_t localaddr; + unsigned int abucket = 0; + dispsocket_t *dispsock; + unsigned int nports; + in_port_t *ports; + + if (isc_sockaddr_pf(&disp->local) == AF_INET) { + nports = disp->mgr->nv4ports; + ports = disp->mgr->v4ports; + } else { + nports = disp->mgr->nv6ports; + ports = disp->mgr->v6ports; + } + if (nports == 0) + return (ISC_R_ADDRNOTAVAIL); + + dispsock = ISC_LIST_HEAD(disp->inactivesockets); + if (dispsock != NULL) { + ISC_LIST_UNLINK(disp->inactivesockets, dispsock, link); + sock = dispsock->socket; + dispsock->socket = NULL; + } else { + dispsock = isc_mempool_get(mgr->spool); + if (dispsock == NULL) + return (ISC_R_NOMEMORY); + + disp->nsockets++; + dispsock->socket = NULL; + dispsock->disp = disp; + dispsock->resp = NULL; + isc_random_get(&r); + dispsock->task = NULL; + isc_task_attach(disp->task[r % disp->ntasks], &dispsock->task); + ISC_LINK_INIT(dispsock, link); + dispsock->magic = DISPSOCK_MAGIC; + } + + /* + * Pick up a random UDP port and open a new socket with it. Avoid + * choosing ports that share the same destination because it will be + * very likely to fail in bind(2) or connect(2). + */ + localaddr = disp->local; + for (i = 0; i < 64; i++) { + port = ports[dispatch_arc4uniformrandom(DISP_ARC4CTX(disp), + nports)]; + isc_sockaddr_setport(&localaddr, port); + + abucket = dns_hash(qid, dest, 0, port); + if (bucket_search(qid, qid->addr_table, dest, 0, port, abucket, + ISC_TRUE) != NULL) { + continue; + } + + result = open_socket(sockmgr, &localaddr, &sock); + if (result == ISC_R_SUCCESS || result != ISC_R_ADDRINUSE) + break; + } + + if (result == ISC_R_SUCCESS) { + dispsock->socket = sock; + *dispsockp = dispsock; + *abucketp = abucket; + *portp = port; + } else { + /* + * We could keep it in the inactive list, but since this should + * be an exceptional case and might be resource shortage, we'd + * rather destroy it. + */ + if (sock != NULL) + isc_socket_detach(&sock); + destroy_dispsocket(disp, &dispsock); + } + + return (result); +} + +/*% + * Destroy a dedicated dispatch socket. + */ +static void +destroy_dispsocket(dns_dispatch_t *disp, dispsocket_t **dispsockp) { + dispsocket_t *dispsock; + + /* + * The dispatch must be locked. + */ + + REQUIRE(dispsockp != NULL && *dispsockp != NULL); + dispsock = *dispsockp; + REQUIRE(!ISC_LINK_LINKED(dispsock, link)); + + disp->nsockets--; + dispsock->magic = 0; + if (dispsock->socket != NULL) + isc_socket_detach(&dispsock->socket); + if (dispsock->task != NULL) + isc_task_detach(&dispsock->task); + isc_mempool_put(disp->mgr->spool, dispsock); + + *dispsockp = NULL; +} + +/*% + * Deactivate a dedicated dispatch socket. Move it to the inactive list for + * future reuse unless the total number of sockets are exceeding the maximum. + */ +static void +deactivate_dispsocket(dns_dispatch_t *disp, dispsocket_t *dispsock) { + /* + * The dispatch must be locked. + */ + ISC_LIST_UNLINK(disp->activesockets, dispsock, link); + if (dispsock->resp != NULL) { + INSIST(dispsock->resp->dispsocket == dispsock); + dispsock->resp->dispsocket = NULL; + } + + if (disp->nsockets > DNS_DISPATCH_POOLSOCKS) + destroy_dispsocket(disp, &dispsock); + else { + isc_socket_close(dispsock->socket); + ISC_LIST_APPEND(disp->inactivesockets, dispsock, link); + } +} /* - * Find an entry for query ID 'id' and socket address 'dest' in 'qid'. + * Find an entry for query ID 'id', socket address 'dest', and port number + * 'port' in 'table'. * Return NULL if no such entry exists. */ static dns_dispentry_t * -bucket_search(dns_qid_t *qid, isc_sockaddr_t *dest, dns_messageid_t id, - unsigned int bucket) +bucket_search(dns_qid_t *qid, dns_displist_t *table, isc_sockaddr_t *dest, + dns_messageid_t id, in_port_t port, unsigned int bucket, + isc_boolean_t ignoreid) { dns_dispentry_t *res; REQUIRE(bucket < qid->qid_nbuckets); - res = ISC_LIST_HEAD(qid->qid_table[bucket]); + res = ISC_LIST_HEAD(table[bucket]); while (res != NULL) { - if ((res->id == id) && isc_sockaddr_equal(dest, &res->host)) + if ((ignoreid || res->id == id) && + isc_sockaddr_equal(dest, &res->host) && + res->port == port) { return (res); + } res = ISC_LIST_NEXT(res, link); } @@ -511,6 +912,26 @@ allocate_event(dns_dispatch_t *disp) { return (ev); } +static void +udp_exrecv(isc_task_t *task, isc_event_t *ev) { + dispsocket_t *dispsock = ev->ev_arg; + + UNUSED(task); + + REQUIRE(VALID_DISPSOCK(dispsock)); + udp_recv(ev, dispsock->disp, dispsock); +} + +static void +udp_shrecv(isc_task_t *task, isc_event_t *ev) { + dns_dispatch_t *disp = ev->ev_arg; + + UNUSED(task); + + REQUIRE(VALID_DISPATCH(disp)); + udp_recv(ev, disp, NULL); +} + /* * General flow: * @@ -526,14 +947,13 @@ allocate_event(dns_dispatch_t *disp) { * restart. */ static void -udp_recv(isc_task_t *task, isc_event_t *ev_in) { +udp_recv(isc_event_t *ev_in, dns_dispatch_t *disp, dispsocket_t *dispsock) { isc_socketevent_t *ev = (isc_socketevent_t *)ev_in; - dns_dispatch_t *disp = ev_in->ev_arg; dns_messageid_t id; isc_result_t dres; isc_buffer_t source; unsigned int flags; - dns_dispentry_t *resp; + dns_dispentry_t *resp = NULL; dns_dispatchevent_t *rev; unsigned int bucket; isc_boolean_t killit; @@ -542,8 +962,8 @@ udp_recv(isc_task_t *task, isc_event_t *ev_in) { dns_qid_t *qid; isc_netaddr_t netaddr; int match; - - UNUSED(task); + int result; + isc_boolean_t qidlocked = ISC_FALSE; LOCK(&disp->lock); @@ -554,7 +974,7 @@ udp_recv(isc_task_t *task, isc_event_t *ev_in) { "got packet: requests %d, buffers %d, recvs %d", disp->requests, disp->mgr->buffers, disp->recv_pending); - if (ev->ev_type == ISC_SOCKEVENT_RECVDONE) { + if (dispsock == NULL && ev->ev_type == ISC_SOCKEVENT_RECVDONE) { /* * Unless the receive event was imported from a listening * interface, in which case the event type is @@ -564,6 +984,19 @@ udp_recv(isc_task_t *task, isc_event_t *ev_in) { disp->recv_pending = 0; } + if (dispsock != NULL && + (ev->result == ISC_R_CANCELED || dispsock->resp == NULL)) { + /* + * dispsock->resp can be NULL if this transaction was canceled + * just after receiving a response. Since this socket is + * exclusively used and there should be at most one receive + * event the canceled event should have been no effect. So + * we can (and should) deactivate the socket right now. + */ + deactivate_dispsocket(disp, dispsock); + dispsock = NULL; + } + if (disp->shutting_down) { /* * This dispatcher is shutting down. @@ -576,12 +1009,25 @@ udp_recv(isc_task_t *task, isc_event_t *ev_in) { killit = destroy_disp_ok(disp); UNLOCK(&disp->lock); if (killit) - isc_task_send(disp->task, &disp->ctlevent); + isc_task_send(disp->task[0], &disp->ctlevent); return; } - if (ev->result != ISC_R_SUCCESS) { + if (dispsock != NULL && + (disp->attributes & DNS_DISPATCHATTR_EXCLUSIVE) != 0) { + resp = dispsock->resp; + id = resp->id; + if (ev->result != ISC_R_SUCCESS) { + /* + * This is most likely a network error on a connected + * socket. It makes no sense to check the address or + * parse the packet, but it will help to return the + * error to the caller. + */ + goto sendresponse; + } + } else if (ev->result != ISC_R_SUCCESS) { free_buffer(disp, ev->region.base, ev->region.length); if (ev->result != ISC_R_CANCELED) @@ -642,18 +1088,31 @@ udp_recv(isc_task_t *task, isc_event_t *ev_in) { goto restart; } - dns_dispatch_hash(&ev->timestamp, sizeof(&ev->timestamp)); - dns_dispatch_hash(ev->region.base, ev->region.length); - - /* response */ - bucket = dns_hash(qid, &ev->address, id); - LOCK(&qid->lock); - resp = bucket_search(qid, &ev->address, id, bucket); - dispatch_log(disp, LVL(90), - "search for response in bucket %d: %s", - bucket, (resp == NULL ? "not found" : "found")); - + /* + * Search for the corresponding response. If we are using an exclusive + * socket, we've already identified it and we can skip the search; but + * the ID and the address must match the expected ones. + */ if (resp == NULL) { + bucket = dns_hash(qid, &ev->address, id, disp->localport); + LOCK(&qid->lock); + qidlocked = ISC_TRUE; + resp = bucket_search(qid, qid->qid_table, &ev->address, id, + disp->localport, bucket, ISC_FALSE); + dispatch_log(disp, LVL(90), + "search for response in bucket %d: %s", + bucket, (resp == NULL ? "not found" : "found")); + + if (resp == NULL) { + dns_generalstats_increment(mgr->stats, + dns_resstatscounter_mismatch); + free_buffer(disp, ev->region.base, ev->region.length); + goto unlock; + } + } else if (resp->id != id || !isc_sockaddr_equal(&ev->address, + &resp->host)) { + dispatch_log(disp, LVL(90), + "response to an exclusive socket doesn't match"); dns_generalstats_increment(mgr->stats, dns_resstatscounter_mismatch); free_buffer(disp, ev->region.base, ev->region.length); @@ -703,6 +1162,7 @@ udp_recv(isc_task_t *task, isc_event_t *ev_in) { } } + sendresponse: queue_response = resp->item_out; rev = allocate_event(resp->disp); if (rev == NULL) { @@ -717,7 +1177,7 @@ udp_recv(isc_task_t *task, isc_event_t *ev_in) { */ isc_buffer_init(&rev->buffer, ev->region.base, ev->region.length); isc_buffer_add(&rev->buffer, ev->n); - rev->result = ISC_R_SUCCESS; + rev->result = ev->result; rev->id = id; rev->addr = ev->address; rev->pktinfo = ev->pktinfo; @@ -736,14 +1196,23 @@ udp_recv(isc_task_t *task, isc_event_t *ev_in) { isc_task_send(resp->task, ISC_EVENT_PTR(&rev)); } unlock: - UNLOCK(&qid->lock); + if (qidlocked) + UNLOCK(&qid->lock); /* * Restart recv() to get the next packet. */ restart: - startrecv(disp); - + result = startrecv(disp, dispsock); + if (result != ISC_R_SUCCESS && dispsock != NULL) { + /* + * XXX: wired. There seems to be no recovery process other than + * deactivate this socket anyway (since we cannot start + * receiving, we won't be able to receive a cancel event + * from the user). + */ + deactivate_dispsocket(disp, dispsock); + } UNLOCK(&disp->lock); isc_event_free(&ev_in); @@ -843,7 +1312,7 @@ tcp_recv(isc_task_t *task, isc_event_t *ev_in) { killit = destroy_disp_ok(disp); UNLOCK(&disp->lock); if (killit) - isc_task_send(disp->task, &disp->ctlevent); + isc_task_send(disp->task[0], &disp->ctlevent); return; } @@ -881,14 +1350,13 @@ tcp_recv(isc_task_t *task, isc_event_t *ev_in) { goto restart; } - dns_dispatch_hash(tcpmsg->buffer.base, tcpmsg->buffer.length); - /* * Response. */ - bucket = dns_hash(qid, &tcpmsg->address, id); + bucket = dns_hash(qid, &tcpmsg->address, id, disp->localport); LOCK(&qid->lock); - resp = bucket_search(qid, &tcpmsg->address, id, bucket); + resp = bucket_search(qid, qid->qid_table, &tcpmsg->address, id, + disp->localport, bucket, ISC_FALSE); dispatch_log(disp, LVL(90), "search for response in bucket %d: %s", bucket, (resp == NULL ? "not found" : "found")); @@ -929,7 +1397,7 @@ tcp_recv(isc_task_t *task, isc_event_t *ev_in) { * Restart recv() to get the next packet. */ restart: - startrecv(disp); + (void)startrecv(disp, NULL); UNLOCK(&disp->lock); @@ -939,22 +1407,33 @@ tcp_recv(isc_task_t *task, isc_event_t *ev_in) { /* * disp must be locked. */ -static void -startrecv(dns_dispatch_t *disp) { +static isc_result_t +startrecv(dns_dispatch_t *disp, dispsocket_t *dispsock) { isc_result_t res; isc_region_t region; + isc_socket_t *socket; if (disp->shutting_down == 1) - return; + return (ISC_R_SUCCESS); if ((disp->attributes & DNS_DISPATCHATTR_NOLISTEN) != 0) - return; + return (ISC_R_SUCCESS); - if (disp->recv_pending != 0) - return; + if (disp->recv_pending != 0 && dispsock == NULL) + return (ISC_R_SUCCESS); if (disp->mgr->buffers >= disp->mgr->maxbuffers) - return; + return (ISC_R_NOMEMORY); + + if ((disp->attributes & DNS_DISPATCHATTR_EXCLUSIVE) != 0 && + dispsock == NULL) + return (ISC_R_SUCCESS); + + if (dispsock != NULL) + socket = dispsock->socket; + else + socket = disp->socket; + INSIST(socket != NULL); switch (disp->socktype) { /* @@ -964,28 +1443,38 @@ startrecv(dns_dispatch_t *disp) { region.length = disp->mgr->buffersize; region.base = allocate_udp_buffer(disp); if (region.base == NULL) - return; - res = isc_socket_recv(disp->socket, ®ion, 1, - disp->task, udp_recv, disp); - if (res != ISC_R_SUCCESS) { - free_buffer(disp, region.base, region.length); - disp->shutdown_why = res; - disp->shutting_down = 1; - do_cancel(disp); - return; + return (ISC_R_NOMEMORY); + if (dispsock != NULL) { + res = isc_socket_recv(socket, ®ion, 1, + dispsock->task, udp_exrecv, + dispsock); + if (res != ISC_R_SUCCESS) { + free_buffer(disp, region.base, region.length); + return (res); + } + } else { + res = isc_socket_recv(socket, ®ion, 1, + disp->task[0], udp_shrecv, disp); + if (res != ISC_R_SUCCESS) { + free_buffer(disp, region.base, region.length); + disp->shutdown_why = res; + disp->shutting_down = 1; + do_cancel(disp); + return (ISC_R_SUCCESS); /* recover by cancel */ + } + INSIST(disp->recv_pending == 0); + disp->recv_pending = 1; } - INSIST(disp->recv_pending == 0); - disp->recv_pending = 1; break; case isc_sockettype_tcp: - res = dns_tcpmsg_readmessage(&disp->tcpmsg, disp->task, + res = dns_tcpmsg_readmessage(&disp->tcpmsg, disp->task[0], tcp_recv, disp); if (res != ISC_R_SUCCESS) { disp->shutdown_why = res; disp->shutting_down = 1; do_cancel(disp); - return; + return (ISC_R_SUCCESS); /* recover by cancel */ } INSIST(disp->recv_pending == 0); disp->recv_pending = 1; @@ -994,6 +1483,8 @@ startrecv(dns_dispatch_t *disp) { INSIST(0); break; } + + return (ISC_R_SUCCESS); } /* @@ -1040,10 +1531,13 @@ destroy_mgr(dns_dispatchmgr_t **mgrp) { DESTROYLOCK(&mgr->lock); mgr->state = 0; + DESTROYLOCK(&mgr->arc4_lock); + isc_mempool_destroy(&mgr->epool); isc_mempool_destroy(&mgr->rpool); isc_mempool_destroy(&mgr->dpool); isc_mempool_destroy(&mgr->bpool); + isc_mempool_destroy(&mgr->spool); DESTROYLOCK(&mgr->pool_lock); @@ -1057,36 +1551,50 @@ destroy_mgr(dns_dispatchmgr_t **mgrp) { if (mgr->blackhole != NULL) dns_acl_detach(&mgr->blackhole); - if (mgr->portlist != NULL) - dns_portlist_detach(&mgr->portlist); - if (mgr->stats != NULL) dns_stats_detach(&mgr->stats); + if (mgr->v4ports != NULL) { + isc_mem_put(mctx, mgr->v4ports, + mgr->nv4ports * sizeof(in_port_t)); + } + if (mgr->v6ports != NULL) { + isc_mem_put(mctx, mgr->v6ports, + mgr->nv6ports * sizeof(in_port_t)); + } isc_mem_put(mctx, mgr, sizeof(dns_dispatchmgr_t)); isc_mem_detach(&mctx); } static isc_result_t -create_socket(isc_socketmgr_t *mgr, isc_sockaddr_t *local, - isc_socket_t **sockp) +open_socket(isc_socketmgr_t *mgr, isc_sockaddr_t *local, + isc_socket_t **sockp) { isc_socket_t *sock; isc_result_t result; - sock = NULL; - result = isc_socket_create(mgr, isc_sockaddr_pf(local), - isc_sockettype_udp, &sock); - if (result != ISC_R_SUCCESS) - return (result); - isc_socket_setname(sock, "dispatcher", NULL); + sock = *sockp; + if (sock == NULL) { + result = isc_socket_create(mgr, isc_sockaddr_pf(local), + isc_sockettype_udp, &sock); + if (result != ISC_R_SUCCESS) + return (result); + isc_socket_setname(sock, "dispatcher", NULL); + } else { + result = isc_socket_open(sock); + if (result != ISC_R_SUCCESS) + return (result); + } #ifndef ISC_ALLOW_MAPPED isc_socket_ipv6only(sock, ISC_TRUE); #endif result = isc_socket_bind(sock, local); if (result != ISC_R_SUCCESS) { - isc_socket_detach(&sock); + if (*sockp == NULL) + isc_socket_detach(&sock); + else + isc_socket_close(sock); return (result); } @@ -1094,6 +1602,24 @@ create_socket(isc_socketmgr_t *mgr, isc_sockaddr_t *local, return (ISC_R_SUCCESS); } +/*% + * Create a temporary port list to set the initial default set of dispatch + * ports: [1024, 65535]. This is almost meaningless as the application will + * normally set the ports explicitly, but is provided to fill some minor corner + * cases. + */ +static isc_result_t +create_default_portset(isc_mem_t *mctx, isc_portset_t **portsetp) { + isc_result_t result; + + result = isc_portset_create(mctx, portsetp); + if (result != ISC_R_SUCCESS) + return (result); + isc_portset_addrange(*portsetp, 1024, 65535); + + return (ISC_R_SUCCESS); +} + /* * Publics. */ @@ -1104,6 +1630,8 @@ dns_dispatchmgr_create(isc_mem_t *mctx, isc_entropy_t *entropy, { dns_dispatchmgr_t *mgr; isc_result_t result; + isc_portset_t *v4portset = NULL; + isc_portset_t *v6portset = NULL; REQUIRE(mctx != NULL); REQUIRE(mgrp != NULL && *mgrp == NULL); @@ -1116,17 +1644,20 @@ dns_dispatchmgr_create(isc_mem_t *mctx, isc_entropy_t *entropy, isc_mem_attach(mctx, &mgr->mctx); mgr->blackhole = NULL; - mgr->portlist = NULL; mgr->stats = NULL; result = isc_mutex_init(&mgr->lock); if (result != ISC_R_SUCCESS) goto deallocate; - result = isc_mutex_init(&mgr->buffer_lock); + result = isc_mutex_init(&mgr->arc4_lock); if (result != ISC_R_SUCCESS) goto kill_lock; + result = isc_mutex_init(&mgr->buffer_lock); + if (result != ISC_R_SUCCESS) + goto kill_arc4_lock; + result = isc_mutex_init(&mgr->pool_lock); if (result != ISC_R_SUCCESS) goto kill_buffer_lock; @@ -1168,18 +1699,43 @@ dns_dispatchmgr_create(isc_mem_t *mctx, isc_entropy_t *entropy, mgr->buffersize = 0; mgr->maxbuffers = 0; mgr->bpool = NULL; + mgr->spool = NULL; mgr->entropy = NULL; mgr->qid = NULL; mgr->state = 0; ISC_LIST_INIT(mgr->list); + mgr->v4ports = NULL; + mgr->v6ports = NULL; + mgr->nv4ports = 0; + mgr->nv6ports = 0; mgr->magic = DNS_DISPATCHMGR_MAGIC; + result = create_default_portset(mctx, &v4portset); + if (result == ISC_R_SUCCESS) { + result = create_default_portset(mctx, &v6portset); + if (result == ISC_R_SUCCESS) { + result = dns_dispatchmgr_setavailports(mgr, + v4portset, + v6portset); + } + } + if (v4portset != NULL) + isc_portset_destroy(mctx, &v4portset); + if (v6portset != NULL) + isc_portset_destroy(mctx, &v6portset); + if (result != ISC_R_SUCCESS) + goto kill_dpool; + if (entropy != NULL) isc_entropy_attach(entropy, &mgr->entropy); + dispatch_arc4init(&mgr->arc4ctx, mgr->entropy, &mgr->arc4_lock); + *mgrp = mgr; return (ISC_R_SUCCESS); + kill_dpool: + isc_mempool_destroy(&mgr->dpool); kill_rpool: isc_mempool_destroy(&mgr->rpool); kill_epool: @@ -1188,6 +1744,8 @@ dns_dispatchmgr_create(isc_mem_t *mctx, isc_entropy_t *entropy, DESTROYLOCK(&mgr->pool_lock); kill_buffer_lock: DESTROYLOCK(&mgr->buffer_lock); + kill_arc4_lock: + DESTROYLOCK(&mgr->arc4_lock); kill_lock: DESTROYLOCK(&mgr->lock); deallocate: @@ -1216,22 +1774,88 @@ dns_dispatchmgr_setblackportlist(dns_dispatchmgr_t *mgr, dns_portlist_t *portlist) { REQUIRE(VALID_DISPATCHMGR(mgr)); - if (mgr->portlist != NULL) - dns_portlist_detach(&mgr->portlist); - if (portlist != NULL) - dns_portlist_attach(portlist, &mgr->portlist); + UNUSED(portlist); + + /* This function is deprecated: use dns_dispatchmgr_setavailports(). */ + return; } dns_portlist_t * dns_dispatchmgr_getblackportlist(dns_dispatchmgr_t *mgr) { REQUIRE(VALID_DISPATCHMGR(mgr)); - return (mgr->portlist); + return (NULL); /* this function is deprecated */ +} + +isc_result_t +dns_dispatchmgr_setavailports(dns_dispatchmgr_t *mgr, isc_portset_t *v4portset, + isc_portset_t *v6portset) +{ + in_port_t *v4ports, *v6ports, p; + unsigned int nv4ports, nv6ports, i4, i6; + + REQUIRE(VALID_DISPATCHMGR(mgr)); + + nv4ports = isc_portset_nports(v4portset); + nv6ports = isc_portset_nports(v6portset); + + v4ports = NULL; + if (nv4ports != 0) { + v4ports = isc_mem_get(mgr->mctx, sizeof(in_port_t) * nv4ports); + if (v4ports == NULL) + return (ISC_R_NOMEMORY); + } + v6ports = NULL; + if (nv6ports != 0) { + v6ports = isc_mem_get(mgr->mctx, sizeof(in_port_t) * nv6ports); + if (v6ports == NULL) { + if (v4ports != NULL) { + isc_mem_put(mgr->mctx, v4ports, + sizeof(in_port_t) * + isc_portset_nports(v4portset)); + } + return (ISC_R_NOMEMORY); + } + } + + p = 0; + i4 = 0; + i6 = 0; + do { + if (isc_portset_isset(v4portset, p)) { + INSIST(i4 < nv4ports); + v4ports[i4++] = p; + } + if (isc_portset_isset(v6portset, p)) { + INSIST(i6 < nv6ports); + v6ports[i6++] = p; + } + } while (p++ < 65535); + INSIST(i4 == nv4ports && i6 == nv6ports); + + PORTBUFLOCK(mgr); + if (mgr->v4ports != NULL) { + isc_mem_put(mgr->mctx, mgr->v4ports, + mgr->nv4ports * sizeof(in_port_t)); + } + mgr->v4ports = v4ports; + mgr->nv4ports = nv4ports; + + if (mgr->v6ports != NULL) { + isc_mem_put(mgr->mctx, mgr->v6ports, + mgr->nv6ports * sizeof(in_port_t)); + } + mgr->v6ports = v6ports; + mgr->nv6ports = nv6ports; + PORTBUFUNLOCK(mgr); + + return (ISC_R_SUCCESS); } static isc_result_t dns_dispatchmgr_setudp(dns_dispatchmgr_t *mgr, - unsigned int buffersize, unsigned int maxbuffers, - unsigned int buckets, unsigned int increment) + unsigned int buffersize, unsigned int maxbuffers, + unsigned int maxrequests, unsigned int buckets, + unsigned int increment) { isc_result_t result; @@ -1258,24 +1882,39 @@ dns_dispatchmgr_setudp(dns_dispatchmgr_t *mgr, maxbuffers = 8; LOCK(&mgr->buffer_lock); + + /* Create or adjust buffer pool */ if (mgr->bpool != NULL) { isc_mempool_setmaxalloc(mgr->bpool, maxbuffers); mgr->maxbuffers = maxbuffers; + } else { + result = isc_mempool_create(mgr->mctx, buffersize, &mgr->bpool); + if (result != ISC_R_SUCCESS) { + UNLOCK(&mgr->buffer_lock); + return (result); + } + isc_mempool_setname(mgr->bpool, "dispmgr_bpool"); + isc_mempool_setmaxalloc(mgr->bpool, maxbuffers); + isc_mempool_associatelock(mgr->bpool, &mgr->pool_lock); + } + + /* Create or adjust socket pool */ + if (mgr->spool != NULL) { + isc_mempool_setmaxalloc(mgr->spool, DNS_DISPATCH_POOLSOCKS * 2); UNLOCK(&mgr->buffer_lock); return (ISC_R_SUCCESS); } - - if (isc_mempool_create(mgr->mctx, buffersize, - &mgr->bpool) != ISC_R_SUCCESS) { + result = isc_mempool_create(mgr->mctx, sizeof(dispsocket_t), + &mgr->spool); + if (result != ISC_R_SUCCESS) { UNLOCK(&mgr->buffer_lock); - return (ISC_R_NOMEMORY); + goto cleanup; } + isc_mempool_setname(mgr->spool, "dispmgr_spool"); + isc_mempool_setmaxalloc(mgr->spool, maxrequests); + isc_mempool_associatelock(mgr->spool, &mgr->pool_lock); - isc_mempool_setname(mgr->bpool, "dispmgr_bpool"); - isc_mempool_setmaxalloc(mgr->bpool, maxbuffers); - isc_mempool_associatelock(mgr->bpool, &mgr->pool_lock); - - result = qid_allocate(mgr, buckets, increment, ISC_TRUE, &mgr->qid); + result = qid_allocate(mgr, buckets, increment, &mgr->qid, ISC_TRUE); if (result != ISC_R_SUCCESS) goto cleanup; @@ -1286,8 +1925,10 @@ dns_dispatchmgr_setudp(dns_dispatchmgr_t *mgr, cleanup: isc_mempool_destroy(&mgr->bpool); + if (mgr->spool != NULL) + isc_mempool_destroy(&mgr->spool); UNLOCK(&mgr->buffer_lock); - return (ISC_R_NOMEMORY); + return (result); } void @@ -1322,29 +1963,56 @@ dns_dispatchmgr_setstats(dns_dispatchmgr_t *mgr, dns_stats_t *stats) { dns_stats_attach(stats, &mgr->stats); } +static int +port_cmp(const void *key, const void *ent) { + in_port_t p1 = *(const in_port_t *)key; + in_port_t p2 = *(const in_port_t *)ent; + + if (p1 < p2) + return (-1); + else if (p1 == p2) + return (0); + else + return (1); +} + static isc_boolean_t -blacklisted(dns_dispatchmgr_t *mgr, isc_socket_t *sock, - isc_sockaddr_t *sockaddrp) +portavailable(dns_dispatchmgr_t *mgr, isc_socket_t *sock, + isc_sockaddr_t *sockaddrp) { isc_sockaddr_t sockaddr; isc_result_t result; + in_port_t *ports, port; + unsigned int nports; + isc_boolean_t available = ISC_FALSE; REQUIRE(sock != NULL || sockaddrp != NULL); - if (mgr->portlist == NULL) - return (ISC_FALSE); - + PORTBUFLOCK(mgr); if (sock != NULL) { sockaddrp = &sockaddr; result = isc_socket_getsockname(sock, sockaddrp); if (result != ISC_R_SUCCESS) - return (ISC_FALSE); + goto unlock; } - if (dns_portlist_match(mgr->portlist, isc_sockaddr_pf(sockaddrp), - isc_sockaddr_getport(sockaddrp))) - return (ISC_TRUE); - return (ISC_FALSE); + if (isc_sockaddr_pf(sockaddrp) == AF_INET) { + ports = mgr->v4ports; + nports = mgr->nv4ports; + } else { + ports = mgr->v6ports; + nports = mgr->nv6ports; + } + if (ports == NULL) + goto unlock; + + port = isc_sockaddr_getport(sockaddrp); + if (bsearch(&port, ports, nports, sizeof(in_port_t), port_cmp) != NULL) + available = ISC_TRUE; + +unlock: + PORTBUFUNLOCK(mgr); + return (available); } #define ATTRMATCH(_a1, _a2, _mask) (((_a1) & (_mask)) == ((_a2) & (_mask))) @@ -1358,13 +2026,17 @@ local_addr_match(dns_dispatch_t *disp, isc_sockaddr_t *addr) { return (ISC_TRUE); /* - * Don't match wildcard ports against newly blacklisted ports. + * Don't match wildcard ports unless the port is available in the + * current configuration. We can skip this check when disp->socket is + * NULL because such a dispatcher will choose ports on-demand from + * the available set. */ - if (disp->mgr->portlist != NULL && - isc_sockaddr_getport(addr) == 0 && + if (isc_sockaddr_getport(addr) == 0 && isc_sockaddr_getport(&disp->local) == 0 && - blacklisted(disp->mgr, disp->socket, NULL)) + disp->socket != NULL && + !portavailable(disp->mgr, disp->socket, NULL)) { return (ISC_FALSE); + } /* * Check if we match the binding . @@ -1436,7 +2108,8 @@ dispatch_find(dns_dispatchmgr_t *mgr, isc_sockaddr_t *local, static isc_result_t qid_allocate(dns_dispatchmgr_t *mgr, unsigned int buckets, - unsigned int increment, isc_boolean_t usepool, dns_qid_t **qidp) + unsigned int increment, dns_qid_t **qidp, + isc_boolean_t needaddrtable) { dns_qid_t *qid; unsigned int i; @@ -1458,25 +2131,35 @@ qid_allocate(dns_dispatchmgr_t *mgr, unsigned int buckets, return (ISC_R_NOMEMORY); } - result = tid_init(mgr->mctx, &qid->tid, usepool); - if (result != ISC_R_SUCCESS) { - isc_mem_put(mgr->mctx, qid->qid_table, - buckets * sizeof(dns_displist_t)); - isc_mem_put(mgr->mctx, qid, sizeof(*qid)); - return (ISC_R_NOMEMORY); + qid->addr_table = NULL; + if (needaddrtable) { + qid->addr_table = isc_mem_get(mgr->mctx, + buckets * sizeof(dns_displist_t)); + if (qid->addr_table == NULL) { + isc_mem_put(mgr->mctx, qid, sizeof(*qid)); + isc_mem_put(mgr->mctx, qid->qid_table, + buckets * sizeof(dns_displist_t)); + return (ISC_R_NOMEMORY); + } } result = isc_mutex_init(&qid->lock); if (result != ISC_R_SUCCESS) { - tid_destroy(mgr->mctx, &qid->tid); + if (qid->addr_table != NULL) { + isc_mem_put(mgr->mctx, qid->addr_table, + buckets * sizeof(dns_displist_t)); + } isc_mem_put(mgr->mctx, qid->qid_table, buckets * sizeof(dns_displist_t)); isc_mem_put(mgr->mctx, qid, sizeof(*qid)); return (result); } - for (i = 0; i < buckets; i++) + for (i = 0; i < buckets; i++) { ISC_LIST_INIT(qid->qid_table[i]); + if (qid->addr_table != NULL) + ISC_LIST_INIT(qid->addr_table[i]); + } qid->qid_nbuckets = buckets; qid->qid_increment = increment; @@ -1496,9 +2179,12 @@ qid_destroy(isc_mem_t *mctx, dns_qid_t **qidp) { *qidp = NULL; qid->magic = 0; - tid_destroy(mctx, &qid->tid); isc_mem_put(mctx, qid->qid_table, qid->qid_nbuckets * sizeof(dns_displist_t)); + if (qid->addr_table != NULL) { + isc_mem_put(mctx, qid->addr_table, + qid->qid_nbuckets * sizeof(dns_displist_t)); + } DESTROYLOCK(&qid->lock); isc_mem_put(mctx, qid, sizeof(*qid)); } @@ -1533,6 +2219,7 @@ dispatch_allocate(dns_dispatchmgr_t *mgr, unsigned int maxrequests, disp->refcount = 1; disp->recv_pending = 0; memset(&disp->local, 0, sizeof(disp->local)); + disp->localport = 0; disp->shutting_down = 0; disp->shutdown_out = 0; disp->connected = 0; @@ -1541,6 +2228,10 @@ dispatch_allocate(dns_dispatchmgr_t *mgr, unsigned int maxrequests, disp->requests = 0; disp->tcpbuffers = 0; disp->qid = NULL; + ISC_LIST_INIT(disp->activesockets); + ISC_LIST_INIT(disp->inactivesockets); + disp->nsockets = 0; + dispatch_arc4init(&disp->arc4ctx, mgr->entropy, NULL); result = isc_mutex_init(&disp->lock); if (result != ISC_R_SUCCESS) @@ -1593,6 +2284,8 @@ dispatch_free(dns_dispatch_t **dispp) INSIST(disp->tcpbuffers == 0); INSIST(disp->requests == 0); INSIST(disp->recv_pending == 0); + INSIST(ISC_LIST_EMPTY(disp->activesockets)); + INSIST(ISC_LIST_EMPTY(disp->inactivesockets)); isc_mempool_put(mgr->epool, disp->failsafe_ev); disp->failsafe_ev = NULL; @@ -1638,7 +2331,7 @@ dns_dispatch_createtcp(dns_dispatchmgr_t *mgr, isc_socket_t *sock, return (result); } - result = qid_allocate(mgr, buckets, increment, ISC_FALSE, &disp->qid); + result = qid_allocate(mgr, buckets, increment, &disp->qid, ISC_FALSE); if (result != ISC_R_SUCCESS) goto deallocate_dispatch; @@ -1646,8 +2339,9 @@ dns_dispatch_createtcp(dns_dispatchmgr_t *mgr, isc_socket_t *sock, disp->socket = NULL; isc_socket_attach(sock, &disp->socket); - disp->task = NULL; - result = isc_task_create(taskmgr, 0, &disp->task); + disp->ntasks = 1; + disp->task[0] = NULL; + result = isc_task_create(taskmgr, 0, &disp->task[0]); if (result != ISC_R_SUCCESS) goto kill_socket; @@ -1660,7 +2354,7 @@ dns_dispatch_createtcp(dns_dispatchmgr_t *mgr, isc_socket_t *sock, goto kill_task; } - isc_task_setname(disp->task, "tcpdispatch", disp); + isc_task_setname(disp->task[0], "tcpdispatch", disp); dns_tcpmsg_init(mgr->mctx, disp->socket, &disp->tcpmsg); disp->tcpmsg_valid = 1; @@ -1674,7 +2368,7 @@ dns_dispatch_createtcp(dns_dispatchmgr_t *mgr, isc_socket_t *sock, UNLOCK(&mgr->lock); mgr_log(mgr, LVL(90), "created TCP dispatcher %p", disp); - dispatch_log(disp, LVL(90), "created task %p", disp->task); + dispatch_log(disp, LVL(90), "created task %p", disp->task[0]); *dispp = disp; @@ -1684,7 +2378,7 @@ dns_dispatch_createtcp(dns_dispatchmgr_t *mgr, isc_socket_t *sock, * Error returns. */ kill_task: - isc_task_detach(&disp->task); + isc_task_detach(&disp->task[0]); kill_socket: isc_socket_detach(&disp->socket); deallocate_dispatch: @@ -1719,13 +2413,13 @@ dns_dispatch_getudp(dns_dispatchmgr_t *mgr, isc_socketmgr_t *sockmgr, REQUIRE((attributes & DNS_DISPATCHATTR_TCP) == 0); result = dns_dispatchmgr_setudp(mgr, buffersize, maxbuffers, - buckets, increment); + maxrequests, buckets, increment); if (result != ISC_R_SUCCESS) return (result); LOCK(&mgr->lock); - if ((attributes & DNS_DISPATCHATTR_RANDOMPORT) != 0) { + if ((attributes & DNS_DISPATCHATTR_EXCLUSIVE) != 0) { REQUIRE(isc_sockaddr_getport(localaddr) == 0); goto createudp; } @@ -1745,7 +2439,7 @@ dns_dispatch_getudp(dns_dispatchmgr_t *mgr, isc_socketmgr_t *sockmgr, { disp->attributes |= DNS_DISPATCHATTR_NOLISTEN; if (disp->recv_pending != 0) - isc_socket_cancel(disp->socket, disp->task, + isc_socket_cancel(disp->socket, disp->task[0], ISC_SOCKCANCEL_RECV); } @@ -1781,6 +2475,100 @@ dns_dispatch_getudp(dns_dispatchmgr_t *mgr, isc_socketmgr_t *sockmgr, #define DNS_DISPATCH_HELD 20U #endif +static isc_result_t +get_udpsocket(dns_dispatchmgr_t *mgr, dns_dispatch_t *disp, + isc_socketmgr_t *sockmgr, isc_sockaddr_t *localaddr, + isc_socket_t **sockp, unsigned int maxtry) +{ + unsigned int i, j; + isc_socket_t *held[DNS_DISPATCH_HELD]; + isc_sockaddr_t localaddr_bound; + isc_socket_t *sock = NULL; + isc_result_t result = ISC_R_SUCCESS; + isc_boolean_t anyport; + + INSIST(sockp != NULL && *sockp == NULL); + + localaddr_bound = *localaddr; + anyport = ISC_TF(isc_sockaddr_getport(localaddr) == 0); + + if (anyport) { + unsigned int nports; + in_port_t *ports; + + /* + * If no port is specified, we first try to pick up a random + * port by ourselves. + */ + if (isc_sockaddr_pf(&disp->local) == AF_INET) { + nports = disp->mgr->nv4ports; + ports = disp->mgr->v4ports; + } else { + nports = disp->mgr->nv6ports; + ports = disp->mgr->v6ports; + } + if (nports == 0) + return (ISC_R_ADDRNOTAVAIL); + + for (i = 0; i < 1024; i++) { + in_port_t prt; + + prt = ports[dispatch_arc4uniformrandom( + DISP_ARC4CTX(disp), + nports)]; + isc_sockaddr_setport(&localaddr_bound, prt); + result = open_socket(sockmgr, &localaddr_bound, &sock); + if (result == ISC_R_SUCCESS || + result != ISC_R_ADDRINUSE) { + disp->localport = prt; + *sockp = sock; + return (result); + } + } + + /* + * If this fails 1024 times, we then ask the kernel for + * choosing one. + */ + } + + memset(held, 0, sizeof(held)); + i = 0; + + for (j = 0; j < maxtry; j++) { + result = open_socket(sockmgr, localaddr, &sock); + if (result != ISC_R_SUCCESS) + goto end; + else if (!anyport) + break; + else if (portavailable(mgr, sock, NULL)) + break; + if (held[i] != NULL) + isc_socket_detach(&held[i]); + held[i++] = sock; + sock = NULL; + if (i == DNS_DISPATCH_HELD) + i = 0; + } + if (j == maxtry) { + mgr_log(mgr, ISC_LOG_ERROR, + "avoid-v%s-udp-ports: unable to allocate " + "an available port", + isc_sockaddr_pf(localaddr) == AF_INET ? "4" : "6"); + result = ISC_R_FAILURE; + goto end; + } + *sockp = sock; + +end: + for (i = 0; i < DNS_DISPATCH_HELD; i++) { + if (held[i] != NULL) + isc_socket_detach(&held[i]); + } + + return (result); +} + static isc_result_t dispatch_createudp(dns_dispatchmgr_t *mgr, isc_socketmgr_t *sockmgr, isc_taskmgr_t *taskmgr, @@ -1792,9 +2580,7 @@ dispatch_createudp(dns_dispatchmgr_t *mgr, isc_socketmgr_t *sockmgr, isc_result_t result; dns_dispatch_t *disp; isc_socket_t *sock = NULL; - isc_socket_t *held[DNS_DISPATCH_HELD]; - unsigned int i = 0, j = 0, k = 0; - isc_sockaddr_t localaddr_bound; + int i = 0; /* * dispatch_allocate() checks mgr for us. @@ -1804,61 +2590,30 @@ dispatch_createudp(dns_dispatchmgr_t *mgr, isc_socketmgr_t *sockmgr, if (result != ISC_R_SUCCESS) return (result); - /* - * Try to allocate a socket that is not on the blacklist. - * Hold up to DNS_DISPATCH_HELD sockets to prevent the OS - * from returning the same port to us too quickly. - */ - memset(held, 0, sizeof(held)); - localaddr_bound = *localaddr; - getsocket: - if ((attributes & DNS_DISPATCHATTR_RANDOMPORT) != 0) { - isc_sockaddr_setport(&localaddr_bound, - get_randomport(&mgr->qid->tid)); - if (blacklisted(mgr, NULL, &localaddr_bound)) { - if (++k == 1024) - attributes &= ~DNS_DISPATCHATTR_RANDOMPORT; - goto getsocket; - } - result = create_socket(sockmgr, &localaddr_bound, &sock); - if (result == ISC_R_ADDRINUSE) { - if (++k == 1024) - attributes &= ~DNS_DISPATCHATTR_RANDOMPORT; - goto getsocket; - } - } else - result = create_socket(sockmgr, localaddr, &sock); - if (result != ISC_R_SUCCESS) - goto deallocate_dispatch; - if ((attributes & DNS_DISPATCHATTR_RANDOMPORT) == 0 && - isc_sockaddr_getport(localaddr) == 0 && - blacklisted(mgr, sock, NULL)) - { - if (held[i] != NULL) - isc_socket_detach(&held[i]); - held[i++] = sock; - sock = NULL; - if (i == DNS_DISPATCH_HELD) - i = 0; - if (j++ == 0xffffU) { - mgr_log(mgr, ISC_LOG_ERROR, "avoid-v%s-udp-ports: " - "unable to allocate a non-blacklisted port", - isc_sockaddr_pf(localaddr) == AF_INET ? - "4" : "6"); - result = ISC_R_FAILURE; + if ((attributes & DNS_DISPATCHATTR_EXCLUSIVE) == 0) { + result = get_udpsocket(mgr, disp, sockmgr, localaddr, &sock, + 0xffffU); + if (result != ISC_R_SUCCESS) goto deallocate_dispatch; - } - goto getsocket; } - disp->socktype = isc_sockettype_udp; disp->socket = sock; disp->local = *localaddr; - disp->task = NULL; - result = isc_task_create(taskmgr, 0, &disp->task); - if (result != ISC_R_SUCCESS) - goto kill_socket; + if ((attributes & DNS_DISPATCHATTR_EXCLUSIVE) != 0) + disp->ntasks = MAX_INTERNAL_TASKS; + else + disp->ntasks = 1; + for (i = 0; i < disp->ntasks; i++) { + disp->task[i] = NULL; + result = isc_task_create(taskmgr, 0, &disp->task[i]); + if (result != ISC_R_SUCCESS) { + while (--i >= 0) + isc_task_destroy(&disp->task[i]); + goto kill_socket; + } + isc_task_setname(disp->task[i], "udpdispatch", disp); + } disp->ctlevent = isc_event_allocate(mgr->mctx, disp, DNS_EVENT_DISPATCHCONTROL, @@ -1869,8 +2624,6 @@ dispatch_createudp(dns_dispatchmgr_t *mgr, isc_socketmgr_t *sockmgr, goto kill_task; } - isc_task_setname(disp->task, "udpdispatch", disp); - attributes &= ~DNS_DISPATCHATTR_TCP; attributes |= DNS_DISPATCHATTR_UDP; disp->attributes = attributes; @@ -1881,26 +2634,25 @@ dispatch_createudp(dns_dispatchmgr_t *mgr, isc_socketmgr_t *sockmgr, ISC_LIST_APPEND(mgr->list, disp, link); mgr_log(mgr, LVL(90), "created UDP dispatcher %p", disp); - dispatch_log(disp, LVL(90), "created task %p", disp->task); - dispatch_log(disp, LVL(90), "created socket %p", disp->socket); + dispatch_log(disp, LVL(90), "created task %p", disp->task[0]); /* XXX */ + if (disp->socket != NULL) + dispatch_log(disp, LVL(90), "created socket %p", disp->socket); *dispp = disp; - - goto cleanheld; + return (result); /* * Error returns. */ kill_task: - isc_task_detach(&disp->task); + for (i = 0; i < disp->ntasks; i++) + isc_task_detach(&disp->task[i]); kill_socket: - isc_socket_detach(&disp->socket); + if (disp->socket != NULL) + isc_socket_detach(&disp->socket); deallocate_dispatch: dispatch_free(&disp); - cleanheld: - for (i = 0; i < DNS_DISPATCH_HELD; i++) - if (held[i] != NULL) - isc_socket_detach(&held[i]); + return (result); } @@ -1926,6 +2678,7 @@ dns_dispatch_attach(dns_dispatch_t *disp, dns_dispatch_t **dispp) { void dns_dispatch_detach(dns_dispatch_t **dispp) { dns_dispatch_t *disp; + dispsocket_t *dispsock; isc_boolean_t killit; REQUIRE(dispp != NULL && VALID_DISPATCH(*dispp)); @@ -1940,8 +2693,14 @@ dns_dispatch_detach(dns_dispatch_t **dispp) { killit = ISC_FALSE; if (disp->refcount == 0) { if (disp->recv_pending > 0) - isc_socket_cancel(disp->socket, disp->task, + isc_socket_cancel(disp->socket, disp->task[0], ISC_SOCKCANCEL_RECV); + for (dispsock = ISC_LIST_HEAD(disp->activesockets); + dispsock != NULL; + dispsock = ISC_LIST_NEXT(dispsock, link)) { + isc_socket_cancel(dispsock->socket, dispsock->task, + ISC_SOCKCANCEL_RECV); + } disp->shutting_down = 1; } @@ -1950,26 +2709,33 @@ dns_dispatch_detach(dns_dispatch_t **dispp) { killit = destroy_disp_ok(disp); UNLOCK(&disp->lock); if (killit) - isc_task_send(disp->task, &disp->ctlevent); + isc_task_send(disp->task[0], &disp->ctlevent); } isc_result_t -dns_dispatch_addresponse(dns_dispatch_t *disp, isc_sockaddr_t *dest, - isc_task_t *task, isc_taskaction_t action, void *arg, - dns_messageid_t *idp, dns_dispentry_t **resp) +dns_dispatch_addresponse2(dns_dispatch_t *disp, isc_sockaddr_t *dest, + isc_task_t *task, isc_taskaction_t action, void *arg, + dns_messageid_t *idp, dns_dispentry_t **resp, + isc_socketmgr_t *sockmgr) { dns_dispentry_t *res; unsigned int bucket; + unsigned int abucket; + in_port_t localport = 0; dns_messageid_t id; int i; isc_boolean_t ok; dns_qid_t *qid; + dispsocket_t *dispsocket = NULL; + isc_result_t result; REQUIRE(VALID_DISPATCH(disp)); REQUIRE(task != NULL); REQUIRE(dest != NULL); REQUIRE(resp != NULL && *resp == NULL); REQUIRE(idp != NULL); + if ((disp->attributes & DNS_DISPATCHATTR_EXCLUSIVE) != 0) + REQUIRE(sockmgr != NULL); LOCK(&disp->lock); @@ -1983,22 +2749,77 @@ dns_dispatch_addresponse(dns_dispatch_t *disp, isc_sockaddr_t *dest, return (ISC_R_QUOTA); } + if ((disp->attributes & DNS_DISPATCHATTR_EXCLUSIVE) != 0 && + disp->nsockets > DNS_DISPATCH_SOCKSQUOTA) { + dispsocket_t *oldestsocket; + dns_dispentry_t *oldestresp; + dns_dispatchevent_t *rev; + + /* + * Kill oldest outstanding query if the number of sockets + * exceeds the quota to keep the room for new queries. + */ + oldestsocket = ISC_LIST_HEAD(disp->activesockets); + oldestresp = oldestsocket->resp; + if (oldestresp != NULL && !oldestresp->item_out) { + rev = allocate_event(oldestresp->disp); + if (rev != NULL) { + rev->buffer.base = NULL; + rev->result = ISC_R_CANCELED; + rev->id = oldestresp->id; + ISC_EVENT_INIT(rev, sizeof(*rev), 0, + NULL, DNS_EVENT_DISPATCH, + oldestresp->action, + oldestresp->arg, oldestresp, + NULL, NULL); + oldestresp->item_out = ISC_TRUE; + isc_task_send(oldestresp->task, + ISC_EVENT_PTR(&rev)); + } + } + + /* + * Move this entry to the tail so that it won't (easily) be + * examined before actually being canceled. + */ + ISC_LIST_UNLINK(disp->activesockets, oldestsocket, link); + ISC_LIST_APPEND(disp->activesockets, oldestsocket, link); + } + + qid = DNS_QID(disp); + LOCK(&qid->lock); + + if ((disp->attributes & DNS_DISPATCHATTR_EXCLUSIVE) != 0) { + /* + * Get a separate UDP socket with a random port number. + */ + result = get_dispsocket(disp, dest, sockmgr, qid, &dispsocket, + &abucket, &localport); + if (result != ISC_R_SUCCESS) { + UNLOCK(&qid->lock); + UNLOCK(&disp->lock); + return (result); + } + } else { + abucket = 0; /* meaningless, but set explicitly */ + localport = disp->localport; + } + /* * Try somewhat hard to find an unique ID. */ - qid = DNS_QID(disp); - LOCK(&qid->lock); - id = dns_randomid(&qid->tid); - bucket = dns_hash(qid, dest, id); + id = (dns_messageid_t)dispatch_arc4random(DISP_ARC4CTX(disp)); + bucket = dns_hash(qid, dest, id, localport); ok = ISC_FALSE; for (i = 0; i < 64; i++) { - if (bucket_search(qid, dest, id, bucket) == NULL) { + if (bucket_search(qid, qid->qid_table, dest, id, localport, + bucket, ISC_FALSE) == NULL) { ok = ISC_TRUE; break; } id += qid->qid_increment; id &= 0x0000ffff; - bucket = dns_hash(qid, dest, id); + bucket = dns_hash(qid, dest, id, localport); } if (!ok) { @@ -2011,6 +2832,8 @@ dns_dispatch_addresponse(dns_dispatch_t *disp, isc_sockaddr_t *dest, if (res == NULL) { UNLOCK(&qid->lock); UNLOCK(&disp->lock); + if (dispsocket != NULL) + destroy_dispsocket(disp, &dispsocket); return (ISC_R_NOMEMORY); } @@ -2020,42 +2843,89 @@ dns_dispatch_addresponse(dns_dispatch_t *disp, isc_sockaddr_t *dest, isc_task_attach(task, &res->task); res->disp = disp; res->id = id; + res->port = localport; res->bucket = bucket; + res->abucket = abucket; res->host = *dest; res->action = action; res->arg = arg; + res->dispsocket = dispsocket; + if (dispsocket != NULL) + dispsocket->resp = res; res->item_out = ISC_FALSE; ISC_LIST_INIT(res->items); ISC_LINK_INIT(res, link); + ISC_LINK_INIT(res, alink); res->magic = RESPONSE_MAGIC; ISC_LIST_APPEND(qid->qid_table[bucket], res, link); + if (dispsocket != NULL) + ISC_LIST_APPEND(qid->addr_table[abucket], res, alink); UNLOCK(&qid->lock); request_log(disp, res, LVL(90), "attached to task %p", res->task); if (((disp->attributes & DNS_DISPATCHATTR_UDP) != 0) || - ((disp->attributes & DNS_DISPATCHATTR_CONNECTED) != 0)) - startrecv(disp); + ((disp->attributes & DNS_DISPATCHATTR_CONNECTED) != 0)) { + result = startrecv(disp, dispsocket); + if (result != ISC_R_SUCCESS) { + LOCK(&qid->lock); + ISC_LIST_UNLINK(qid->qid_table[bucket], res, link); + if (ISC_LINK_LINKED(res, alink)) { + ISC_LIST_UNLINK(qid->addr_table[abucket], res, + alink); + } + UNLOCK(&qid->lock); + + if (dispsocket != NULL) + destroy_dispsocket(disp, &dispsocket); + + disp->refcount--; + disp->requests--; + + UNLOCK(&disp->lock); + isc_task_detach(&res->task); + isc_mempool_put(disp->mgr->rpool, res); + return (result); + } + } + + if (dispsocket != NULL) + ISC_LIST_APPEND(disp->activesockets, dispsocket, link); UNLOCK(&disp->lock); *idp = id; *resp = res; + if ((disp->attributes & DNS_DISPATCHATTR_EXCLUSIVE) != 0) + INSIST(res->dispsocket != NULL); + return (ISC_R_SUCCESS); } +isc_result_t +dns_dispatch_addresponse(dns_dispatch_t *disp, isc_sockaddr_t *dest, + isc_task_t *task, isc_taskaction_t action, void *arg, + dns_messageid_t *idp, dns_dispentry_t **resp) +{ + REQUIRE(VALID_DISPATCH(disp)); + REQUIRE((disp->attributes & DNS_DISPATCHATTR_EXCLUSIVE) == 0); + + return (dns_dispatch_addresponse2(disp, dest, task, action, arg, + idp, resp, NULL)); +} + void dns_dispatch_starttcp(dns_dispatch_t *disp) { REQUIRE(VALID_DISPATCH(disp)); - dispatch_log(disp, LVL(90), "starttcp %p", disp->task); + dispatch_log(disp, LVL(90), "starttcp %p", disp->task[0]); LOCK(&disp->lock); disp->attributes |= DNS_DISPATCHATTR_CONNECTED; - startrecv(disp); + (void)startrecv(disp, NULL); UNLOCK(&disp->lock); } @@ -2066,6 +2936,7 @@ dns_dispatch_removeresponse(dns_dispentry_t **resp, dns_dispatchmgr_t *mgr; dns_dispatch_t *disp; dns_dispentry_t *res; + dispsocket_t *dispsock; dns_dispatchevent_t *ev; unsigned int bucket; isc_boolean_t killit; @@ -2103,8 +2974,14 @@ dns_dispatch_removeresponse(dns_dispentry_t **resp, killit = ISC_FALSE; if (disp->refcount == 0) { if (disp->recv_pending > 0) - isc_socket_cancel(disp->socket, disp->task, + isc_socket_cancel(disp->socket, disp->task[0], ISC_SOCKCANCEL_RECV); + for (dispsock = ISC_LIST_HEAD(disp->activesockets); + dispsock != NULL; + dispsock = ISC_LIST_NEXT(dispsock, link)) { + isc_socket_cancel(dispsock->socket, dispsock->task, + ISC_SOCKCANCEL_RECV); + } disp->shutting_down = 1; } @@ -2112,6 +2989,8 @@ dns_dispatch_removeresponse(dns_dispentry_t **resp, LOCK(&qid->lock); ISC_LIST_UNLINK(qid->qid_table[bucket], res, link); + if (ISC_LINK_LINKED(res, alink)) + ISC_LIST_UNLINK(qid->addr_table[res->abucket], res, alink); UNLOCK(&qid->lock); if (ev == NULL && res->item_out) { @@ -2140,6 +3019,12 @@ dns_dispatch_removeresponse(dns_dispentry_t **resp, request_log(disp, res, LVL(90), "detaching from task %p", res->task); isc_task_detach(&res->task); + if (res->dispsocket != NULL) { + isc_socket_cancel(res->dispsocket->socket, + res->dispsocket->task, ISC_SOCKCANCEL_RECV); + res->dispsocket->resp = NULL; + } + /* * Free any buffered requests as well */ @@ -2156,12 +3041,12 @@ dns_dispatch_removeresponse(dns_dispentry_t **resp, if (disp->shutting_down == 1) do_cancel(disp); else - startrecv(disp); + (void)startrecv(disp, NULL); killit = destroy_disp_ok(disp); UNLOCK(&disp->lock); if (killit) - isc_task_send(disp->task, &disp->ctlevent); + isc_task_send(disp->task[0], &disp->ctlevent); } static void @@ -2176,13 +3061,15 @@ do_cancel(dns_dispatch_t *disp) { qid = DNS_QID(disp); /* - * Search for the first response handler without packets outstanding. + * Search for the first response handler without packets outstanding + * unless a specific hander is given. */ LOCK(&qid->lock); for (resp = linear_first(qid); - resp != NULL && resp->item_out != ISC_FALSE; + resp != NULL && !resp->item_out; /* Empty. */) resp = linear_next(qid, resp); + /* * No one to send the cancel event to, so nothing to do. */ @@ -2215,6 +3102,16 @@ dns_dispatch_getsocket(dns_dispatch_t *disp) { return (disp->socket); } +isc_socket_t * +dns_dispatch_getentrysocket(dns_dispentry_t *resp) { + REQUIRE(VALID_RESPONSE(resp)); + + if (resp->dispsocket != NULL) + return (resp->dispsocket->socket); + else + return (NULL); +} + isc_result_t dns_dispatch_getlocaladdress(dns_dispatch_t *disp, isc_sockaddr_t *addrp) { @@ -2248,11 +3145,27 @@ dns_dispatch_cancel(dns_dispatch_t *disp) { return; } +unsigned int +dns_dispatch_getattributes(dns_dispatch_t *disp) { + REQUIRE(VALID_DISPATCH(disp)); + + /* + * We don't bother locking disp here; it's the caller's responsibility + * to use only non volatile flags. + */ + return (disp->attributes); +} + void dns_dispatch_changeattributes(dns_dispatch_t *disp, unsigned int attributes, unsigned int mask) { REQUIRE(VALID_DISPATCH(disp)); + /* Exclusive attribute can only be set on creation */ + REQUIRE((attributes & DNS_DISPATCHATTR_EXCLUSIVE) == 0); + /* Also, a dispatch with randomport specified cannot start listening */ + REQUIRE((disp->attributes & DNS_DISPATCHATTR_EXCLUSIVE) == 0 || + (attributes & DNS_DISPATCHATTR_NOLISTEN) == 0); /* XXXMLG * Should check for valid attributes here! @@ -2264,13 +3177,13 @@ dns_dispatch_changeattributes(dns_dispatch_t *disp, if ((disp->attributes & DNS_DISPATCHATTR_NOLISTEN) != 0 && (attributes & DNS_DISPATCHATTR_NOLISTEN) == 0) { disp->attributes &= ~DNS_DISPATCHATTR_NOLISTEN; - startrecv(disp); + (void)startrecv(disp, NULL); } else if ((disp->attributes & DNS_DISPATCHATTR_NOLISTEN) == 0 && (attributes & DNS_DISPATCHATTR_NOLISTEN) != 0) { disp->attributes |= DNS_DISPATCHATTR_NOLISTEN; if (disp->recv_pending != 0) - isc_socket_cancel(disp->socket, disp->task, + isc_socket_cancel(disp->socket, disp->task[0], ISC_SOCKCANCEL_RECV); } } @@ -2294,7 +3207,7 @@ dns_dispatch_importrecv(dns_dispatch_t *disp, isc_event_t *event) { INSIST(sevent->n <= disp->mgr->buffersize); newsevent = (isc_socketevent_t *) isc_event_allocate(disp->mgr->mctx, NULL, - DNS_EVENT_IMPORTRECVDONE, udp_recv, + DNS_EVENT_IMPORTRECVDONE, udp_shrecv, disp, sizeof(isc_socketevent_t)); if (newsevent == NULL) return; @@ -2314,7 +3227,7 @@ dns_dispatch_importrecv(dns_dispatch_t *disp, isc_event_t *event) { newsevent->pktinfo = sevent->pktinfo; newsevent->attributes = sevent->attributes; - isc_task_send(disp->task, ISC_EVENT_PTR(&newsevent)); + isc_task_send(disp->task[0], ISC_EVENT_PTR(&newsevent)); } #if 0 @@ -2331,418 +3244,3 @@ dns_dispatchmgr_dump(dns_dispatchmgr_t *mgr) { } } #endif - -/* - * Allow the user to pick one of two ID randomization algorithms. - * - * The first algorithm is an adaptation of the sequence shuffling - * algorithm discovered by Carter Bays and S. D. Durham [ACM Trans. Math. - * Software 2 (1976), 59-64], as documented as Algorithm B in Chapter - * 3.2.2 in Volume 2 of Knuth's "The Art of Computer Programming". We use - * a randomly selected linear congruential random number generator with a - * modulus of 2^16, whose increment is a randomly picked odd number, and - * whose multiplier is picked from a set which meets the following - * criteria: - * Is of the form 8*n+5, which ensures "high potency" according to - * principle iii in the summary chapter 3.6. This form also has a - * gcd(a-1,m) of 4 which is good according to principle iv. - * - * Is between 0.01 and 0.99 times the modulus as specified by - * principle iv. - * - * Passes the spectral test "with flying colors" (ut >= 1) in - * dimensions 2 through 6 as calculated by Algorithm S in Chapter - * 3.3.4 and the ratings calculated by formula 35 in section E. - * - * Of the multipliers that pass this test, pick the set that is - * best according to the theoretical bounds of the serial - * correlation test. This was calculated using a simplified - * version of Knuth's Theorem K in Chapter 3.3.3. - * - * These criteria may not be important for this use, but we might as well - * pick from the best generators since there are so many possible ones and - * we don't have that many random bits to do the picking. - * - * We use a modulus of 2^16 instead of something bigger so that we will - * tend to cycle through all the possible IDs before repeating any, - * however the shuffling will perturb this somewhat. Theoretically there - * is no minimimum interval between two uses of the same ID, but in - * practice it seems to be >64000. - * - * Our adaptatation of Algorithm B mixes the hash state which has - * captured various random events into the shuffler to perturb the - * sequence. - * - * One disadvantage of this algorithm is that if the generator parameters - * were to be guessed, it would be possible to mount a limited brute force - * attack on the ID space since the IDs are only shuffled within a limited - * range. - * - * The second algorithm uses the same random number generator to populate - * a pool of 65536 IDs. The hash state is used to pick an ID from a window - * of 4096 IDs in this pool, then the chosen ID is swapped with the ID - * at the beginning of the window and the window position is advanced. - * This means that the interval between uses of the ID will be no less - * than 65536-4096. The ID sequence in the pool will become more random - * over time. - * - * For both algorithms, two more linear congruential random number generators - * are selected. The ID from the first part of algorithm is used to seed - * the first of these generators, and its output is used to seed the second. - * The strategy is use these generators as 1 to 1 hashes to obfuscate the - * properties of the generator used in the first part of either algorithm. - * - * The first algorithm may be suitable for use in a client resolver since - * its memory requirements are fairly low and it's pretty random out of - * the box. It is somewhat succeptible to a limited brute force attack, - * so the second algorithm is probably preferable for a longer running - * program that issues a large number of queries and has time to randomize - * the pool. - */ - -#define TID_SHUFFLE_TABLE_SIZE 100 /* Suggested by Knuth */ -/* - * Pick one of the next 4096 IDs in the pool. - * There is a tradeoff here between randomness and how often and ID is reused. - */ -#define TID_LOOKAHEAD 4096 /* Must be a power of 2 */ -#define TID_SHUFFLE_ONLY 1 /* algorithm 1 */ -#define TID_USE_POOL 2 /* algorithm 2 */ -#define TID_HASHSHIFT 3 -#define TID_HASHROTATE(v) \ - (((v) << TID_HASHSHIFT) | ((v) >> ((sizeof(v) * 8) - TID_HASHSHIFT))) - -static isc_uint32_t tid_hash_state; - -/* - * Keep a running hash of various bits of data that we'll use to - * stir the ID pool or perturb the ID generator - */ -static void -tid_hash(void *data, size_t len) { - unsigned char *p = data; - /* - * Hash function similar to the one we use for hashing names. - * We don't fold case or toss the upper bit here, though. - * This hash doesn't do much interesting when fed binary zeros, - * so there may be a better hash function. - * This function doesn't need to be very strong since we're - * only using it to stir the pool, but it should be reasonably - * fast. - */ - /* - * We don't care about locking access to tid_hash_state. - * In fact races make the result even more non deteministic. - */ - while (len-- > 0U) { - tid_hash_state = TID_HASHROTATE(tid_hash_state); - tid_hash_state += *p++; - } -} - -/* - * Table of good linear congruential multipliers for modulus 2^16 - * in order of increasing serial correlation bounds (so trim from - * the end). - */ -static const isc_uint16_t tid_multiplier_table[] = { - 17565, 25013, 11733, 19877, 23989, 23997, 24997, 25421, - 26781, 27413, 35901, 35917, 35973, 36229, 38317, 38437, - 39941, 40493, 41853, 46317, 50581, 51429, 53453, 53805, - 11317, 11789, 12045, 12413, 14277, 14821, 14917, 18989, - 19821, 23005, 23533, 23573, 23693, 27549, 27709, 28461, - 29365, 35605, 37693, 37757, 38309, 41285, 45261, 47061, - 47269, 48133, 48597, 50277, 50717, 50757, 50805, 51341, - 51413, 51581, 51597, 53445, 11493, 14229, 20365, 20653, - 23485, 25541, 27429, 29421, 30173, 35445, 35653, 36789, - 36797, 37109, 37157, 37669, 38661, 39773, 40397, 41837, - 41877, 45293, 47277, 47845, 49853, 51085, 51349, 54085, - 56933, 8877, 8973, 9885, 11365, 11813, 13581, 13589, - 13613, 14109, 14317, 15765, 15789, 16925, 17069, 17205, - 17621, 17941, 19077, 19381, 20245, 22845, 23733, 24869, - 25453, 27213, 28381, 28965, 29245, 29997, 30733, 30901, - 34877, 35485, 35613, 36133, 36661, 36917, 38597, 40285, - 40693, 41413, 41541, 41637, 42053, 42349, 45245, 45469, - 46493, 48205, 48613, 50861, 51861, 52877, 53933, 54397, - 55669, 56453, 56965, 58021, 7757, 7781, 8333, 9661, - 12229, 14373, 14453, 17549, 18141, 19085, 20773, 23701, - 24205, 24333, 25261, 25317, 27181, 30117, 30477, 34757, - 34885, 35565, 35885, 36541, 37957, 39733, 39813, 41157, - 41893, 42317, 46621, 48117, 48181, 49525, 55261, 55389, - 56845, 7045, 7749, 7965, 8469, 9133, 9549, 9789, - 10173, 11181, 11285, 12253, 13453, 13533, 13757, 14477, - 15053, 16901, 17213, 17269, 17525, 17629, 18605, 19013, - 19829, 19933, 20069, 20093, 23261, 23333, 24949, 25309, - 27613, 28453, 28709, 29301, 29541, 34165, 34413, 37301, - 37773, 38045, 38405, 41077, 41781, 41925, 42717, 44437, - 44525, 44613, 45933, 45941, 47077, 50077, 50893, 52117, - 5293, 55069, 55989, 58125, 59205, 6869, 14685, 15453, - 16821, 17045, 17613, 18437, 21029, 22773, 22909, 25445, - 25757, 26541, 30709, 30909, 31093, 31149, 37069, 37725, - 37925, 38949, 39637, 39701, 40765, 40861, 42965, 44813, - 45077, 45733, 47045, 50093, 52861, 52957, 54181, 56325, - 56365, 56381, 56877, 57013, 5741, 58101, 58669, 8613, - 10045, 10261, 10653, 10733, 11461, 12261, 14069, 15877, - 17757, 21165, 23885, 24701, 26429, 26645, 27925, 28765, - 29197, 30189, 31293, 39781, 39909, 40365, 41229, 41453, - 41653, 42165, 42365, 47421, 48029, 48085, 52773, 5573, - 57037, 57637, 58341, 58357, 58901, 6357, 7789, 9093, - 10125, 10709, 10765, 11957, 12469, 13437, 13509, 14773, - 15437, 15773, 17813, 18829, 19565, 20237, 23461, 23685, - 23725, 23941, 24877, 25461, 26405, 29509, 30285, 35181, - 37229, 37893, 38565, 40293, 44189, 44581, 45701, 47381, - 47589, 48557, 4941, 51069, 5165, 52797, 53149, 5341, - 56301, 56765, 58581, 59493, 59677, 6085, 6349, 8293, - 8501, 8517, 11597, 11709, 12589, 12693, 13517, 14909, - 17397, 18085, 21101, 21269, 22717, 25237, 25661, 29189, - 30101, 31397, 33933, 34213, 34661, 35533, 36493, 37309, - 40037, 4189, 42909, 44309, 44357, 44389, 4541, 45461, - 46445, 48237, 54149, 55301, 55853, 56621, 56717, 56901, - 5813, 58437, 12493, 15365, 15989, 17829, 18229, 19341, - 21013, 21357, 22925, 24885, 26053, 27581, 28221, 28485, - 30605, 30613, 30789, 35437, 36285, 37189, 3941, 41797, - 4269, 42901, 43293, 44645, 45221, 46893, 4893, 50301, - 50325, 5189, 52109, 53517, 54053, 54485, 5525, 55949, - 56973, 59069, 59421, 60733, 61253, 6421, 6701, 6709, - 7101, 8669, 15797, 19221, 19837, 20133, 20957, 21293, - 21461, 22461, 29085, 29861, 30869, 34973, 36469, 37565, - 38125, 38829, 39469, 40061, 40117, 44093, 47429, 48341, - 50597, 51757, 5541, 57629, 58405, 59621, 59693, 59701, - 61837, 7061, 10421, 11949, 15405, 20861, 25397, 25509, - 25893, 26037, 28629, 28869, 29605, 30213, 34205, 35637, - 36365, 37285, 3773, 39117, 4021, 41061, 42653, 44509, - 4461, 44829, 4725, 5125, 52269, 56469, 59085, 5917, - 60973, 8349, 17725, 18637, 19773, 20293, 21453, 22533, - 24285, 26333, 26997, 31501, 34541, 34805, 37509, 38477, - 41333, 44125, 46285, 46997, 47637, 48173, 4925, 50253, - 50381, 50917, 51205, 51325, 52165, 52229, 5253, 5269, - 53509, 56253, 56341, 5821, 58373, 60301, 61653, 61973, - 62373, 8397, 11981, 14341, 14509, 15077, 22261, 22429, - 24261, 28165, 28685, 30661, 34021, 34445, 39149, 3917, - 43013, 43317, 44053, 44101, 4533, 49541, 49981, 5277, - 54477, 56357, 57261, 57765, 58573, 59061, 60197, 61197, - 62189, 7725, 8477, 9565, 10229, 11437, 14613, 14709, - 16813, 20029, 20677, 31445, 3165, 31957, 3229, 33541, - 36645, 3805, 38973, 3965, 4029, 44293, 44557, 46245, - 48917, 4909, 51749, 53709, 55733, 56445, 5925, 6093, - 61053, 62637, 8661, 9109, 10821, 11389, 13813, 14325, - 15501, 16149, 18845, 22669, 26437, 29869, 31837, 33709, - 33973, 34173, 3677, 3877, 3981, 39885, 42117, 4421, - 44221, 44245, 44693, 46157, 47309, 5005, 51461, 52037, - 55333, 55693, 56277, 58949, 6205, 62141, 62469, 6293, - 10101, 12509, 14029, 17997, 20469, 21149, 25221, 27109, - 2773, 2877, 29405, 31493, 31645, 4077, 42005, 42077, - 42469, 42501, 44013, 48653, 49349, 4997, 50101, 55405, - 56957, 58037, 59429, 60749, 61797, 62381, 62837, 6605, - 10541, 23981, 24533, 2701, 27333, 27341, 31197, 33805, - 3621, 37381, 3749, 3829, 38533, 42613, 44381, 45901, - 48517, 51269, 57725, 59461, 60045, 62029, 13805, 14013, - 15461, 16069, 16157, 18573, 2309, 23501, 28645, 3077, - 31541, 36357, 36877, 3789, 39429, 39805, 47685, 47949, - 49413, 5485, 56757, 57549, 57805, 58317, 59549, 62213, - 62613, 62853, 62933, 8909, 12941, 16677, 20333, 21541, - 24429, 26077, 26421, 2885, 31269, 33381, 3661, 40925, - 42925, 45173, 4525, 4709, 53133, 55941, 57413, 57797, - 62125, 62237, 62733, 6773, 12317, 13197, 16533, 16933, - 18245, 2213, 2477, 29757, 33293, 35517, 40133, 40749, - 4661, 49941, 62757, 7853, 8149, 8573, 11029, 13421, - 21549, 22709, 22725, 24629, 2469, 26125, 2669, 34253, - 36709, 41013, 45597, 46637, 52285, 52333, 54685, 59013, - 60997, 61189, 61981, 62605, 62821, 7077, 7525, 8781, - 10861, 15277, 2205, 22077, 28517, 28949, 32109, 33493, - 4661, 49941, 62757, 7853, 8149, 8573, 11029, 13421, - 21549, 22709, 22725, 24629, 2469, 26125, 2669, 34253, - 36709, 41013, 45597, 46637, 52285, 52333, 54685, 59013, - 60997, 61189, 61981, 62605, 62821, 7077, 7525, 8781, - 10861, 15277, 2205, 22077, 28517, 28949, 32109, 33493, - 3685, 39197, 39869, 42621, 44997, 48565, 5221, 57381, - 61749, 62317, 63245, 63381, 23149, 2549, 28661, 31653, - 33885, 36341, 37053, 39517, 42805, 45853, 48997, 59349, - 60053, 62509, 63069, 6525, 1893, 20181, 2365, 24893, - 27397, 31357, 32277, 33357, 34437, 36677, 37661, 43469, - 43917, 50997, 53869, 5653, 13221, 16741, 17893, 2157, - 28653, 31789, 35301, 35821, 61613, 62245, 12405, 14517, - 17453, 18421, 3149, 3205, 40341, 4109, 43941, 46869, - 48837, 50621, 57405, 60509, 62877, 8157, 12933, 12957, - 16501, 19533, 3461, 36829, 52357, 58189, 58293, 63053, - 17109, 1933, 32157, 37701, 59005, 61621, 13029, 15085, - 16493, 32317, 35093, 5061, 51557, 62221, 20765, 24613, - 2629, 30861, 33197, 33749, 35365, 37933, 40317, 48045, - 56229, 61157, 63797, 7917, 17965, 1917, 1973, 20301, - 2253, 33157, 58629, 59861, 61085, 63909, 8141, 9221, - 14757, 1581, 21637, 26557, 33869, 34285, 35733, 40933, - 42517, 43501, 53653, 61885, 63805, 7141, 21653, 54973, - 31189, 60061, 60341, 63357, 16045, 2053, 26069, 33997, - 43901, 54565, 63837, 8949, 17909, 18693, 32349, 33125, - 37293, 48821, 49053, 51309, 64037, 7117, 1445, 20405, - 23085, 26269, 26293, 27349, 32381, 33141, 34525, 36461, - 37581, 43525, 4357, 43877, 5069, 55197, 63965, 9845, - 12093, 2197, 2229, 32165, 33469, 40981, 42397, 8749, - 10853, 1453, 18069, 21693, 30573, 36261, 37421, 42533 -}; - -#define TID_MULT_TABLE_SIZE \ - ((sizeof tid_multiplier_table) / \ - (sizeof tid_multiplier_table[0])) -#define TID_RANGE_MASK (TID_LOOKAHEAD - 1) -#define TID_POOL_MASK 0xFFFF /* used to wrap the pool index */ -#define TID_SHUFFLE_ONLY 1 -#define TID_USE_POOL 2 - -static isc_uint16_t -tid_next(dns_tid_t *tid) { - isc_uint16_t id, compressed_hash; - isc_uint16_t j; - - compressed_hash = ((tid_hash_state >> 16) ^ - (tid_hash_state)) & 0xFFFF; - - if (tid->tid_usepool) { - isc_uint16_t pick; - - pick = compressed_hash & TID_RANGE_MASK; - pick = (tid->tid_state + pick) & TID_POOL_MASK; - id = tid->tid_pool[pick]; - if (pick != 0) { - /* Swap two IDs to stir the pool */ - tid->tid_pool[pick] = - tid->tid_pool[tid->tid_state]; - tid->tid_pool[tid->tid_state] = id; - } - - /* increment the base pointer into the pool */ - if (tid->tid_state == 65535) - tid->tid_state = 0; - else - tid->tid_state++; - } else { - /* - * This is the original Algorithm B - * j = ((u_long) - * QUERID_SHUFFLE_TABLE_SIZE * tid_state2) >> 16; - * - * We'll perturb it with some random stuff ... - */ - j = ((isc_uint32_t) TID_SHUFFLE_TABLE_SIZE * - (tid->tid_state2 ^ compressed_hash)) >> 16; - tid->tid_state2 = id = tid->tid_vtable[j]; - tid->tid_state = (((isc_uint32_t) tid->tid_a1 * - tid->tid_state) + - tid->tid_c1) & 0xFFFF; - tid->tid_vtable[j] = tid->tid_state; - } - - /* Now lets obfuscate ... */ - id = (((isc_uint32_t) tid->tid_a2 * id) + - tid->tid_c2) & 0xFFFF; - id = (((isc_uint32_t) tid->tid_a3 * id) + - tid->tid_c3) & 0xFFFF; - - return (id); -} - -static isc_result_t -tid_init(isc_mem_t *mctx, dns_tid_t *tid, isc_boolean_t usepool) { - isc_time_t now; - pid_t mypid; - isc_uint16_t a1ndx, a2ndx, a3ndx, c1ndx, c2ndx, c3ndx; - int i; - - isc_time_now(&now); - mypid = getpid(); - - /* Initialize the state */ - memset(tid, 0, sizeof(*tid)); - tid_hash(&now, sizeof now); - tid_hash(&mypid, sizeof mypid); - - /* - * Select our random number generators and initial seed. - * We could really use more random bits at this point, - * but we'll try to make a silk purse out of a sows ear ... - */ - /* generator 1 */ - a1ndx = ((isc_uint32_t) TID_MULT_TABLE_SIZE * - (tid_hash_state & 0xFFFF)) >> 16; - tid->tid_a1 = tid_multiplier_table[a1ndx]; - c1ndx = (tid_hash_state >> 9) & 0x7FFF; - tid->tid_c1 = 2 * c1ndx + 1; - - /* generator 2, distinct from 1 */ - a2ndx = ((isc_uint32_t) (TID_MULT_TABLE_SIZE - 1) * - ((tid_hash_state >> 10) & 0xFFFF)) >> 16; - if (a2ndx >= a1ndx) - a2ndx++; - tid->tid_a2 = tid_multiplier_table[a2ndx]; - c2ndx = tid_hash_state % 32767; - if (c2ndx >= c1ndx) - c2ndx++; - tid->tid_c2 = 2*c2ndx + 1; - - /* generator 3, distinct from 1 and 2 */ - a3ndx = ((isc_uint32_t) (TID_MULT_TABLE_SIZE - 2) * - ((tid_hash_state >> 20) & 0xFFFF)) >> 16; - if (a3ndx >= a1ndx || a3ndx >= a2ndx) - a3ndx++; - if (a3ndx >= a1ndx && a3ndx >= a2ndx) - a3ndx++; - tid->tid_a3 = tid_multiplier_table[a3ndx]; - c3ndx = tid_hash_state % 32766; - if (c3ndx >= c1ndx || c3ndx >= c2ndx) - c3ndx++; - if (c3ndx >= c1ndx && c3ndx >= c2ndx) - c3ndx++; - tid->tid_c3 = 2*c3ndx + 1; - - tid->tid_state = - ((tid_hash_state >> 16) ^ (tid_hash_state)) & 0xFFFF; - - tid->tid_usepool = usepool; - if (tid->tid_usepool) { - tid->tid_pool = isc_mem_get(mctx, - 0x10000 * sizeof(isc_uint16_t)); - if (tid->tid_pool == NULL) - return (ISC_R_NOMEMORY); - for (i = 0; ; i++) { - tid->tid_pool[i] = tid->tid_state; - tid->tid_state = - (((u_long) tid->tid_a1 * - tid->tid_state) + - tid->tid_c1) & 0xFFFF; - if (i == 0xFFFF) - break; - } - } else { - tid->tid_vtable = isc_mem_get(mctx, TID_SHUFFLE_TABLE_SIZE * - (sizeof(isc_uint16_t)) ); - if (tid->tid_vtable == NULL) - return (ISC_R_NOMEMORY); - - for (i = 0; i < TID_SHUFFLE_TABLE_SIZE; i++) { - tid->tid_vtable[i] = tid->tid_state; - tid->tid_state = - (((isc_uint32_t) tid->tid_a1 * - tid->tid_state) + - tid->tid_c1) & 0xFFFF; - } - tid->tid_state2 = tid->tid_state; - } - return (ISC_R_SUCCESS); -} - -static void -tid_destroy(isc_mem_t *mctx, dns_tid_t *tid) { - if (tid->tid_usepool) - isc_mem_put(mctx, tid->tid_pool, - 0x10000 * sizeof(isc_uint16_t)); - else - isc_mem_put(mctx, tid->tid_vtable, - TID_SHUFFLE_TABLE_SIZE * - (sizeof(isc_uint16_t)) ); - memset(tid, 0, sizeof(*tid)); -} - -void -dns_dispatch_hash(void *data, size_t len) { - tid_hash(data, len); -} diff --git a/lib/dns/include/dns/dispatch.h b/lib/dns/include/dns/dispatch.h index 20b763cf3a..480b5a34bb 100644 --- a/lib/dns/include/dns/dispatch.h +++ b/lib/dns/include/dns/dispatch.h @@ -15,7 +15,7 @@ * PERFORMANCE OF THIS SOFTWARE. */ -/* $Id: dispatch.h,v 1.58 2008/04/03 06:09:04 tbox Exp $ */ +/* $Id: dispatch.h,v 1.59 2008/06/23 19:41:19 jinmei Exp $ */ #ifndef DNS_DISPATCH_H #define DNS_DISPATCH_H 1 @@ -105,7 +105,7 @@ struct dns_dispatchevent { * The dispatcher is a TCP or UDP socket. * * _IPV4, _IPV6 - * The dispatcher uses an ipv4 or ipv6 socket. + * The dispatcher uses an IPv4 or IPv6 socket. * * _NOLISTEN * The dispatcher should not listen on the socket. @@ -115,7 +115,12 @@ struct dns_dispatchevent { * accept replies from them. * * _RANDOMPORT - * TBD + * Previously used to indicate that the port of a dispatch UDP must be + * chosen randomly. This behavior now always applies and the attribute + * is obsoleted. + * + * _EXCLUSIVE + * A separate socket will be used on-demand for each transaction. */ #define DNS_DISPATCHATTR_PRIVATE 0x00000001U #define DNS_DISPATCHATTR_TCP 0x00000002U @@ -125,7 +130,8 @@ struct dns_dispatchevent { #define DNS_DISPATCHATTR_NOLISTEN 0x00000020U #define DNS_DISPATCHATTR_MAKEQUERY 0x00000040U #define DNS_DISPATCHATTR_CONNECTED 0x00000080U -#define DNS_DISPATCHATTR_RANDOMPORT 0x00000100U +/*#define DNS_DISPATCHATTR_RANDOMPORT 0x00000100U*/ +#define DNS_DISPATCHATTR_EXCLUSIVE 0x00000200U /*@}*/ isc_result_t @@ -189,23 +195,33 @@ void dns_dispatchmgr_setblackportlist(dns_dispatchmgr_t *mgr, dns_portlist_t *portlist); /*%< - * Sets a list of UDP ports that won't be used when creating a udp - * dispatch with a wildcard port. + * This function is deprecated. Use dns_dispatchmgr_setavailports() instead. * * Requires: *\li mgr is a valid dispatchmgr - *\li portlist to be NULL or a valid port list. */ dns_portlist_t * dns_dispatchmgr_getblackportlist(dns_dispatchmgr_t *mgr); /*%< - * Return the current port list. + * This function is deprecated and always returns NULL. * * Requires: *\li mgr is a valid dispatchmgr */ +isc_result_t +dns_dispatchmgr_setavailports(dns_dispatchmgr_t *mgr, isc_portset_t *v4portset, + isc_portset_t *v6portset); +/*%< + * Sets a list of UDP ports that can be used for outgoing UDP messages. + * + * Requires: + *\li mgr is a valid dispatchmgr + *\li v4portset is NULL or a valid port set + *\li v6portset is NULL or a valid port set + */ + void dns_dispatchmgr_setstats(dns_dispatchmgr_t *mgr, dns_stats_t *stats); /*%< @@ -331,6 +347,12 @@ dns_dispatch_starttcp(dns_dispatch_t *disp); *\li 'disp' is valid. */ +isc_result_t +dns_dispatch_addresponse2(dns_dispatch_t *disp, isc_sockaddr_t *dest, + isc_task_t *task, isc_taskaction_t action, void *arg, + isc_uint16_t *idp, dns_dispentry_t **resp, + isc_socketmgr_t *sockmgr); + isc_result_t dns_dispatch_addresponse(dns_dispatch_t *disp, isc_sockaddr_t *dest, isc_task_t *task, isc_taskaction_t action, void *arg, @@ -354,6 +376,10 @@ dns_dispatch_addresponse(dns_dispatch_t *disp, isc_sockaddr_t *dest, * *\li "resp" be non-NULL and *resp be NULL * + *\li "sockmgr" be NULL or a valid socket manager. If 'disp' has + * the DNS_DISPATCHATTR_EXCLUSIVE attribute, this must not be NULL, + * which also means dns_dispatch_addresponse() cannot be used. + * * Ensures: * *\li <id, dest> is a unique tuple. That means incoming messages @@ -384,6 +410,8 @@ dns_dispatch_removeresponse(dns_dispentry_t **resp, * argument to dns_dispatch_addresponse() when allocating '*resp'. */ +isc_socket_t * +dns_dispatch_getentrysocket(dns_dispentry_t *resp); isc_socket_t * dns_dispatch_getsocket(dns_dispatch_t *disp); @@ -421,6 +449,16 @@ dns_dispatch_cancel(dns_dispatch_t *disp); *\li disp is valid. */ +unsigned int +dns_dispatch_getattributes(dns_dispatch_t *disp); +/*%< + * Return the attributes (DNS_DISPATCHATTR_xxx) of this dispatch. Only the + * non-changeable attributes are expected to be referenced by the caller. + * + * Requires: + *\li disp is valid. + */ + void dns_dispatch_changeattributes(dns_dispatch_t *disp, unsigned int attributes, unsigned int mask); @@ -458,13 +496,6 @@ dns_dispatch_importrecv(dns_dispatch_t *disp, isc_event_t *event); * event != NULL */ -void -dns_dispatch_hash(void *data, size_t len); -/*%< - * Feed 'data' to the dispatch query id generator where 'len' is the size - * of 'data'. - */ - ISC_LANG_ENDDECLS #endif /* DNS_DISPATCH_H */ diff --git a/lib/dns/include/dns/resolver.h b/lib/dns/include/dns/resolver.h index 4b36be958d..6d12a28c0a 100644 --- a/lib/dns/include/dns/resolver.h +++ b/lib/dns/include/dns/resolver.h @@ -15,7 +15,7 @@ * PERFORMANCE OF THIS SOFTWARE. */ -/* $Id: resolver.h,v 1.58 2008/04/03 06:09:05 tbox Exp $ */ +/* $Id: resolver.h,v 1.59 2008/06/23 19:41:19 jinmei Exp $ */ #ifndef DNS_RESOLVER_H #define DNS_RESOLVER_H 1 @@ -107,8 +107,6 @@ typedef struct dns_fetchevent { #define DNS_RESOLVER_CHECKNAMES 0x01 #define DNS_RESOLVER_CHECKNAMESFAIL 0x02 -#define DNS_RESOLVER_USEDISPATCHPOOL4 0x04 -#define DNS_RESOLVER_USEDISPATCHPOOL6 0x08 isc_result_t dns_resolver_create(dns_view_t *view, diff --git a/lib/dns/request.c b/lib/dns/request.c index dd3e670020..009426f389 100644 --- a/lib/dns/request.c +++ b/lib/dns/request.c @@ -15,7 +15,7 @@ * PERFORMANCE OF THIS SOFTWARE. */ -/* $Id: request.c,v 1.79 2007/06/19 23:47:16 tbox Exp $ */ +/* $Id: request.c,v 1.80 2008/06/23 19:41:19 jinmei Exp $ */ /*! \file */ @@ -121,6 +121,7 @@ static isc_result_t req_render(dns_message_t *message, isc_buffer_t **buffer, static void req_senddone(isc_task_t *task, isc_event_t *event); static void req_response(isc_task_t *task, isc_event_t *event); static void req_timeout(isc_task_t *task, isc_event_t *event); +static isc_socket_t * req_getsocket(dns_request_t *request); static void req_connected(isc_task_t *task, isc_event_t *event); static void req_sendevent(dns_request_t *request, isc_result_t result); static void req_cancel(dns_request_t *request); @@ -146,6 +147,7 @@ dns_requestmgr_create(isc_mem_t *mctx, isc_socket_t *socket; isc_result_t result; int i; + unsigned int dispattr; req_log(ISC_LOG_DEBUG(3), "dns_requestmgr_create"); @@ -154,13 +156,14 @@ dns_requestmgr_create(isc_mem_t *mctx, REQUIRE(socketmgr != NULL); REQUIRE(taskmgr != NULL); REQUIRE(dispatchmgr != NULL); + UNUSED(socket); if (dispatchv4 != NULL) { - socket = dns_dispatch_getsocket(dispatchv4); - REQUIRE(isc_socket_gettype(socket) == isc_sockettype_udp); + dispattr = dns_dispatch_getattributes(dispatchv4); + REQUIRE((dispattr & DNS_DISPATCHATTR_UDP) != 0); } if (dispatchv6 != NULL) { - socket = dns_dispatch_getsocket(dispatchv6); - REQUIRE(isc_socket_gettype(socket) == isc_sockettype_udp); + dispattr = dns_dispatch_getattributes(dispatchv6); + REQUIRE((dispattr & DNS_DISPATCHATTR_UDP) != 0); } requestmgr = isc_mem_get(mctx, sizeof(*requestmgr)); @@ -425,12 +428,19 @@ req_send(dns_request_t *request, isc_task_t *task, isc_sockaddr_t *address) { isc_region_t r; isc_socket_t *socket; isc_result_t result; + unsigned int dispattr; req_log(ISC_LOG_DEBUG(3), "req_send: request %p", request); REQUIRE(VALID_REQUEST(request)); - socket = dns_dispatch_getsocket(request->dispatch); + dispattr = dns_dispatch_getattributes(request->dispatch); + socket = req_getsocket(request); isc_buffer_usedregion(request->query, &r); + /* + * We could connect the socket when we are using an exclusive dispatch + * as we do in resolver.c, but we prefer implementation simplicity + * at this moment. + */ result = isc_socket_sendto(socket, &r, task, req_senddone, request, address, NULL); if (result == ISC_R_SUCCESS) @@ -742,14 +752,16 @@ dns_request_createraw3(dns_requestmgr_t *requestmgr, isc_buffer_t *msgbuf, if (result != ISC_R_SUCCESS) goto cleanup; - socket = dns_dispatch_getsocket(request->dispatch); - INSIST(socket != NULL); - result = dns_dispatch_addresponse(request->dispatch, destaddr, task, - req_response, request, &id, - &request->dispentry); + result = dns_dispatch_addresponse2(request->dispatch, destaddr, task, + req_response, request, &id, + &request->dispentry, + requestmgr->socketmgr); if (result != ISC_R_SUCCESS) goto cleanup; + socket = req_getsocket(request); + INSIST(socket != NULL); + result = isc_buffer_allocate(mctx, &request->query, r.length + (tcp ? 2 : 0)); if (result != ISC_R_SUCCESS) @@ -935,13 +947,14 @@ dns_request_createvia3(dns_requestmgr_t *requestmgr, dns_message_t *message, if (result != ISC_R_SUCCESS) goto cleanup; - socket = dns_dispatch_getsocket(request->dispatch); - INSIST(socket != NULL); - result = dns_dispatch_addresponse(request->dispatch, destaddr, task, - req_response, request, &id, - &request->dispentry); + result = dns_dispatch_addresponse2(request->dispatch, destaddr, task, + req_response, request, &id, + &request->dispentry, + requestmgr->socketmgr); if (result != ISC_R_SUCCESS) goto cleanup; + socket = req_getsocket(request); + INSIST(socket != NULL); message->id = id; if (setkey) { @@ -1226,6 +1239,21 @@ dns_request_destroy(dns_request_t **requestp) { *** Private: request. ***/ +static isc_socket_t * +req_getsocket(dns_request_t *request) { + unsigned int dispattr; + isc_socket_t *socket; + + dispattr = dns_dispatch_getattributes(request->dispatch); + if ((dispattr & DNS_DISPATCHATTR_EXCLUSIVE) != 0) { + INSIST(request->dispentry != NULL); + socket = dns_dispatch_getentrysocket(request->dispentry); + } else + socket = dns_dispatch_getsocket(request->dispatch); + + return (socket); +} + static void req_connected(isc_task_t *task, isc_event_t *event) { isc_socketevent_t *sevent = (isc_socketevent_t *)event; @@ -1425,6 +1453,7 @@ req_destroy(dns_request_t *request) { static void req_cancel(dns_request_t *request) { isc_socket_t *socket; + unsigned int dispattr; REQUIRE(VALID_REQUEST(request)); @@ -1437,16 +1466,23 @@ req_cancel(dns_request_t *request) { if (request->timer != NULL) isc_timer_detach(&request->timer); + dispattr = dns_dispatch_getattributes(request->dispatch); + socket = NULL; + if (DNS_REQUEST_CONNECTING(request) || DNS_REQUEST_SENDING(request)) { + if ((dispattr & DNS_DISPATCHATTR_EXCLUSIVE) != 0) { + if (request->dispentry != NULL) { + socket = dns_dispatch_getentrysocket( + request->dispentry); + } + } else + socket = dns_dispatch_getsocket(request->dispatch); + if (DNS_REQUEST_CONNECTING(request) && socket != NULL) + isc_socket_cancel(socket, NULL, ISC_SOCKCANCEL_CONNECT); + if (DNS_REQUEST_SENDING(request) && socket != NULL) + isc_socket_cancel(socket, NULL, ISC_SOCKCANCEL_SEND); + } if (request->dispentry != NULL) dns_dispatch_removeresponse(&request->dispentry, NULL); - if (DNS_REQUEST_CONNECTING(request)) { - socket = dns_dispatch_getsocket(request->dispatch); - isc_socket_cancel(socket, NULL, ISC_SOCKCANCEL_CONNECT); - } - if (DNS_REQUEST_SENDING(request)) { - socket = dns_dispatch_getsocket(request->dispatch); - isc_socket_cancel(socket, NULL, ISC_SOCKCANCEL_SEND); - } dns_dispatch_detach(&request->dispatch); } diff --git a/lib/dns/resolver.c b/lib/dns/resolver.c index 599d6b98d7..9b07d1bbd2 100644 --- a/lib/dns/resolver.c +++ b/lib/dns/resolver.c @@ -15,7 +15,7 @@ * PERFORMANCE OF THIS SOFTWARE. */ -/* $Id: resolver.c,v 1.372 2008/06/17 22:35:08 jinmei Exp $ */ +/* $Id: resolver.c,v 1.373 2008/06/23 19:41:19 jinmei Exp $ */ /*! \file */ @@ -124,6 +124,7 @@ typedef struct query { isc_mem_t * mctx; dns_dispatchmgr_t * dispatchmgr; dns_dispatch_t * dispatch; + isc_boolean_t exclusivesocket; dns_adbaddrinfo_t * addrinfo; isc_socket_t * tcpsocket; isc_time_t start; @@ -294,24 +295,6 @@ typedef struct alternate { ISC_LINK(struct alternate) link; } alternate_t; -#ifdef ISC_RWLOCK_USEATOMIC -#define DNS_RESOLVER_USERWLOCK 1 -#else -#define DNS_RESOLVER_USERWLOCK 0 -#endif - -#if DNS_RESOLVER_USERWLOCK -#define RES_INITLOCK(l) isc_rwlock_init((l), 0, 0) -#define RES_DESTROYLOCK(l) isc_rwlock_destroy(l) -#define RES_LOCK(l, t) RWLOCK((l), (t)) -#define RES_UNLOCK(l, t) RWUNLOCK((l), (t)) -#else -#define RES_INITLOCK(l) isc_mutex_init(l) -#define RES_DESTROYLOCK(l) DESTROYLOCK(l) -#define RES_LOCK(l, t) LOCK(l) -#define RES_UNLOCK(l, t) UNLOCK(l) -#endif - struct dns_resolver { /* Unlocked. */ unsigned int magic; @@ -319,11 +302,6 @@ struct dns_resolver { isc_mutex_t lock; isc_mutex_t nlock; isc_mutex_t primelock; -#if DNS_RESOLVER_USERWLOCK - isc_rwlock_t poollock; -#else - isc_mutex_t poollock; -#endif dns_rdataclass_t rdclass; isc_socketmgr_t * socketmgr; isc_timermgr_t * timermgr; @@ -333,7 +311,9 @@ struct dns_resolver { unsigned int options; dns_dispatchmgr_t * dispatchmgr; dns_dispatch_t * dispatchv4; + isc_boolean_t exclusivev4; dns_dispatch_t * dispatchv6; + isc_boolean_t exclusivev6; unsigned int ndisps; unsigned int nbuckets; fctxbucket_t * buckets; @@ -352,7 +332,6 @@ struct dns_resolver { unsigned int spillatmin; isc_timer_t * spillattimer; isc_boolean_t zero_no_soa_ttl; - isc_timer_t * disppooltimer; /* Locked by lock. */ unsigned int references; @@ -366,9 +345,6 @@ struct dns_resolver { dns_fetch_t * primefetch; /* Locked by nlock. */ unsigned int nfctx; - /* Locked by poollock. */ - dns_dispatch_t ** dispatchv4pool; - dns_dispatch_t ** dispatchv6pool; }; #define RES_MAGIC ISC_MAGIC('R', 'e', 's', '!') @@ -603,6 +579,7 @@ fctx_cancelquery(resquery_t **queryp, dns_dispatchevent_t **deventp, unsigned int factor; dns_adbfind_t *find; dns_adbaddrinfo_t *addrinfo; + isc_socket_t *socket; query = *queryp; fctx = query->fctx; @@ -684,6 +661,38 @@ fctx_cancelquery(resquery_t **queryp, dns_dispatchevent_t **deventp, 0, factor); } + /* + * Check for any outstanding socket events. If they exist, cancel + * them and let the event handlers finish the cleanup. The resolver + * only needs to worry about managing the connect and send events; + * the dispatcher manages the recv events. + */ + if (RESQUERY_CONNECTING(query)) { + /* + * Cancel the connect. + */ + if (query->tcpsocket != NULL) { + isc_socket_cancel(query->tcpsocket, NULL, + ISC_SOCKCANCEL_CONNECT); + } else if (query->dispentry != NULL) { + INSIST(query->exclusivesocket); + socket = dns_dispatch_getentrysocket(query->dispentry); + if (socket != NULL) + isc_socket_cancel(socket, NULL, + ISC_SOCKCANCEL_CONNECT); + } + } else if (RESQUERY_SENDING(query)) { + /* + * Cancel the pending send. + */ + if (query->exclusivesocket && query->dispentry != NULL) + socket = dns_dispatch_getentrysocket(query->dispentry); + else + socket = dns_dispatch_getsocket(query->dispatch); + if (socket != NULL) + isc_socket_cancel(socket, NULL, ISC_SOCKCANCEL_SEND); + } + if (query->dispentry != NULL) dns_dispatch_removeresponse(&query->dispentry, deventp); @@ -695,25 +704,6 @@ fctx_cancelquery(resquery_t **queryp, dns_dispatchevent_t **deventp, if (query->tsigkey != NULL) dns_tsigkey_detach(&query->tsigkey); - /* - * Check for any outstanding socket events. If they exist, cancel - * them and let the event handlers finish the cleanup. The resolver - * only needs to worry about managing the connect and send events; - * the dispatcher manages the recv events. - */ - if (RESQUERY_CONNECTING(query)) - /* - * Cancel the connect. - */ - isc_socket_cancel(query->tcpsocket, NULL, - ISC_SOCKCANCEL_CONNECT); - else if (RESQUERY_SENDING(query)) - /* - * Cancel the pending send. - */ - isc_socket_cancel(dns_dispatch_getsocket(query->dispatch), - NULL, ISC_SOCKCANCEL_SEND); - if (query->dispatch != NULL) dns_dispatch_detach(&query->dispatch); @@ -912,43 +902,25 @@ fctx_done(fetchctx_t *fctx, isc_result_t result) { } static void -resquery_senddone(isc_task_t *task, isc_event_t *event) { +process_sendevent(resquery_t *query, isc_event_t *event) { isc_socketevent_t *sevent = (isc_socketevent_t *)event; - resquery_t *query = event->ev_arg; isc_boolean_t retry = ISC_FALSE; isc_result_t result; fetchctx_t *fctx; - REQUIRE(event->ev_type == ISC_SOCKEVENT_SENDDONE); - - QTRACE("senddone"); - - /* - * XXXRTH - * - * Currently we don't wait for the senddone event before retrying - * a query. This means that if we get really behind, we may end - * up doing extra work! - */ - - UNUSED(task); - - INSIST(RESQUERY_SENDING(query)); - - query->sends--; fctx = query->fctx; if (RESQUERY_CANCELED(query)) { - if (query->sends == 0) { + if (query->sends == 0 && query->connects == 0) { /* * This query was canceled while the - * isc_socket_sendto() was in progress. + * isc_socket_sendto/connect() was in progress. */ if (query->tcpsocket != NULL) isc_socket_detach(&query->tcpsocket); resquery_destroy(&query); } - } else + } else { switch (sevent->result) { case ISC_R_SUCCESS: break; @@ -970,6 +942,7 @@ resquery_senddone(isc_task_t *task, isc_event_t *event) { fctx_cancelquery(&query, NULL, NULL, ISC_FALSE); break; } + } isc_event_free(&event); @@ -987,6 +960,48 @@ resquery_senddone(isc_task_t *task, isc_event_t *event) { } } +static void +resquery_udpconnected(isc_task_t *task, isc_event_t *event) { + resquery_t *query = event->ev_arg; + + REQUIRE(event->ev_type == ISC_SOCKEVENT_CONNECT); + + QTRACE("udpconnected"); + + UNUSED(task); + + INSIST(RESQUERY_CONNECTING(query)); + + query->connects--; + + process_sendevent(query, event); +} + +static void +resquery_senddone(isc_task_t *task, isc_event_t *event) { + resquery_t *query = event->ev_arg; + + REQUIRE(event->ev_type == ISC_SOCKEVENT_SENDDONE); + + QTRACE("senddone"); + + /* + * XXXRTH + * + * Currently we don't wait for the senddone event before retrying + * a query. This means that if we get really behind, we may end + * up doing extra work! + */ + + UNUSED(task); + + INSIST(RESQUERY_SENDING(query)); + + query->sends--; + + process_sendevent(query, event); +} + static inline isc_result_t fctx_addopt(dns_message_t *message, unsigned int version, isc_uint16_t udpsize, isc_boolean_t request_nsid) @@ -1139,6 +1154,7 @@ fctx_query(fetchctx_t *fctx, dns_adbaddrinfo_t *addrinfo, */ query->dispatchmgr = res->dispatchmgr; query->dispatch = NULL; + query->exclusivesocket = ISC_FALSE; query->tcpsocket = NULL; if (res->view->peers != NULL) { dns_peer_t *peer = NULL; @@ -1221,39 +1237,16 @@ fctx_query(fetchctx_t *fctx, dns_adbaddrinfo_t *addrinfo, if (result != ISC_R_SUCCESS) goto cleanup_query; } else { - int did = 0; - isc_uint32_t val; - - if (res->ndisps > 0) { - isc_random_get(&val); - did = val % res->ndisps; - } switch (isc_sockaddr_pf(&addrinfo->sockaddr)) { case PF_INET: - if (res->dispatchv4pool != NULL) { - RES_LOCK(&res->poollock, - isc_rwlocktype_read); - dns_dispatch_attach(res->dispatchv4pool[did], - &query->dispatch); - RES_UNLOCK(&res->poollock, - isc_rwlocktype_read); - } else { - dns_dispatch_attach(res->dispatchv4, - &query->dispatch); - } + dns_dispatch_attach(res->dispatchv4, + &query->dispatch); + query->exclusivesocket = res->exclusivev4; break; case PF_INET6: - if (res->dispatchv6pool != NULL) { - RES_LOCK(&res->poollock, - isc_rwlocktype_read); - dns_dispatch_attach(res->dispatchv6pool[did], - &query->dispatch); - RES_UNLOCK(&res->poollock, - isc_rwlocktype_read); - } else { - dns_dispatch_attach(res->dispatchv6, - &query->dispatch); - } + dns_dispatch_attach(res->dispatchv6, + &query->dispatch); + query->exclusivesocket = res->exclusivev6; break; default: result = ISC_R_NOTIMPLEMENTED; @@ -1449,13 +1442,14 @@ resquery_send(resquery_t *query) { /* * Get a query id from the dispatch. */ - result = dns_dispatch_addresponse(query->dispatch, - &query->addrinfo->sockaddr, - task, - resquery_response, - query, - &query->id, - &query->dispentry); + result = dns_dispatch_addresponse2(query->dispatch, + &query->addrinfo->sockaddr, + task, + resquery_response, + query, + &query->id, + &query->dispentry, + res->socketmgr); if (result != ISC_R_SUCCESS) goto cleanup_temps; @@ -1672,12 +1666,24 @@ resquery_send(resquery_t *query) { */ dns_message_reset(fctx->qmessage, DNS_MESSAGE_INTENTRENDER); - socket = dns_dispatch_getsocket(query->dispatch); + if (query->exclusivesocket) + socket = dns_dispatch_getentrysocket(query->dispentry); + else + socket = dns_dispatch_getsocket(query->dispatch); /* * Send the query! */ - if ((query->options & DNS_FETCHOPT_TCP) == 0) + if ((query->options & DNS_FETCHOPT_TCP) == 0) { address = &query->addrinfo->sockaddr; + if (query->exclusivesocket) { + result = isc_socket_connect(socket, address, task, + resquery_udpconnected, + query); + if (result != ISC_R_SUCCESS) + goto cleanup_message; + query->connects++; + } + } isc_buffer_usedregion(buffer, &r); /* @@ -2779,6 +2785,8 @@ fctx_destroy(fetchctx_t *fctx) { static void fctx_timeout(isc_task_t *task, isc_event_t *event) { fetchctx_t *fctx = event->ev_arg; + isc_timerevent_t *tevent = (isc_timerevent_t *)event; + resquery_t *query; REQUIRE(VALID_FCTX(fctx)); @@ -2794,8 +2802,18 @@ fctx_timeout(isc_task_t *task, isc_event_t *event) { fctx->timeouts++; /* * We could cancel the running queries here, or we could let - * them keep going. Right now we choose the latter... + * them keep going. Since we normally use separate sockets for + * different queries, we adopt the former approach to reduce + * the number of open sockets: cancel the oldest query if it + * expired before the query had started (this is usually the + * case but is not always so, depending on the task schedule + * timing). */ + query = ISC_LIST_HEAD(fctx->queries); + if (query != NULL && + isc_time_compare(&tevent->due, &query->start) >= 0) { + fctx_cancelquery(&query, NULL, NULL, ISC_TRUE); + } fctx->attributes &= ~FCTX_ATTR_ADDRWAIT; /* * Our timer has triggered. Reestablish the fctx lifetime @@ -5659,6 +5677,19 @@ resquery_response(isc_task_t *task, isc_event_t *event) { * There's no hope for this query. */ keep_trying = ISC_TRUE; + + /* + * If this is a network error on an exclusive query + * socket, mark the server as bad so that we won't try + * it for this fetch again. + */ + if (query->exclusivesocket && + (devent->result == ISC_R_HOSTUNREACH || + devent->result == ISC_R_NETUNREACH || + devent->result == ISC_R_CONNREFUSED || + devent->result == ISC_R_CANCELED)) { + broken_server = devent->result; + } } goto done; } @@ -6262,7 +6293,6 @@ destroy(dns_resolver_t *res) { INSIST(res->nfctx == 0); - RES_DESTROYLOCK(&res->poollock); DESTROYLOCK(&res->primelock); DESTROYLOCK(&res->nlock); DESTROYLOCK(&res->lock); @@ -6279,26 +6309,12 @@ destroy(dns_resolver_t *res) { dns_dispatch_detach(&res->dispatchv4); if (res->dispatchv6 != NULL) dns_dispatch_detach(&res->dispatchv6); - if (res->dispatchv4pool != NULL) { - for (i = 0; i < res->ndisps; i++) - dns_dispatch_detach(&res->dispatchv4pool[i]); - isc_mem_put(res->mctx, res->dispatchv4pool, - res->ndisps * sizeof(dns_dispatch_t *)); - } - if (res->dispatchv6pool != NULL) { - for (i = 0; i < res->ndisps; i++) - dns_dispatch_detach(&res->dispatchv6pool[i]); - isc_mem_put(res->mctx, res->dispatchv6pool, - res->ndisps * sizeof(dns_dispatch_t *)); - } while ((a = ISC_LIST_HEAD(res->alternates)) != NULL) { ISC_LIST_UNLINK(res->alternates, a, link); if (!a->isaddress) dns_name_free(&a->_u._n.name, res->mctx); isc_mem_put(res->mctx, a, sizeof(*a)); } - if (res->disppooltimer != NULL) - isc_timer_detach(&res->disppooltimer); dns_resolver_reset_algorithms(res); dns_resolver_resetmustbesecure(res); #if USE_ALGLOCK @@ -6395,6 +6411,7 @@ dns_resolver_create(dns_view_t *view, unsigned int i, buckets_created = 0; isc_task_t *task = NULL; char name[16]; + unsigned dispattr; /* * Create a resolver. @@ -6429,9 +6446,6 @@ dns_resolver_create(dns_view_t *view, res->zero_no_soa_ttl = ISC_FALSE; res->ndisps = 0; res->nextdisp = 0; /* meaningless at this point, but init it */ - res->dispatchv4pool = NULL; - res->dispatchv6pool = NULL; - res->disppooltimer = NULL; res->nbuckets = ntasks; res->activebuckets = ntasks; res->buckets = isc_mem_get(view->mctx, @@ -6475,12 +6489,20 @@ dns_resolver_create(dns_view_t *view, } res->dispatchv4 = NULL; - if (dispatchv4 != NULL) - dns_dispatch_attach(dispatchv4, &res->dispatchv4); + if (dispatchv4 != NULL) { + dns_dispatch_attach(dispatchv4, &res->dispatchv4); + dispattr = dns_dispatch_getattributes(dispatchv4); + res->exclusivev4 = + ISC_TF((dispattr & DNS_DISPATCHATTR_EXCLUSIVE) != 0); + } res->dispatchv6 = NULL; - if (dispatchv6 != NULL) + if (dispatchv6 != NULL) { dns_dispatch_attach(dispatchv6, &res->dispatchv6); + dispattr = dns_dispatch_getattributes(dispatchv6); + res->exclusivev6 = + ISC_TF((dispattr & DNS_DISPATCHATTR_EXCLUSIVE) != 0); + } res->references = 1; res->exiting = ISC_FALSE; @@ -6502,21 +6524,17 @@ dns_resolver_create(dns_view_t *view, if (result != ISC_R_SUCCESS) goto cleanup_nlock; - result = RES_INITLOCK(&res->poollock); - if (result != ISC_R_SUCCESS) - goto cleanup_primelock; - task = NULL; result = isc_task_create(taskmgr, 0, &task); if (result != ISC_R_SUCCESS) - goto cleanup_poollock; + goto cleanup_primelock; result = isc_timer_create(timermgr, isc_timertype_inactive, NULL, NULL, task, spillattimer_countdown, res, &res->spillattimer); isc_task_detach(&task); if (result != ISC_R_SUCCESS) - goto cleanup_poollock; + goto cleanup_primelock; #if USE_ALGLOCK result = isc_rwlock_init(&res->alglock, 0, 0); @@ -6546,9 +6564,6 @@ dns_resolver_create(dns_view_t *view, isc_timer_detach(&res->spillattimer); #endif - cleanup_poollock: - RES_DESTROYLOCK(&res->poollock); - cleanup_primelock: DESTROYLOCK(&res->primelock); @@ -6770,12 +6785,12 @@ dns_resolver_shutdown(dns_resolver_t *res) { fctx != NULL; fctx = ISC_LIST_NEXT(fctx, link)) fctx_shutdown(fctx); - if (res->dispatchv4 != NULL) { + if (res->dispatchv4 != NULL && !res->exclusivev4) { sock = dns_dispatch_getsocket(res->dispatchv4); isc_socket_cancel(sock, res->buckets[i].task, ISC_SOCKCANCEL_ALL); } - if (res->dispatchv6 != NULL) { + if (res->dispatchv6 != NULL && !res->exclusivev6) { sock = dns_dispatch_getsocket(res->dispatchv6); isc_socket_cancel(sock, res->buckets[i].task, ISC_SOCKCANCEL_ALL); @@ -7459,245 +7474,3 @@ dns_resolver_getoptions(dns_resolver_t *resolver) { return (resolver->options); } - -static void -disppooltimer_update(isc_task_t *task, isc_event_t *event) { - dns_resolver_t *res = event->ev_arg; - isc_sockaddr_t addr4, addr6; - dns_dispatch_t *disp4 = NULL, *disp6 = NULL; - isc_result_t result; - unsigned int nxt; - unsigned int attrs_base, attrs, attrmask; - - REQUIRE(VALID_RESOLVER(res)); - REQUIRE((res->options & DNS_RESOLVER_USEDISPATCHPOOL4) != 0 || - (res->options & DNS_RESOLVER_USEDISPATCHPOOL6) != 0); - - UNUSED(task); - isc_event_free(&event); - - LOCK(&res->lock); - nxt = res->nextdisp++; - if (res->nextdisp == res->ndisps) - res->nextdisp = 0; - UNLOCK(&res->lock); - - attrs_base = 0; - attrs_base |= DNS_DISPATCHATTR_UDP; - attrs_base |= DNS_DISPATCHATTR_RANDOMPORT; - - attrmask = 0; - attrmask |= DNS_DISPATCHATTR_UDP; - attrmask |= DNS_DISPATCHATTR_TCP; - attrmask |= DNS_DISPATCHATTR_IPV4; - attrmask |= DNS_DISPATCHATTR_IPV6; - - RES_LOCK(&res->poollock, isc_rwlocktype_read); - if ((res->options & DNS_RESOLVER_USEDISPATCHPOOL4) != 0) { - result = dns_dispatch_getlocaladdress(res->dispatchv4pool[nxt], - &addr4); - INSIST(result == ISC_R_SUCCESS); - } - if ((res->options & DNS_RESOLVER_USEDISPATCHPOOL6) != 0) { - result = dns_dispatch_getlocaladdress(res->dispatchv6pool[nxt], - &addr6); - INSIST(result == ISC_R_SUCCESS); - } - RES_UNLOCK(&res->poollock, isc_rwlocktype_read); - - if ((res->options & DNS_RESOLVER_USEDISPATCHPOOL4) != 0) { - attrs = attrs_base; - attrs |= DNS_DISPATCHATTR_IPV4; - - result = dns_dispatch_getudp(res->dispatchmgr, - res->socketmgr, - res->taskmgr, &addr4, - 4096, 1000, 32768, 16411, - 16433, attrs, attrmask, - &disp4); - if (result != ISC_R_SUCCESS) { - isc_log_write(dns_lctx, DNS_LOGCATEGORY_RESOLVER, - DNS_LOGMODULE_RESOLVER, ISC_LOG_ERROR, - "could not update an IPv4 random query " - "port: %s", - isc_result_totext(result)); - /* keep the old one */ - } - - /* - * We don't try to ensure the new dispatch is unique (see the - * comments in dns_resolver_createdispatchpool()). - */ - } - if ((res->options & DNS_RESOLVER_USEDISPATCHPOOL6) != 0) { - attrs = attrs_base; - attrs |= DNS_DISPATCHATTR_IPV6; - - result = dns_dispatch_getudp(res->dispatchmgr, - res->socketmgr, - res->taskmgr, &addr6, - 4096, 1000, 32768, 16411, - 16433, attrs, attrmask, - &disp6); - if (result != ISC_R_SUCCESS) { - isc_log_write(dns_lctx, DNS_LOGCATEGORY_RESOLVER, - DNS_LOGMODULE_RESOLVER, ISC_LOG_ERROR, - "could not update an IPv6 random query " - "port: %s", - isc_result_totext(result)); - } - } - - RES_LOCK(&res->poollock, isc_rwlocktype_write); - if (disp4 != NULL) { - dns_dispatch_detach(&res->dispatchv4pool[nxt]); - res->dispatchv4pool[nxt] = disp4; - } - if (disp6 != NULL) { - dns_dispatch_detach(&res->dispatchv6pool[nxt]); - res->dispatchv6pool[nxt] = disp6; - } - RES_UNLOCK(&res->poollock, isc_rwlocktype_write); - - return; -} - -isc_result_t -dns_resolver_createdispatchpool(dns_resolver_t *res, unsigned int ndisps, - unsigned int tick) -{ - unsigned int i; - isc_result_t result = ISC_R_SUCCESS; - unsigned int attrs_base, attrs, attrmask; - isc_sockaddr_t addr4, addr6; - dns_dispatch_t *disp; - isc_task_t *task; - isc_interval_t interval; - - REQUIRE(VALID_RESOLVER(res)); - REQUIRE(!res->frozen); /* meaning we don't have to lock res */ - REQUIRE(ndisps > 0); - REQUIRE((res->options & DNS_RESOLVER_USEDISPATCHPOOL4) != 0 || - (res->options & DNS_RESOLVER_USEDISPATCHPOOL6) != 0); - - attrs_base = 0; - attrs_base |= DNS_DISPATCHATTR_UDP; - attrs_base |= DNS_DISPATCHATTR_RANDOMPORT; - - attrmask = 0; - attrmask |= DNS_DISPATCHATTR_UDP; - attrmask |= DNS_DISPATCHATTR_TCP; - attrmask |= DNS_DISPATCHATTR_IPV4; - attrmask |= DNS_DISPATCHATTR_IPV6; - - if ((res->options & DNS_RESOLVER_USEDISPATCHPOOL4) != 0) { - INSIST(res->dispatchv4 != NULL); - result = dns_dispatch_getlocaladdress(res->dispatchv4, &addr4); - INSIST(result == ISC_R_SUCCESS && - isc_sockaddr_getport(&addr4) == 0); - res->dispatchv4pool = isc_mem_get(res->mctx, - sizeof(dns_dispatch_t *) * - ndisps); - if (res->dispatchv4pool == NULL) - return (ISC_R_NOMEMORY); - for (i = 0; i < ndisps; i++) - res->dispatchv4pool[i] = NULL; - } - if ((res->options & DNS_RESOLVER_USEDISPATCHPOOL6) != 0) { - INSIST(res->dispatchv6 != NULL); - result = dns_dispatch_getlocaladdress(res->dispatchv6, &addr6); - INSIST(result == ISC_R_SUCCESS && - isc_sockaddr_getport(&addr6) == 0); - res->dispatchv6pool = isc_mem_get(res->mctx, - sizeof(dns_dispatch_t *) * - ndisps); - if (res->dispatchv6pool == NULL) { - isc_mem_put(res->mctx, res->dispatchv4pool, - sizeof(dns_dispatch_t *) * ndisps); - res->dispatchv4pool = NULL; - return (ISC_R_NOMEMORY); - } - for (i = 0; i < ndisps; i++) - res->dispatchv6pool[i] = NULL; - } - - for (i = 0; i < ndisps; i++) { - if ((res->options & DNS_RESOLVER_USEDISPATCHPOOL4) != 0) { - attrs = attrs_base; - attrs |= DNS_DISPATCHATTR_IPV4; - - disp = NULL; - result = dns_dispatch_getudp(res->dispatchmgr, - res->socketmgr, - res->taskmgr, &addr4, - 4096, 1000, 32768, 16411, - 16433, attrs, attrmask, - &disp); - if (result != ISC_R_SUCCESS) - goto cleanup; - res->dispatchv4pool[i] = disp; - - /* - * It might be better to ensure all ports are - * different, but in practice it's probably okay to - * assume dns_dispatch_getudp() made reasonable - * choices. - */ - } - if ((res->options & DNS_RESOLVER_USEDISPATCHPOOL6) != 0) { - attrs = attrs_base; - attrs |= DNS_DISPATCHATTR_IPV6; - - disp = NULL; - result = dns_dispatch_getudp(res->dispatchmgr, - res->socketmgr, - res->taskmgr, &addr6, - 4096, 1000, 32768, 16411, - 16433, attrs, attrmask, - &disp); - if (result != ISC_R_SUCCESS) - goto cleanup; - - res->dispatchv6pool[i] = disp; - } - } - - /* start update timer */ - if (tick != 0) { - task = NULL; - result = isc_task_create(res->taskmgr, 0, &task); - if (result != ISC_R_SUCCESS) - goto cleanup; - isc_interval_set(&interval, tick, 0); - result = isc_timer_create(res->timermgr, isc_timertype_ticker, - NULL, &interval, task, - disppooltimer_update, - res, &res->disppooltimer); - isc_task_detach(&task); - if (result != ISC_R_SUCCESS) - goto cleanup; - } - - res->ndisps = ndisps; - res->nextdisp = 0; - - return (result); - - cleanup: - if (res->dispatchv4pool != NULL) { - for (i = 0; i < ndisps; i++) - if (res->dispatchv4pool[i] != NULL) - dns_dispatch_detach(&res->dispatchv4pool[i]); - isc_mem_put(res->mctx, res->dispatchv4pool, - sizeof(dns_dispatch_t *) * ndisps); - } - if (res->dispatchv6pool != NULL) { - for (i = 0; i < ndisps; i++) - if (res->dispatchv6pool[i] != NULL) - dns_dispatch_detach(&res->dispatchv6pool[i]); - isc_mem_put(res->mctx, res->dispatchv6pool, - sizeof(dns_dispatch_t *) * ndisps); - } - - return (result); -} diff --git a/lib/isc/Makefile.in b/lib/isc/Makefile.in index e0a420c7fe..e88457209e 100644 --- a/lib/isc/Makefile.in +++ b/lib/isc/Makefile.in @@ -13,7 +13,7 @@ # OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR # PERFORMANCE OF THIS SOFTWARE. -# $Id: Makefile.in,v 1.93 2007/09/14 03:39:29 marka Exp $ +# $Id: Makefile.in,v 1.94 2008/06/23 19:41:19 jinmei Exp $ srcdir = @srcdir@ VPATH = @srcdir@ @@ -58,7 +58,7 @@ OBJS = @ISC_EXTRA_OBJS@ \ lex.@O@ lfsr.@O@ lib.@O@ log.@O@ \ md5.@O@ mem.@O@ mutexblock.@O@ \ netaddr.@O@ netscope.@O@ ondestroy.@O@ \ - parseint.@O@ quota.@O@ radix.@O@ random.@O@ \ + parseint.@O@ portset.@O@ quota.@O@ radix.@O@ random.@O@ \ ratelimiter.@O@ refcount.@O@ region.@O@ result.@O@ rwlock.@O@ \ serial.@O@ sha1.@O@ sha2.@O@ sockaddr.@O@ \ string.@O@ strtoul.@O@ symtab.@O@ task.@O@ taskpool.@O@ \ @@ -73,7 +73,7 @@ SRCS = @ISC_EXTRA_SRCS@ \ lex.c lfsr.c lib.c log.c \ md5.c mem.c mutexblock.c \ netaddr.c netscope.c ondestroy.c \ - parseint.c quota.c radix.c random.c \ + parseint.c portset.c quota.c radix.c random.c \ ratelimiter.c refcount.c region.c result.c rwlock.c \ serial.c sha1.c sha2.c sockaddr.c string.c strtoul.c \ symtab.c task.c taskpool.c timer.c version.c diff --git a/lib/isc/include/isc/platform.h.in b/lib/isc/include/isc/platform.h.in index 0ef1b65a73..79439109b0 100644 --- a/lib/isc/include/isc/platform.h.in +++ b/lib/isc/include/isc/platform.h.in @@ -15,7 +15,7 @@ * PERFORMANCE OF THIS SOFTWARE. */ -/* $Id: platform.h.in,v 1.47 2008/01/24 23:47:00 tbox Exp $ */ +/* $Id: platform.h.in,v 1.48 2008/06/23 19:41:19 jinmei Exp $ */ #ifndef ISC_PLATFORM_H #define ISC_PLATFORM_H 1 @@ -136,6 +136,21 @@ */ @ISC_PLATFORM_FIXIN6ISADDR@ +/*! \brief + * Define if the system supports kqueue multiplexing + */ +@ISC_PLATFORM_HAVEKQUEUE@ + +/*! \brief + * Define if the system supports epoll multiplexing + */ +@ISC_PLATFORM_HAVEEPOLL@ + +/*! \brief + * Define if the system supports /dev/poll multiplexing + */ +@ISC_PLATFORM_HAVEDEVPOLL@ + /* *** Printing. ***/ diff --git a/lib/isc/include/isc/portset.h b/lib/isc/include/isc/portset.h new file mode 100644 index 0000000000..3ad7867983 --- /dev/null +++ b/lib/isc/include/isc/portset.h @@ -0,0 +1,141 @@ +/* + * Copyright (C) 2008 Internet Systems Consortium, Inc. ("ISC") + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND ISC DISCLAIMS ALL WARRANTIES WITH + * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY + * AND FITNESS. IN NO EVENT SHALL ISC BE LIABLE FOR ANY SPECIAL, DIRECT, + * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM + * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE + * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR + * PERFORMANCE OF THIS SOFTWARE. + */ + +/* $Id: portset.h,v 1.2 2008/06/23 19:41:19 jinmei Exp $ */ + +/*! \file isc/portset.h + * \brief Transport Protocol Port Manipuration Module + * + * This module provides simple utilities to handle a set of transport protocol + * (UDP or TCP) port numbers, e.g., for creating an ACL list. An isc_portset_t + * object is an opaque instance of a port set, for which the user can add or + * remove a specific port or a range of consecutive ports. This object is + * expected to be used as a temporary work space only, and does not protect + * simultaneous access from multiple threads. Therefore it must not be stored + * in a place that can be accessed from multiple threads. + */ + +#ifndef ISC_PORTSET_H +#define ISC_PORTSET_H 1 + +/*** + *** Imports + ***/ + +#include + +/*** + *** Functions + ***/ + +ISC_LANG_BEGINDECLS + +isc_result_t +isc_portset_create(isc_mem_t *mctx, isc_portset_t **portsetp); +/*%< + * Create a port set and initialize it as an empty set. + * + * Requires: + *\li 'mctx' to be valid. + *\li 'portsetp' to be non NULL and '*portsetp' to be NULL; + * + * Returns: + *\li #ISC_R_SUCCESS + *\li #ISC_R_NOMEMORY + */ + +void +isc_portset_destroy(isc_mem_t *mctx, isc_portset_t **portsetp); +/*%< + * Destroy a port set. + * + * Requires: + *\li 'mctx' to be valid and must be the same context given when the port set + * was created. + *\li '*portsetp' to be a valid set. + */ + +isc_boolean_t +isc_portset_isset(isc_portset_t *portset, in_port_t port); +/*%< + * Test whether the given port is stored in the portset. + * + * Requires: + *\li 'portset' to be a valid set. + * + * Returns + * \li #ISC_TRUE if the port is found, ISC_FALSE otherwise. + */ + +unsigned int +isc_portset_nports(isc_portset_t *portset); +/*%< + * Provides the number of ports stored in the given portset. + * + * Requires: + *\li 'portset' to be a valid set. + * + * Returns + * \li the number of ports stored in portset. + */ + +void +isc_portset_add(isc_portset_t *portset, in_port_t port); +/*%< + * Add the given port to the portset. The port may or may not be stored in + * the portset. + * + * Requires: + *\li 'portlist' to be valid. + */ + +void +isc_portset_remove(isc_portset_t *portset, in_port_t port); +/*%< + * Remove the given port to the portset. The port may or may not be stored in + * the portset. + * + * Requires: + *\li 'portlist' to be valid. + */ + +void +isc_portset_addrange(isc_portset_t *portset, in_port_t port_lo, + in_port_t port_hi); +/*%< + * Add a subset of [port_lo, port_hi] (inclusive) to the portset. Ports in the + * subset may or may not be stored in portset. + * + * Requires: + *\li 'portlist' to be valid. + *\li port_lo <= port_hi + */ + +void +isc_portset_removerange(isc_portset_t *portset, in_port_t port_lo, + in_port_t port_hi); +/*%< + * Subtract a subset of [port_lo, port_hi] (inclusive) from the portset. Ports + * in the subset may or may not be stored in portset. + * + * Requires: + *\li 'portlist' to be valid. + *\li port_lo <= port_hi + */ + +ISC_LANG_ENDDECLS + +#endif /* ISC_NETADDR_H */ diff --git a/lib/isc/include/isc/socket.h b/lib/isc/include/isc/socket.h index c3558fed7b..dc18f02c2c 100644 --- a/lib/isc/include/isc/socket.h +++ b/lib/isc/include/isc/socket.h @@ -15,7 +15,7 @@ * PERFORMANCE OF THIS SOFTWARE. */ -/* $Id: socket.h,v 1.74 2008/06/04 23:47:11 tbox Exp $ */ +/* $Id: socket.h,v 1.75 2008/06/23 19:41:19 jinmei Exp $ */ #ifndef ISC_SOCKET_H #define ISC_SOCKET_H 1 @@ -360,6 +360,45 @@ isc_socket_detach(isc_socket_t **socketp); * All resources used by the socket have been freed */ +isc_result_t +isc_socket_open(isc_socket_t *sock); +/*%< + * Open a new socket file descriptor of the given socket structure. It simply + * opens a new descriptor; all of the other parameters including the socket + * type are inherited from the existing socket. This function is provided to + * avoid overhead of destroying and creating sockets when many short-lived + * sockets are frequently opened and closed. When the efficiency is not an + * issue, it should be safer to detach the unused socket and re-create a new + * one. + * + * Requires: + * + * \li there must be no other reference to this socket. + * + * \li 'socket' is a valid and previously closed by isc_socket_close() + * + * Returns: + * Same as isc_socket_create(). + */ + +void +isc_socket_close(isc_socket_t *sock); +/*%< + * Close a socket file descriptor of the given socket structure. This function + * is provided as an alternative to destroying an unused socket when overhead + * destroying/re-creating sockets can be significant, and is expected to be + * used with isc_socket_open(). + * + * Requires: + * + * \li The socket must have a valid descriptor. + * + * \li There must be no other reference to this socket. + * + * \li There must be no pending I/O requests. + * + */ + isc_result_t isc_socket_bind(isc_socket_t *sock, isc_sockaddr_t *addressp); /*%< diff --git a/lib/isc/include/isc/timer.h b/lib/isc/include/isc/timer.h index 69b9d2b097..8c3de399ca 100644 --- a/lib/isc/include/isc/timer.h +++ b/lib/isc/include/isc/timer.h @@ -15,7 +15,7 @@ * PERFORMANCE OF THIS SOFTWARE. */ -/* $Id: timer.h,v 1.38 2007/06/19 23:47:18 tbox Exp $ */ +/* $Id: timer.h,v 1.39 2008/06/23 19:41:19 jinmei Exp $ */ #ifndef ISC_TIMER_H #define ISC_TIMER_H 1 @@ -76,6 +76,7 @@ #include #include #include +#include ISC_LANG_BEGINDECLS @@ -93,6 +94,7 @@ typedef enum { typedef struct isc_timerevent { struct isc_event common; + isc_time_t due; } isc_timerevent_t; #define ISC_TIMEREVENT_FIRSTEVENT (ISC_EVENTCLASS_TIMER + 0) diff --git a/lib/isc/include/isc/types.h b/lib/isc/include/isc/types.h index 5f4ca82901..88c06ff666 100644 --- a/lib/isc/include/isc/types.h +++ b/lib/isc/include/isc/types.h @@ -15,7 +15,7 @@ * PERFORMANCE OF THIS SOFTWARE. */ -/* $Id: types.h,v 1.45 2008/01/17 23:47:00 tbox Exp $ */ +/* $Id: types.h,v 1.46 2008/06/23 19:41:19 jinmei Exp $ */ #ifndef ISC_TYPES_H #define ISC_TYPES_H 1 @@ -70,6 +70,7 @@ typedef struct isc_mempool isc_mempool_t; /*%< Memory Pool */ typedef struct isc_msgcat isc_msgcat_t; /*%< Message Catalog */ typedef struct isc_ondestroy isc_ondestroy_t; /*%< On Destroy */ typedef struct isc_netaddr isc_netaddr_t; /*%< Net Address */ +typedef struct isc_portset isc_portset_t; /*%< Port Set */ typedef struct isc_quota isc_quota_t; /*%< Quota */ typedef struct isc_random isc_random_t; /*%< Random */ typedef struct isc_ratelimiter isc_ratelimiter_t; /*%< Rate Limiter */ diff --git a/lib/isc/portset.c b/lib/isc/portset.c new file mode 100644 index 0000000000..e4c5da0f39 --- /dev/null +++ b/lib/isc/portset.c @@ -0,0 +1,140 @@ +/* + * Copyright (C) 2008 Internet Systems Consortium, Inc. ("ISC") + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND ISC DISCLAIMS ALL WARRANTIES WITH + * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY + * AND FITNESS. IN NO EVENT SHALL ISC BE LIABLE FOR ANY SPECIAL, DIRECT, + * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM + * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE + * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR + * PERFORMANCE OF THIS SOFTWARE. + */ + +/* $Id: portset.c,v 1.2 2008/06/23 19:41:19 jinmei Exp $ */ + +/*! \file */ +#include +#include +#include +#include +#include + +#define ISC_PORTSET_BUFSIZE (65536 / (sizeof(isc_uint32_t) * 8)) + +/*% + * Internal representation of portset. It's an array of 32-bit integers, each + * bit corresponding to a single port in the ascending order. For example, + * the second most significant bit of buf[0] corresponds to port 1. + */ +struct isc_portset { + unsigned int nports; /*%< number of ports in the set */ + isc_uint32_t buf[ISC_PORTSET_BUFSIZE]; +}; + +static inline isc_boolean_t +portset_isset(isc_portset_t *portset, in_port_t port) { + return (ISC_TF((portset->buf[port >> 5] & (1 << (port & 31))) != 0)); +} + +static inline void +portset_add(isc_portset_t *portset, in_port_t port) { + if (!portset_isset(portset, port)) { + portset->nports++; + portset->buf[port >> 5] |= (1 << (port & 31)); + } +} + +static inline void +portset_remove(isc_portset_t *portset, in_port_t port) { + if (portset_isset(portset, port)) { + portset->nports--; + portset->buf[port >> 5] &= ~(1 << (port & 31)); + } +} + +isc_result_t +isc_portset_create(isc_mem_t *mctx, isc_portset_t **portsetp) { + isc_portset_t *portset; + + REQUIRE(portsetp != NULL && *portsetp == NULL); + + portset = isc_mem_get(mctx, sizeof(*portset)); + if (portset == NULL) + return (ISC_R_NOMEMORY); + + /* Make the set 'empty' by default */ + memset(portset, 0, sizeof(*portset)); + *portsetp = portset; + + return (ISC_R_SUCCESS); +} + +void +isc_portset_destroy(isc_mem_t *mctx, isc_portset_t **portsetp) { + isc_portset_t *portset; + + REQUIRE(portsetp != NULL); + portset = *portsetp; + + isc_mem_put(mctx, portset, sizeof(*portset)); +} + +isc_boolean_t +isc_portset_isset(isc_portset_t *portset, in_port_t port) { + REQUIRE(portset != NULL); + + return (portset_isset(portset, port)); +} + +unsigned int +isc_portset_nports(isc_portset_t *portset) { + REQUIRE(portset != NULL); + + return (portset->nports); +} + +void +isc_portset_add(isc_portset_t *portset, in_port_t port) { + REQUIRE(portset != NULL); + + portset_add(portset, port); +} + +void +isc_portset_remove(isc_portset_t *portset, in_port_t port) { + portset_remove(portset, port); +} + +void +isc_portset_addrange(isc_portset_t *portset, in_port_t port_lo, + in_port_t port_hi) +{ + in_port_t p; + + REQUIRE(portset != NULL); + REQUIRE(port_lo <= port_hi); + + p = port_lo; + do { + portset_add(portset, p); + } while (p++ < port_hi); +} + +void +isc_portset_removerange(isc_portset_t *portset, in_port_t port_lo, + in_port_t port_hi) +{ + in_port_t p; + + REQUIRE(portset != NULL); + REQUIRE(port_lo <= port_hi); + + p = port_lo; + do { + portset_remove(portset, p); + } while (p++ < port_hi); +} diff --git a/lib/isc/timer.c b/lib/isc/timer.c index fb599796d8..15f793fe16 100644 --- a/lib/isc/timer.c +++ b/lib/isc/timer.c @@ -15,7 +15,7 @@ * PERFORMANCE OF THIS SOFTWARE. */ -/* $Id: timer.c,v 1.81 2007/10/24 00:57:23 marka Exp $ */ +/* $Id: timer.c,v 1.82 2008/06/23 19:41:19 jinmei Exp $ */ /*! \file */ @@ -577,7 +577,7 @@ isc_timer_detach(isc_timer_t **timerp) { static void dispatch(isc_timermgr_t *manager, isc_time_t *now) { isc_boolean_t done = ISC_FALSE, post_event, need_schedule; - isc_event_t *event; + isc_timerevent_t *event; isc_eventtype_t type = 0; isc_timer_t *timer; isc_result_t result; @@ -650,16 +650,18 @@ dispatch(isc_timermgr_t *manager, isc_time_t *now) { /* * XXX We could preallocate this event. */ - event = isc_event_allocate(manager->mctx, + event = (isc_timerevent_t *)isc_event_allocate(manager->mctx, timer, type, timer->action, timer->arg, sizeof(*event)); - if (event != NULL) - isc_task_send(timer->task, &event); - else + if (event != NULL) { + event->due = timer->due; + isc_task_send(timer->task, + (isc_event_t **)&event); + } else UNEXPECTED_ERROR(__FILE__, __LINE__, isc_msgcat_get(isc_msgcat, ISC_MSGSET_TIMER, diff --git a/lib/isc/unix/app.c b/lib/isc/unix/app.c index a50189d66b..23e573809c 100644 --- a/lib/isc/unix/app.c +++ b/lib/isc/unix/app.c @@ -15,7 +15,7 @@ * PERFORMANCE OF THIS SOFTWARE. */ -/* $Id: app.c,v 1.57 2008/01/18 23:46:58 tbox Exp $ */ +/* $Id: app.c,v 1.58 2008/06/23 19:41:19 jinmei Exp $ */ /*! \file */ @@ -30,6 +30,9 @@ #include #include #include +#ifdef HAVE_EPOLL +#include +#endif #include #include @@ -303,8 +306,7 @@ evloop() { int n; isc_time_t when, now; struct timeval tv, *tvp; - fd_set readfds, writefds; - int maxfd; + isc_socketwait_t *swait; isc_boolean_t readytasks; isc_boolean_t call_timer_dispatch = ISC_FALSE; @@ -331,8 +333,8 @@ evloop() { } } - isc__socketmgr_getfdsets(&readfds, &writefds, &maxfd); - n = select(maxfd, &readfds, &writefds, NULL, tvp); + swait = NULL; + n = isc__socketmgr_waitevents(tvp, &swait); if (n == 0 || call_timer_dispatch) { /* @@ -352,8 +354,7 @@ evloop() { isc__timermgr_dispatch(); } if (n > 0) - (void)isc__socketmgr_dispatch(&readfds, &writefds, - maxfd); + (void)isc__socketmgr_dispatch(swait); (void)isc__taskmgr_dispatch(); if (want_reload) { diff --git a/lib/isc/unix/include/isc/net.h b/lib/isc/unix/include/isc/net.h index cb40de2c42..32abb0ba8b 100644 --- a/lib/isc/unix/include/isc/net.h +++ b/lib/isc/unix/include/isc/net.h @@ -15,7 +15,7 @@ * PERFORMANCE OF THIS SOFTWARE. */ -/* $Id: net.h,v 1.46 2007/06/19 23:47:19 tbox Exp $ */ +/* $Id: net.h,v 1.47 2008/06/23 19:41:20 jinmei Exp $ */ #ifndef ISC_NET_H #define ISC_NET_H 1 @@ -324,6 +324,23 @@ isc_net_probeunix(void); * Returns whether UNIX domain sockets are supported. */ +isc_result_t +isc_net_getudpportrange(int af, in_port_t *low, in_port_t *high); +/*%< + * Returns system's default range of ephemeral UDP ports, if defined. + * If the range is not available or unknown, ISC_NET_PORTRANGELOW and + * ISC_NET_PORTRANGEHIGH will be returned. + * + * Requires: + * + *\li 'low' and 'high' must be non NULL. + * + * Returns: + * + *\li *low and *high will be the ports specifying the low and high ends of + * the range. + */ + #ifdef ISC_PLATFORM_NEEDNTOP const char * isc_net_ntop(int af, const void *src, char *dst, size_t size); diff --git a/lib/isc/unix/net.c b/lib/isc/unix/net.c index b19ace9c9d..f1d23bd1eb 100644 --- a/lib/isc/unix/net.c +++ b/lib/isc/unix/net.c @@ -15,10 +15,13 @@ * PERFORMANCE OF THIS SOFTWARE. */ -/* $Id: net.c,v 1.36 2007/09/13 04:45:18 each Exp $ */ +/* $Id: net.c,v 1.37 2008/06/23 19:41:19 jinmei Exp $ */ #include +#include +#include + #include #include @@ -30,6 +33,50 @@ #include #include +/*% + * Definitions about UDP port range specification. This is a total mess of + * portability variants: some use sysctl (but the sysctl names vary), some use + * system-specific interfaces, some have the same interface for IPv4 and IPv6, + * some separate them, etc... + */ + +/*% + * The last resort defaults: use all non well known port space + */ +#ifndef ISC_NET_PORTRANGELOW +#define ISC_NET_PORTRANGELOW 1024 +#endif /* ISC_NET_PORTRANGELOW */ +#ifndef ISC_NET_PORTRANGEHIGH +#define ISC_NET_PORTRANGEHIGH 65535 +#endif /* ISC_NET_PORTRANGEHIGH */ + +/*% + * sysctl variants + */ +#if defined(__FreeBSD__) || defined(__APPLE__) +#define USE_SYSCTL_PORTRANGE +#define SYSCTL_V4PORTRANGE_LOW "net.inet.ip.portrange.first" +#define SYSCTL_V4PORTRANGE_HIGH "net.inet.ip.portrange.last" +#define SYSCTL_V6PORTRANGE_LOW "net.inet.ip.portrange.first" +#define SYSCTL_V6PORTRANGE_HIGH "net.inet.ip.portrange.last" +#endif + +#ifdef __NetBSD__ +#define USE_SYSCTL_PORTRANGE +#define SYSCTL_V4PORTRANGE_LOW "net.inet.ip.anonportmin" +#define SYSCTL_V4PORTRANGE_HIGH "net.inet.ip.anonportmax" +#define SYSCTL_V6PORTRANGE_LOW "net.inet6.ip6.portrange.first" +#define SYSCTL_V6PORTRANGE_HIGH "net.inet6.ip6.portrange.last" +#endif + +#ifdef __OpenBSD__ +#define USE_SYSCTL_PORTRANGE +#define SYSCTL_V4PORTRANGE_LOW "net.inet.ip.portfirst" +#define SYSCTL_V4PORTRANGE_HIGH "net.inet.ip.portlast" +#define SYSCTL_V6PORTRANGE_LOW "net.inet6.ip6.portrange.first" +#define SYSCTL_V6PORTRANGE_HIGH "net.inet6.ip6.portrange.last" +#endif + #if defined(ISC_PLATFORM_HAVEIPV6) # if defined(ISC_PLATFORM_NEEDIN6ADDRANY) const struct in6_addr isc_net_in6addrany = IN6ADDR_ANY_INIT; @@ -338,6 +385,60 @@ isc_net_probe_ipv6pktinfo(void) { return (ipv6pktinfo_result); } +#ifdef USE_SYSCTL_PORTRANGE +static isc_result_t +getudpportrange_sysctl(int af, in_port_t *low, in_port_t *high) { + int port_low, port_high; + size_t portlen; + const char *sysctlname_lowport, *sysctlname_hiport; + + if (af == AF_INET) { + sysctlname_lowport = SYSCTL_V4PORTRANGE_LOW; + sysctlname_hiport = SYSCTL_V4PORTRANGE_HIGH; + } else { + sysctlname_lowport = SYSCTL_V6PORTRANGE_LOW; + sysctlname_hiport = SYSCTL_V6PORTRANGE_HIGH; + } + portlen = sizeof(portlen); + if (sysctlbyname(sysctlname_lowport, &port_low, &portlen, + NULL, 0) < 0) { + return (ISC_R_FAILURE); + } + portlen = sizeof(portlen); + if (sysctlbyname(sysctlname_hiport, &port_high, &portlen, + NULL, 0) < 0) { + return (ISC_R_FAILURE); + } + if ((port_low & ~0xffff) != 0 || (port_high & ~0xffff) != 0) + return (ISC_R_RANGE); + + *low = (in_port_t)port_low; + *high = (in_port_t)port_high; + + return (ISC_R_SUCCESS); +} +#endif + +isc_result_t +isc_net_getudpportrange(int af, in_port_t *low, in_port_t *high) { + int result = ISC_R_FAILURE; + + REQUIRE(low != NULL && high != NULL); + +#ifdef USE_SYSCTL_PORTRANGE + result = getudpportrange_sysctl(af, low, high); +#else + UNUSED(af); +#endif + + if (result != ISC_R_SUCCESS) { + *low = ISC_NET_PORTRANGELOW; + *high = ISC_NET_PORTRANGEHIGH; + } + + return (ISC_R_SUCCESS); /* we currently never fail in this function */ +} + void isc_net_disableipv4(void) { initialize(); diff --git a/lib/isc/unix/socket.c b/lib/isc/unix/socket.c index 5192576b70..d5a332ad5a 100644 --- a/lib/isc/unix/socket.c +++ b/lib/isc/unix/socket.c @@ -15,7 +15,7 @@ * PERFORMANCE OF THIS SOFTWARE. */ -/* $Id: socket.c,v 1.280 2008/03/27 21:08:51 jinmei Exp $ */ +/* $Id: socket.c,v 1.281 2008/06/23 19:41:19 jinmei Exp $ */ /*! \file */ @@ -25,9 +25,6 @@ #include #include #include -#ifdef ISC_PLATFORM_HAVESYSUNH -#include -#endif #include #include @@ -59,6 +56,19 @@ #include #include +#ifdef ISC_PLATFORM_HAVESYSUNH +#include +#endif +#ifdef ISC_PLATFORM_HAVEKQUEUE +#include +#endif +#ifdef ISC_PLATFORM_HAVEEPOLL +#include +#endif +#ifdef ISC_PLATFORM_HAVEDEVPOLL +#include +#endif + #include "errno2result.h" #ifndef ISC_PLATFORM_USETHREADS @@ -69,6 +79,70 @@ #include #endif +/*% + * Choose the most preferable multiplex method. + */ +#ifdef ISC_PLATFORM_HAVEKQUEUE +#define USE_KQUEUE +#elif defined (ISC_PLATFORM_HAVEEPOLL) +#define USE_EPOLL +#elif defined (ISC_PLATFORM_HAVEDEVPOLL) +#define USE_DEVPOLL +typedef struct { + unsigned int want_read : 1, + want_write : 1; +} pollinfo_t; +#else +#define USE_SELECT +#endif /* ISC_PLATFORM_HAVEKQUEUE */ + +#ifndef ISC_PLATFORM_USETHREADS +#if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL) +struct isc_socketwait { + int nevents; +}; +#elif defined (USE_SELECT) +struct isc_socketwait { + fd_set readset; + fd_set writeset; + int nfds; + int maxfd; +}; +#endif /* USE_KQUEUE */ +#endif /* !ISC_PLATFORM_USETHREADS */ + +/*% + * Maximum number of allowable open sockets. This is also the maximum + * allowable socket file descriptor. This definition is meaningless with + * USE_SELECT due to the API limitation of select(2). + */ +#if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL) +#ifndef ISC_SOCKET_MAXSOCKETS +#define ISC_SOCKET_MAXSOCKETS 4096 +#endif +#endif /* USE_KQUEUE || USE_EPOLL || USE_DEVPOLL */ + +/*% + * Size of per-FD lock buckets. + */ +#ifdef ISC_PLATFORM_USETHREADS +#define FDLOCK_COUNT 1024 +#define FDLOCK_ID(fd) ((fd) % FDLOCK_COUNT) +#else +#define FDLOCK_COUNT 1 +#define FDLOCK_ID(fd) 0 +#endif /* ISC_PLATFORM_USETHREADS */ + +/*% + * Maximum number of events communicated with the kernel. There should normally + * be no need for having a large number. + */ +#if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL) +#ifndef ISC_SOCKET_MAXEVENTS +#define ISC_SOCKET_MAXEVENTS 64 +#endif +#endif + /*% * Some systems define the socket length argument as an int, some as size_t, * some as socklen_t. This is here so it can be easily changed if needed. @@ -209,17 +283,44 @@ struct isc_socketmgr { unsigned int magic; isc_mem_t *mctx; isc_mutex_t lock; + isc_mutex_t *fdlock; +#ifdef USE_KQUEUE + int kqueue_fd; + int nevents; + struct kevent *events; +#endif /* USE_KQUEUE */ +#ifdef USE_EPOLL + int epoll_fd; + int nevents; + struct epoll_event *events; +#endif /* USE_EPOLL */ +#ifdef USE_DEVPOLL + int devpoll_fd; + int nevents; + struct pollfd *events; +#endif /* USE_DEVPOLL */ + unsigned int maxsocks; +#ifdef ISC_PLATFORM_USETHREADS + int pipe_fds[2]; +#endif + + /* Locked by fdlock. */ + isc_socket_t **fds; + int *fdstate; +#ifdef USE_DEVPOLL + pollinfo_t *fdpollinfo; +#endif + /* Locked by manager lock. */ ISC_LIST(isc_socket_t) socklist; +#ifdef USE_SELECT fd_set read_fds; fd_set write_fds; - isc_socket_t *fds[FD_SETSIZE]; - int fdstate[FD_SETSIZE]; int maxfd; +#endif /* USE_SELECT */ #ifdef ISC_PLATFORM_USETHREADS isc_thread_t watcher; isc_condition_t shutdown_ok; - int pipe_fds[2]; #else /* ISC_PLATFORM_USETHREADS */ unsigned int refs; #endif /* ISC_PLATFORM_USETHREADS */ @@ -261,6 +362,9 @@ static void build_msghdr_send(isc_socket_t *, isc_socketevent_t *, struct msghdr *, struct iovec *, size_t *); static void build_msghdr_recv(isc_socket_t *, isc_socketevent_t *, struct msghdr *, struct iovec *, size_t *); +#ifdef ISC_PLATFORM_USETHREADS +static isc_boolean_t process_ctlfd(isc_socketmgr_t *manager); +#endif #define SELECT_POKE_SHUTDOWN (-1) #define SELECT_POKE_NOTHING (-2) @@ -329,9 +433,164 @@ socket_log(isc_socket_t *sock, isc_sockaddr_t *address, } } +static inline isc_result_t +watch_fd(isc_socketmgr_t *manager, int fd, int msg) { + isc_result_t result = ISC_R_SUCCESS; + +#ifdef USE_KQUEUE + struct kevent evchange; + + memset(&evchange, 0, sizeof(evchange)); + if (msg == SELECT_POKE_READ) + evchange.filter = EVFILT_READ; + else + evchange.filter = EVFILT_WRITE; + evchange.flags = EV_ADD; + evchange.ident = fd; + if (kevent(manager->kqueue_fd, &evchange, 1, NULL, 0, NULL) != 0) + result = isc__errno2result(errno); + + return (result); +#elif defined(USE_EPOLL) + struct epoll_event event; + + if (msg == SELECT_POKE_READ) + event.events = EPOLLIN; + else + event.events = EPOLLOUT; + event.data.fd = fd; + if (epoll_ctl(manager->epoll_fd, EPOLL_CTL_ADD, fd, &event) == -1 && + errno != EEXIST) { + result = isc__errno2result(errno); + } + + return (result); +#elif defined(USE_DEVPOLL) + struct pollfd pfd; + int lockid = FDLOCK_ID(fd); + + memset(&pfd, 0, sizeof(pfd)); + if (msg == SELECT_POKE_READ) + pfd.events = POLLIN; + else + pfd.events = POLLOUT; + pfd.fd = fd; + pfd.revents = 0; + LOCK(&manager->fdlock[lockid]); + if (write(manager->devpoll_fd, &pfd, sizeof(pfd)) == -1) + result = isc__errno2result(errno); + else { + if (msg == SELECT_POKE_READ) + manager->fdpollinfo[fd].want_read = 1; + else + manager->fdpollinfo[fd].want_write = 1; + } + UNLOCK(&manager->fdlock[lockid]); + + return (result); +#elif defined(USE_SELECT) + LOCK(&manager->lock); + if (msg == SELECT_POKE_READ) + FD_SET(fd, &manager->read_fds); + if (msg == SELECT_POKE_WRITE) + FD_SET(fd, &manager->write_fds); + UNLOCK(&manager->lock); + + return (result); +#endif +} + +static inline isc_result_t +unwatch_fd(isc_socketmgr_t *manager, int fd, int msg) { + isc_result_t result = ISC_R_SUCCESS; + +#ifdef USE_KQUEUE + struct kevent evchange; + + memset(&evchange, 0, sizeof(evchange)); + if (msg == SELECT_POKE_READ) + evchange.filter = EVFILT_READ; + else + evchange.filter = EVFILT_WRITE; + evchange.flags = EV_DELETE; + evchange.ident = fd; + if (kevent(manager->kqueue_fd, &evchange, 1, NULL, 0, NULL) != 0) + result = isc__errno2result(errno); + + return (result); +#elif defined(USE_EPOLL) + struct epoll_event event; + + if (msg == SELECT_POKE_READ) + event.events = EPOLLIN; + else + event.events = EPOLLOUT; + event.data.fd = fd; + if (epoll_ctl(manager->epoll_fd, EPOLL_CTL_DEL, fd, &event) == -1 && + errno != ENOENT) { + char strbuf[ISC_STRERRORSIZE]; + isc__strerror(errno, strbuf, sizeof(strbuf)); + UNEXPECTED_ERROR(__FILE__, __LINE__, + "epoll_ctl(DEL), %d: %s", fd, strbuf); + result = ISC_R_UNEXPECTED; + } + return (result); +#elif defined(USE_DEVPOLL) + struct pollfd pfds[2]; + size_t writelen = sizeof(pfds[0]); + int lockid = FDLOCK_ID(fd); + + memset(pfds, 0, sizeof(pfds)); + pfds[0].events = POLLREMOVE; + pfds[0].fd = fd; + + /* + * Canceling read or write polling via /dev/poll is tricky. Since it + * only provides a way of canceling per FD, we may need to re-poll the + * socket for the other operation. + */ + LOCK(&manager->fdlock[lockid]); + if (msg == SELECT_POKE_READ && + manager->fdpollinfo[fd].want_write == 1) { + pfds[1].events = POLLOUT; + pfds[1].fd = fd; + writelen += sizeof(pfds[1]); + } + if (msg == SELECT_POKE_WRITE && + manager->fdpollinfo[fd].want_read == 1) { + pfds[1].events = POLLIN; + pfds[1].fd = fd; + writelen += sizeof(pfds[1]); + } + + if (write(manager->devpoll_fd, pfds, writelen) == -1) + result = isc__errno2result(errno); + else { + if (msg == SELECT_POKE_READ) + manager->fdpollinfo[fd].want_read = 0; + else + manager->fdpollinfo[fd].want_write = 0; + } + UNLOCK(&manager->fdlock[lockid]); + + return (result); +#elif defined(USE_SELECT) + LOCK(&manager->lock); + if (msg == SELECT_POKE_READ) + FD_CLR(fd, &manager->read_fds); + else if (msg == SELECT_POKE_WRITE) + FD_CLR(fd, &manager->write_fds); + UNLOCK(&manager->lock); + + return (result); +#endif +} + static void wakeup_socket(isc_socketmgr_t *manager, int fd, int msg) { - isc_socket_t *sock; + isc_result_t result; + isc_boolean_t needclose; + int lockid = FDLOCK_ID(fd); /* * This is a wakeup on a socket. If the socket is not in the @@ -339,29 +598,50 @@ wakeup_socket(isc_socketmgr_t *manager, int fd, int msg) { * or writes. */ - INSIST(fd >= 0 && fd < (int)FD_SETSIZE); + INSIST(fd >= 0 && fd < (int)manager->maxsocks); + LOCK(&manager->fdlock[lockid]); if (manager->fdstate[fd] == CLOSE_PENDING || manager->fdstate[fd] == MANAGER_CLOSE_PENDING) { - FD_CLR(fd, &manager->read_fds); - FD_CLR(fd, &manager->write_fds); - if (manager->fdstate[fd] == CLOSE_PENDING) - (void)close(fd); + needclose = ISC_TF(manager->fdstate[fd] == CLOSE_PENDING); manager->fdstate[fd] = CLOSED; + UNLOCK(&manager->fdlock[lockid]); + + /* + * We accept (and ignore) any error from unwatch_fd() as we are + * closing the socket, hoping it doesn't leave dangling state in + * the kernel. + * Note that unwatch_fd() must be called after releasing the + * fdlock; otherwise it could cause deadlock due to a lock order + * reversal. + */ + (void)unwatch_fd(manager, fd, SELECT_POKE_READ); + (void)unwatch_fd(manager, fd, SELECT_POKE_WRITE); + if (needclose) + (void)close(fd); return; } - if (manager->fdstate[fd] != MANAGED) + if (manager->fdstate[fd] != MANAGED) { + UNLOCK(&manager->fdlock[lockid]); return; - - sock = manager->fds[fd]; + } + UNLOCK(&manager->fdlock[lockid]); /* * Set requested bit. */ - if (msg == SELECT_POKE_READ) - FD_SET(sock->fd, &manager->read_fds); - if (msg == SELECT_POKE_WRITE) - FD_SET(sock->fd, &manager->write_fds); + result = watch_fd(manager, fd, msg); + if (result != ISC_R_SUCCESS) { + /* + * XXXJT: what should we do? Ignoring the failure of watching + * a socket will make the application dysfunctional, but there + * seems to be no reasonable recovery process. + */ + isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL, + ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR, + "failed to start watching FD (%d): %s", + fd, isc_result_totext(result)); + } } #ifdef ISC_PLATFORM_USETHREADS @@ -656,7 +936,7 @@ build_msghdr_send(isc_socket_t *sock, isc_socketevent_t *dev, memset(msg, 0, sizeof(*msg)); - if (sock->type == isc_sockettype_udp) { + if (!sock->connected) { msg->msg_name = (void *)&dev->address.type.sa; msg->msg_namelen = dev->address.length; } else { @@ -1221,8 +1501,56 @@ doio_send(isc_socket_t *sock, isc_socketevent_t *dev) { * Caller must ensure that the socket is not locked and no external * references exist. */ +static void +closesocket(isc_socketmgr_t *manager, isc_sockettype_t type, int fd) { + int lockid = FDLOCK_ID(fd); + + /* + * No one has this socket open, so the watcher doesn't have to be + * poked, and the socket doesn't have to be locked. + */ + LOCK(&manager->fdlock[lockid]); + manager->fds[fd] = NULL; + if (type == isc_sockettype_fdwatch) + manager->fdstate[fd] = MANAGER_CLOSE_PENDING; + else + manager->fdstate[fd] = CLOSE_PENDING; + UNLOCK(&manager->fdlock[lockid]); + select_poke(manager, fd, SELECT_POKE_CLOSE); + + /* + * update manager->maxfd here (XXX: this should be implemented more + * efficiently) + */ +#ifdef USE_SELECT + LOCK(&manager->lock); + if (manager->maxfd == fd) { + int i; + + manager->maxfd = 0; + for (i = fd - 1; i >= 0; i--) { + lockid = FDLOCK_ID(i); + + LOCK(&manager->fdlock[lockid]); + if (manager->fdstate[i] == MANAGED) { + manager->maxfd = i; + UNLOCK(&manager->fdlock[lockid]); + break; + } + UNLOCK(&manager->fdlock[lockid]); + } +#ifdef ISC_PLATFORM_USETHREADS + if (manager->maxfd < manager->pipe_fds[0]) + manager->maxfd = manager->pipe_fds[0]; +#endif + } + UNLOCK(&manager->lock); +#endif /* USE_SELECT */ +} + static void destroy(isc_socket_t **sockp) { + int fd; isc_socket_t *sock = *sockp; isc_socketmgr_t *manager = sock->manager; @@ -1233,20 +1561,16 @@ destroy(isc_socket_t **sockp) { INSIST(ISC_LIST_EMPTY(sock->recv_list)); INSIST(ISC_LIST_EMPTY(sock->send_list)); INSIST(sock->connect_ev == NULL); - REQUIRE(sock->fd >= 0 && sock->fd < (int)FD_SETSIZE); + REQUIRE(sock->fd == -1 || sock->fd < (int)manager->maxsocks); + + if (sock->fd >= 0) { + fd = sock->fd; + sock->fd = -1; + closesocket(manager, sock->type, fd); + } LOCK(&manager->lock); - /* - * No one has this socket open, so the watcher doesn't have to be - * poked, and the socket doesn't have to be locked. - */ - manager->fds[sock->fd] = NULL; - if (sock->type == isc_sockettype_fdwatch) - manager->fdstate[sock->fd] = MANAGER_CLOSE_PENDING; - else - manager->fdstate[sock->fd] = CLOSE_PENDING; - select_poke(manager, sock->fd, SELECT_POKE_CLOSE); ISC_LIST_UNLINK(manager->socklist, sock, link); #ifdef ISC_PLATFORM_USETHREADS @@ -1254,10 +1578,6 @@ destroy(isc_socket_t **sockp) { SIGNAL(&manager->shutdown_ok); #endif /* ISC_PLATFORM_USETHREADS */ - /* - * XXX should reset manager->maxfd here - */ - UNLOCK(&manager->lock); free_socket(sockp); @@ -1445,18 +1765,11 @@ clear_bsdcompat(void) { } #endif -/*% - * Create a new 'type' socket managed by 'manager'. Events - * will be posted to 'task' and when dispatched 'action' will be - * called with 'arg' as the arg value. The new socket is returned - * in 'socketp'. - */ -isc_result_t -isc_socket_create(isc_socketmgr_t *manager, int pf, isc_sockettype_t type, - isc_socket_t **socketp) -{ - isc_socket_t *sock = NULL; - isc_result_t result; +static isc_result_t +opensocket(isc_socketmgr_t *manager, isc_socket_t *sock) { + char strbuf[ISC_STRERRORSIZE]; + const char *err = "socket"; + int tries = 0; #if defined(USE_CMSG) || defined(SO_BSDCOMPAT) int on = 1; #endif @@ -1464,31 +1777,20 @@ isc_socket_create(isc_socketmgr_t *manager, int pf, isc_sockettype_t type, ISC_SOCKADDR_LEN_T optlen; int size; #endif - char strbuf[ISC_STRERRORSIZE]; - const char *err = "socket"; - int tries = 0; - REQUIRE(VALID_MANAGER(manager)); - REQUIRE(socketp != NULL && *socketp == NULL); - - result = allocate_socket(manager, type, &sock); - if (result != ISC_R_SUCCESS) - return (result); - - sock->pf = pf; again: - switch (type) { + switch (sock->type) { case isc_sockettype_udp: - sock->fd = socket(pf, SOCK_DGRAM, IPPROTO_UDP); + sock->fd = socket(sock->pf, SOCK_DGRAM, IPPROTO_UDP); break; case isc_sockettype_tcp: - sock->fd = socket(pf, SOCK_STREAM, IPPROTO_TCP); + sock->fd = socket(sock->pf, SOCK_STREAM, IPPROTO_TCP); break; case isc_sockettype_unix: - sock->fd = socket(pf, SOCK_STREAM, 0); + sock->fd = socket(sock->pf, SOCK_STREAM, 0); break; case isc_sockettype_fdwatch: - INSIST(type != isc_sockettype_fdwatch); + INSIST(sock->type != isc_sockettype_fdwatch); break; } if (sock->fd == -1 && errno == EINTR && tries++ < 42) @@ -1509,20 +1811,17 @@ isc_socket_create(isc_socketmgr_t *manager, int pf, isc_sockettype_t type, } #endif - if (sock->fd >= (int)FD_SETSIZE) { + if (sock->fd >= (int)manager->maxsocks) { (void)close(sock->fd); isc_log_iwrite(isc_lctx, ISC_LOGCATEGORY_GENERAL, ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR, isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_TOOMANYFDS, "%s: too many open file descriptors", "socket"); - free_socket(&sock); return (ISC_R_NORESOURCES); } if (sock->fd < 0) { - free_socket(&sock); - switch (errno) { case EMFILE: case ENFILE: @@ -1554,14 +1853,13 @@ isc_socket_create(isc_socketmgr_t *manager, int pf, isc_sockettype_t type, if (make_nonblock(sock->fd) != ISC_R_SUCCESS) { (void)close(sock->fd); - free_socket(&sock); return (ISC_R_UNEXPECTED); } #ifdef SO_BSDCOMPAT RUNTIME_CHECK(isc_once_do(&bsdcompat_once, clear_bsdcompat) == ISC_R_SUCCESS); - if (type != isc_sockettype_unix && bsdcompat && + if (sock->type != isc_sockettype_unix && bsdcompat && setsockopt(sock->fd, SOL_SOCKET, SO_BSDCOMPAT, (void *)&on, sizeof(on)) < 0) { isc__strerror(errno, strbuf, sizeof(strbuf)); @@ -1590,7 +1888,7 @@ isc_socket_create(isc_socketmgr_t *manager, int pf, isc_sockettype_t type, #endif #if defined(USE_CMSG) || defined(SO_RCVBUF) - if (type == isc_sockettype_udp) { + if (sock->type == isc_sockettype_udp) { #if defined(USE_CMSG) #if defined(SO_TIMESTAMP) @@ -1611,7 +1909,7 @@ isc_socket_create(isc_socketmgr_t *manager, int pf, isc_sockettype_t type, #endif /* SO_TIMESTAMP */ #if defined(ISC_PLATFORM_HAVEIPV6) - if (pf == AF_INET6 && sock->recvcmsgbuflen == 0U) { + if (sock->pf == AF_INET6 && sock->recvcmsgbuflen == 0U) { /* * Warn explicitly because this anomaly can be hidden * in usual operation (and unexpectedly appear later). @@ -1623,7 +1921,7 @@ isc_socket_create(isc_socketmgr_t *manager, int pf, isc_sockettype_t type, #ifdef ISC_PLATFORM_HAVEIN6PKTINFO #ifdef IPV6_RECVPKTINFO /* RFC 3542 */ - if ((pf == AF_INET6) + if ((sock->pf == AF_INET6) && (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_RECVPKTINFO, (void *)&on, sizeof(on)) < 0)) { isc__strerror(errno, strbuf, sizeof(strbuf)); @@ -1638,7 +1936,7 @@ isc_socket_create(isc_socketmgr_t *manager, int pf, isc_sockettype_t type, } #else /* RFC 2292 */ - if ((pf == AF_INET6) + if ((sock->pf == AF_INET6) && (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_PKTINFO, (void *)&on, sizeof(on)) < 0)) { isc__strerror(errno, strbuf, sizeof(strbuf)); @@ -1655,7 +1953,7 @@ isc_socket_create(isc_socketmgr_t *manager, int pf, isc_sockettype_t type, #endif /* ISC_PLATFORM_HAVEIN6PKTINFO */ #ifdef IPV6_USE_MIN_MTU /* RFC 3542, not too common yet*/ /* use minimum MTU */ - if (pf == AF_INET6) { + if (sock->pf == AF_INET6) { (void)setsockopt(sock->fd, IPPROTO_IPV6, IPV6_USE_MIN_MTU, (void *)&on, sizeof(on)); @@ -1687,25 +1985,64 @@ isc_socket_create(isc_socketmgr_t *manager, int pf, isc_sockettype_t type, } #endif /* defined(USE_CMSG) || defined(SO_RCVBUF) */ + return (ISC_R_SUCCESS); +} + +/*% + * Create a new 'type' socket managed by 'manager'. Events + * will be posted to 'task' and when dispatched 'action' will be + * called with 'arg' as the arg value. The new socket is returned + * in 'socketp'. + */ +isc_result_t +isc_socket_create(isc_socketmgr_t *manager, int pf, isc_sockettype_t type, + isc_socket_t **socketp) +{ + isc_socket_t *sock = NULL; + isc_result_t result; + int lockid; + + REQUIRE(VALID_MANAGER(manager)); + REQUIRE(socketp != NULL && *socketp == NULL); + + result = allocate_socket(manager, type, &sock); + if (result != ISC_R_SUCCESS) + return (result); + + sock->pf = pf; + result = opensocket(manager, sock); + if (result != ISC_R_SUCCESS) { + free_socket(&sock); + return (result); + } + memset(sock->name, 0, sizeof(sock->name)); sock->tag = NULL; sock->references = 1; *socketp = sock; - LOCK(&manager->lock); - /* * Note we don't have to lock the socket like we normally would because * there are no external references to it yet. */ + lockid = FDLOCK_ID(sock->fd); + LOCK(&manager->fdlock[lockid]); manager->fds[sock->fd] = sock; manager->fdstate[sock->fd] = MANAGED; +#ifdef USE_DEVPOLL + INSIST(sock->manager->fdpollinfo[sock->fd].want_read == 0 && + sock->manager->fdpollinfo[sock->fd].want_write == 0); +#endif + UNLOCK(&manager->fdlock[lockid]); + + LOCK(&manager->lock); ISC_LIST_APPEND(manager->socklist, sock, link); +#ifdef USE_SELECT if (manager->maxfd < sock->fd) manager->maxfd = sock->fd; - +#endif UNLOCK(&manager->lock); socket_log(sock, NULL, CREATION, isc_msgcat, ISC_MSGSET_SOCKET, @@ -1714,6 +2051,48 @@ isc_socket_create(isc_socketmgr_t *manager, int pf, isc_sockettype_t type, return (ISC_R_SUCCESS); } +isc_result_t +isc_socket_open(isc_socket_t *sock) { + isc_result_t result; + + REQUIRE(VALID_SOCKET(sock)); + + LOCK(&sock->lock); + REQUIRE(sock->references == 1); + UNLOCK(&sock->lock); + /* + * We don't need to retain the lock hereafter, since no one else has + * this socket. + */ + REQUIRE(sock->fd == -1); + + result = opensocket(sock->manager, sock); + if (result != ISC_R_SUCCESS) + sock->fd = -1; + + if (result == ISC_R_SUCCESS) { + int lockid = FDLOCK_ID(sock->fd); + + LOCK(&sock->manager->fdlock[lockid]); + sock->manager->fds[sock->fd] = sock; + sock->manager->fdstate[sock->fd] = MANAGED; +#ifdef USE_DEVPOLL + INSIST(sock->manager->fdpollinfo[sock->fd].want_read == 0 && + sock->manager->fdpollinfo[sock->fd].want_write == 0); +#endif + UNLOCK(&sock->manager->fdlock[lockid]); + +#ifdef USE_SELECT + LOCK(&sock->manager->lock); + if (sock->manager->maxfd < sock->fd) + sock->manager->maxfd = sock->fd; + UNLOCK(&sock->manager->lock); +#endif + } + + return (result); +} + /* * Create a new 'type' socket managed by 'manager'. Events * will be posted to 'task' and when dispatched 'action' will be @@ -1727,6 +2106,7 @@ isc_socket_fdwatchcreate(isc_socketmgr_t *manager, int fd, int flags, { isc_socket_t *sock = NULL; isc_result_t result; + int lockid; REQUIRE(VALID_MANAGER(manager)); REQUIRE(socketp != NULL && *socketp == NULL); @@ -1744,19 +2124,23 @@ isc_socket_fdwatchcreate(isc_socketmgr_t *manager, int fd, int flags, sock->references = 1; *socketp = sock; - LOCK(&manager->lock); - /* * Note we don't have to lock the socket like we normally would because * there are no external references to it yet. */ + lockid = FDLOCK_ID(sock->fd); + LOCK(&manager->fdlock[lockid]); manager->fds[sock->fd] = sock; manager->fdstate[sock->fd] = MANAGED; + UNLOCK(&manager->fdlock[lockid]); + + LOCK(&manager->lock); ISC_LIST_APPEND(manager->socklist, sock, link); +#ifdef USE_SELECT if (manager->maxfd < sock->fd) manager->maxfd = sock->fd; - +#endif UNLOCK(&manager->lock); if (flags & ISC_SOCKFDWATCH_READ) @@ -1811,6 +2195,42 @@ isc_socket_detach(isc_socket_t **socketp) { *socketp = NULL; } +void +isc_socket_close(isc_socket_t *sock) { + int fd; + + REQUIRE(VALID_SOCKET(sock)); + + LOCK(&sock->lock); + REQUIRE(sock->references == 1); + UNLOCK(&sock->lock); + /* + * We don't need to retain the lock hereafter, since no one else has + * this socket. + */ + + REQUIRE(sock->fd >= 0 && sock->fd < (int)sock->manager->maxsocks); + + INSIST(!sock->connecting); + INSIST(!sock->pending_recv); + INSIST(!sock->pending_send); + INSIST(!sock->pending_accept); + INSIST(ISC_LIST_EMPTY(sock->recv_list)); + INSIST(ISC_LIST_EMPTY(sock->send_list)); + INSIST(ISC_LIST_EMPTY(sock->accept_list)); + INSIST(sock->connect_ev == NULL); + + fd = sock->fd; + sock->fd = -1; + sock->listener = 0; + sock->connected = 0; + sock->connecting = 0; + sock->bound = 0; + isc_sockaddr_any(&sock->peer_address); + + closesocket(sock->manager, sock->type, fd); +} + /* * I/O is possible on a given socket. Schedule an event to this task that * will call an internal function to do the I/O. This will charge the @@ -1825,7 +2245,15 @@ dispatch_recv(isc_socket_t *sock) { isc_socketevent_t *ev; isc_task_t *sender; +#if 0 + /* + * XXXJT: this assertion seems to strong, but leave it here for + * reference. + */ INSIST(!sock->pending_recv); +#endif + if (sock->pending_recv != 0) + return; if (sock->type != isc_sockettype_fdwatch) { ev = ISC_LIST_HEAD(sock->recv_list); @@ -2132,7 +2560,7 @@ internal_accept(isc_task_t *me, isc_event_t *ev) { sock->pf); (void)close(fd); goto soft_error; - } else if (fd >= (int)FD_SETSIZE) { + } else if (fd >= (int)manager->maxsocks) { isc_log_iwrite(isc_lctx, ISC_LOGCATEGORY_GENERAL, ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR, isc_msgcat, ISC_MSGSET_SOCKET, @@ -2172,6 +2600,13 @@ internal_accept(isc_task_t *me, isc_event_t *ev) { * -1 means the new socket didn't happen. */ if (fd != -1) { + int lockid = FDLOCK_ID(fd); + + LOCK(&manager->fdlock[lockid]); + manager->fds[fd] = dev->newsocket; + manager->fdstate[fd] = MANAGED; + UNLOCK(&manager->fdlock[lockid]); + LOCK(&manager->lock); ISC_LIST_APPEND(manager->socklist, dev->newsocket, link); @@ -2184,10 +2619,10 @@ internal_accept(isc_task_t *me, isc_event_t *ev) { */ dev->address = dev->newsocket->peer_address; - manager->fds[fd] = dev->newsocket; - manager->fdstate[fd] = MANAGED; +#ifdef USE_SELECT if (manager->maxfd < fd) manager->maxfd = fd; +#endif socket_log(sock, &dev->newsocket->peer_address, CREATION, isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_ACCEPTEDCXN, @@ -2416,80 +2851,238 @@ internal_fdwatch_read(isc_task_t *me, isc_event_t *ev) { UNLOCK(&sock->lock); } +/* + * Process read/writes on each fd here. Avoid locking + * and unlocking twice if both reads and writes are possible. + */ +static void +process_fd(isc_socketmgr_t *manager, int fd, isc_boolean_t readable, + isc_boolean_t writeable) +{ + isc_socket_t *sock; + isc_boolean_t unlock_sock; + isc_boolean_t needclose; + int lockid = FDLOCK_ID(fd); + + /* + * If we need to close the socket, do it now. + */ + LOCK(&manager->fdlock[lockid]); + if (manager->fdstate[fd] == CLOSE_PENDING + || manager->fdstate[fd] == MANAGER_CLOSE_PENDING) { + needclose = ISC_TF(manager->fdstate[fd] == CLOSE_PENDING); + manager->fdstate[fd] = CLOSED; + UNLOCK(&manager->fdlock[lockid]); + + (void)unwatch_fd(manager, fd, SELECT_POKE_READ); + (void)unwatch_fd(manager, fd, SELECT_POKE_WRITE); + if (needclose) + (void)close(fd); + return; + } + + sock = manager->fds[fd]; + UNLOCK(&manager->fdlock[lockid]); + unlock_sock = ISC_FALSE; + if (readable) { + if (sock == NULL) { + (void)unwatch_fd(manager, fd, SELECT_POKE_READ); + goto check_write; + } + unlock_sock = ISC_TRUE; + LOCK(&sock->lock); + if (!SOCK_DEAD(sock)) { + if (sock->listener) + dispatch_accept(sock); + else + dispatch_recv(sock); + } + (void)unwatch_fd(manager, fd, SELECT_POKE_READ); + } +check_write: + if (writeable) { + if (sock == NULL) { + (void)unwatch_fd(manager, fd, SELECT_POKE_WRITE); + return; + } + if (!unlock_sock) { + unlock_sock = ISC_TRUE; + LOCK(&sock->lock); + } + if (!SOCK_DEAD(sock)) { + if (sock->connecting) + dispatch_connect(sock); + else + dispatch_send(sock); + } + (void)unwatch_fd(manager, fd, SELECT_POKE_WRITE); + } + if (unlock_sock) + UNLOCK(&sock->lock); +} + +#ifdef USE_KQUEUE +static isc_boolean_t +process_fds(isc_socketmgr_t *manager, struct kevent *events, int nevents) { + int i; + isc_boolean_t readable, writable; + isc_boolean_t done = ISC_FALSE; + + if (nevents == manager->nevents) { + /* + * This is not an error, but something unexpected. If this + * happens, it may indicate the need for increasing + * ISC_SOCKET_MAXEVENTS. + */ + manager_log(manager, ISC_LOGCATEGORY_GENERAL, + ISC_LOGMODULE_SOCKET, ISC_LOG_INFO, + "maximum number of FD events (%d) received", + nevents); + } + + for (i = 0; i < nevents; i++) { + REQUIRE(events[i].ident < manager->maxsocks); +#ifdef ISC_PLATFORM_USETHREADS + if (events[i].ident == (uintptr_t)manager->pipe_fds[0]) { + done = process_ctlfd(manager); + continue; + } +#endif + readable = ISC_TF(events[i].filter == EVFILT_READ); + writable = ISC_TF(events[i].filter == EVFILT_WRITE); + process_fd(manager, events[i].ident, readable, writable); + } + + return (done); +} +#elif defined(USE_EPOLL) +static isc_boolean_t +process_fds(isc_socketmgr_t *manager, struct epoll_event *events, int nevents) { + int i; + isc_boolean_t done = ISC_FALSE; + + if (nevents == manager->nevents) { + manager_log(manager, ISC_LOGCATEGORY_GENERAL, + ISC_LOGMODULE_SOCKET, ISC_LOG_INFO, + "maximum number of FD events (%d) received", + nevents); + } + + for (i = 0; i < nevents; i++) { + REQUIRE(events[i].data.fd < (int)manager->maxsocks); +#ifdef ISC_PLATFORM_USETHREADS + if (events[i].data.fd == manager->pipe_fds[0]) { + done = process_ctlfd(manager); + continue; + } +#endif + if ((events[i].events & EPOLLERR) != 0 || + (events[i].events & EPOLLHUP) != 0) { + /* + * epoll does not set IN/OUT bits on an erroneous + * condition, so we need to try both anyway. This is a + * bit inefficient, but should be okay for such rare + * events. Note also that the read or write attempt + * won't block because we use non-blocking sockets. + */ + events[i].events |= (EPOLLIN | EPOLLOUT); + } + process_fd(manager, events[i].data.fd, + (events[i].events & EPOLLIN) != 0, + (events[i].events & EPOLLOUT) != 0); + } + + return (done); +} +#elif defined(USE_DEVPOLL) +static isc_boolean_t +process_fds(isc_socketmgr_t *manager, struct pollfd *events, int nevents) { + int i; + isc_boolean_t done = ISC_FALSE; + + if (nevents == manager->nevents) { + manager_log(manager, ISC_LOGCATEGORY_GENERAL, + ISC_LOGMODULE_SOCKET, ISC_LOG_INFO, + "maximum number of FD events (%d) received", + nevents); + } + + for (i = 0; i < nevents; i++) { + REQUIRE(events[i].fd < (int)manager->maxsocks); +#ifdef ISC_PLATFORM_USETHREADS + if (events[i].fd == manager->pipe_fds[0]) { + done = process_ctlfd(manager); + continue; + } +#endif + process_fd(manager, events[i].fd, + (events[i].events & POLLIN) != 0, + (events[i].events & POLLOUT) != 0); + } + + return (done); +} +#elif defined(USE_SELECT) static void process_fds(isc_socketmgr_t *manager, int maxfd, fd_set *readfds, fd_set *writefds) { int i; - isc_socket_t *sock; - isc_boolean_t unlock_sock; - REQUIRE(maxfd <= (int)FD_SETSIZE); + REQUIRE(maxfd <= (int)manager->maxsocks); - /* - * Process read/writes on other fds here. Avoid locking - * and unlocking twice if both reads and writes are possible. - */ for (i = 0; i < maxfd; i++) { #ifdef ISC_PLATFORM_USETHREADS if (i == manager->pipe_fds[0] || i == manager->pipe_fds[1]) continue; #endif /* ISC_PLATFORM_USETHREADS */ - - /* - * If we need to close the socket, do it now. - */ - if (manager->fdstate[i] == CLOSE_PENDING - || manager->fdstate[i] == MANAGER_CLOSE_PENDING) { - FD_CLR(i, &manager->read_fds); - FD_CLR(i, &manager->write_fds); - if (manager->fdstate[i] == CLOSE_PENDING) - (void)close(i); - manager->fdstate[i] = CLOSED; - continue; - } - - sock = manager->fds[i]; - unlock_sock = ISC_FALSE; - if (FD_ISSET(i, readfds)) { - if (sock == NULL) { - FD_CLR(i, &manager->read_fds); - goto check_write; - } - unlock_sock = ISC_TRUE; - LOCK(&sock->lock); - if (!SOCK_DEAD(sock)) { - if (sock->listener) - dispatch_accept(sock); - else - dispatch_recv(sock); - } - FD_CLR(i, &manager->read_fds); - } - check_write: - if (FD_ISSET(i, writefds)) { - if (sock == NULL) { - FD_CLR(i, &manager->write_fds); - continue; - } - if (!unlock_sock) { - unlock_sock = ISC_TRUE; - LOCK(&sock->lock); - } - if (!SOCK_DEAD(sock)) { - if (sock->connecting) - dispatch_connect(sock); - else - dispatch_send(sock); - } - FD_CLR(i, &manager->write_fds); - } - if (unlock_sock) - UNLOCK(&sock->lock); + process_fd(manager, i, FD_ISSET(i, readfds), + FD_ISSET(i, writefds)); } } +#endif #ifdef ISC_PLATFORM_USETHREADS +static isc_boolean_t +process_ctlfd(isc_socketmgr_t *manager) { + int msg, fd; + + for (;;) { + select_readmsg(manager, &fd, &msg); + + manager_log(manager, IOEVENT, + isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET, + ISC_MSG_WATCHERMSG, + "watcher got message %d " + "for socket %d"), msg, fd); + + /* + * Nothing to read? + */ + if (msg == SELECT_POKE_NOTHING) + return (ISC_FALSE); + + /* + * Handle shutdown message. We really should + * jump out of this loop right away, but + * it doesn't matter if we have to do a little + * more work first. + */ + if (msg == SELECT_POKE_SHUTDOWN) + return (ISC_TRUE); + + /* + * This is a wakeup on a socket. Look + * at the event queue for both read and write, + * and decide if we need to watch on it now + * or not. + */ + wakeup_socket(manager, fd, msg); + } + + return (ISC_FALSE); +} + /* * This is the thread that will loop forever, always in a select or poll * call. @@ -2503,97 +3096,77 @@ watcher(void *uap) { isc_boolean_t done; int ctlfd; int cc; +#ifdef USE_KQUEUE + const char *fnname = "kevent()"; +#elif defined (USE_EPOLL) + const char *fnname = "epoll_wait()"; +#elif defined(USE_DEVPOLL) + const char *fnname = "ioctl(DP_POLL)"; + struct dvpoll dvp; +#elif defined (USE_SELECT) + const char *fnname = "select()"; fd_set readfds; fd_set writefds; - int msg, fd; int maxfd; +#endif char strbuf[ISC_STRERRORSIZE]; /* * Get the control fd here. This will never change. */ - LOCK(&manager->lock); ctlfd = manager->pipe_fds[0]; - done = ISC_FALSE; while (!done) { do { +#ifdef USE_KQUEUE + cc = kevent(manager->kqueue_fd, NULL, 0, + manager->events, manager->nevents, NULL); +#elif defined(USE_EPOLL) + cc = epoll_wait(manager->epoll_fd, manager->events, + manager->nevents, -1); +#elif defined(USE_DEVPOLL) + dvp.dp_fds = manager->events; + dvp.dp_nfds = manager->nevents; + dvp.dp_timeout = -1; + cc = ioctl(manager->devpoll_fd, DP_POLL, &dvp); +#elif defined(USE_SELECT) + LOCK(&manager->lock); readfds = manager->read_fds; writefds = manager->write_fds; maxfd = manager->maxfd + 1; - UNLOCK(&manager->lock); cc = select(maxfd, &readfds, &writefds, NULL, NULL); - if (cc < 0) { - if (!SOFT_ERROR(errno)) { - isc__strerror(errno, strbuf, - sizeof(strbuf)); - FATAL_ERROR(__FILE__, __LINE__, - "select() %s: %s", - isc_msgcat_get(isc_msgcat, - ISC_MSGSET_GENERAL, - ISC_MSG_FAILED, - "failed"), - strbuf); - } - } +#endif /* USE_KQUEUE */ - LOCK(&manager->lock); + if (cc < 0 && !SOFT_ERROR(errno)) { + isc__strerror(errno, strbuf, sizeof(strbuf)); + FATAL_ERROR(__FILE__, __LINE__, + "%s %s: %s", fnname, + isc_msgcat_get(isc_msgcat, + ISC_MSGSET_GENERAL, + ISC_MSG_FAILED, + "failed"), strbuf); + } } while (cc < 0); - +#if defined(USE_KQUEUE) || defined (USE_EPOLL) || defined (USE_DEVPOLL) + done = process_fds(manager, manager->events, cc); +#elif defined(USE_SELECT) /* * Process reads on internal, control fd. */ - if (FD_ISSET(ctlfd, &readfds)) { - for (;;) { - select_readmsg(manager, &fd, &msg); - - manager_log(manager, IOEVENT, - isc_msgcat_get(isc_msgcat, - ISC_MSGSET_SOCKET, - ISC_MSG_WATCHERMSG, - "watcher got message %d" - " for socket %d"), - msg, fd); - - /* - * Nothing to read? - */ - if (msg == SELECT_POKE_NOTHING) - break; - - /* - * Handle shutdown message. We really should - * jump out of this loop right away, but - * it doesn't matter if we have to do a little - * more work first. - */ - if (msg == SELECT_POKE_SHUTDOWN) { - done = ISC_TRUE; - - break; - } - - /* - * This is a wakeup on a socket. Look - * at the event queue for both read and write, - * and decide if we need to watch on it now - * or not. - */ - wakeup_socket(manager, fd, msg); - } - } + if (FD_ISSET(ctlfd, &readfds)) + done = process_ctlfd(manager); process_fds(manager, maxfd, &readfds, &writefds); +#endif } manager_log(manager, TRACE, isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL, ISC_MSG_EXITING, "watcher exiting")); - UNLOCK(&manager->lock); return ((isc_threadresult_t)0); } #endif /* ISC_PLATFORM_USETHREADS */ @@ -2601,14 +3174,157 @@ watcher(void *uap) { /* * Create a new socket manager. */ + +static isc_result_t +setup_watcher(isc_mem_t *mctx, isc_socketmgr_t *manager) { + isc_result_t result; + +#ifdef USE_KQUEUE + manager->nevents = ISC_SOCKET_MAXEVENTS; + manager->events = isc_mem_get(mctx, sizeof(struct kevent) * + manager->nevents); + if (manager->events == NULL) + return (ISC_R_NOMEMORY); + manager->kqueue_fd = kqueue(); + if (manager->kqueue_fd == -1) { + result = isc__errno2result(errno); + isc_mem_put(mctx, manager->events, + sizeof(struct kevent) * manager->nevents); + return (result); + } + +#ifdef ISC_PLATFORM_USETHREADS + result = watch_fd(manager, manager->pipe_fds[0], SELECT_POKE_READ); + if (result != ISC_R_SUCCESS) { + close(manager->kqueue_fd); + isc_mem_put(mctx, manager->events, + sizeof(struct kevent) * manager->nevents); + return (result); + } +#endif /* ISC_PLATFORM_USETHREADS */ +#elif defined(USE_EPOLL) + manager->nevents = ISC_SOCKET_MAXEVENTS; + manager->events = isc_mem_get(mctx, sizeof(struct epoll_event) * + manager->nevents); + if (manager->events == NULL) + return (ISC_R_NOMEMORY); + manager->epoll_fd = epoll_create(manager->nevents); + if (manager->epoll_fd == -1) { + result = isc__errno2result(errno); + isc_mem_put(mctx, manager->events, + sizeof(struct epoll_event) * manager->nevents); + return (result); + } +#ifdef ISC_PLATFORM_USETHREADS + result = watch_fd(manager, manager->pipe_fds[0], SELECT_POKE_READ); + if (result != ISC_R_SUCCESS) { + close(manager->epoll_fd); + isc_mem_put(mctx, manager->events, + sizeof(struct epoll_event) * manager->nevents); + return (result); + } +#endif /* ISC_PLATFORM_USETHREADS */ +#elif defined(USE_DEVPOLL) + /* + * XXXJT: /dev/poll seems to reject large numbers of events, + * so we should be careful about redefining ISC_SOCKET_MAXEVENTS. + */ + manager->nevents = ISC_SOCKET_MAXEVENTS; + manager->events = isc_mem_get(mctx, sizeof(struct pollfd) * + manager->nevents); + if (manager->events == NULL) + return (ISC_R_NOMEMORY); + /* + * Note: fdpollinfo should be able to support all possible FDs, so + * it must have maxsocks entries (not nevents). + */ + manager->fdpollinfo = isc_mem_get(mctx, sizeof(pollinfo_t) * + manager->maxsocks); + if (manager->fdpollinfo == NULL) { + isc_mem_put(mctx, manager->events, + sizeof(pollinfo_t) * manager->maxsocks); + return (ISC_R_NOMEMORY); + } + memset(manager->fdpollinfo, 0, sizeof(pollinfo_t) * manager->maxsocks); + manager->devpoll_fd = open("/dev/poll", O_RDWR); + if (manager->devpoll_fd == -1) { + result = isc__errno2result(errno); + isc_mem_put(mctx, manager->events, + sizeof(struct pollfd) * manager->nevents); + isc_mem_put(mctx, manager->fdpollinfo, + sizeof(pollinfo_t) * manager->maxsocks); + return (result); + } +#ifdef ISC_PLATFORM_USETHREADS + result = watch_fd(manager, manager->pipe_fds[0], SELECT_POKE_READ); + if (result != ISC_R_SUCCESS) { + close(manager->devpoll_fd); + isc_mem_put(mctx, manager->events, + sizeof(struct pollfd) * manager->nevents); + isc_mem_put(mctx, manager->fdpollinfo, + sizeof(pollinfo_t) * manager->maxsocks); + return (result); + } +#endif /* ISC_PLATFORM_USETHREADS */ +#elif defined(USE_SELECT) + UNUSED(mctx); + UNUSED(result); + + FD_ZERO(&manager->read_fds); + FD_ZERO(&manager->write_fds); +#ifdef ISC_PLATFORM_USETHREADS + (void)watch_fd(manager, manager->pipe_fds[0], SELECT_POKE_READ); + manager->maxfd = manager->pipe_fds[0]; +#else /* ISC_PLATFORM_USETHREADS */ + manager->maxfd = 0; +#endif /* ISC_PLATFORM_USETHREADS */ +#endif /* USE_KQUEUE */ + + return (ISC_R_SUCCESS); +} + +static void +cleanup_watcher(isc_mem_t *mctx, isc_socketmgr_t *manager) { +#ifdef ISC_PLATFORM_USETHREADS + isc_result_t result; + + result = unwatch_fd(manager, manager->pipe_fds[0], SELECT_POKE_READ); + if (result != ISC_R_SUCCESS) { + UNEXPECTED_ERROR(__FILE__, __LINE__, + "epoll_ctl(DEL) %s", + isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL, + ISC_MSG_FAILED, "failed")); + } +#endif /* ISC_PLATFORM_USETHREADS */ + +#ifdef USE_KQUEUE + close(manager->kqueue_fd); + isc_mem_put(mctx, manager->events, + sizeof(struct kevent) * manager->nevents); +#elif defined(USE_EPOLL) + close(manager->epoll_fd); + isc_mem_put(mctx, manager->events, + sizeof(struct epoll_event) * manager->nevents); +#elif defined(USE_DEVPOLL) + close(manager->devpoll_fd); + isc_mem_put(mctx, manager->events, + sizeof(struct pollfd) * manager->nevents); + isc_mem_put(mctx, manager->fdpollinfo, + sizeof(pollinfo_t) * manager->maxsocks); +#elif defined(USE_SELECT) + UNUSED(mctx); + UNUSED(manager); +#endif /* USE_KQUEUE */ +} + isc_result_t isc_socketmgr_create(isc_mem_t *mctx, isc_socketmgr_t **managerp) { + int i; isc_socketmgr_t *manager; #ifdef ISC_PLATFORM_USETHREADS char strbuf[ISC_STRERRORSIZE]; #endif isc_result_t result; - REQUIRE(managerp != NULL && *managerp == NULL); #ifndef ISC_PLATFORM_USETHREADS @@ -2623,24 +3339,59 @@ isc_socketmgr_create(isc_mem_t *mctx, isc_socketmgr_t **managerp) { if (manager == NULL) return (ISC_R_NOMEMORY); + /* zero-clear so that necessary cleanup on failure will be easy */ + memset(manager, 0, sizeof(*manager)); + +#if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL) + manager->maxsocks = ISC_SOCKET_MAXSOCKETS; +#elif defined (USE_SELECT) + manager->maxsocks = FD_SETSIZE; +#endif + + manager->fds = isc_mem_get(mctx, + manager->maxsocks * sizeof(isc_socket_t *)); + if (manager->fds == NULL) { + result = ISC_R_NOMEMORY; + goto free_manager; + } + manager->fdstate = isc_mem_get(mctx, manager->maxsocks * sizeof(int)); + if (manager->fds == NULL) { + result = ISC_R_NOMEMORY; + goto free_manager; + } + manager->magic = SOCKET_MANAGER_MAGIC; manager->mctx = NULL; - memset(manager->fds, 0, sizeof(manager->fds)); + memset(manager->fds, 0, manager->maxsocks * sizeof(isc_socket_t *)); ISC_LIST_INIT(manager->socklist); result = isc_mutex_init(&manager->lock); - if (result != ISC_R_SUCCESS) { - isc_mem_put(mctx, manager, sizeof(*manager)); - return (result); + if (result != ISC_R_SUCCESS) + goto free_manager; + manager->fdlock = isc_mem_get(mctx, FDLOCK_COUNT * sizeof(isc_mutex_t)); + if (manager->fdlock == NULL) { + result = ISC_R_NOMEMORY; + goto cleanup_lock; } + for (i = 0; i < FDLOCK_COUNT; i++) { + result = isc_mutex_init(&manager->fdlock[i]); + if (result != ISC_R_SUCCESS) { + while (--i >= 0) + DESTROYLOCK(&manager->fdlock[i]); + isc_mem_put(mctx, manager->fdlock, + FDLOCK_COUNT * sizeof(isc_mutex_t)); + manager->fdlock = NULL; + goto cleanup_lock; + } + } + #ifdef ISC_PLATFORM_USETHREADS if (isc_condition_init(&manager->shutdown_ok) != ISC_R_SUCCESS) { - DESTROYLOCK(&manager->lock); - isc_mem_put(mctx, manager, sizeof(*manager)); UNEXPECTED_ERROR(__FILE__, __LINE__, "isc_condition_init() %s", isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL, ISC_MSG_FAILED, "failed")); - return (ISC_R_UNEXPECTED); + result = ISC_R_UNEXPECTED; + goto cleanup_lock; } /* @@ -2648,16 +3399,14 @@ isc_socketmgr_create(isc_mem_t *mctx, isc_socketmgr_t **managerp) { * select/poll loop when something internal needs to be done. */ if (pipe(manager->pipe_fds) != 0) { - DESTROYLOCK(&manager->lock); - isc_mem_put(mctx, manager, sizeof(*manager)); isc__strerror(errno, strbuf, sizeof(strbuf)); UNEXPECTED_ERROR(__FILE__, __LINE__, "pipe() %s: %s", isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL, ISC_MSG_FAILED, "failed"), strbuf); - - return (ISC_R_UNEXPECTED); + result = ISC_R_UNEXPECTED; + goto cleanup_condition; } RUNTIME_CHECK(make_nonblock(manager->pipe_fds[0]) == ISC_R_SUCCESS); @@ -2671,15 +3420,10 @@ isc_socketmgr_create(isc_mem_t *mctx, isc_socketmgr_t **managerp) { /* * Set up initial state for the select loop */ - FD_ZERO(&manager->read_fds); - FD_ZERO(&manager->write_fds); -#ifdef ISC_PLATFORM_USETHREADS - FD_SET(manager->pipe_fds[0], &manager->read_fds); - manager->maxfd = manager->pipe_fds[0]; -#else /* ISC_PLATFORM_USETHREADS */ - manager->maxfd = 0; -#endif /* ISC_PLATFORM_USETHREADS */ - memset(manager->fdstate, 0, sizeof(manager->fdstate)); + result = setup_watcher(mctx, manager); + if (result != ISC_R_SUCCESS) + goto cleanup; + memset(manager->fdstate, 0, manager->maxsocks * sizeof(int)); #ifdef ISC_PLATFORM_USETHREADS /* @@ -2687,15 +3431,13 @@ isc_socketmgr_create(isc_mem_t *mctx, isc_socketmgr_t **managerp) { */ if (isc_thread_create(watcher, manager, &manager->watcher) != ISC_R_SUCCESS) { - (void)close(manager->pipe_fds[0]); - (void)close(manager->pipe_fds[1]); - DESTROYLOCK(&manager->lock); - isc_mem_put(mctx, manager, sizeof(*manager)); UNEXPECTED_ERROR(__FILE__, __LINE__, "isc_thread_create() %s", isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL, ISC_MSG_FAILED, "failed")); - return (ISC_R_UNEXPECTED); + cleanup_watcher(mctx, manager); + result = ISC_R_UNEXPECTED; + goto cleanup; } #endif /* ISC_PLATFORM_USETHREADS */ isc_mem_attach(mctx, &manager->mctx); @@ -2706,6 +3448,42 @@ isc_socketmgr_create(isc_mem_t *mctx, isc_socketmgr_t **managerp) { *managerp = manager; return (ISC_R_SUCCESS); + +cleanup: +#ifdef ISC_PLATFORM_USETHREADS + (void)close(manager->pipe_fds[0]); + (void)close(manager->pipe_fds[1]); +#endif /* ISC_PLATFORM_USETHREADS */ + +#ifdef ISC_PLATFORM_USETHREADS +cleanup_condition: + (void)isc_condition_destroy(&manager->shutdown_ok); +#endif /* ISC_PLATFORM_USETHREADS */ + + +cleanup_lock: + if (manager->fdlock != NULL) { + for (i = 0; i < FDLOCK_COUNT; i++) + DESTROYLOCK(&manager->fdlock[i]); + } + DESTROYLOCK(&manager->lock); + +free_manager: + if (manager->fdlock != NULL) { + isc_mem_put(mctx, manager->fdlock, + FDLOCK_COUNT * sizeof(isc_mutex_t)); + } + if (manager->fdstate != NULL) { + isc_mem_put(mctx, manager->fdstate, + manager->maxsocks * sizeof(int)); + } + if (manager->fds != NULL) { + isc_mem_put(mctx, manager->fds, + manager->maxsocks * sizeof(isc_socket_t *)); + } + isc_mem_put(mctx, manager, sizeof(*manager)); + + return (result); } void @@ -2779,16 +3557,29 @@ isc_socketmgr_destroy(isc_socketmgr_t **managerp) { /* * Clean up. */ + cleanup_watcher(manager->mctx, manager); + #ifdef ISC_PLATFORM_USETHREADS (void)close(manager->pipe_fds[0]); (void)close(manager->pipe_fds[1]); (void)isc_condition_destroy(&manager->shutdown_ok); #endif /* ISC_PLATFORM_USETHREADS */ - for (i = 0; i < (int)FD_SETSIZE; i++) - if (manager->fdstate[i] == CLOSE_PENDING) + for (i = 0; i < (int)manager->maxsocks; i++) + if (manager->fdstate[i] == CLOSE_PENDING) /* no need to lock */ (void)close(i); + isc_mem_put(manager->mctx, manager->fds, + manager->maxsocks * sizeof(isc_socket_t *)); + isc_mem_put(manager->mctx, manager->fdstate, + manager->maxsocks * sizeof(int)); + + if (manager->fdlock != NULL) { + for (i = 0; i < FDLOCK_COUNT; i++) + DESTROYLOCK(&manager->fdlock[i]); + isc_mem_put(manager->mctx, manager->fdlock, + FDLOCK_COUNT * sizeof(isc_mutex_t)); + } DESTROYLOCK(&manager->lock); manager->magic = 0; mctx= manager->mctx; @@ -4005,26 +4796,84 @@ isc_socket_ipv6only(isc_socket_t *sock, isc_boolean_t yes) { } #ifndef ISC_PLATFORM_USETHREADS -void -isc__socketmgr_getfdsets(fd_set *readset, fd_set *writeset, int *maxfd) { +/* In our assumed scenario, we can simply use a single static object. */ +static isc_socketwait_t swait_private; + +int +isc__socketmgr_waitevents(struct timeval *tvp, isc_socketwait_t **swaitp) { + int n; +#ifdef USE_KQUEUE + struct timespec ts, *tsp; +#endif +#ifdef USE_EPOLL + int timeout; +#endif +#ifdef USE_DEVPOLL + struct dvpoll dvp; +#endif + + REQUIRE(swaitp != NULL && *swaitp == NULL); + if (socketmgr == NULL) - *maxfd = 0; - else { - *readset = socketmgr->read_fds; - *writeset = socketmgr->write_fds; - *maxfd = socketmgr->maxfd + 1; - } + return (0); + +#ifdef USE_KQUEUE + if (tvp != NULL) { + ts.tv_sec = tvp->tv_sec; + ts.tv_nsec = tvp->tv_usec * 1000; + tsp = &ts; + } else + tsp = NULL; + swait_private.nevents = kevent(socketmgr->kqueue_fd, NULL, 0, + socketmgr->events, socketmgr->nevents, + tsp); + n = swait_private.nevents; +#elif defined(USE_EPOLL) + if (tvp != NULL) + timeout = tvp->tv_sec * 1000 + (tvp->tv_usec + 999) / 1000; + else + timeout = -1; + swait_private.nevents = epoll_wait(socketmgr->epoll_fd, + socketmgr->events, + socketmgr->nevents, timeout); + n = swait_private.nevents; +#elif defined(USE_DEVPOLL) + dvp.dp_fds = socketmgr->events; + dvp.dp_nfds = socketmgr->nevents; + if (tvp != NULL) { + dvp.dp_timeout = tvp->tv_sec * 1000 + + (tvp->tv_usec + 999) / 1000; + } else + dvp.dp_timeout = -1; + swait_private.nevents = ioctl(socketmgr->devpoll_fd, DP_POLL, &dvp); + n = swait_private.nevents; +#elif defined(USE_SELECT) + swait_private.readset = socketmgr->read_fds; + swait_private.writeset = socketmgr->write_fds; + swait_private.maxfd = socketmgr->maxfd + 1; + + n = select(swait_private.maxfd, &swait_private.readset, + &swait_private.writeset, NULL, tvp); +#endif + + *swaitp = &swait_private; + return (n); } isc_result_t -isc__socketmgr_dispatch(fd_set *readset, fd_set *writeset, int maxfd) { - isc_socketmgr_t *manager = socketmgr; +isc__socketmgr_dispatch(isc_socketwait_t *swait) { + REQUIRE(swait == &swait_private); - if (manager == NULL) + if (socketmgr == NULL) return (ISC_R_NOTFOUND); - process_fds(manager, maxfd, readset, writeset); +#if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL) + (void)process_fds(socketmgr, socketmgr->events, swait->nevents); return (ISC_R_SUCCESS); +#elif defined(USE_SELECT) + process_fds(socketmgr, swait->maxfd, &swait->readset, &swait->writeset); + return (ISC_R_SUCCESS); +#endif } #endif /* ISC_PLATFORM_USETHREADS */ diff --git a/lib/isc/unix/socket_p.h b/lib/isc/unix/socket_p.h index 76df84e87c..9881ee5f74 100644 --- a/lib/isc/unix/socket_p.h +++ b/lib/isc/unix/socket_p.h @@ -15,7 +15,7 @@ * PERFORMANCE OF THIS SOFTWARE. */ -/* $Id: socket_p.h,v 1.11 2007/06/19 23:47:18 tbox Exp $ */ +/* $Id: socket_p.h,v 1.12 2008/06/23 19:41:20 jinmei Exp $ */ #ifndef ISC_SOCKET_P_H #define ISC_SOCKET_P_H @@ -26,10 +26,7 @@ #include #endif -void -isc__socketmgr_getfdsets(fd_set *readset, fd_set *writeset, int *maxfd); - -isc_result_t -isc__socketmgr_dispatch(fd_set *readset, fd_set *writeset, int maxfd); - +typedef struct isc_socketwait isc_socketwait_t; +int isc__socketmgr_waitevents(struct timeval *, isc_socketwait_t **); +isc_result_t isc__socketmgr_dispatch(isc_socketwait_t *); #endif /* ISC_SOCKET_P_H */ diff --git a/lib/isccfg/namedconf.c b/lib/isccfg/namedconf.c index 8a195a8276..2cccd331fb 100644 --- a/lib/isccfg/namedconf.c +++ b/lib/isccfg/namedconf.c @@ -15,7 +15,7 @@ * PERFORMANCE OF THIS SOFTWARE. */ -/* $Id: namedconf.c,v 1.87 2008/04/03 02:01:08 marka Exp $ */ +/* $Id: namedconf.c,v 1.88 2008/06/23 19:41:20 jinmei Exp $ */ /*! \file */ @@ -547,11 +547,19 @@ static cfg_type_t cfg_type_serverid = { /*% * Port list. */ -static isc_result_t -parse_port(cfg_parser_t *pctx, const cfg_type_t *type, cfg_obj_t **ret) { - isc_result_t result; +static cfg_tuplefielddef_t porttuple_fields[] = { + { "loport", &cfg_type_uint32, 0 }, + { "hiport", &cfg_type_uint32, 0 }, + { NULL, NULL, 0 } +}; +static cfg_type_t cfg_type_porttuple = { + "porttuple", cfg_parse_tuple, cfg_print_tuple, cfg_doc_tuple, + &cfg_rep_tuple, porttuple_fields +}; - UNUSED(type); +static isc_result_t +parse_port(cfg_parser_t *pctx, cfg_obj_t **ret) { + isc_result_t result; CHECK(cfg_parse_uint32(pctx, NULL, ret)); if ((*ret)->value.uint32 > 0xffff) { @@ -559,18 +567,60 @@ parse_port(cfg_parser_t *pctx, const cfg_type_t *type, cfg_obj_t **ret) { cfg_obj_destroy(pctx, ret); result = ISC_R_RANGE; } + cleanup: return (result); } -static cfg_type_t cfg_type_port = { - "port", parse_port, NULL, cfg_doc_terminal, +static isc_result_t +parse_portrange(cfg_parser_t *pctx, const cfg_type_t *type, cfg_obj_t **ret) { + isc_result_t result; + cfg_obj_t *obj = NULL; + + UNUSED(type); + + CHECK(cfg_peektoken(pctx, ISC_LEXOPT_NUMBER | ISC_LEXOPT_CNUMBER)); + if (pctx->token.type == isc_tokentype_number) + CHECK(parse_port(pctx, ret)); + else { + CHECK(cfg_gettoken(pctx, 0)); + if (pctx->token.type != isc_tokentype_string || + strcasecmp(TOKEN_STRING(pctx), "range") != 0) { + cfg_parser_error(pctx, CFG_LOG_NEAR, + "expected integer or 'range'"); + return (ISC_R_UNEXPECTEDTOKEN); + } + CHECK(cfg_create_tuple(pctx, &cfg_type_porttuple, &obj)); + CHECK(parse_port(pctx, &obj->value.tuple[0])); + CHECK(parse_port(pctx, &obj->value.tuple[1])); + if (obj->value.tuple[0]->value.uint32 > + obj->value.tuple[1]->value.uint32) { + cfg_parser_error(pctx, CFG_LOG_NOPREP, + "low port '%u' must not be larger " + "than high port", + obj->value.tuple[0]->value.uint32); + result = ISC_R_RANGE; + goto cleanup; + } + *ret = obj; + obj = NULL; + } + + cleanup: + if (obj != NULL) + cfg_obj_destroy(pctx, &obj); + return (result); +} + +static cfg_type_t cfg_type_portrange = { + "portrange", parse_portrange, NULL, cfg_doc_terminal, NULL, NULL }; static cfg_type_t cfg_type_bracketed_portlist = { - "bracketed_sockaddrlist", cfg_parse_bracketed_list, cfg_print_bracketed_list, cfg_doc_bracketed_list, - &cfg_rep_list, &cfg_type_port + "bracketed_sockaddrlist", cfg_parse_bracketed_list, + cfg_print_bracketed_list, cfg_doc_bracketed_list, + &cfg_rep_list, &cfg_type_portrange }; /*% @@ -611,6 +661,8 @@ namedconf_or_view_clauses[] = { */ static cfg_clausedef_t options_clauses[] = { + { "use-v4-udp-ports", &cfg_type_bracketed_portlist, 0 }, + { "use-v6-udp-ports", &cfg_type_bracketed_portlist, 0 }, { "avoid-v4-udp-ports", &cfg_type_bracketed_portlist, 0 }, { "avoid-v6-udp-ports", &cfg_type_bracketed_portlist, 0 }, { "blackhole", &cfg_type_bracketed_aml, 0 }, @@ -782,8 +834,9 @@ view_clauses[] = { */ { "query-source", &cfg_type_querysource4, 0 }, { "query-source-v6", &cfg_type_querysource6, 0 }, - { "queryport-pool-ports", &cfg_type_uint32, 0 }, - { "queryport-pool-updateinterval", &cfg_type_uint32, 0 }, + { "queryport-pool-ports", &cfg_type_uint32, CFG_CLAUSEFLAG_OBSOLETE }, + { "queryport-pool-updateinterval", &cfg_type_uint32, + CFG_CLAUSEFLAG_OBSOLETE }, { "recursion", &cfg_type_boolean, 0 }, { "request-ixfr", &cfg_type_boolean, 0 }, { "request-nsid", &cfg_type_boolean, 0 }, @@ -794,7 +847,7 @@ view_clauses[] = { { "suppress-initial-notify", &cfg_type_boolean, CFG_CLAUSEFLAG_NYI }, { "topology", &cfg_type_bracketed_aml, CFG_CLAUSEFLAG_NOTIMP }, { "transfer-format", &cfg_type_transferformat, 0 }, - { "use-queryport-pool", &cfg_type_boolean, 0 }, + { "use-queryport-pool", &cfg_type_boolean, CFG_CLAUSEFLAG_OBSOLETE }, { "zero-no-soa-ttl-cache", &cfg_type_boolean, 0 }, { NULL, NULL, 0 } };