mirror of
https://gitlab.isc.org/isc-projects/bind9
synced 2025-08-22 18:19:42 +00:00
The DNS Flag Day 2020 aims to remove the IP fragmentation problem from the UDP DNS communication. In this commit, we implement the required changes and simplify the logic for picking the EDNS Buffer Size. 1. The defaults for `edns-udp-size`, `max-udp-size` and `nocookie-udp-size` have been changed to `1232` (the value picked by DNS Flag Day 2020). 2. The probing heuristics that would try 512->4096->1432->1232 buffer sizes has been removed and the resolver will always use just the `edns-udp-size` value. 3. Instead of just disabling the PMTUD mechanism on the UDP sockets, we now set IP_DONTFRAG (IPV6_DONTFRAG) flag. That means that the UDP packets won't get ever fragmented. If the ICMP packets are lost the UDP will just timeout and eventually be retried over TCP.
5566 lines
144 KiB
C
5566 lines
144 KiB
C
/*
|
|
* Copyright (C) Internet Systems Consortium, Inc. ("ISC")
|
|
*
|
|
* This Source Code Form is subject to the terms of the Mozilla Public
|
|
* License, v. 2.0. If a copy of the MPL was not distributed with this
|
|
* file, you can obtain one at https://mozilla.org/MPL/2.0/.
|
|
*
|
|
* See the COPYRIGHT file distributed with this work for additional
|
|
* information regarding copyright ownership.
|
|
*/
|
|
|
|
/*! \file */
|
|
|
|
#include <inttypes.h>
|
|
#include <stdbool.h>
|
|
#include <sys/param.h>
|
|
#include <sys/socket.h>
|
|
#include <sys/stat.h>
|
|
#include <sys/types.h>
|
|
#if defined(HAVE_SYS_SYSCTL_H) && !defined(__linux__)
|
|
#include <sys/sysctl.h>
|
|
#endif /* if defined(HAVE_SYS_SYSCTL_H) && !defined(__linux__) */
|
|
#include <sys/time.h>
|
|
#include <sys/uio.h>
|
|
|
|
#if defined(HAVE_LINUX_NETLINK_H) && defined(HAVE_LINUX_RTNETLINK_H)
|
|
#include <linux/netlink.h>
|
|
#include <linux/rtnetlink.h>
|
|
#endif /* if defined(HAVE_LINUX_NETLINK_H) && defined(HAVE_LINUX_RTNETLINK_H) \
|
|
*/
|
|
|
|
#include <errno.h>
|
|
#include <fcntl.h>
|
|
#include <stddef.h>
|
|
#include <stdlib.h>
|
|
#include <sys/un.h>
|
|
#include <unistd.h>
|
|
|
|
#include <isc/app.h>
|
|
#include <isc/buffer.h>
|
|
#include <isc/condition.h>
|
|
#include <isc/formatcheck.h>
|
|
#include <isc/list.h>
|
|
#include <isc/log.h>
|
|
#include <isc/mem.h>
|
|
#include <isc/mutex.h>
|
|
#include <isc/net.h>
|
|
#include <isc/once.h>
|
|
#include <isc/platform.h>
|
|
#include <isc/print.h>
|
|
#include <isc/refcount.h>
|
|
#include <isc/region.h>
|
|
#include <isc/resource.h>
|
|
#include <isc/socket.h>
|
|
#include <isc/stats.h>
|
|
#include <isc/strerr.h>
|
|
#include <isc/string.h>
|
|
#include <isc/task.h>
|
|
#include <isc/thread.h>
|
|
#include <isc/util.h>
|
|
|
|
#ifdef HAVE_KQUEUE
|
|
#include <sys/event.h>
|
|
#endif /* ifdef HAVE_KQUEUE */
|
|
#ifdef HAVE_EPOLL_CREATE1
|
|
#include <sys/epoll.h>
|
|
#endif /* ifdef HAVE_EPOLL_CREATE1 */
|
|
#if defined(HAVE_SYS_DEVPOLL_H)
|
|
#include <sys/devpoll.h>
|
|
#elif defined(HAVE_DEVPOLL_H)
|
|
#include <devpoll.h>
|
|
#endif /* if defined(HAVE_SYS_DEVPOLL_H) */
|
|
|
|
#include <netinet/tcp.h>
|
|
|
|
#include "errno2result.h"
|
|
|
|
#ifdef ENABLE_TCP_FASTOPEN
|
|
#include <netinet/tcp.h>
|
|
#endif /* ifdef ENABLE_TCP_FASTOPEN */
|
|
|
|
#ifdef HAVE_JSON_C
|
|
#include <json_object.h>
|
|
#endif /* HAVE_JSON_C */
|
|
|
|
#ifdef HAVE_LIBXML2
|
|
#include <libxml/xmlwriter.h>
|
|
#define ISC_XMLCHAR (const xmlChar *)
|
|
#endif /* HAVE_LIBXML2 */
|
|
|
|
/*%
|
|
* Choose the most preferable multiplex method.
|
|
*/
|
|
#if defined(HAVE_KQUEUE)
|
|
#define USE_KQUEUE
|
|
#elif defined(HAVE_EPOLL_CREATE1)
|
|
#define USE_EPOLL
|
|
#elif defined(HAVE_SYS_DEVPOLL_H) || defined(HAVE_DEVPOLL_H)
|
|
#define USE_DEVPOLL
|
|
typedef struct {
|
|
unsigned int want_read : 1, want_write : 1;
|
|
} pollinfo_t;
|
|
#else /* if defined(HAVE_KQUEUE) */
|
|
#define USE_SELECT
|
|
#endif /* HAVE_KQUEUE */
|
|
|
|
/*
|
|
* Set by the -T dscp option on the command line. If set to a value
|
|
* other than -1, we check to make sure DSCP values match it, and
|
|
* assert if not.
|
|
*/
|
|
int isc_dscp_check_value = -1;
|
|
|
|
/*%
|
|
* Maximum number of allowable open sockets. This is also the maximum
|
|
* allowable socket file descriptor.
|
|
*
|
|
* Care should be taken before modifying this value for select():
|
|
* The API standard doesn't ensure select() accept more than (the system default
|
|
* of) FD_SETSIZE descriptors, and the default size should in fact be fine in
|
|
* the vast majority of cases. This constant should therefore be increased only
|
|
* when absolutely necessary and possible, i.e., the server is exhausting all
|
|
* available file descriptors (up to FD_SETSIZE) and the select() function
|
|
* and FD_xxx macros support larger values than FD_SETSIZE (which may not
|
|
* always by true, but we keep using some of them to ensure as much
|
|
* portability as possible). Note also that overall server performance
|
|
* may be rather worsened with a larger value of this constant due to
|
|
* inherent scalability problems of select().
|
|
*
|
|
* As a special note, this value shouldn't have to be touched if
|
|
* this is a build for an authoritative only DNS server.
|
|
*/
|
|
#ifndef ISC_SOCKET_MAXSOCKETS
|
|
#if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL)
|
|
#ifdef TUNE_LARGE
|
|
#define ISC_SOCKET_MAXSOCKETS 21000
|
|
#else /* ifdef TUNE_LARGE */
|
|
#define ISC_SOCKET_MAXSOCKETS 4096
|
|
#endif /* TUNE_LARGE */
|
|
#elif defined(USE_SELECT)
|
|
#define ISC_SOCKET_MAXSOCKETS FD_SETSIZE
|
|
#endif /* USE_KQUEUE... */
|
|
#endif /* ISC_SOCKET_MAXSOCKETS */
|
|
|
|
#ifdef USE_SELECT
|
|
/*%
|
|
* Mac OS X needs a special definition to support larger values in select().
|
|
* We always define this because a larger value can be specified run-time.
|
|
*/
|
|
#ifdef __APPLE__
|
|
#define _DARWIN_UNLIMITED_SELECT
|
|
#endif /* __APPLE__ */
|
|
#endif /* USE_SELECT */
|
|
|
|
#ifdef ISC_SOCKET_USE_POLLWATCH
|
|
/*%
|
|
* If this macro is defined, enable workaround for a Solaris /dev/poll kernel
|
|
* bug: DP_POLL ioctl could keep sleeping even if socket I/O is possible for
|
|
* some of the specified FD. The idea is based on the observation that it's
|
|
* likely for a busy server to keep receiving packets. It specifically works
|
|
* as follows: the socket watcher is first initialized with the state of
|
|
* "poll_idle". While it's in the idle state it keeps sleeping until a socket
|
|
* event occurs. When it wakes up for a socket I/O event, it moves to the
|
|
* poll_active state, and sets the poll timeout to a short period
|
|
* (ISC_SOCKET_POLLWATCH_TIMEOUT msec). If timeout occurs in this state, the
|
|
* watcher goes to the poll_checking state with the same timeout period.
|
|
* In this state, the watcher tries to detect whether this is a break
|
|
* during intermittent events or the kernel bug is triggered. If the next
|
|
* polling reports an event within the short period, the previous timeout is
|
|
* likely to be a kernel bug, and so the watcher goes back to the active state.
|
|
* Otherwise, it moves to the idle state again.
|
|
*
|
|
* It's not clear whether this is a thread-related bug, but since we've only
|
|
* seen this with threads, this workaround is used only when enabling threads.
|
|
*/
|
|
|
|
typedef enum { poll_idle, poll_active, poll_checking } pollstate_t;
|
|
|
|
#ifndef ISC_SOCKET_POLLWATCH_TIMEOUT
|
|
#define ISC_SOCKET_POLLWATCH_TIMEOUT 10
|
|
#endif /* ISC_SOCKET_POLLWATCH_TIMEOUT */
|
|
#endif /* ISC_SOCKET_USE_POLLWATCH */
|
|
|
|
/*%
|
|
* Per-FD lock buckets, we shuffle them around a bit as FDs come in herds.
|
|
*/
|
|
#define FDLOCK_BITS 10
|
|
#define FDLOCK_COUNT (1 << FDLOCK_BITS)
|
|
#define FDLOCK_ID(fd) \
|
|
(((fd) % (FDLOCK_COUNT) >> (FDLOCK_BITS / 2)) | \
|
|
(((fd) << (FDLOCK_BITS / 2)) % (FDLOCK_COUNT)))
|
|
|
|
/*%
|
|
* Maximum number of events communicated with the kernel. There should normally
|
|
* be no need for having a large number.
|
|
*/
|
|
#if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL)
|
|
#ifndef ISC_SOCKET_MAXEVENTS
|
|
#ifdef TUNE_LARGE
|
|
#define ISC_SOCKET_MAXEVENTS 2048
|
|
#else /* ifdef TUNE_LARGE */
|
|
#define ISC_SOCKET_MAXEVENTS 64
|
|
#endif /* TUNE_LARGE */
|
|
#endif /* ifndef ISC_SOCKET_MAXEVENTS */
|
|
#endif /* if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL) \
|
|
* */
|
|
|
|
/*%
|
|
* Some systems define the socket length argument as an int, some as size_t,
|
|
* some as socklen_t. This is here so it can be easily changed if needed.
|
|
*/
|
|
#ifndef socklen_t
|
|
#define socklen_t unsigned int
|
|
#endif /* ifndef socklen_t */
|
|
|
|
/*%
|
|
* Define what the possible "soft" errors can be. These are non-fatal returns
|
|
* of various network related functions, like recv() and so on.
|
|
*
|
|
* For some reason, BSDI (and perhaps others) will sometimes return <0
|
|
* from recv() but will have errno==0. This is broken, but we have to
|
|
* work around it here.
|
|
*/
|
|
#define SOFT_ERROR(e) \
|
|
((e) == EAGAIN || (e) == EWOULDBLOCK || (e) == ENOBUFS || \
|
|
(e) == EINTR || (e) == 0)
|
|
|
|
#define DLVL(x) ISC_LOGCATEGORY_GENERAL, ISC_LOGMODULE_SOCKET, ISC_LOG_DEBUG(x)
|
|
|
|
/*!<
|
|
* DLVL(90) -- Function entry/exit and other tracing.
|
|
* DLVL(70) -- Socket "correctness" -- including returning of events, etc.
|
|
* DLVL(60) -- Socket data send/receive
|
|
* DLVL(50) -- Event tracing, including receiving/sending completion events.
|
|
* DLVL(20) -- Socket creation/destruction.
|
|
*/
|
|
#define TRACE_LEVEL 90
|
|
#define CORRECTNESS_LEVEL 70
|
|
#define IOEVENT_LEVEL 60
|
|
#define EVENT_LEVEL 50
|
|
#define CREATION_LEVEL 20
|
|
|
|
#define TRACE DLVL(TRACE_LEVEL)
|
|
#define CORRECTNESS DLVL(CORRECTNESS_LEVEL)
|
|
#define IOEVENT DLVL(IOEVENT_LEVEL)
|
|
#define EVENT DLVL(EVENT_LEVEL)
|
|
#define CREATION DLVL(CREATION_LEVEL)
|
|
|
|
typedef isc_event_t intev_t;
|
|
|
|
#define SOCKET_MAGIC ISC_MAGIC('I', 'O', 'i', 'o')
|
|
#define VALID_SOCKET(s) ISC_MAGIC_VALID(s, SOCKET_MAGIC)
|
|
|
|
/*!
|
|
* IPv6 control information. If the socket is an IPv6 socket we want
|
|
* to collect the destination address and interface so the client can
|
|
* set them on outgoing packets.
|
|
*/
|
|
#ifndef USE_CMSG
|
|
#define USE_CMSG 1
|
|
#endif /* ifndef USE_CMSG */
|
|
|
|
/*%
|
|
* NetBSD and FreeBSD can timestamp packets. XXXMLG Should we have
|
|
* a setsockopt() like interface to request timestamps, and if the OS
|
|
* doesn't do it for us, call gettimeofday() on every UDP receive?
|
|
*/
|
|
#ifdef SO_TIMESTAMP
|
|
#ifndef USE_CMSG
|
|
#define USE_CMSG 1
|
|
#endif /* ifndef USE_CMSG */
|
|
#endif /* ifdef SO_TIMESTAMP */
|
|
|
|
#if defined(SO_RCVBUF) && defined(ISC_RECV_BUFFER_SIZE)
|
|
#define SET_RCVBUF
|
|
#endif
|
|
|
|
#if defined(SO_SNDBUF) && defined(ISC_SEND_BUFFER_SIZE)
|
|
#define SET_SNDBUF
|
|
#endif
|
|
|
|
/*%
|
|
* Instead of calculating the cmsgbuf lengths every time we take
|
|
* a rule of thumb approach - sizes are taken from x86_64 linux,
|
|
* multiplied by 2, everything should fit. Those sizes are not
|
|
* large enough to cause any concern.
|
|
*/
|
|
#if defined(USE_CMSG)
|
|
#define CMSG_SP_IN6PKT 40
|
|
#else /* if defined(USE_CMSG) */
|
|
#define CMSG_SP_IN6PKT 0
|
|
#endif /* if defined(USE_CMSG) */
|
|
|
|
#if defined(USE_CMSG) && defined(SO_TIMESTAMP)
|
|
#define CMSG_SP_TIMESTAMP 32
|
|
#else /* if defined(USE_CMSG) && defined(SO_TIMESTAMP) */
|
|
#define CMSG_SP_TIMESTAMP 0
|
|
#endif /* if defined(USE_CMSG) && defined(SO_TIMESTAMP) */
|
|
|
|
#if defined(USE_CMSG) && (defined(IPV6_TCLASS) || defined(IP_TOS))
|
|
#define CMSG_SP_TCTOS 24
|
|
#else /* if defined(USE_CMSG) && (defined(IPV6_TCLASS) || defined(IP_TOS)) */
|
|
#define CMSG_SP_TCTOS 0
|
|
#endif /* if defined(USE_CMSG) && (defined(IPV6_TCLASS) || defined(IP_TOS)) */
|
|
|
|
#define CMSG_SP_INT 24
|
|
|
|
/* Align cmsg buffers to be safe on SPARC etc. */
|
|
#define RECVCMSGBUFLEN \
|
|
ISC_ALIGN(2 * (CMSG_SP_IN6PKT + CMSG_SP_TIMESTAMP + CMSG_SP_TCTOS) + \
|
|
1, \
|
|
sizeof(void *))
|
|
#define SENDCMSGBUFLEN \
|
|
ISC_ALIGN(2 * (CMSG_SP_IN6PKT + CMSG_SP_INT + CMSG_SP_TCTOS) + 1, \
|
|
sizeof(void *))
|
|
|
|
/*%
|
|
* The number of times a send operation is repeated if the result is EINTR.
|
|
*/
|
|
#define NRETRIES 10
|
|
|
|
typedef struct isc__socket isc__socket_t;
|
|
typedef struct isc__socketmgr isc__socketmgr_t;
|
|
typedef struct isc__socketthread isc__socketthread_t;
|
|
|
|
#define NEWCONNSOCK(ev) ((isc__socket_t *)(ev)->newsocket)
|
|
|
|
struct isc__socket {
|
|
/* Not locked. */
|
|
isc_socket_t common;
|
|
isc__socketmgr_t *manager;
|
|
isc_mutex_t lock;
|
|
isc_sockettype_t type;
|
|
const isc_statscounter_t *statsindex;
|
|
isc_refcount_t references;
|
|
|
|
/* Locked by socket lock. */
|
|
ISC_LINK(isc__socket_t) link;
|
|
int fd;
|
|
int pf;
|
|
int threadid;
|
|
char name[16];
|
|
void *tag;
|
|
|
|
ISC_LIST(isc_socketevent_t) send_list;
|
|
ISC_LIST(isc_socketevent_t) recv_list;
|
|
ISC_LIST(isc_socket_newconnev_t) accept_list;
|
|
ISC_LIST(isc_socket_connev_t) connect_list;
|
|
|
|
isc_sockaddr_t peer_address; /* remote address */
|
|
|
|
unsigned int listener : 1, /* listener socket */
|
|
connected : 1, connecting : 1, /* connect pending
|
|
* */
|
|
bound : 1, /* bound to local addr */
|
|
dupped : 1, active : 1, /* currently active */
|
|
pktdscp : 1; /* per packet dscp */
|
|
|
|
#ifdef ISC_PLATFORM_RECVOVERFLOW
|
|
unsigned char overflow; /* used for MSG_TRUNC fake */
|
|
#endif /* ifdef ISC_PLATFORM_RECVOVERFLOW */
|
|
|
|
unsigned int dscp;
|
|
};
|
|
|
|
#define SOCKET_MANAGER_MAGIC ISC_MAGIC('I', 'O', 'm', 'g')
|
|
#define VALID_MANAGER(m) ISC_MAGIC_VALID(m, SOCKET_MANAGER_MAGIC)
|
|
|
|
struct isc__socketmgr {
|
|
/* Not locked. */
|
|
isc_socketmgr_t common;
|
|
isc_mem_t *mctx;
|
|
isc_mutex_t lock;
|
|
isc_stats_t *stats;
|
|
int nthreads;
|
|
isc__socketthread_t *threads;
|
|
unsigned int maxsocks;
|
|
/* Locked by manager lock. */
|
|
ISC_LIST(isc__socket_t) socklist;
|
|
int reserved; /* unlocked */
|
|
isc_condition_t shutdown_ok;
|
|
size_t maxudp;
|
|
};
|
|
|
|
struct isc__socketthread {
|
|
isc__socketmgr_t *manager;
|
|
int threadid;
|
|
isc_thread_t thread;
|
|
int pipe_fds[2];
|
|
isc_mutex_t *fdlock;
|
|
/* Locked by fdlock. */
|
|
isc__socket_t **fds;
|
|
int *fdstate;
|
|
#ifdef USE_KQUEUE
|
|
int kqueue_fd;
|
|
int nevents;
|
|
struct kevent *events;
|
|
#endif /* USE_KQUEUE */
|
|
#ifdef USE_EPOLL
|
|
int epoll_fd;
|
|
int nevents;
|
|
struct epoll_event *events;
|
|
uint32_t *epoll_events;
|
|
#endif /* USE_EPOLL */
|
|
#ifdef USE_DEVPOLL
|
|
int devpoll_fd;
|
|
isc_resourcevalue_t open_max;
|
|
unsigned int calls;
|
|
int nevents;
|
|
struct pollfd *events;
|
|
pollinfo_t *fdpollinfo;
|
|
#endif /* USE_DEVPOLL */
|
|
#ifdef USE_SELECT
|
|
int fd_bufsize;
|
|
fd_set *read_fds;
|
|
fd_set *read_fds_copy;
|
|
fd_set *write_fds;
|
|
fd_set *write_fds_copy;
|
|
int maxfd;
|
|
#endif /* USE_SELECT */
|
|
};
|
|
|
|
#define CLOSED 0 /* this one must be zero */
|
|
#define MANAGED 1
|
|
#define CLOSE_PENDING 2
|
|
|
|
/*
|
|
* send() and recv() iovec counts
|
|
*/
|
|
#define MAXSCATTERGATHER_SEND (ISC_SOCKET_MAXSCATTERGATHER)
|
|
#ifdef ISC_PLATFORM_RECVOVERFLOW
|
|
#define MAXSCATTERGATHER_RECV (ISC_SOCKET_MAXSCATTERGATHER + 1)
|
|
#else /* ifdef ISC_PLATFORM_RECVOVERFLOW */
|
|
#define MAXSCATTERGATHER_RECV (ISC_SOCKET_MAXSCATTERGATHER)
|
|
#endif /* ifdef ISC_PLATFORM_RECVOVERFLOW */
|
|
|
|
static isc_result_t
|
|
socket_create(isc_socketmgr_t *manager0, int pf, isc_sockettype_t type,
|
|
isc_socket_t **socketp, isc_socket_t *dup_socket);
|
|
static void
|
|
send_recvdone_event(isc__socket_t *, isc_socketevent_t **);
|
|
static void
|
|
send_senddone_event(isc__socket_t *, isc_socketevent_t **);
|
|
static void
|
|
send_connectdone_event(isc__socket_t *, isc_socket_connev_t **);
|
|
static void
|
|
free_socket(isc__socket_t **);
|
|
static isc_result_t
|
|
allocate_socket(isc__socketmgr_t *, isc_sockettype_t, isc__socket_t **);
|
|
static void
|
|
destroy(isc__socket_t **);
|
|
static void
|
|
internal_accept(isc__socket_t *);
|
|
static void
|
|
internal_connect(isc__socket_t *);
|
|
static void
|
|
internal_recv(isc__socket_t *);
|
|
static void
|
|
internal_send(isc__socket_t *);
|
|
static void
|
|
process_cmsg(isc__socket_t *, struct msghdr *, isc_socketevent_t *);
|
|
static void
|
|
build_msghdr_send(isc__socket_t *, char *, isc_socketevent_t *, struct msghdr *,
|
|
struct iovec *, size_t *);
|
|
static void
|
|
build_msghdr_recv(isc__socket_t *, char *, isc_socketevent_t *, struct msghdr *,
|
|
struct iovec *, size_t *);
|
|
static bool
|
|
process_ctlfd(isc__socketthread_t *thread);
|
|
static void
|
|
setdscp(isc__socket_t *sock, isc_dscp_t dscp);
|
|
|
|
#define SELECT_POKE_SHUTDOWN (-1)
|
|
#define SELECT_POKE_NOTHING (-2)
|
|
#define SELECT_POKE_READ (-3)
|
|
#define SELECT_POKE_ACCEPT (-3) /*%< Same as _READ */
|
|
#define SELECT_POKE_WRITE (-4)
|
|
#define SELECT_POKE_CONNECT (-4) /*%< Same as _WRITE */
|
|
#define SELECT_POKE_CLOSE (-5)
|
|
|
|
/*%
|
|
* Shortcut index arrays to get access to statistics counters.
|
|
*/
|
|
enum { STATID_OPEN = 0,
|
|
STATID_OPENFAIL = 1,
|
|
STATID_CLOSE = 2,
|
|
STATID_BINDFAIL = 3,
|
|
STATID_CONNECTFAIL = 4,
|
|
STATID_CONNECT = 5,
|
|
STATID_ACCEPTFAIL = 6,
|
|
STATID_ACCEPT = 7,
|
|
STATID_SENDFAIL = 8,
|
|
STATID_RECVFAIL = 9,
|
|
STATID_ACTIVE = 10 };
|
|
static const isc_statscounter_t udp4statsindex[] = {
|
|
isc_sockstatscounter_udp4open,
|
|
isc_sockstatscounter_udp4openfail,
|
|
isc_sockstatscounter_udp4close,
|
|
isc_sockstatscounter_udp4bindfail,
|
|
isc_sockstatscounter_udp4connectfail,
|
|
isc_sockstatscounter_udp4connect,
|
|
-1,
|
|
-1,
|
|
isc_sockstatscounter_udp4sendfail,
|
|
isc_sockstatscounter_udp4recvfail,
|
|
isc_sockstatscounter_udp4active
|
|
};
|
|
static const isc_statscounter_t udp6statsindex[] = {
|
|
isc_sockstatscounter_udp6open,
|
|
isc_sockstatscounter_udp6openfail,
|
|
isc_sockstatscounter_udp6close,
|
|
isc_sockstatscounter_udp6bindfail,
|
|
isc_sockstatscounter_udp6connectfail,
|
|
isc_sockstatscounter_udp6connect,
|
|
-1,
|
|
-1,
|
|
isc_sockstatscounter_udp6sendfail,
|
|
isc_sockstatscounter_udp6recvfail,
|
|
isc_sockstatscounter_udp6active
|
|
};
|
|
static const isc_statscounter_t tcp4statsindex[] = {
|
|
isc_sockstatscounter_tcp4open, isc_sockstatscounter_tcp4openfail,
|
|
isc_sockstatscounter_tcp4close, isc_sockstatscounter_tcp4bindfail,
|
|
isc_sockstatscounter_tcp4connectfail, isc_sockstatscounter_tcp4connect,
|
|
isc_sockstatscounter_tcp4acceptfail, isc_sockstatscounter_tcp4accept,
|
|
isc_sockstatscounter_tcp4sendfail, isc_sockstatscounter_tcp4recvfail,
|
|
isc_sockstatscounter_tcp4active
|
|
};
|
|
static const isc_statscounter_t tcp6statsindex[] = {
|
|
isc_sockstatscounter_tcp6open, isc_sockstatscounter_tcp6openfail,
|
|
isc_sockstatscounter_tcp6close, isc_sockstatscounter_tcp6bindfail,
|
|
isc_sockstatscounter_tcp6connectfail, isc_sockstatscounter_tcp6connect,
|
|
isc_sockstatscounter_tcp6acceptfail, isc_sockstatscounter_tcp6accept,
|
|
isc_sockstatscounter_tcp6sendfail, isc_sockstatscounter_tcp6recvfail,
|
|
isc_sockstatscounter_tcp6active
|
|
};
|
|
static const isc_statscounter_t unixstatsindex[] = {
|
|
isc_sockstatscounter_unixopen, isc_sockstatscounter_unixopenfail,
|
|
isc_sockstatscounter_unixclose, isc_sockstatscounter_unixbindfail,
|
|
isc_sockstatscounter_unixconnectfail, isc_sockstatscounter_unixconnect,
|
|
isc_sockstatscounter_unixacceptfail, isc_sockstatscounter_unixaccept,
|
|
isc_sockstatscounter_unixsendfail, isc_sockstatscounter_unixrecvfail,
|
|
isc_sockstatscounter_unixactive
|
|
};
|
|
static const isc_statscounter_t rawstatsindex[] = {
|
|
isc_sockstatscounter_rawopen,
|
|
isc_sockstatscounter_rawopenfail,
|
|
isc_sockstatscounter_rawclose,
|
|
-1,
|
|
-1,
|
|
-1,
|
|
-1,
|
|
-1,
|
|
-1,
|
|
isc_sockstatscounter_rawrecvfail,
|
|
isc_sockstatscounter_rawactive
|
|
};
|
|
|
|
static int
|
|
gen_threadid(isc__socket_t *sock);
|
|
|
|
static int
|
|
gen_threadid(isc__socket_t *sock) {
|
|
return (sock->fd % sock->manager->nthreads);
|
|
}
|
|
|
|
static void
|
|
manager_log(isc__socketmgr_t *sockmgr, isc_logcategory_t *category,
|
|
isc_logmodule_t *module, int level, const char *fmt, ...)
|
|
ISC_FORMAT_PRINTF(5, 6);
|
|
static void
|
|
manager_log(isc__socketmgr_t *sockmgr, isc_logcategory_t *category,
|
|
isc_logmodule_t *module, int level, const char *fmt, ...) {
|
|
char msgbuf[2048];
|
|
va_list ap;
|
|
|
|
if (!isc_log_wouldlog(isc_lctx, level)) {
|
|
return;
|
|
}
|
|
|
|
va_start(ap, fmt);
|
|
vsnprintf(msgbuf, sizeof(msgbuf), fmt, ap);
|
|
va_end(ap);
|
|
|
|
isc_log_write(isc_lctx, category, module, level, "sockmgr %p: %s",
|
|
sockmgr, msgbuf);
|
|
}
|
|
|
|
static void
|
|
thread_log(isc__socketthread_t *thread, isc_logcategory_t *category,
|
|
isc_logmodule_t *module, int level, const char *fmt, ...)
|
|
ISC_FORMAT_PRINTF(5, 6);
|
|
static void
|
|
thread_log(isc__socketthread_t *thread, isc_logcategory_t *category,
|
|
isc_logmodule_t *module, int level, const char *fmt, ...) {
|
|
char msgbuf[2048];
|
|
va_list ap;
|
|
|
|
if (!isc_log_wouldlog(isc_lctx, level)) {
|
|
return;
|
|
}
|
|
|
|
va_start(ap, fmt);
|
|
vsnprintf(msgbuf, sizeof(msgbuf), fmt, ap);
|
|
va_end(ap);
|
|
|
|
isc_log_write(isc_lctx, category, module, level,
|
|
"sockmgr %p thread %d: %s", thread->manager,
|
|
thread->threadid, msgbuf);
|
|
}
|
|
|
|
static void
|
|
socket_log(isc__socket_t *sock, const isc_sockaddr_t *address,
|
|
isc_logcategory_t *category, isc_logmodule_t *module, int level,
|
|
const char *fmt, ...) ISC_FORMAT_PRINTF(6, 7);
|
|
static void
|
|
socket_log(isc__socket_t *sock, const isc_sockaddr_t *address,
|
|
isc_logcategory_t *category, isc_logmodule_t *module, int level,
|
|
const char *fmt, ...) {
|
|
char msgbuf[2048];
|
|
char peerbuf[ISC_SOCKADDR_FORMATSIZE];
|
|
va_list ap;
|
|
|
|
if (!isc_log_wouldlog(isc_lctx, level)) {
|
|
return;
|
|
}
|
|
|
|
va_start(ap, fmt);
|
|
vsnprintf(msgbuf, sizeof(msgbuf), fmt, ap);
|
|
va_end(ap);
|
|
|
|
if (address == NULL) {
|
|
isc_log_write(isc_lctx, category, module, level,
|
|
"socket %p: %s", sock, msgbuf);
|
|
} else {
|
|
isc_sockaddr_format(address, peerbuf, sizeof(peerbuf));
|
|
isc_log_write(isc_lctx, category, module, level,
|
|
"socket %p %s: %s", sock, peerbuf, msgbuf);
|
|
}
|
|
}
|
|
|
|
/*%
|
|
* Increment socket-related statistics counters.
|
|
*/
|
|
static inline void
|
|
inc_stats(isc_stats_t *stats, isc_statscounter_t counterid) {
|
|
REQUIRE(counterid != -1);
|
|
|
|
if (stats != NULL) {
|
|
isc_stats_increment(stats, counterid);
|
|
}
|
|
}
|
|
|
|
/*%
|
|
* Decrement socket-related statistics counters.
|
|
*/
|
|
static inline void
|
|
dec_stats(isc_stats_t *stats, isc_statscounter_t counterid) {
|
|
REQUIRE(counterid != -1);
|
|
|
|
if (stats != NULL) {
|
|
isc_stats_decrement(stats, counterid);
|
|
}
|
|
}
|
|
|
|
static inline isc_result_t
|
|
watch_fd(isc__socketthread_t *thread, int fd, int msg) {
|
|
isc_result_t result = ISC_R_SUCCESS;
|
|
|
|
#ifdef USE_KQUEUE
|
|
struct kevent evchange;
|
|
|
|
memset(&evchange, 0, sizeof(evchange));
|
|
if (msg == SELECT_POKE_READ) {
|
|
evchange.filter = EVFILT_READ;
|
|
} else {
|
|
evchange.filter = EVFILT_WRITE;
|
|
}
|
|
evchange.flags = EV_ADD;
|
|
evchange.ident = fd;
|
|
if (kevent(thread->kqueue_fd, &evchange, 1, NULL, 0, NULL) != 0) {
|
|
result = isc__errno2result(errno);
|
|
}
|
|
|
|
return (result);
|
|
#elif defined(USE_EPOLL)
|
|
struct epoll_event event;
|
|
uint32_t oldevents;
|
|
int ret;
|
|
int op;
|
|
|
|
oldevents = thread->epoll_events[fd];
|
|
if (msg == SELECT_POKE_READ) {
|
|
thread->epoll_events[fd] |= EPOLLIN;
|
|
} else {
|
|
thread->epoll_events[fd] |= EPOLLOUT;
|
|
}
|
|
|
|
event.events = thread->epoll_events[fd];
|
|
memset(&event.data, 0, sizeof(event.data));
|
|
event.data.fd = fd;
|
|
|
|
op = (oldevents == 0U) ? EPOLL_CTL_ADD : EPOLL_CTL_MOD;
|
|
if (thread->fds[fd] != NULL) {
|
|
LOCK(&thread->fds[fd]->lock);
|
|
}
|
|
ret = epoll_ctl(thread->epoll_fd, op, fd, &event);
|
|
if (thread->fds[fd] != NULL) {
|
|
UNLOCK(&thread->fds[fd]->lock);
|
|
}
|
|
if (ret == -1) {
|
|
if (errno == EEXIST) {
|
|
UNEXPECTED_ERROR(__FILE__, __LINE__,
|
|
"epoll_ctl(ADD/MOD) returned "
|
|
"EEXIST for fd %d",
|
|
fd);
|
|
}
|
|
result = isc__errno2result(errno);
|
|
}
|
|
|
|
return (result);
|
|
#elif defined(USE_DEVPOLL)
|
|
struct pollfd pfd;
|
|
|
|
memset(&pfd, 0, sizeof(pfd));
|
|
if (msg == SELECT_POKE_READ) {
|
|
pfd.events = POLLIN;
|
|
} else {
|
|
pfd.events = POLLOUT;
|
|
}
|
|
pfd.fd = fd;
|
|
pfd.revents = 0;
|
|
if (write(thread->devpoll_fd, &pfd, sizeof(pfd)) == -1) {
|
|
result = isc__errno2result(errno);
|
|
} else {
|
|
if (msg == SELECT_POKE_READ) {
|
|
thread->fdpollinfo[fd].want_read = 1;
|
|
} else {
|
|
thread->fdpollinfo[fd].want_write = 1;
|
|
}
|
|
}
|
|
|
|
return (result);
|
|
#elif defined(USE_SELECT)
|
|
LOCK(&thread->manager->lock);
|
|
if (msg == SELECT_POKE_READ) {
|
|
FD_SET(fd, thread->read_fds);
|
|
}
|
|
if (msg == SELECT_POKE_WRITE) {
|
|
FD_SET(fd, thread->write_fds);
|
|
}
|
|
UNLOCK(&thread->manager->lock);
|
|
|
|
return (result);
|
|
#endif /* ifdef USE_KQUEUE */
|
|
}
|
|
|
|
static inline isc_result_t
|
|
unwatch_fd(isc__socketthread_t *thread, int fd, int msg) {
|
|
isc_result_t result = ISC_R_SUCCESS;
|
|
|
|
#ifdef USE_KQUEUE
|
|
struct kevent evchange;
|
|
|
|
memset(&evchange, 0, sizeof(evchange));
|
|
if (msg == SELECT_POKE_READ) {
|
|
evchange.filter = EVFILT_READ;
|
|
} else {
|
|
evchange.filter = EVFILT_WRITE;
|
|
}
|
|
evchange.flags = EV_DELETE;
|
|
evchange.ident = fd;
|
|
if (kevent(thread->kqueue_fd, &evchange, 1, NULL, 0, NULL) != 0) {
|
|
result = isc__errno2result(errno);
|
|
}
|
|
|
|
return (result);
|
|
#elif defined(USE_EPOLL)
|
|
struct epoll_event event;
|
|
int ret;
|
|
int op;
|
|
|
|
if (msg == SELECT_POKE_READ) {
|
|
thread->epoll_events[fd] &= ~(EPOLLIN);
|
|
} else {
|
|
thread->epoll_events[fd] &= ~(EPOLLOUT);
|
|
}
|
|
|
|
event.events = thread->epoll_events[fd];
|
|
memset(&event.data, 0, sizeof(event.data));
|
|
event.data.fd = fd;
|
|
|
|
op = (event.events == 0U) ? EPOLL_CTL_DEL : EPOLL_CTL_MOD;
|
|
ret = epoll_ctl(thread->epoll_fd, op, fd, &event);
|
|
if (ret == -1 && errno != ENOENT) {
|
|
char strbuf[ISC_STRERRORSIZE];
|
|
strerror_r(errno, strbuf, sizeof(strbuf));
|
|
UNEXPECTED_ERROR(__FILE__, __LINE__, "epoll_ctl(DEL), %d: %s",
|
|
fd, strbuf);
|
|
result = ISC_R_UNEXPECTED;
|
|
}
|
|
return (result);
|
|
#elif defined(USE_DEVPOLL)
|
|
struct pollfd pfds[2];
|
|
size_t writelen = sizeof(pfds[0]);
|
|
int lockid = FDLOCK_ID(fd);
|
|
|
|
memset(pfds, 0, sizeof(pfds));
|
|
pfds[0].events = POLLREMOVE;
|
|
pfds[0].fd = fd;
|
|
|
|
/*
|
|
* Canceling read or write polling via /dev/poll is tricky. Since it
|
|
* only provides a way of canceling per FD, we may need to re-poll the
|
|
* socket for the other operation.
|
|
*/
|
|
if (msg == SELECT_POKE_READ && thread->fdpollinfo[fd].want_write == 1) {
|
|
pfds[1].events = POLLOUT;
|
|
pfds[1].fd = fd;
|
|
writelen += sizeof(pfds[1]);
|
|
}
|
|
if (msg == SELECT_POKE_WRITE && thread->fdpollinfo[fd].want_read == 1) {
|
|
pfds[1].events = POLLIN;
|
|
pfds[1].fd = fd;
|
|
writelen += sizeof(pfds[1]);
|
|
}
|
|
|
|
if (write(thread->devpoll_fd, pfds, writelen) == -1) {
|
|
result = isc__errno2result(errno);
|
|
} else {
|
|
if (msg == SELECT_POKE_READ) {
|
|
thread->fdpollinfo[fd].want_read = 0;
|
|
} else {
|
|
thread->fdpollinfo[fd].want_write = 0;
|
|
}
|
|
}
|
|
|
|
return (result);
|
|
#elif defined(USE_SELECT)
|
|
LOCK(&thread->manager->lock);
|
|
if (msg == SELECT_POKE_READ) {
|
|
FD_CLR(fd, thread->read_fds);
|
|
} else if (msg == SELECT_POKE_WRITE) {
|
|
FD_CLR(fd, thread->write_fds);
|
|
}
|
|
UNLOCK(&thread->manager->lock);
|
|
|
|
return (result);
|
|
#endif /* ifdef USE_KQUEUE */
|
|
}
|
|
|
|
/*
|
|
* A poke message was received, perform a proper watch/unwatch
|
|
* on a fd provided
|
|
*/
|
|
static void
|
|
wakeup_socket(isc__socketthread_t *thread, int fd, int msg) {
|
|
isc_result_t result;
|
|
int lockid = FDLOCK_ID(fd);
|
|
|
|
/*
|
|
* This is a wakeup on a socket. If the socket is not in the
|
|
* process of being closed, start watching it for either reads
|
|
* or writes.
|
|
*/
|
|
|
|
INSIST(fd >= 0 && fd < (int)thread->manager->maxsocks);
|
|
|
|
if (msg == SELECT_POKE_CLOSE) {
|
|
LOCK(&thread->fdlock[lockid]);
|
|
INSIST(thread->fdstate[fd] == CLOSE_PENDING);
|
|
thread->fdstate[fd] = CLOSED;
|
|
(void)unwatch_fd(thread, fd, SELECT_POKE_READ);
|
|
(void)unwatch_fd(thread, fd, SELECT_POKE_WRITE);
|
|
(void)close(fd);
|
|
UNLOCK(&thread->fdlock[lockid]);
|
|
return;
|
|
}
|
|
|
|
LOCK(&thread->fdlock[lockid]);
|
|
if (thread->fdstate[fd] == CLOSE_PENDING) {
|
|
/*
|
|
* We accept (and ignore) any error from unwatch_fd() as we are
|
|
* closing the socket, hoping it doesn't leave dangling state in
|
|
* the kernel.
|
|
* Note that unwatch_fd() must be called after releasing the
|
|
* fdlock; otherwise it could cause deadlock due to a lock order
|
|
* reversal.
|
|
*/
|
|
(void)unwatch_fd(thread, fd, SELECT_POKE_READ);
|
|
(void)unwatch_fd(thread, fd, SELECT_POKE_WRITE);
|
|
UNLOCK(&thread->fdlock[lockid]);
|
|
return;
|
|
}
|
|
if (thread->fdstate[fd] != MANAGED) {
|
|
UNLOCK(&thread->fdlock[lockid]);
|
|
return;
|
|
}
|
|
|
|
/*
|
|
* Set requested bit.
|
|
*/
|
|
result = watch_fd(thread, fd, msg);
|
|
if (result != ISC_R_SUCCESS) {
|
|
/*
|
|
* XXXJT: what should we do? Ignoring the failure of watching
|
|
* a socket will make the application dysfunctional, but there
|
|
* seems to be no reasonable recovery process.
|
|
*/
|
|
isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
|
|
ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
|
|
"failed to start watching FD (%d): %s", fd,
|
|
isc_result_totext(result));
|
|
}
|
|
UNLOCK(&thread->fdlock[lockid]);
|
|
}
|
|
|
|
/*
|
|
* Poke the select loop when there is something for us to do.
|
|
* The write is required (by POSIX) to complete. That is, we
|
|
* will not get partial writes.
|
|
*/
|
|
static void
|
|
select_poke(isc__socketmgr_t *mgr, int threadid, int fd, int msg) {
|
|
int cc;
|
|
int buf[2];
|
|
char strbuf[ISC_STRERRORSIZE];
|
|
|
|
buf[0] = fd;
|
|
buf[1] = msg;
|
|
|
|
do {
|
|
cc = write(mgr->threads[threadid].pipe_fds[1], buf,
|
|
sizeof(buf));
|
|
#ifdef ENOSR
|
|
/*
|
|
* Treat ENOSR as EAGAIN but loop slowly as it is
|
|
* unlikely to clear fast.
|
|
*/
|
|
if (cc < 0 && errno == ENOSR) {
|
|
sleep(1);
|
|
errno = EAGAIN;
|
|
}
|
|
#endif /* ifdef ENOSR */
|
|
} while (cc < 0 && SOFT_ERROR(errno));
|
|
|
|
if (cc < 0) {
|
|
strerror_r(errno, strbuf, sizeof(strbuf));
|
|
FATAL_ERROR(__FILE__, __LINE__,
|
|
"write() failed during watcher poke: %s", strbuf);
|
|
}
|
|
|
|
INSIST(cc == sizeof(buf));
|
|
}
|
|
|
|
/*
|
|
* Read a message on the internal fd.
|
|
*/
|
|
static void
|
|
select_readmsg(isc__socketthread_t *thread, int *fd, int *msg) {
|
|
int buf[2];
|
|
int cc;
|
|
char strbuf[ISC_STRERRORSIZE];
|
|
|
|
cc = read(thread->pipe_fds[0], buf, sizeof(buf));
|
|
if (cc < 0) {
|
|
*msg = SELECT_POKE_NOTHING;
|
|
*fd = -1; /* Silence compiler. */
|
|
if (SOFT_ERROR(errno)) {
|
|
return;
|
|
}
|
|
|
|
strerror_r(errno, strbuf, sizeof(strbuf));
|
|
FATAL_ERROR(__FILE__, __LINE__,
|
|
"read() failed during watcher poke: %s", strbuf);
|
|
}
|
|
INSIST(cc == sizeof(buf));
|
|
|
|
*fd = buf[0];
|
|
*msg = buf[1];
|
|
}
|
|
|
|
/*
|
|
* Make a fd non-blocking.
|
|
*/
|
|
static isc_result_t
|
|
make_nonblock(int fd) {
|
|
int ret;
|
|
char strbuf[ISC_STRERRORSIZE];
|
|
#ifdef USE_FIONBIO_IOCTL
|
|
int on = 1;
|
|
#else /* ifdef USE_FIONBIO_IOCTL */
|
|
int flags;
|
|
#endif /* ifdef USE_FIONBIO_IOCTL */
|
|
|
|
#ifdef USE_FIONBIO_IOCTL
|
|
ret = ioctl(fd, FIONBIO, (char *)&on);
|
|
#else /* ifdef USE_FIONBIO_IOCTL */
|
|
flags = fcntl(fd, F_GETFL, 0);
|
|
flags |= O_NONBLOCK;
|
|
ret = fcntl(fd, F_SETFL, flags);
|
|
#endif /* ifdef USE_FIONBIO_IOCTL */
|
|
|
|
if (ret == -1) {
|
|
strerror_r(errno, strbuf, sizeof(strbuf));
|
|
UNEXPECTED_ERROR(__FILE__, __LINE__,
|
|
#ifdef USE_FIONBIO_IOCTL
|
|
"ioctl(%d, FIONBIO, &on): %s", fd,
|
|
#else /* ifdef USE_FIONBIO_IOCTL */
|
|
"fcntl(%d, F_SETFL, %d): %s", fd, flags,
|
|
#endif /* ifdef USE_FIONBIO_IOCTL */
|
|
strbuf);
|
|
|
|
return (ISC_R_UNEXPECTED);
|
|
}
|
|
|
|
return (ISC_R_SUCCESS);
|
|
}
|
|
|
|
#ifdef USE_CMSG
|
|
/*
|
|
* Not all OSes support advanced CMSG macros: CMSG_LEN and CMSG_SPACE.
|
|
* In order to ensure as much portability as possible, we provide wrapper
|
|
* functions of these macros.
|
|
* Note that cmsg_space() could run slow on OSes that do not have
|
|
* CMSG_SPACE.
|
|
*/
|
|
static inline socklen_t
|
|
cmsg_len(socklen_t len) {
|
|
#ifdef CMSG_LEN
|
|
return (CMSG_LEN(len));
|
|
#else /* ifdef CMSG_LEN */
|
|
socklen_t hdrlen;
|
|
|
|
/*
|
|
* Cast NULL so that any pointer arithmetic performed by CMSG_DATA
|
|
* is correct.
|
|
*/
|
|
hdrlen = (socklen_t)CMSG_DATA(((struct cmsghdr *)NULL));
|
|
return (hdrlen + len);
|
|
#endif /* ifdef CMSG_LEN */
|
|
}
|
|
|
|
static inline socklen_t
|
|
cmsg_space(socklen_t len) {
|
|
#ifdef CMSG_SPACE
|
|
return (CMSG_SPACE(len));
|
|
#else /* ifdef CMSG_SPACE */
|
|
struct msghdr msg;
|
|
struct cmsghdr *cmsgp;
|
|
/*
|
|
* XXX: The buffer length is an ad-hoc value, but should be enough
|
|
* in a practical sense.
|
|
*/
|
|
char dummybuf[sizeof(struct cmsghdr) + 1024];
|
|
|
|
memset(&msg, 0, sizeof(msg));
|
|
msg.msg_control = dummybuf;
|
|
msg.msg_controllen = sizeof(dummybuf);
|
|
|
|
cmsgp = (struct cmsghdr *)dummybuf;
|
|
cmsgp->cmsg_len = cmsg_len(len);
|
|
|
|
cmsgp = CMSG_NXTHDR(&msg, cmsgp);
|
|
if (cmsgp != NULL) {
|
|
return ((char *)cmsgp - (char *)msg.msg_control);
|
|
} else {
|
|
return (0);
|
|
}
|
|
#endif /* ifdef CMSG_SPACE */
|
|
}
|
|
#endif /* USE_CMSG */
|
|
|
|
/*
|
|
* Process control messages received on a socket.
|
|
*/
|
|
static void
|
|
process_cmsg(isc__socket_t *sock, struct msghdr *msg, isc_socketevent_t *dev) {
|
|
#ifdef USE_CMSG
|
|
struct cmsghdr *cmsgp;
|
|
struct in6_pktinfo *pktinfop;
|
|
#ifdef SO_TIMESTAMP
|
|
void *timevalp;
|
|
#endif /* ifdef SO_TIMESTAMP */
|
|
#endif /* ifdef USE_CMSG */
|
|
|
|
/*
|
|
* sock is used only when ISC_NET_BSD44MSGHDR and USE_CMSG are defined.
|
|
* msg and dev are used only when ISC_NET_BSD44MSGHDR is defined.
|
|
* They are all here, outside of the CPP tests, because it is
|
|
* more consistent with the usual ISC coding style.
|
|
*/
|
|
UNUSED(sock);
|
|
UNUSED(msg);
|
|
UNUSED(dev);
|
|
|
|
#ifdef MSG_TRUNC
|
|
if ((msg->msg_flags & MSG_TRUNC) != 0) {
|
|
dev->attributes |= ISC_SOCKEVENTATTR_TRUNC;
|
|
}
|
|
#endif /* ifdef MSG_TRUNC */
|
|
|
|
#ifdef MSG_CTRUNC
|
|
if ((msg->msg_flags & MSG_CTRUNC) != 0) {
|
|
dev->attributes |= ISC_SOCKEVENTATTR_CTRUNC;
|
|
}
|
|
#endif /* ifdef MSG_CTRUNC */
|
|
|
|
#ifndef USE_CMSG
|
|
return;
|
|
#else /* ifndef USE_CMSG */
|
|
if (msg->msg_controllen == 0U || msg->msg_control == NULL) {
|
|
return;
|
|
}
|
|
|
|
#ifdef SO_TIMESTAMP
|
|
timevalp = NULL;
|
|
#endif /* ifdef SO_TIMESTAMP */
|
|
pktinfop = NULL;
|
|
|
|
cmsgp = CMSG_FIRSTHDR(msg);
|
|
while (cmsgp != NULL) {
|
|
socket_log(sock, NULL, TRACE, "processing cmsg %p", cmsgp);
|
|
|
|
if (cmsgp->cmsg_level == IPPROTO_IPV6 &&
|
|
cmsgp->cmsg_type == IPV6_PKTINFO) {
|
|
pktinfop = (struct in6_pktinfo *)CMSG_DATA(cmsgp);
|
|
memmove(&dev->pktinfo, pktinfop,
|
|
sizeof(struct in6_pktinfo));
|
|
dev->attributes |= ISC_SOCKEVENTATTR_PKTINFO;
|
|
socket_log(sock, NULL, TRACE,
|
|
"interface received on ifindex %u",
|
|
dev->pktinfo.ipi6_ifindex);
|
|
if (IN6_IS_ADDR_MULTICAST(&pktinfop->ipi6_addr)) {
|
|
dev->attributes |= ISC_SOCKEVENTATTR_MULTICAST;
|
|
}
|
|
goto next;
|
|
}
|
|
|
|
#ifdef SO_TIMESTAMP
|
|
if (cmsgp->cmsg_level == SOL_SOCKET &&
|
|
cmsgp->cmsg_type == SCM_TIMESTAMP) {
|
|
struct timeval tv;
|
|
timevalp = CMSG_DATA(cmsgp);
|
|
memmove(&tv, timevalp, sizeof(tv));
|
|
dev->timestamp.seconds = tv.tv_sec;
|
|
dev->timestamp.nanoseconds = tv.tv_usec * 1000;
|
|
dev->attributes |= ISC_SOCKEVENTATTR_TIMESTAMP;
|
|
goto next;
|
|
}
|
|
#endif /* ifdef SO_TIMESTAMP */
|
|
|
|
#ifdef IPV6_TCLASS
|
|
if (cmsgp->cmsg_level == IPPROTO_IPV6 &&
|
|
cmsgp->cmsg_type == IPV6_TCLASS) {
|
|
dev->dscp = *(int *)CMSG_DATA(cmsgp);
|
|
dev->dscp >>= 2;
|
|
dev->attributes |= ISC_SOCKEVENTATTR_DSCP;
|
|
goto next;
|
|
}
|
|
#endif /* ifdef IPV6_TCLASS */
|
|
|
|
#ifdef IP_TOS
|
|
if (cmsgp->cmsg_level == IPPROTO_IP &&
|
|
(cmsgp->cmsg_type == IP_TOS
|
|
#ifdef IP_RECVTOS
|
|
|| cmsgp->cmsg_type == IP_RECVTOS
|
|
#endif /* ifdef IP_RECVTOS */
|
|
))
|
|
{
|
|
dev->dscp = (int)*(unsigned char *)CMSG_DATA(cmsgp);
|
|
dev->dscp >>= 2;
|
|
dev->attributes |= ISC_SOCKEVENTATTR_DSCP;
|
|
goto next;
|
|
}
|
|
#endif /* ifdef IP_TOS */
|
|
next:
|
|
cmsgp = CMSG_NXTHDR(msg, cmsgp);
|
|
}
|
|
#endif /* USE_CMSG */
|
|
}
|
|
|
|
/*
|
|
* Construct an iov array and attach it to the msghdr passed in. This is
|
|
* the SEND constructor, which will use the used region of the buffer
|
|
* (if using a buffer list) or will use the internal region (if a single
|
|
* buffer I/O is requested).
|
|
*
|
|
* Nothing can be NULL, and the done event must list at least one buffer
|
|
* on the buffer linked list for this function to be meaningful.
|
|
*
|
|
* If write_countp != NULL, *write_countp will hold the number of bytes
|
|
* this transaction can send.
|
|
*/
|
|
static void
|
|
build_msghdr_send(isc__socket_t *sock, char *cmsgbuf, isc_socketevent_t *dev,
|
|
struct msghdr *msg, struct iovec *iov, size_t *write_countp) {
|
|
unsigned int iovcount;
|
|
size_t write_count;
|
|
struct cmsghdr *cmsgp;
|
|
|
|
memset(msg, 0, sizeof(*msg));
|
|
|
|
if (!sock->connected) {
|
|
msg->msg_name = (void *)&dev->address.type.sa;
|
|
msg->msg_namelen = dev->address.length;
|
|
} else {
|
|
msg->msg_name = NULL;
|
|
msg->msg_namelen = 0;
|
|
}
|
|
|
|
write_count = dev->region.length - dev->n;
|
|
iov[0].iov_base = (void *)(dev->region.base + dev->n);
|
|
iov[0].iov_len = write_count;
|
|
iovcount = 1;
|
|
|
|
msg->msg_iov = iov;
|
|
msg->msg_iovlen = iovcount;
|
|
msg->msg_control = NULL;
|
|
msg->msg_controllen = 0;
|
|
msg->msg_flags = 0;
|
|
#if defined(USE_CMSG)
|
|
|
|
if ((sock->type == isc_sockettype_udp) &&
|
|
((dev->attributes & ISC_SOCKEVENTATTR_PKTINFO) != 0))
|
|
{
|
|
struct in6_pktinfo *pktinfop;
|
|
|
|
socket_log(sock, NULL, TRACE, "sendto pktinfo data, ifindex %u",
|
|
dev->pktinfo.ipi6_ifindex);
|
|
|
|
msg->msg_control = (void *)cmsgbuf;
|
|
msg->msg_controllen = cmsg_space(sizeof(struct in6_pktinfo));
|
|
INSIST(msg->msg_controllen <= SENDCMSGBUFLEN);
|
|
|
|
cmsgp = (struct cmsghdr *)cmsgbuf;
|
|
cmsgp->cmsg_level = IPPROTO_IPV6;
|
|
cmsgp->cmsg_type = IPV6_PKTINFO;
|
|
cmsgp->cmsg_len = cmsg_len(sizeof(struct in6_pktinfo));
|
|
pktinfop = (struct in6_pktinfo *)CMSG_DATA(cmsgp);
|
|
memmove(pktinfop, &dev->pktinfo, sizeof(struct in6_pktinfo));
|
|
}
|
|
|
|
#if defined(IPV6_USE_MIN_MTU)
|
|
if ((sock->type == isc_sockettype_udp) && (sock->pf == AF_INET6) &&
|
|
((dev->attributes & ISC_SOCKEVENTATTR_USEMINMTU) != 0))
|
|
{
|
|
int use_min_mtu = 1; /* -1, 0, 1 */
|
|
|
|
cmsgp = (struct cmsghdr *)(cmsgbuf + msg->msg_controllen);
|
|
msg->msg_control = (void *)cmsgbuf;
|
|
msg->msg_controllen += cmsg_space(sizeof(use_min_mtu));
|
|
INSIST(msg->msg_controllen <= SENDCMSGBUFLEN);
|
|
|
|
cmsgp->cmsg_level = IPPROTO_IPV6;
|
|
cmsgp->cmsg_type = IPV6_USE_MIN_MTU;
|
|
cmsgp->cmsg_len = cmsg_len(sizeof(use_min_mtu));
|
|
memmove(CMSG_DATA(cmsgp), &use_min_mtu, sizeof(use_min_mtu));
|
|
}
|
|
#endif /* if defined(IPV6_USE_MIN_MTU) */
|
|
|
|
if (isc_dscp_check_value > -1) {
|
|
if (sock->type == isc_sockettype_udp) {
|
|
INSIST((int)dev->dscp == isc_dscp_check_value);
|
|
} else if (sock->type == isc_sockettype_tcp) {
|
|
INSIST((int)sock->dscp == isc_dscp_check_value);
|
|
}
|
|
}
|
|
|
|
#if defined(IP_TOS) || (defined(IPPROTO_IPV6) && defined(IPV6_TCLASS))
|
|
if ((sock->type == isc_sockettype_udp) &&
|
|
((dev->attributes & ISC_SOCKEVENTATTR_DSCP) != 0))
|
|
{
|
|
int dscp = (dev->dscp << 2) & 0xff;
|
|
|
|
INSIST(dev->dscp < 0x40);
|
|
|
|
#ifdef IP_TOS
|
|
if (sock->pf == AF_INET && sock->pktdscp) {
|
|
cmsgp = (struct cmsghdr *)(cmsgbuf +
|
|
msg->msg_controllen);
|
|
msg->msg_control = (void *)cmsgbuf;
|
|
msg->msg_controllen += cmsg_space(sizeof(dscp));
|
|
INSIST(msg->msg_controllen <= SENDCMSGBUFLEN);
|
|
|
|
cmsgp->cmsg_level = IPPROTO_IP;
|
|
cmsgp->cmsg_type = IP_TOS;
|
|
cmsgp->cmsg_len = cmsg_len(sizeof(char));
|
|
*(unsigned char *)CMSG_DATA(cmsgp) = dscp;
|
|
} else if (sock->pf == AF_INET && sock->dscp != dev->dscp) {
|
|
if (setsockopt(sock->fd, IPPROTO_IP, IP_TOS,
|
|
(void *)&dscp, sizeof(int)) < 0) {
|
|
char strbuf[ISC_STRERRORSIZE];
|
|
strerror_r(errno, strbuf, sizeof(strbuf));
|
|
UNEXPECTED_ERROR(__FILE__, __LINE__,
|
|
"setsockopt(%d, IP_TOS, %.02x)"
|
|
" failed: %s",
|
|
sock->fd, dscp >> 2, strbuf);
|
|
} else {
|
|
sock->dscp = dscp;
|
|
}
|
|
}
|
|
#endif /* ifdef IP_TOS */
|
|
#if defined(IPPROTO_IPV6) && defined(IPV6_TCLASS)
|
|
if (sock->pf == AF_INET6 && sock->pktdscp) {
|
|
cmsgp = (struct cmsghdr *)(cmsgbuf +
|
|
msg->msg_controllen);
|
|
msg->msg_control = (void *)cmsgbuf;
|
|
msg->msg_controllen += cmsg_space(sizeof(dscp));
|
|
INSIST(msg->msg_controllen <= SENDCMSGBUFLEN);
|
|
|
|
cmsgp->cmsg_level = IPPROTO_IPV6;
|
|
cmsgp->cmsg_type = IPV6_TCLASS;
|
|
cmsgp->cmsg_len = cmsg_len(sizeof(dscp));
|
|
memmove(CMSG_DATA(cmsgp), &dscp, sizeof(dscp));
|
|
} else if (sock->pf == AF_INET6 && sock->dscp != dev->dscp) {
|
|
if (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_TCLASS,
|
|
(void *)&dscp, sizeof(int)) < 0)
|
|
{
|
|
char strbuf[ISC_STRERRORSIZE];
|
|
strerror_r(errno, strbuf, sizeof(strbuf));
|
|
UNEXPECTED_ERROR(__FILE__, __LINE__,
|
|
"setsockopt(%d, IPV6_TCLASS, "
|
|
"%.02x) failed: %s",
|
|
sock->fd, dscp >> 2, strbuf);
|
|
} else {
|
|
sock->dscp = dscp;
|
|
}
|
|
}
|
|
#endif /* if defined(IPPROTO_IPV6) && defined(IPV6_TCLASS) */
|
|
if (msg->msg_controllen != 0 &&
|
|
msg->msg_controllen < SENDCMSGBUFLEN) {
|
|
memset(cmsgbuf + msg->msg_controllen, 0,
|
|
SENDCMSGBUFLEN - msg->msg_controllen);
|
|
}
|
|
}
|
|
#endif /* if defined(IP_TOS) || (defined(IPPROTO_IPV6) && \
|
|
* defined(IPV6_TCLASS)) \
|
|
* */
|
|
#endif /* USE_CMSG */
|
|
|
|
if (write_countp != NULL) {
|
|
*write_countp = write_count;
|
|
}
|
|
}
|
|
|
|
/*
|
|
* Construct an iov array and attach it to the msghdr passed in. This is
|
|
* the RECV constructor, which will use the available region of the buffer
|
|
* (if using a buffer list) or will use the internal region (if a single
|
|
* buffer I/O is requested).
|
|
*
|
|
* Nothing can be NULL, and the done event must list at least one buffer
|
|
* on the buffer linked list for this function to be meaningful.
|
|
*
|
|
* If read_countp != NULL, *read_countp will hold the number of bytes
|
|
* this transaction can receive.
|
|
*/
|
|
static void
|
|
build_msghdr_recv(isc__socket_t *sock, char *cmsgbuf, isc_socketevent_t *dev,
|
|
struct msghdr *msg, struct iovec *iov, size_t *read_countp) {
|
|
unsigned int iovcount;
|
|
size_t read_count;
|
|
|
|
memset(msg, 0, sizeof(struct msghdr));
|
|
|
|
if (sock->type == isc_sockettype_udp) {
|
|
memset(&dev->address, 0, sizeof(dev->address));
|
|
msg->msg_name = (void *)&dev->address.type.sa;
|
|
msg->msg_namelen = sizeof(dev->address.type);
|
|
} else { /* TCP */
|
|
msg->msg_name = NULL;
|
|
msg->msg_namelen = 0;
|
|
dev->address = sock->peer_address;
|
|
}
|
|
|
|
read_count = dev->region.length - dev->n;
|
|
iov[0].iov_base = (void *)(dev->region.base + dev->n);
|
|
iov[0].iov_len = read_count;
|
|
iovcount = 1;
|
|
|
|
/*
|
|
* If needed, set up to receive that one extra byte.
|
|
*/
|
|
#ifdef ISC_PLATFORM_RECVOVERFLOW
|
|
if (sock->type == isc_sockettype_udp) {
|
|
INSIST(iovcount < MAXSCATTERGATHER_RECV);
|
|
iov[iovcount].iov_base = (void *)(&sock->overflow);
|
|
iov[iovcount].iov_len = 1;
|
|
iovcount++;
|
|
}
|
|
#endif /* ifdef ISC_PLATFORM_RECVOVERFLOW */
|
|
|
|
msg->msg_iov = iov;
|
|
msg->msg_iovlen = iovcount;
|
|
|
|
#if defined(USE_CMSG)
|
|
msg->msg_control = cmsgbuf;
|
|
msg->msg_controllen = RECVCMSGBUFLEN;
|
|
#else /* if defined(USE_CMSG) */
|
|
msg->msg_control = NULL;
|
|
msg->msg_controllen = 0;
|
|
#endif /* USE_CMSG */
|
|
msg->msg_flags = 0;
|
|
|
|
if (read_countp != NULL) {
|
|
*read_countp = read_count;
|
|
}
|
|
}
|
|
|
|
static void
|
|
set_dev_address(const isc_sockaddr_t *address, isc__socket_t *sock,
|
|
isc_socketevent_t *dev) {
|
|
if (sock->type == isc_sockettype_udp) {
|
|
if (address != NULL) {
|
|
dev->address = *address;
|
|
} else {
|
|
dev->address = sock->peer_address;
|
|
}
|
|
} else if (sock->type == isc_sockettype_tcp) {
|
|
INSIST(address == NULL);
|
|
dev->address = sock->peer_address;
|
|
}
|
|
}
|
|
|
|
static void
|
|
destroy_socketevent(isc_event_t *event) {
|
|
isc_socketevent_t *ev = (isc_socketevent_t *)event;
|
|
|
|
(ev->destroy)(event);
|
|
}
|
|
|
|
static isc_socketevent_t *
|
|
allocate_socketevent(isc_mem_t *mctx, void *sender, isc_eventtype_t eventtype,
|
|
isc_taskaction_t action, void *arg) {
|
|
isc_socketevent_t *ev;
|
|
|
|
ev = (isc_socketevent_t *)isc_event_allocate(mctx, sender, eventtype,
|
|
action, arg, sizeof(*ev));
|
|
|
|
ev->result = ISC_R_UNSET;
|
|
ISC_LINK_INIT(ev, ev_link);
|
|
ev->region.base = NULL;
|
|
ev->n = 0;
|
|
ev->offset = 0;
|
|
ev->attributes = 0;
|
|
ev->destroy = ev->ev_destroy;
|
|
ev->ev_destroy = destroy_socketevent;
|
|
ev->dscp = 0;
|
|
|
|
return (ev);
|
|
}
|
|
|
|
#if defined(ISC_SOCKET_DEBUG)
|
|
static void
|
|
dump_msg(struct msghdr *msg) {
|
|
unsigned int i;
|
|
|
|
printf("MSGHDR %p\n", msg);
|
|
printf("\tname %p, namelen %ld\n", msg->msg_name,
|
|
(long)msg->msg_namelen);
|
|
printf("\tiov %p, iovlen %ld\n", msg->msg_iov, (long)msg->msg_iovlen);
|
|
for (i = 0; i < (unsigned int)msg->msg_iovlen; i++)
|
|
printf("\t\t%u\tbase %p, len %ld\n", i,
|
|
msg->msg_iov[i].iov_base, (long)msg->msg_iov[i].iov_len);
|
|
printf("\tcontrol %p, controllen %ld\n", msg->msg_control,
|
|
(long)msg->msg_controllen);
|
|
}
|
|
#endif /* if defined(ISC_SOCKET_DEBUG) */
|
|
|
|
#define DOIO_SUCCESS 0 /* i/o ok, event sent */
|
|
#define DOIO_SOFT 1 /* i/o ok, soft error, no event sent */
|
|
#define DOIO_HARD 2 /* i/o error, event sent */
|
|
#define DOIO_EOF 3 /* EOF, no event sent */
|
|
|
|
static int
|
|
doio_recv(isc__socket_t *sock, isc_socketevent_t *dev) {
|
|
int cc;
|
|
struct iovec iov[MAXSCATTERGATHER_RECV];
|
|
size_t read_count;
|
|
struct msghdr msghdr;
|
|
int recv_errno;
|
|
char strbuf[ISC_STRERRORSIZE];
|
|
char cmsgbuf[RECVCMSGBUFLEN] = { 0 };
|
|
|
|
build_msghdr_recv(sock, cmsgbuf, dev, &msghdr, iov, &read_count);
|
|
|
|
#if defined(ISC_SOCKET_DEBUG)
|
|
dump_msg(&msghdr);
|
|
#endif /* if defined(ISC_SOCKET_DEBUG) */
|
|
|
|
cc = recvmsg(sock->fd, &msghdr, 0);
|
|
recv_errno = errno;
|
|
|
|
#if defined(ISC_SOCKET_DEBUG)
|
|
dump_msg(&msghdr);
|
|
#endif /* if defined(ISC_SOCKET_DEBUG) */
|
|
|
|
if (cc < 0) {
|
|
if (SOFT_ERROR(recv_errno)) {
|
|
return (DOIO_SOFT);
|
|
}
|
|
|
|
if (isc_log_wouldlog(isc_lctx, IOEVENT_LEVEL)) {
|
|
strerror_r(recv_errno, strbuf, sizeof(strbuf));
|
|
socket_log(sock, NULL, IOEVENT,
|
|
"doio_recv: recvmsg(%d) %d bytes, err %d/%s",
|
|
sock->fd, cc, recv_errno, strbuf);
|
|
}
|
|
|
|
#define SOFT_OR_HARD(_system, _isc) \
|
|
if (recv_errno == _system) { \
|
|
if (sock->connected) { \
|
|
dev->result = _isc; \
|
|
inc_stats(sock->manager->stats, \
|
|
sock->statsindex[STATID_RECVFAIL]); \
|
|
return (DOIO_HARD); \
|
|
} \
|
|
return (DOIO_SOFT); \
|
|
}
|
|
#define ALWAYS_HARD(_system, _isc) \
|
|
if (recv_errno == _system) { \
|
|
dev->result = _isc; \
|
|
inc_stats(sock->manager->stats, \
|
|
sock->statsindex[STATID_RECVFAIL]); \
|
|
return (DOIO_HARD); \
|
|
}
|
|
|
|
SOFT_OR_HARD(ECONNREFUSED, ISC_R_CONNREFUSED);
|
|
SOFT_OR_HARD(ENETUNREACH, ISC_R_NETUNREACH);
|
|
SOFT_OR_HARD(EHOSTUNREACH, ISC_R_HOSTUNREACH);
|
|
SOFT_OR_HARD(EHOSTDOWN, ISC_R_HOSTDOWN);
|
|
SOFT_OR_HARD(ENOBUFS, ISC_R_NORESOURCES);
|
|
/*
|
|
* Older operating systems may still return EPROTO in some
|
|
* situations, for example when receiving ICMP/ICMPv6 errors.
|
|
* A real life scenario is when ICMPv6 returns code 5 or 6.
|
|
* These codes are introduced in RFC 4443 from March 2006,
|
|
* and the document obsoletes RFC 1885. But unfortunately not
|
|
* all operating systems have caught up with the new standard
|
|
* (in 2020) and thus a generic protocol error is returned.
|
|
*/
|
|
SOFT_OR_HARD(EPROTO, ISC_R_HOSTUNREACH);
|
|
/* Should never get this one but it was seen. */
|
|
#ifdef ENOPROTOOPT
|
|
SOFT_OR_HARD(ENOPROTOOPT, ISC_R_HOSTUNREACH);
|
|
#endif /* ifdef ENOPROTOOPT */
|
|
SOFT_OR_HARD(EINVAL, ISC_R_HOSTUNREACH);
|
|
|
|
#undef SOFT_OR_HARD
|
|
#undef ALWAYS_HARD
|
|
|
|
dev->result = isc__errno2result(recv_errno);
|
|
inc_stats(sock->manager->stats,
|
|
sock->statsindex[STATID_RECVFAIL]);
|
|
return (DOIO_HARD);
|
|
}
|
|
|
|
/*
|
|
* On TCP and UNIX sockets, zero length reads indicate EOF,
|
|
* while on UDP sockets, zero length reads are perfectly valid,
|
|
* although strange.
|
|
*/
|
|
switch (sock->type) {
|
|
case isc_sockettype_tcp:
|
|
case isc_sockettype_unix:
|
|
if (cc == 0) {
|
|
return (DOIO_EOF);
|
|
}
|
|
break;
|
|
case isc_sockettype_udp:
|
|
case isc_sockettype_raw:
|
|
break;
|
|
default:
|
|
INSIST(0);
|
|
ISC_UNREACHABLE();
|
|
}
|
|
|
|
if (sock->type == isc_sockettype_udp) {
|
|
dev->address.length = msghdr.msg_namelen;
|
|
if (isc_sockaddr_getport(&dev->address) == 0) {
|
|
if (isc_log_wouldlog(isc_lctx, IOEVENT_LEVEL)) {
|
|
socket_log(sock, &dev->address, IOEVENT,
|
|
"dropping source port zero packet");
|
|
}
|
|
return (DOIO_SOFT);
|
|
}
|
|
/*
|
|
* Simulate a firewall blocking UDP responses bigger than
|
|
* 'maxudp' bytes.
|
|
*/
|
|
if (sock->manager->maxudp != 0 &&
|
|
cc > (int)sock->manager->maxudp) {
|
|
return (DOIO_SOFT);
|
|
}
|
|
}
|
|
|
|
socket_log(sock, &dev->address, IOEVENT, "packet received correctly");
|
|
|
|
/*
|
|
* Overflow bit detection. If we received MORE bytes than we should,
|
|
* this indicates an overflow situation. Set the flag in the
|
|
* dev entry and adjust how much we read by one.
|
|
*/
|
|
#ifdef ISC_PLATFORM_RECVOVERFLOW
|
|
if ((sock->type == isc_sockettype_udp) && ((size_t)cc > read_count)) {
|
|
dev->attributes |= ISC_SOCKEVENTATTR_TRUNC;
|
|
cc--;
|
|
}
|
|
#endif /* ifdef ISC_PLATFORM_RECVOVERFLOW */
|
|
|
|
/*
|
|
* If there are control messages attached, run through them and pull
|
|
* out the interesting bits.
|
|
*/
|
|
process_cmsg(sock, &msghdr, dev);
|
|
|
|
/*
|
|
* update the buffers (if any) and the i/o count
|
|
*/
|
|
dev->n += cc;
|
|
|
|
/*
|
|
* If we read less than we expected, update counters,
|
|
* and let the upper layer poke the descriptor.
|
|
*/
|
|
if (((size_t)cc != read_count) && (dev->n < dev->minimum)) {
|
|
return (DOIO_SOFT);
|
|
}
|
|
|
|
/*
|
|
* Full reads are posted, or partials if partials are ok.
|
|
*/
|
|
dev->result = ISC_R_SUCCESS;
|
|
return (DOIO_SUCCESS);
|
|
}
|
|
|
|
/*
|
|
* Returns:
|
|
* DOIO_SUCCESS The operation succeeded. dev->result contains
|
|
* ISC_R_SUCCESS.
|
|
*
|
|
* DOIO_HARD A hard or unexpected I/O error was encountered.
|
|
* dev->result contains the appropriate error.
|
|
*
|
|
* DOIO_SOFT A soft I/O error was encountered. No senddone
|
|
* event was sent. The operation should be retried.
|
|
*
|
|
* No other return values are possible.
|
|
*/
|
|
static int
|
|
doio_send(isc__socket_t *sock, isc_socketevent_t *dev) {
|
|
int cc;
|
|
struct iovec iov[MAXSCATTERGATHER_SEND];
|
|
size_t write_count;
|
|
struct msghdr msghdr;
|
|
char addrbuf[ISC_SOCKADDR_FORMATSIZE];
|
|
int attempts = 0;
|
|
int send_errno;
|
|
char strbuf[ISC_STRERRORSIZE];
|
|
char cmsgbuf[SENDCMSGBUFLEN] = { 0 };
|
|
|
|
build_msghdr_send(sock, cmsgbuf, dev, &msghdr, iov, &write_count);
|
|
|
|
resend:
|
|
if (sock->type == isc_sockettype_udp && sock->manager->maxudp != 0 &&
|
|
write_count > sock->manager->maxudp)
|
|
{
|
|
cc = write_count;
|
|
} else {
|
|
cc = sendmsg(sock->fd, &msghdr, 0);
|
|
}
|
|
send_errno = errno;
|
|
|
|
/*
|
|
* Check for error or block condition.
|
|
*/
|
|
if (cc < 0) {
|
|
if (send_errno == EINTR && ++attempts < NRETRIES) {
|
|
goto resend;
|
|
}
|
|
|
|
if (SOFT_ERROR(send_errno)) {
|
|
if (errno == EWOULDBLOCK || errno == EAGAIN) {
|
|
dev->result = ISC_R_WOULDBLOCK;
|
|
}
|
|
return (DOIO_SOFT);
|
|
}
|
|
|
|
#define SOFT_OR_HARD(_system, _isc) \
|
|
if (send_errno == _system) { \
|
|
if (sock->connected) { \
|
|
dev->result = _isc; \
|
|
inc_stats(sock->manager->stats, \
|
|
sock->statsindex[STATID_SENDFAIL]); \
|
|
return (DOIO_HARD); \
|
|
} \
|
|
return (DOIO_SOFT); \
|
|
}
|
|
#define ALWAYS_HARD(_system, _isc) \
|
|
if (send_errno == _system) { \
|
|
dev->result = _isc; \
|
|
inc_stats(sock->manager->stats, \
|
|
sock->statsindex[STATID_SENDFAIL]); \
|
|
return (DOIO_HARD); \
|
|
}
|
|
|
|
SOFT_OR_HARD(ECONNREFUSED, ISC_R_CONNREFUSED);
|
|
ALWAYS_HARD(EACCES, ISC_R_NOPERM);
|
|
ALWAYS_HARD(EAFNOSUPPORT, ISC_R_ADDRNOTAVAIL);
|
|
ALWAYS_HARD(EADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL);
|
|
ALWAYS_HARD(EHOSTUNREACH, ISC_R_HOSTUNREACH);
|
|
#ifdef EHOSTDOWN
|
|
ALWAYS_HARD(EHOSTDOWN, ISC_R_HOSTUNREACH);
|
|
#endif /* ifdef EHOSTDOWN */
|
|
ALWAYS_HARD(ENETUNREACH, ISC_R_NETUNREACH);
|
|
SOFT_OR_HARD(ENOBUFS, ISC_R_NORESOURCES);
|
|
ALWAYS_HARD(EPERM, ISC_R_HOSTUNREACH);
|
|
ALWAYS_HARD(EPIPE, ISC_R_NOTCONNECTED);
|
|
ALWAYS_HARD(ECONNRESET, ISC_R_CONNECTIONRESET);
|
|
|
|
#undef SOFT_OR_HARD
|
|
#undef ALWAYS_HARD
|
|
|
|
/*
|
|
* The other error types depend on whether or not the
|
|
* socket is UDP or TCP. If it is UDP, some errors
|
|
* that we expect to be fatal under TCP are merely
|
|
* annoying, and are really soft errors.
|
|
*
|
|
* However, these soft errors are still returned as
|
|
* a status.
|
|
*/
|
|
isc_sockaddr_format(&dev->address, addrbuf, sizeof(addrbuf));
|
|
strerror_r(send_errno, strbuf, sizeof(strbuf));
|
|
UNEXPECTED_ERROR(__FILE__, __LINE__, "internal_send: %s: %s",
|
|
addrbuf, strbuf);
|
|
dev->result = isc__errno2result(send_errno);
|
|
inc_stats(sock->manager->stats,
|
|
sock->statsindex[STATID_SENDFAIL]);
|
|
return (DOIO_HARD);
|
|
}
|
|
|
|
if (cc == 0) {
|
|
inc_stats(sock->manager->stats,
|
|
sock->statsindex[STATID_SENDFAIL]);
|
|
UNEXPECTED_ERROR(__FILE__, __LINE__,
|
|
"doio_send: send() returned 0");
|
|
}
|
|
|
|
/*
|
|
* If we write less than we expected, update counters, poke.
|
|
*/
|
|
dev->n += cc;
|
|
if ((size_t)cc != write_count) {
|
|
return (DOIO_SOFT);
|
|
}
|
|
|
|
/*
|
|
* Exactly what we wanted to write. We're done with this
|
|
* entry. Post its completion event.
|
|
*/
|
|
dev->result = ISC_R_SUCCESS;
|
|
return (DOIO_SUCCESS);
|
|
}
|
|
|
|
/*
|
|
* Kill.
|
|
*
|
|
* Caller must ensure that the socket is not locked and no external
|
|
* references exist.
|
|
*/
|
|
static void
|
|
socketclose(isc__socketthread_t *thread, isc__socket_t *sock, int fd) {
|
|
int lockid = FDLOCK_ID(fd);
|
|
/*
|
|
* No one has this socket open, so the watcher doesn't have to be
|
|
* poked, and the socket doesn't have to be locked.
|
|
*/
|
|
LOCK(&thread->fdlock[lockid]);
|
|
thread->fds[fd] = NULL;
|
|
thread->fdstate[fd] = CLOSE_PENDING;
|
|
UNLOCK(&thread->fdlock[lockid]);
|
|
select_poke(thread->manager, thread->threadid, fd, SELECT_POKE_CLOSE);
|
|
|
|
inc_stats(thread->manager->stats, sock->statsindex[STATID_CLOSE]);
|
|
|
|
LOCK(&sock->lock);
|
|
if (sock->active == 1) {
|
|
dec_stats(thread->manager->stats,
|
|
sock->statsindex[STATID_ACTIVE]);
|
|
sock->active = 0;
|
|
}
|
|
UNLOCK(&sock->lock);
|
|
|
|
/*
|
|
* update manager->maxfd here (XXX: this should be implemented more
|
|
* efficiently)
|
|
*/
|
|
#ifdef USE_SELECT
|
|
LOCK(&thread->manager->lock);
|
|
if (thread->maxfd == fd) {
|
|
int i;
|
|
|
|
thread->maxfd = 0;
|
|
for (i = fd - 1; i >= 0; i--) {
|
|
lockid = FDLOCK_ID(i);
|
|
|
|
LOCK(&thread->fdlock[lockid]);
|
|
if (thread->fdstate[i] == MANAGED) {
|
|
thread->maxfd = i;
|
|
UNLOCK(&thread->fdlock[lockid]);
|
|
break;
|
|
}
|
|
UNLOCK(&thread->fdlock[lockid]);
|
|
}
|
|
if (thread->maxfd < thread->pipe_fds[0]) {
|
|
thread->maxfd = thread->pipe_fds[0];
|
|
}
|
|
}
|
|
|
|
UNLOCK(&thread->manager->lock);
|
|
#endif /* USE_SELECT */
|
|
}
|
|
|
|
static void
|
|
destroy(isc__socket_t **sockp) {
|
|
int fd = 0;
|
|
isc__socket_t *sock = *sockp;
|
|
isc__socketmgr_t *manager = sock->manager;
|
|
isc__socketthread_t *thread = NULL;
|
|
|
|
socket_log(sock, NULL, CREATION, "destroying");
|
|
|
|
isc_refcount_destroy(&sock->references);
|
|
|
|
LOCK(&sock->lock);
|
|
INSIST(ISC_LIST_EMPTY(sock->connect_list));
|
|
INSIST(ISC_LIST_EMPTY(sock->accept_list));
|
|
INSIST(ISC_LIST_EMPTY(sock->recv_list));
|
|
INSIST(ISC_LIST_EMPTY(sock->send_list));
|
|
INSIST(sock->fd >= -1 && sock->fd < (int)manager->maxsocks);
|
|
|
|
if (sock->fd >= 0) {
|
|
fd = sock->fd;
|
|
thread = &manager->threads[sock->threadid];
|
|
sock->fd = -1;
|
|
sock->threadid = -1;
|
|
}
|
|
UNLOCK(&sock->lock);
|
|
|
|
if (fd > 0) {
|
|
socketclose(thread, sock, fd);
|
|
}
|
|
|
|
LOCK(&manager->lock);
|
|
|
|
ISC_LIST_UNLINK(manager->socklist, sock, link);
|
|
|
|
if (ISC_LIST_EMPTY(manager->socklist)) {
|
|
SIGNAL(&manager->shutdown_ok);
|
|
}
|
|
|
|
/* can't unlock manager as its memory context is still used */
|
|
free_socket(sockp);
|
|
|
|
UNLOCK(&manager->lock);
|
|
}
|
|
|
|
static isc_result_t
|
|
allocate_socket(isc__socketmgr_t *manager, isc_sockettype_t type,
|
|
isc__socket_t **socketp) {
|
|
isc__socket_t *sock;
|
|
|
|
sock = isc_mem_get(manager->mctx, sizeof(*sock));
|
|
|
|
sock->common.magic = 0;
|
|
sock->common.impmagic = 0;
|
|
isc_refcount_init(&sock->references, 0);
|
|
|
|
sock->manager = manager;
|
|
sock->type = type;
|
|
sock->fd = -1;
|
|
sock->threadid = -1;
|
|
sock->dscp = 0; /* TOS/TCLASS is zero until set. */
|
|
sock->dupped = 0;
|
|
sock->statsindex = NULL;
|
|
sock->active = 0;
|
|
|
|
ISC_LINK_INIT(sock, link);
|
|
|
|
memset(sock->name, 0, sizeof(sock->name));
|
|
sock->tag = NULL;
|
|
|
|
/*
|
|
* Set up list of readers and writers to be initially empty.
|
|
*/
|
|
ISC_LIST_INIT(sock->recv_list);
|
|
ISC_LIST_INIT(sock->send_list);
|
|
ISC_LIST_INIT(sock->accept_list);
|
|
ISC_LIST_INIT(sock->connect_list);
|
|
|
|
sock->listener = 0;
|
|
sock->connected = 0;
|
|
sock->connecting = 0;
|
|
sock->bound = 0;
|
|
sock->pktdscp = 0;
|
|
|
|
/*
|
|
* Initialize the lock.
|
|
*/
|
|
isc_mutex_init(&sock->lock);
|
|
|
|
sock->common.magic = ISCAPI_SOCKET_MAGIC;
|
|
sock->common.impmagic = SOCKET_MAGIC;
|
|
*socketp = sock;
|
|
|
|
return (ISC_R_SUCCESS);
|
|
}
|
|
|
|
/*
|
|
* This event requires that the various lists be empty, that the reference
|
|
* count be 1, and that the magic number is valid. The other socket bits,
|
|
* like the lock, must be initialized as well. The fd associated must be
|
|
* marked as closed, by setting it to -1 on close, or this routine will
|
|
* also close the socket.
|
|
*/
|
|
static void
|
|
free_socket(isc__socket_t **socketp) {
|
|
isc__socket_t *sock = *socketp;
|
|
*socketp = NULL;
|
|
|
|
INSIST(VALID_SOCKET(sock));
|
|
isc_refcount_destroy(&sock->references);
|
|
LOCK(&sock->lock);
|
|
INSIST(!sock->connecting);
|
|
INSIST(ISC_LIST_EMPTY(sock->recv_list));
|
|
INSIST(ISC_LIST_EMPTY(sock->send_list));
|
|
INSIST(ISC_LIST_EMPTY(sock->accept_list));
|
|
INSIST(ISC_LIST_EMPTY(sock->connect_list));
|
|
INSIST(!ISC_LINK_LINKED(sock, link));
|
|
UNLOCK(&sock->lock);
|
|
|
|
sock->common.magic = 0;
|
|
sock->common.impmagic = 0;
|
|
|
|
isc_mutex_destroy(&sock->lock);
|
|
|
|
isc_mem_put(sock->manager->mctx, sock, sizeof(*sock));
|
|
}
|
|
|
|
#if defined(SET_RCVBUF)
|
|
static isc_once_t rcvbuf_once = ISC_ONCE_INIT;
|
|
static int rcvbuf = ISC_RECV_BUFFER_SIZE;
|
|
|
|
static void
|
|
set_rcvbuf(void) {
|
|
int fd;
|
|
int max = rcvbuf, min;
|
|
socklen_t len;
|
|
|
|
fd = socket(AF_INET, SOCK_DGRAM, IPPROTO_UDP);
|
|
if (fd == -1) {
|
|
switch (errno) {
|
|
case EPROTONOSUPPORT:
|
|
case EPFNOSUPPORT:
|
|
case EAFNOSUPPORT:
|
|
/*
|
|
* Linux 2.2 (and maybe others) return EINVAL instead of
|
|
* EAFNOSUPPORT.
|
|
*/
|
|
case EINVAL:
|
|
fd = socket(AF_INET6, SOCK_DGRAM, IPPROTO_UDP);
|
|
break;
|
|
}
|
|
}
|
|
if (fd == -1) {
|
|
return;
|
|
}
|
|
|
|
len = sizeof(min);
|
|
if (getsockopt(fd, SOL_SOCKET, SO_RCVBUF, (void *)&min, &len) == 0 &&
|
|
min < rcvbuf)
|
|
{
|
|
again:
|
|
if (setsockopt(fd, SOL_SOCKET, SO_RCVBUF, (void *)&rcvbuf,
|
|
sizeof(rcvbuf)) == -1)
|
|
{
|
|
if (errno == ENOBUFS && rcvbuf > min) {
|
|
max = rcvbuf - 1;
|
|
rcvbuf = (rcvbuf + min) / 2;
|
|
goto again;
|
|
} else {
|
|
rcvbuf = min;
|
|
goto cleanup;
|
|
}
|
|
} else {
|
|
min = rcvbuf;
|
|
}
|
|
if (min != max) {
|
|
rcvbuf = max;
|
|
goto again;
|
|
}
|
|
}
|
|
cleanup:
|
|
close(fd);
|
|
}
|
|
#endif /* ifdef SO_RCVBUF */
|
|
|
|
#if defined(SET_SNDBUF)
|
|
static isc_once_t sndbuf_once = ISC_ONCE_INIT;
|
|
static int sndbuf = ISC_SEND_BUFFER_SIZE;
|
|
|
|
static void
|
|
set_sndbuf(void) {
|
|
int fd;
|
|
int max = sndbuf, min;
|
|
socklen_t len;
|
|
|
|
fd = socket(AF_INET, SOCK_DGRAM, IPPROTO_UDP);
|
|
#if defined(ISC_PLATFORM_HAVEIPV6)
|
|
if (fd == -1) {
|
|
switch (errno) {
|
|
case EPROTONOSUPPORT:
|
|
case EPFNOSUPPORT:
|
|
case EAFNOSUPPORT:
|
|
/*
|
|
* Linux 2.2 (and maybe others) return EINVAL instead of
|
|
* EAFNOSUPPORT.
|
|
*/
|
|
case EINVAL:
|
|
fd = socket(AF_INET6, SOCK_DGRAM, IPPROTO_UDP);
|
|
break;
|
|
}
|
|
}
|
|
#endif /* if defined(ISC_PLATFORM_HAVEIPV6) */
|
|
if (fd == -1) {
|
|
return;
|
|
}
|
|
|
|
len = sizeof(min);
|
|
if (getsockopt(fd, SOL_SOCKET, SO_SNDBUF, (void *)&min, &len) == 0 &&
|
|
min < sndbuf)
|
|
{
|
|
again:
|
|
if (setsockopt(fd, SOL_SOCKET, SO_SNDBUF, (void *)&sndbuf,
|
|
sizeof(sndbuf)) == -1)
|
|
{
|
|
if (errno == ENOBUFS && sndbuf > min) {
|
|
max = sndbuf - 1;
|
|
sndbuf = (sndbuf + min) / 2;
|
|
goto again;
|
|
} else {
|
|
sndbuf = min;
|
|
goto cleanup;
|
|
}
|
|
} else {
|
|
min = sndbuf;
|
|
}
|
|
if (min != max) {
|
|
sndbuf = max;
|
|
goto again;
|
|
}
|
|
}
|
|
cleanup:
|
|
close(fd);
|
|
}
|
|
#endif /* ifdef SO_SNDBUF */
|
|
|
|
static void
|
|
use_min_mtu(isc__socket_t *sock) {
|
|
#if !defined(IPV6_USE_MIN_MTU) && !defined(IPV6_MTU)
|
|
UNUSED(sock);
|
|
#endif /* if !defined(IPV6_USE_MIN_MTU) && !defined(IPV6_MTU) */
|
|
#ifdef IPV6_USE_MIN_MTU
|
|
/* use minimum MTU */
|
|
if (sock->pf == AF_INET6) {
|
|
int on = 1;
|
|
(void)setsockopt(sock->fd, IPPROTO_IPV6, IPV6_USE_MIN_MTU,
|
|
(void *)&on, sizeof(on));
|
|
}
|
|
#endif /* ifdef IPV6_USE_MIN_MTU */
|
|
#if defined(IPV6_MTU)
|
|
/*
|
|
* Use minimum MTU on IPv6 sockets.
|
|
*/
|
|
if (sock->pf == AF_INET6) {
|
|
int mtu = 1280;
|
|
(void)setsockopt(sock->fd, IPPROTO_IPV6, IPV6_MTU, &mtu,
|
|
sizeof(mtu));
|
|
}
|
|
#endif /* if defined(IPV6_MTU) */
|
|
}
|
|
|
|
static void
|
|
set_tcp_maxseg(isc__socket_t *sock, int size) {
|
|
#ifdef TCP_MAXSEG
|
|
if (sock->type == isc_sockettype_tcp) {
|
|
(void)setsockopt(sock->fd, IPPROTO_TCP, TCP_MAXSEG,
|
|
(void *)&size, sizeof(size));
|
|
}
|
|
#endif /* ifdef TCP_MAXSEG */
|
|
}
|
|
|
|
static void
|
|
set_ip_dontfrag(isc__socket_t *sock) {
|
|
/*
|
|
* Set the Don't Fragment flag on IP packets
|
|
*/
|
|
if (sock->pf == AF_INET6) {
|
|
#if defined(IPV6_DONTFRAG)
|
|
(void)setsockopt(sock->fd, IPPROTO_IPV6, IPV6_DONTFRAG,
|
|
&(int){ 1 }, sizeof(int));
|
|
#endif
|
|
#if defined(IPV6_MTU_DISCOVER)
|
|
(void)setsockopt(sock->fd, IPPROTO_IPV6, IPV6_MTU_DISCOVER,
|
|
&(int){ IP_PMTUDISC_DO }, sizeof(int));
|
|
#endif
|
|
} else if (sock->pf == AF_INET) {
|
|
#if defined(IP_DONTFRAG)
|
|
(void)setsockopt(sock->fd, IPPROTO_IP, IP_DONTFRAG, &(int){ 1 },
|
|
sizeof(int));
|
|
#endif
|
|
#if defined(IP_MTU_DISCOVER)
|
|
(void)setsockopt(sock->fd, IPPROTO_IP, IP_MTU_DISCOVER,
|
|
&(int){ IP_PMTUDISC_DO }, sizeof(int));
|
|
#endif
|
|
}
|
|
}
|
|
|
|
static isc_result_t
|
|
opensocket(isc__socketmgr_t *manager, isc__socket_t *sock,
|
|
isc__socket_t *dup_socket) {
|
|
isc_result_t result;
|
|
char strbuf[ISC_STRERRORSIZE];
|
|
const char *err = "socket";
|
|
int tries = 0;
|
|
#if defined(USE_CMSG) || defined(SO_NOSIGPIPE)
|
|
int on = 1;
|
|
#endif /* if defined(USE_CMSG) || defined(SO_NOSIGPIPE) */
|
|
#if defined(SET_RCVBUF) || defined(SET_SNDBUF)
|
|
socklen_t optlen;
|
|
int size = 0;
|
|
#endif
|
|
|
|
again:
|
|
if (dup_socket == NULL) {
|
|
switch (sock->type) {
|
|
case isc_sockettype_udp:
|
|
sock->fd = socket(sock->pf, SOCK_DGRAM, IPPROTO_UDP);
|
|
break;
|
|
case isc_sockettype_tcp:
|
|
sock->fd = socket(sock->pf, SOCK_STREAM, IPPROTO_TCP);
|
|
break;
|
|
case isc_sockettype_unix:
|
|
sock->fd = socket(sock->pf, SOCK_STREAM, 0);
|
|
break;
|
|
case isc_sockettype_raw:
|
|
errno = EPFNOSUPPORT;
|
|
/*
|
|
* PF_ROUTE is a alias for PF_NETLINK on linux.
|
|
*/
|
|
#if defined(PF_ROUTE)
|
|
if (sock->fd == -1 && sock->pf == PF_ROUTE) {
|
|
#ifdef NETLINK_ROUTE
|
|
sock->fd = socket(sock->pf, SOCK_RAW,
|
|
NETLINK_ROUTE);
|
|
#else /* ifdef NETLINK_ROUTE */
|
|
sock->fd = socket(sock->pf, SOCK_RAW, 0);
|
|
#endif /* ifdef NETLINK_ROUTE */
|
|
if (sock->fd != -1) {
|
|
#ifdef NETLINK_ROUTE
|
|
struct sockaddr_nl sa;
|
|
int n;
|
|
|
|
/*
|
|
* Do an implicit bind.
|
|
*/
|
|
memset(&sa, 0, sizeof(sa));
|
|
sa.nl_family = AF_NETLINK;
|
|
sa.nl_groups = RTMGRP_IPV4_IFADDR |
|
|
RTMGRP_IPV6_IFADDR;
|
|
n = bind(sock->fd,
|
|
(struct sockaddr *)&sa,
|
|
sizeof(sa));
|
|
if (n < 0) {
|
|
close(sock->fd);
|
|
sock->fd = -1;
|
|
}
|
|
#endif /* ifdef NETLINK_ROUTE */
|
|
sock->bound = 1;
|
|
}
|
|
}
|
|
#endif /* if defined(PF_ROUTE) */
|
|
break;
|
|
}
|
|
} else {
|
|
sock->fd = dup(dup_socket->fd);
|
|
sock->dupped = 1;
|
|
sock->bound = dup_socket->bound;
|
|
}
|
|
if (sock->fd == -1 && errno == EINTR && tries++ < 42) {
|
|
goto again;
|
|
}
|
|
|
|
#ifdef F_DUPFD
|
|
/*
|
|
* Leave a space for stdio and TCP to work in.
|
|
*/
|
|
if (manager->reserved != 0 && sock->type == isc_sockettype_udp &&
|
|
sock->fd >= 0 && sock->fd < manager->reserved)
|
|
{
|
|
int newfd, tmp;
|
|
newfd = fcntl(sock->fd, F_DUPFD, manager->reserved);
|
|
tmp = errno;
|
|
(void)close(sock->fd);
|
|
errno = tmp;
|
|
sock->fd = newfd;
|
|
err = "isc_socket_create: fcntl/reserved";
|
|
} else if (sock->fd >= 0 && sock->fd < 20) {
|
|
int newfd, tmp;
|
|
newfd = fcntl(sock->fd, F_DUPFD, 20);
|
|
tmp = errno;
|
|
(void)close(sock->fd);
|
|
errno = tmp;
|
|
sock->fd = newfd;
|
|
err = "isc_socket_create: fcntl";
|
|
}
|
|
#endif /* ifdef F_DUPFD */
|
|
|
|
if (sock->fd >= (int)manager->maxsocks) {
|
|
(void)close(sock->fd);
|
|
isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
|
|
ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
|
|
"socket: file descriptor exceeds limit (%d/%u)",
|
|
sock->fd, manager->maxsocks);
|
|
inc_stats(manager->stats, sock->statsindex[STATID_OPENFAIL]);
|
|
return (ISC_R_NORESOURCES);
|
|
}
|
|
|
|
if (sock->fd < 0) {
|
|
switch (errno) {
|
|
case EMFILE:
|
|
case ENFILE:
|
|
strerror_r(errno, strbuf, sizeof(strbuf));
|
|
isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
|
|
ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
|
|
"%s: %s", err, strbuf);
|
|
/* fallthrough */
|
|
case ENOBUFS:
|
|
inc_stats(manager->stats,
|
|
sock->statsindex[STATID_OPENFAIL]);
|
|
return (ISC_R_NORESOURCES);
|
|
|
|
case EPROTONOSUPPORT:
|
|
case EPFNOSUPPORT:
|
|
case EAFNOSUPPORT:
|
|
/*
|
|
* Linux 2.2 (and maybe others) return EINVAL instead of
|
|
* EAFNOSUPPORT.
|
|
*/
|
|
case EINVAL:
|
|
inc_stats(manager->stats,
|
|
sock->statsindex[STATID_OPENFAIL]);
|
|
return (ISC_R_FAMILYNOSUPPORT);
|
|
|
|
default:
|
|
strerror_r(errno, strbuf, sizeof(strbuf));
|
|
UNEXPECTED_ERROR(__FILE__, __LINE__, "%s() failed: %s",
|
|
err, strbuf);
|
|
inc_stats(manager->stats,
|
|
sock->statsindex[STATID_OPENFAIL]);
|
|
return (ISC_R_UNEXPECTED);
|
|
}
|
|
}
|
|
|
|
if (dup_socket != NULL) {
|
|
goto setup_done;
|
|
}
|
|
|
|
result = make_nonblock(sock->fd);
|
|
if (result != ISC_R_SUCCESS) {
|
|
(void)close(sock->fd);
|
|
inc_stats(manager->stats, sock->statsindex[STATID_OPENFAIL]);
|
|
return (result);
|
|
}
|
|
|
|
#ifdef SO_NOSIGPIPE
|
|
if (setsockopt(sock->fd, SOL_SOCKET, SO_NOSIGPIPE, (void *)&on,
|
|
sizeof(on)) < 0) {
|
|
strerror_r(errno, strbuf, sizeof(strbuf));
|
|
UNEXPECTED_ERROR(__FILE__, __LINE__,
|
|
"setsockopt(%d, SO_NOSIGPIPE) failed: %s",
|
|
sock->fd, strbuf);
|
|
/* Press on... */
|
|
}
|
|
#endif /* ifdef SO_NOSIGPIPE */
|
|
|
|
/*
|
|
* Use minimum mtu if possible.
|
|
*/
|
|
if (sock->type == isc_sockettype_tcp && sock->pf == AF_INET6) {
|
|
use_min_mtu(sock);
|
|
set_tcp_maxseg(sock, 1280 - 20 - 40); /* 1280 - TCP - IPV6 */
|
|
}
|
|
|
|
#if defined(USE_CMSG) || defined(SET_RCVBUF) || defined(SET_SNDBUF)
|
|
if (sock->type == isc_sockettype_udp) {
|
|
#if defined(USE_CMSG)
|
|
#if defined(SO_TIMESTAMP)
|
|
if (setsockopt(sock->fd, SOL_SOCKET, SO_TIMESTAMP, (void *)&on,
|
|
sizeof(on)) < 0 &&
|
|
errno != ENOPROTOOPT)
|
|
{
|
|
strerror_r(errno, strbuf, sizeof(strbuf));
|
|
UNEXPECTED_ERROR(__FILE__, __LINE__,
|
|
"setsockopt(%d, SO_TIMESTAMP) failed: "
|
|
"%s",
|
|
sock->fd, strbuf);
|
|
/* Press on... */
|
|
}
|
|
#endif /* SO_TIMESTAMP */
|
|
|
|
#ifdef IPV6_RECVPKTINFO
|
|
/* RFC 3542 */
|
|
if ((sock->pf == AF_INET6) &&
|
|
(setsockopt(sock->fd, IPPROTO_IPV6, IPV6_RECVPKTINFO,
|
|
(void *)&on, sizeof(on)) < 0))
|
|
{
|
|
strerror_r(errno, strbuf, sizeof(strbuf));
|
|
UNEXPECTED_ERROR(__FILE__, __LINE__,
|
|
"setsockopt(%d, IPV6_RECVPKTINFO) "
|
|
"failed: %s",
|
|
sock->fd, strbuf);
|
|
}
|
|
#else /* ifdef IPV6_RECVPKTINFO */
|
|
/* RFC 2292 */
|
|
if ((sock->pf == AF_INET6) &&
|
|
(setsockopt(sock->fd, IPPROTO_IPV6, IPV6_PKTINFO,
|
|
(void *)&on, sizeof(on)) < 0))
|
|
{
|
|
strerror_r(errno, strbuf, sizeof(strbuf));
|
|
UNEXPECTED_ERROR(__FILE__, __LINE__,
|
|
"setsockopt(%d, IPV6_PKTINFO) failed: "
|
|
"%s",
|
|
sock->fd, strbuf);
|
|
}
|
|
#endif /* IPV6_RECVPKTINFO */
|
|
#endif /* defined(USE_CMSG) */
|
|
|
|
set_ip_dontfrag(sock);
|
|
|
|
#if defined(SET_RCVBUF)
|
|
optlen = sizeof(size);
|
|
if (getsockopt(sock->fd, SOL_SOCKET, SO_RCVBUF, (void *)&size,
|
|
&optlen) == 0 &&
|
|
size < rcvbuf)
|
|
{
|
|
RUNTIME_CHECK(isc_once_do(&rcvbuf_once, set_rcvbuf) ==
|
|
ISC_R_SUCCESS);
|
|
if (setsockopt(sock->fd, SOL_SOCKET, SO_RCVBUF,
|
|
(void *)&rcvbuf, sizeof(rcvbuf)) == -1)
|
|
{
|
|
strerror_r(errno, strbuf, sizeof(strbuf));
|
|
UNEXPECTED_ERROR(__FILE__, __LINE__,
|
|
"setsockopt(%d, SO_RCVBUF, "
|
|
"%d) failed: %s",
|
|
sock->fd, rcvbuf, strbuf);
|
|
}
|
|
}
|
|
#endif /* if defined(SET_RCVBUF) */
|
|
|
|
#if defined(SET_SNDBUF)
|
|
optlen = sizeof(size);
|
|
if (getsockopt(sock->fd, SOL_SOCKET, SO_SNDBUF, (void *)&size,
|
|
&optlen) == 0 &&
|
|
size < sndbuf)
|
|
{
|
|
RUNTIME_CHECK(isc_once_do(&sndbuf_once, set_sndbuf) ==
|
|
ISC_R_SUCCESS);
|
|
if (setsockopt(sock->fd, SOL_SOCKET, SO_SNDBUF,
|
|
(void *)&sndbuf, sizeof(sndbuf)) == -1)
|
|
{
|
|
strerror_r(errno, strbuf, sizeof(strbuf));
|
|
UNEXPECTED_ERROR(__FILE__, __LINE__,
|
|
"setsockopt(%d, SO_SNDBUF, "
|
|
"%d) failed: %s",
|
|
sock->fd, sndbuf, strbuf);
|
|
}
|
|
}
|
|
#endif /* if defined(SO_SNDBUF) */
|
|
}
|
|
#ifdef IPV6_RECVTCLASS
|
|
if ((sock->pf == AF_INET6) &&
|
|
(setsockopt(sock->fd, IPPROTO_IPV6, IPV6_RECVTCLASS, (void *)&on,
|
|
sizeof(on)) < 0))
|
|
{
|
|
strerror_r(errno, strbuf, sizeof(strbuf));
|
|
UNEXPECTED_ERROR(__FILE__, __LINE__,
|
|
"setsockopt(%d, IPV6_RECVTCLASS) "
|
|
"failed: %s",
|
|
sock->fd, strbuf);
|
|
}
|
|
#endif /* ifdef IPV6_RECVTCLASS */
|
|
#ifdef IP_RECVTOS
|
|
if ((sock->pf == AF_INET) &&
|
|
(setsockopt(sock->fd, IPPROTO_IP, IP_RECVTOS, (void *)&on,
|
|
sizeof(on)) < 0))
|
|
{
|
|
strerror_r(errno, strbuf, sizeof(strbuf));
|
|
UNEXPECTED_ERROR(__FILE__, __LINE__,
|
|
"setsockopt(%d, IP_RECVTOS) "
|
|
"failed: %s",
|
|
sock->fd, strbuf);
|
|
}
|
|
#endif /* ifdef IP_RECVTOS */
|
|
#endif /* defined(USE_CMSG) || defined(SET_RCVBUF) || defined(SET_SNDBUF) */
|
|
|
|
setup_done:
|
|
inc_stats(manager->stats, sock->statsindex[STATID_OPEN]);
|
|
if (sock->active == 0) {
|
|
inc_stats(manager->stats, sock->statsindex[STATID_ACTIVE]);
|
|
sock->active = 1;
|
|
}
|
|
|
|
return (ISC_R_SUCCESS);
|
|
}
|
|
|
|
/*
|
|
* Create a 'type' socket or duplicate an existing socket, managed
|
|
* by 'manager'. Events will be posted to 'task' and when dispatched
|
|
* 'action' will be called with 'arg' as the arg value. The new
|
|
* socket is returned in 'socketp'.
|
|
*/
|
|
static isc_result_t
|
|
socket_create(isc_socketmgr_t *manager0, int pf, isc_sockettype_t type,
|
|
isc_socket_t **socketp, isc_socket_t *dup_socket) {
|
|
isc__socket_t *sock = NULL;
|
|
isc__socketmgr_t *manager = (isc__socketmgr_t *)manager0;
|
|
isc__socketthread_t *thread;
|
|
isc_result_t result;
|
|
int lockid;
|
|
|
|
REQUIRE(VALID_MANAGER(manager));
|
|
REQUIRE(socketp != NULL && *socketp == NULL);
|
|
|
|
result = allocate_socket(manager, type, &sock);
|
|
if (result != ISC_R_SUCCESS) {
|
|
return (result);
|
|
}
|
|
|
|
switch (sock->type) {
|
|
case isc_sockettype_udp:
|
|
sock->statsindex = (pf == AF_INET) ? udp4statsindex
|
|
: udp6statsindex;
|
|
#define DCSPPKT(pf) ((pf == AF_INET) ? ISC_NET_DSCPPKTV4 : ISC_NET_DSCPPKTV6)
|
|
sock->pktdscp = (isc_net_probedscp() & DCSPPKT(pf)) != 0;
|
|
break;
|
|
case isc_sockettype_tcp:
|
|
sock->statsindex = (pf == AF_INET) ? tcp4statsindex
|
|
: tcp6statsindex;
|
|
break;
|
|
case isc_sockettype_unix:
|
|
sock->statsindex = unixstatsindex;
|
|
break;
|
|
case isc_sockettype_raw:
|
|
sock->statsindex = rawstatsindex;
|
|
break;
|
|
default:
|
|
INSIST(0);
|
|
ISC_UNREACHABLE();
|
|
}
|
|
|
|
sock->pf = pf;
|
|
|
|
result = opensocket(manager, sock, (isc__socket_t *)dup_socket);
|
|
if (result != ISC_R_SUCCESS) {
|
|
free_socket(&sock);
|
|
return (result);
|
|
}
|
|
|
|
if (sock->fd == -1) {
|
|
abort();
|
|
}
|
|
sock->threadid = gen_threadid(sock);
|
|
isc_refcount_increment0(&sock->references);
|
|
thread = &manager->threads[sock->threadid];
|
|
*socketp = (isc_socket_t *)sock;
|
|
|
|
/*
|
|
* Note we don't have to lock the socket like we normally would because
|
|
* there are no external references to it yet.
|
|
*/
|
|
|
|
lockid = FDLOCK_ID(sock->fd);
|
|
LOCK(&thread->fdlock[lockid]);
|
|
thread->fds[sock->fd] = sock;
|
|
thread->fdstate[sock->fd] = MANAGED;
|
|
#if defined(USE_EPOLL)
|
|
thread->epoll_events[sock->fd] = 0;
|
|
#endif /* if defined(USE_EPOLL) */
|
|
#ifdef USE_DEVPOLL
|
|
INSIST(thread->fdpollinfo[sock->fd].want_read == 0 &&
|
|
thread->fdpollinfo[sock->fd].want_write == 0);
|
|
#endif /* ifdef USE_DEVPOLL */
|
|
UNLOCK(&thread->fdlock[lockid]);
|
|
|
|
LOCK(&manager->lock);
|
|
ISC_LIST_APPEND(manager->socklist, sock, link);
|
|
#ifdef USE_SELECT
|
|
if (thread->maxfd < sock->fd) {
|
|
thread->maxfd = sock->fd;
|
|
}
|
|
#endif /* ifdef USE_SELECT */
|
|
UNLOCK(&manager->lock);
|
|
|
|
socket_log(sock, NULL, CREATION,
|
|
dup_socket != NULL ? "dupped" : "created");
|
|
|
|
return (ISC_R_SUCCESS);
|
|
}
|
|
|
|
/*%
|
|
* Create a new 'type' socket managed by 'manager'. Events
|
|
* will be posted to 'task' and when dispatched 'action' will be
|
|
* called with 'arg' as the arg value. The new socket is returned
|
|
* in 'socketp'.
|
|
*/
|
|
isc_result_t
|
|
isc_socket_create(isc_socketmgr_t *manager0, int pf, isc_sockettype_t type,
|
|
isc_socket_t **socketp) {
|
|
return (socket_create(manager0, pf, type, socketp, NULL));
|
|
}
|
|
|
|
/*%
|
|
* Duplicate an existing socket. The new socket is returned
|
|
* in 'socketp'.
|
|
*/
|
|
isc_result_t
|
|
isc_socket_dup(isc_socket_t *sock0, isc_socket_t **socketp) {
|
|
isc__socket_t *sock = (isc__socket_t *)sock0;
|
|
|
|
REQUIRE(VALID_SOCKET(sock));
|
|
REQUIRE(socketp != NULL && *socketp == NULL);
|
|
|
|
return (socket_create((isc_socketmgr_t *)sock->manager, sock->pf,
|
|
sock->type, socketp, sock0));
|
|
}
|
|
|
|
isc_result_t
|
|
isc_socket_open(isc_socket_t *sock0) {
|
|
isc_result_t result;
|
|
isc__socket_t *sock = (isc__socket_t *)sock0;
|
|
isc__socketthread_t *thread;
|
|
|
|
REQUIRE(VALID_SOCKET(sock));
|
|
|
|
LOCK(&sock->lock);
|
|
|
|
REQUIRE(isc_refcount_current(&sock->references) >= 1);
|
|
REQUIRE(sock->fd == -1);
|
|
REQUIRE(sock->threadid == -1);
|
|
|
|
result = opensocket(sock->manager, sock, NULL);
|
|
|
|
UNLOCK(&sock->lock);
|
|
|
|
if (result != ISC_R_SUCCESS) {
|
|
sock->fd = -1;
|
|
} else {
|
|
sock->threadid = gen_threadid(sock);
|
|
thread = &sock->manager->threads[sock->threadid];
|
|
int lockid = FDLOCK_ID(sock->fd);
|
|
|
|
LOCK(&thread->fdlock[lockid]);
|
|
thread->fds[sock->fd] = sock;
|
|
thread->fdstate[sock->fd] = MANAGED;
|
|
#if defined(USE_EPOLL)
|
|
thread->epoll_events[sock->fd] = 0;
|
|
#endif /* if defined(USE_EPOLL) */
|
|
#ifdef USE_DEVPOLL
|
|
INSIST(thread->fdpollinfo[sock->fd].want_read == 0 &&
|
|
thread->fdpollinfo[sock->fd].want_write == 0);
|
|
#endif /* ifdef USE_DEVPOLL */
|
|
UNLOCK(&thread->fdlock[lockid]);
|
|
|
|
#ifdef USE_SELECT
|
|
LOCK(&sock->manager->lock);
|
|
if (thread->maxfd < sock->fd) {
|
|
thread->maxfd = sock->fd;
|
|
}
|
|
UNLOCK(&sock->manager->lock);
|
|
#endif /* ifdef USE_SELECT */
|
|
}
|
|
|
|
return (result);
|
|
}
|
|
|
|
/*
|
|
* Attach to a socket. Caller must explicitly detach when it is done.
|
|
*/
|
|
void
|
|
isc_socket_attach(isc_socket_t *sock0, isc_socket_t **socketp) {
|
|
isc__socket_t *sock = (isc__socket_t *)sock0;
|
|
|
|
REQUIRE(VALID_SOCKET(sock));
|
|
REQUIRE(socketp != NULL && *socketp == NULL);
|
|
|
|
int old_refs = isc_refcount_increment(&sock->references);
|
|
REQUIRE(old_refs > 0);
|
|
|
|
*socketp = (isc_socket_t *)sock;
|
|
}
|
|
|
|
/*
|
|
* Dereference a socket. If this is the last reference to it, clean things
|
|
* up by destroying the socket.
|
|
*/
|
|
void
|
|
isc_socket_detach(isc_socket_t **socketp) {
|
|
isc__socket_t *sock;
|
|
|
|
REQUIRE(socketp != NULL);
|
|
sock = (isc__socket_t *)*socketp;
|
|
REQUIRE(VALID_SOCKET(sock));
|
|
if (isc_refcount_decrement(&sock->references) == 1) {
|
|
destroy(&sock);
|
|
}
|
|
|
|
*socketp = NULL;
|
|
}
|
|
|
|
isc_result_t
|
|
isc_socket_close(isc_socket_t *sock0) {
|
|
isc__socket_t *sock = (isc__socket_t *)sock0;
|
|
int fd;
|
|
isc__socketmgr_t *manager;
|
|
isc__socketthread_t *thread;
|
|
fflush(stdout);
|
|
REQUIRE(VALID_SOCKET(sock));
|
|
|
|
LOCK(&sock->lock);
|
|
|
|
REQUIRE(sock->fd >= 0 && sock->fd < (int)sock->manager->maxsocks);
|
|
|
|
INSIST(!sock->connecting);
|
|
INSIST(ISC_LIST_EMPTY(sock->recv_list));
|
|
INSIST(ISC_LIST_EMPTY(sock->send_list));
|
|
INSIST(ISC_LIST_EMPTY(sock->accept_list));
|
|
INSIST(ISC_LIST_EMPTY(sock->connect_list));
|
|
|
|
manager = sock->manager;
|
|
thread = &manager->threads[sock->threadid];
|
|
fd = sock->fd;
|
|
sock->fd = -1;
|
|
sock->threadid = -1;
|
|
|
|
sock->dupped = 0;
|
|
memset(sock->name, 0, sizeof(sock->name));
|
|
sock->tag = NULL;
|
|
sock->listener = 0;
|
|
sock->connected = 0;
|
|
sock->connecting = 0;
|
|
sock->bound = 0;
|
|
isc_sockaddr_any(&sock->peer_address);
|
|
|
|
UNLOCK(&sock->lock);
|
|
|
|
socketclose(thread, sock, fd);
|
|
|
|
return (ISC_R_SUCCESS);
|
|
}
|
|
|
|
/*
|
|
* Dequeue an item off the given socket's read queue, set the result code
|
|
* in the done event to the one provided, and send it to the task it was
|
|
* destined for.
|
|
*
|
|
* If the event to be sent is on a list, remove it before sending. If
|
|
* asked to, send and detach from the socket as well.
|
|
*
|
|
* Caller must have the socket locked if the event is attached to the socket.
|
|
*/
|
|
static void
|
|
send_recvdone_event(isc__socket_t *sock, isc_socketevent_t **dev) {
|
|
isc_task_t *task;
|
|
|
|
task = (*dev)->ev_sender;
|
|
|
|
(*dev)->ev_sender = sock;
|
|
|
|
if (ISC_LINK_LINKED(*dev, ev_link)) {
|
|
ISC_LIST_DEQUEUE(sock->recv_list, *dev, ev_link);
|
|
}
|
|
|
|
if (((*dev)->attributes & ISC_SOCKEVENTATTR_ATTACHED) != 0) {
|
|
isc_task_sendtoanddetach(&task, (isc_event_t **)dev,
|
|
sock->threadid);
|
|
} else {
|
|
isc_task_sendto(task, (isc_event_t **)dev, sock->threadid);
|
|
}
|
|
}
|
|
|
|
/*
|
|
* See comments for send_recvdone_event() above.
|
|
*
|
|
* Caller must have the socket locked if the event is attached to the socket.
|
|
*/
|
|
static void
|
|
send_senddone_event(isc__socket_t *sock, isc_socketevent_t **dev) {
|
|
isc_task_t *task;
|
|
|
|
INSIST(dev != NULL && *dev != NULL);
|
|
|
|
task = (*dev)->ev_sender;
|
|
(*dev)->ev_sender = sock;
|
|
|
|
if (ISC_LINK_LINKED(*dev, ev_link)) {
|
|
ISC_LIST_DEQUEUE(sock->send_list, *dev, ev_link);
|
|
}
|
|
|
|
if (((*dev)->attributes & ISC_SOCKEVENTATTR_ATTACHED) != 0) {
|
|
isc_task_sendtoanddetach(&task, (isc_event_t **)dev,
|
|
sock->threadid);
|
|
} else {
|
|
isc_task_sendto(task, (isc_event_t **)dev, sock->threadid);
|
|
}
|
|
}
|
|
|
|
/*
|
|
* See comments for send_recvdone_event() above.
|
|
*
|
|
* Caller must have the socket locked if the event is attached to the socket.
|
|
*/
|
|
static void
|
|
send_connectdone_event(isc__socket_t *sock, isc_socket_connev_t **dev) {
|
|
isc_task_t *task;
|
|
|
|
INSIST(dev != NULL && *dev != NULL);
|
|
|
|
task = (*dev)->ev_sender;
|
|
(*dev)->ev_sender = sock;
|
|
|
|
if (ISC_LINK_LINKED(*dev, ev_link)) {
|
|
ISC_LIST_DEQUEUE(sock->connect_list, *dev, ev_link);
|
|
}
|
|
|
|
isc_task_sendtoanddetach(&task, (isc_event_t **)dev, sock->threadid);
|
|
}
|
|
|
|
/*
|
|
* Call accept() on a socket, to get the new file descriptor. The listen
|
|
* socket is used as a prototype to create a new isc_socket_t. The new
|
|
* socket has one outstanding reference. The task receiving the event
|
|
* will be detached from just after the event is delivered.
|
|
*
|
|
* On entry to this function, the event delivered is the internal
|
|
* readable event, and the first item on the accept_list should be
|
|
* the done event we want to send. If the list is empty, this is a no-op,
|
|
* so just unlock and return.
|
|
*/
|
|
static void
|
|
internal_accept(isc__socket_t *sock) {
|
|
isc__socketmgr_t *manager;
|
|
isc__socketthread_t *thread, *nthread;
|
|
isc_socket_newconnev_t *dev;
|
|
isc_task_t *task;
|
|
socklen_t addrlen;
|
|
int fd;
|
|
isc_result_t result = ISC_R_SUCCESS;
|
|
char strbuf[ISC_STRERRORSIZE];
|
|
const char *err = "accept";
|
|
|
|
INSIST(VALID_SOCKET(sock));
|
|
REQUIRE(sock->fd >= 0);
|
|
|
|
socket_log(sock, NULL, TRACE, "internal_accept called, locked socket");
|
|
|
|
manager = sock->manager;
|
|
INSIST(VALID_MANAGER(manager));
|
|
thread = &manager->threads[sock->threadid];
|
|
|
|
INSIST(sock->listener);
|
|
|
|
/*
|
|
* Get the first item off the accept list.
|
|
* If it is empty, unlock the socket and return.
|
|
*/
|
|
dev = ISC_LIST_HEAD(sock->accept_list);
|
|
if (dev == NULL) {
|
|
unwatch_fd(thread, sock->fd, SELECT_POKE_ACCEPT);
|
|
UNLOCK(&sock->lock);
|
|
return;
|
|
}
|
|
|
|
/*
|
|
* Try to accept the new connection. If the accept fails with
|
|
* EAGAIN or EINTR, simply poke the watcher to watch this socket
|
|
* again. Also ignore ECONNRESET, which has been reported to
|
|
* be spuriously returned on Linux 2.2.19 although it is not
|
|
* a documented error for accept(). ECONNABORTED has been
|
|
* reported for Solaris 8. The rest are thrown in not because
|
|
* we have seen them but because they are ignored by other
|
|
* daemons such as BIND 8 and Apache.
|
|
*/
|
|
|
|
addrlen = sizeof(NEWCONNSOCK(dev)->peer_address.type);
|
|
memset(&NEWCONNSOCK(dev)->peer_address.type, 0, addrlen);
|
|
fd = accept(sock->fd, &NEWCONNSOCK(dev)->peer_address.type.sa,
|
|
(void *)&addrlen);
|
|
|
|
#ifdef F_DUPFD
|
|
/*
|
|
* Leave a space for stdio to work in.
|
|
*/
|
|
if (fd >= 0 && fd < 20) {
|
|
int newfd, tmp;
|
|
newfd = fcntl(fd, F_DUPFD, 20);
|
|
tmp = errno;
|
|
(void)close(fd);
|
|
errno = tmp;
|
|
fd = newfd;
|
|
err = "accept/fcntl";
|
|
}
|
|
#endif /* ifdef F_DUPFD */
|
|
|
|
if (fd < 0) {
|
|
if (SOFT_ERROR(errno)) {
|
|
goto soft_error;
|
|
}
|
|
switch (errno) {
|
|
case ENFILE:
|
|
case EMFILE:
|
|
isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
|
|
ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
|
|
"%s: too many open file descriptors",
|
|
err);
|
|
goto soft_error;
|
|
|
|
case ENOBUFS:
|
|
case ENOMEM:
|
|
case ECONNRESET:
|
|
case ECONNABORTED:
|
|
case EHOSTUNREACH:
|
|
case EHOSTDOWN:
|
|
case ENETUNREACH:
|
|
case ENETDOWN:
|
|
case ECONNREFUSED:
|
|
#ifdef EPROTO
|
|
case EPROTO:
|
|
#endif /* ifdef EPROTO */
|
|
#ifdef ENONET
|
|
case ENONET:
|
|
#endif /* ifdef ENONET */
|
|
goto soft_error;
|
|
default:
|
|
break;
|
|
}
|
|
strerror_r(errno, strbuf, sizeof(strbuf));
|
|
UNEXPECTED_ERROR(__FILE__, __LINE__,
|
|
"internal_accept: %s() failed: %s", err,
|
|
strbuf);
|
|
fd = -1;
|
|
result = ISC_R_UNEXPECTED;
|
|
} else {
|
|
if (addrlen == 0U) {
|
|
UNEXPECTED_ERROR(__FILE__, __LINE__,
|
|
"internal_accept(): "
|
|
"accept() failed to return "
|
|
"remote address");
|
|
|
|
(void)close(fd);
|
|
goto soft_error;
|
|
} else if (NEWCONNSOCK(dev)->peer_address.type.sa.sa_family !=
|
|
sock->pf) {
|
|
UNEXPECTED_ERROR(
|
|
__FILE__, __LINE__,
|
|
"internal_accept(): "
|
|
"accept() returned peer address "
|
|
"family %u (expected %u)",
|
|
NEWCONNSOCK(dev)->peer_address.type.sa.sa_family,
|
|
sock->pf);
|
|
(void)close(fd);
|
|
goto soft_error;
|
|
} else if (fd >= (int)manager->maxsocks) {
|
|
isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
|
|
ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
|
|
"accept: file descriptor exceeds limit "
|
|
"(%d/%u)",
|
|
fd, manager->maxsocks);
|
|
(void)close(fd);
|
|
goto soft_error;
|
|
}
|
|
}
|
|
|
|
if (fd != -1) {
|
|
NEWCONNSOCK(dev)->peer_address.length = addrlen;
|
|
NEWCONNSOCK(dev)->pf = sock->pf;
|
|
}
|
|
|
|
/*
|
|
* Pull off the done event.
|
|
*/
|
|
ISC_LIST_UNLINK(sock->accept_list, dev, ev_link);
|
|
|
|
/*
|
|
* Poke watcher if there are more pending accepts.
|
|
*/
|
|
if (ISC_LIST_EMPTY(sock->accept_list)) {
|
|
unwatch_fd(thread, sock->fd, SELECT_POKE_ACCEPT);
|
|
}
|
|
|
|
if (fd != -1) {
|
|
result = make_nonblock(fd);
|
|
if (result != ISC_R_SUCCESS) {
|
|
(void)close(fd);
|
|
fd = -1;
|
|
}
|
|
}
|
|
|
|
/*
|
|
* We need to unlock sock->lock now to be able to lock manager->lock
|
|
* without risking a deadlock with xmlstats.
|
|
*/
|
|
UNLOCK(&sock->lock);
|
|
|
|
/*
|
|
* -1 means the new socket didn't happen.
|
|
*/
|
|
if (fd != -1) {
|
|
int lockid = FDLOCK_ID(fd);
|
|
|
|
NEWCONNSOCK(dev)->fd = fd;
|
|
NEWCONNSOCK(dev)->threadid = gen_threadid(NEWCONNSOCK(dev));
|
|
NEWCONNSOCK(dev)->bound = 1;
|
|
NEWCONNSOCK(dev)->connected = 1;
|
|
nthread = &manager->threads[NEWCONNSOCK(dev)->threadid];
|
|
|
|
/*
|
|
* We already hold a lock on one fdlock in accepting thread,
|
|
* we need to make sure that we don't double lock.
|
|
*/
|
|
bool same_bucket = (sock->threadid ==
|
|
NEWCONNSOCK(dev)->threadid) &&
|
|
(FDLOCK_ID(sock->fd) == lockid);
|
|
|
|
/*
|
|
* Use minimum mtu if possible.
|
|
*/
|
|
use_min_mtu(NEWCONNSOCK(dev));
|
|
set_tcp_maxseg(NEWCONNSOCK(dev), 1280 - 20 - 40);
|
|
|
|
/*
|
|
* Ensure DSCP settings are inherited across accept.
|
|
*/
|
|
setdscp(NEWCONNSOCK(dev), sock->dscp);
|
|
|
|
/*
|
|
* Save away the remote address
|
|
*/
|
|
dev->address = NEWCONNSOCK(dev)->peer_address;
|
|
|
|
if (NEWCONNSOCK(dev)->active == 0) {
|
|
inc_stats(manager->stats,
|
|
NEWCONNSOCK(dev)->statsindex[STATID_ACTIVE]);
|
|
NEWCONNSOCK(dev)->active = 1;
|
|
}
|
|
|
|
if (!same_bucket) {
|
|
LOCK(&nthread->fdlock[lockid]);
|
|
}
|
|
nthread->fds[fd] = NEWCONNSOCK(dev);
|
|
nthread->fdstate[fd] = MANAGED;
|
|
#if defined(USE_EPOLL)
|
|
nthread->epoll_events[fd] = 0;
|
|
#endif /* if defined(USE_EPOLL) */
|
|
if (!same_bucket) {
|
|
UNLOCK(&nthread->fdlock[lockid]);
|
|
}
|
|
|
|
LOCK(&manager->lock);
|
|
|
|
#ifdef USE_SELECT
|
|
if (nthread->maxfd < fd) {
|
|
nthread->maxfd = fd;
|
|
}
|
|
#endif /* ifdef USE_SELECT */
|
|
|
|
socket_log(sock, &NEWCONNSOCK(dev)->peer_address, CREATION,
|
|
"accepted connection, new socket %p",
|
|
dev->newsocket);
|
|
|
|
ISC_LIST_APPEND(manager->socklist, NEWCONNSOCK(dev), link);
|
|
|
|
UNLOCK(&manager->lock);
|
|
|
|
inc_stats(manager->stats, sock->statsindex[STATID_ACCEPT]);
|
|
} else {
|
|
inc_stats(manager->stats, sock->statsindex[STATID_ACCEPTFAIL]);
|
|
isc_refcount_decrementz(&NEWCONNSOCK(dev)->references);
|
|
free_socket((isc__socket_t **)&dev->newsocket);
|
|
}
|
|
|
|
/*
|
|
* Fill in the done event details and send it off.
|
|
*/
|
|
dev->result = result;
|
|
task = dev->ev_sender;
|
|
dev->ev_sender = sock;
|
|
|
|
isc_task_sendtoanddetach(&task, ISC_EVENT_PTR(&dev), sock->threadid);
|
|
return;
|
|
|
|
soft_error:
|
|
watch_fd(thread, sock->fd, SELECT_POKE_ACCEPT);
|
|
UNLOCK(&sock->lock);
|
|
|
|
inc_stats(manager->stats, sock->statsindex[STATID_ACCEPTFAIL]);
|
|
return;
|
|
}
|
|
|
|
static void
|
|
internal_recv(isc__socket_t *sock) {
|
|
isc_socketevent_t *dev;
|
|
|
|
INSIST(VALID_SOCKET(sock));
|
|
REQUIRE(sock->fd >= 0);
|
|
|
|
dev = ISC_LIST_HEAD(sock->recv_list);
|
|
if (dev == NULL) {
|
|
goto finish;
|
|
}
|
|
|
|
socket_log(sock, NULL, IOEVENT, "internal_recv: event %p -> task %p",
|
|
dev, dev->ev_sender);
|
|
|
|
/*
|
|
* Try to do as much I/O as possible on this socket. There are no
|
|
* limits here, currently.
|
|
*/
|
|
while (dev != NULL) {
|
|
switch (doio_recv(sock, dev)) {
|
|
case DOIO_SOFT:
|
|
goto finish;
|
|
|
|
case DOIO_EOF:
|
|
/*
|
|
* read of 0 means the remote end was closed.
|
|
* Run through the event queue and dispatch all
|
|
* the events with an EOF result code.
|
|
*/
|
|
do {
|
|
dev->result = ISC_R_EOF;
|
|
send_recvdone_event(sock, &dev);
|
|
dev = ISC_LIST_HEAD(sock->recv_list);
|
|
} while (dev != NULL);
|
|
goto finish;
|
|
|
|
case DOIO_SUCCESS:
|
|
case DOIO_HARD:
|
|
send_recvdone_event(sock, &dev);
|
|
break;
|
|
}
|
|
|
|
dev = ISC_LIST_HEAD(sock->recv_list);
|
|
}
|
|
|
|
finish:
|
|
if (ISC_LIST_EMPTY(sock->recv_list)) {
|
|
unwatch_fd(&sock->manager->threads[sock->threadid], sock->fd,
|
|
SELECT_POKE_READ);
|
|
}
|
|
}
|
|
|
|
static void
|
|
internal_send(isc__socket_t *sock) {
|
|
isc_socketevent_t *dev;
|
|
|
|
INSIST(VALID_SOCKET(sock));
|
|
REQUIRE(sock->fd >= 0);
|
|
|
|
dev = ISC_LIST_HEAD(sock->send_list);
|
|
if (dev == NULL) {
|
|
goto finish;
|
|
}
|
|
socket_log(sock, NULL, EVENT, "internal_send: event %p -> task %p", dev,
|
|
dev->ev_sender);
|
|
|
|
/*
|
|
* Try to do as much I/O as possible on this socket. There are no
|
|
* limits here, currently.
|
|
*/
|
|
while (dev != NULL) {
|
|
switch (doio_send(sock, dev)) {
|
|
case DOIO_SOFT:
|
|
goto finish;
|
|
|
|
case DOIO_HARD:
|
|
case DOIO_SUCCESS:
|
|
send_senddone_event(sock, &dev);
|
|
break;
|
|
}
|
|
|
|
dev = ISC_LIST_HEAD(sock->send_list);
|
|
}
|
|
|
|
finish:
|
|
if (ISC_LIST_EMPTY(sock->send_list)) {
|
|
unwatch_fd(&sock->manager->threads[sock->threadid], sock->fd,
|
|
SELECT_POKE_WRITE);
|
|
}
|
|
}
|
|
|
|
/*
|
|
* Process read/writes on each fd here. Avoid locking
|
|
* and unlocking twice if both reads and writes are possible.
|
|
*/
|
|
static void
|
|
process_fd(isc__socketthread_t *thread, int fd, bool readable, bool writeable) {
|
|
isc__socket_t *sock;
|
|
int lockid = FDLOCK_ID(fd);
|
|
|
|
/*
|
|
* If the socket is going to be closed, don't do more I/O.
|
|
*/
|
|
LOCK(&thread->fdlock[lockid]);
|
|
if (thread->fdstate[fd] == CLOSE_PENDING) {
|
|
UNLOCK(&thread->fdlock[lockid]);
|
|
|
|
(void)unwatch_fd(thread, fd, SELECT_POKE_READ);
|
|
(void)unwatch_fd(thread, fd, SELECT_POKE_WRITE);
|
|
return;
|
|
}
|
|
|
|
sock = thread->fds[fd];
|
|
if (sock == NULL) {
|
|
UNLOCK(&thread->fdlock[lockid]);
|
|
return;
|
|
}
|
|
|
|
LOCK(&sock->lock);
|
|
|
|
if (sock->fd < 0) {
|
|
/*
|
|
* Sock is being closed - the final external reference
|
|
* is gone but it was not yet removed from event loop
|
|
* and fdstate[]/fds[] as destroy() is waiting on
|
|
* thread->fdlock[lockid] or sock->lock that we're holding.
|
|
* Just release the locks and bail.
|
|
*/
|
|
UNLOCK(&sock->lock);
|
|
UNLOCK(&thread->fdlock[lockid]);
|
|
return;
|
|
}
|
|
|
|
REQUIRE(readable || writeable);
|
|
if (writeable) {
|
|
if (sock->connecting) {
|
|
internal_connect(sock);
|
|
} else {
|
|
internal_send(sock);
|
|
}
|
|
}
|
|
|
|
if (readable) {
|
|
if (sock->listener) {
|
|
internal_accept(sock); /* unlocks sock */
|
|
} else {
|
|
internal_recv(sock);
|
|
UNLOCK(&sock->lock);
|
|
}
|
|
} else {
|
|
UNLOCK(&sock->lock);
|
|
}
|
|
|
|
UNLOCK(&thread->fdlock[lockid]);
|
|
|
|
/*
|
|
* Socket destruction might be pending, it will resume
|
|
* after releasing fdlock and sock->lock.
|
|
*/
|
|
}
|
|
|
|
/*
|
|
* process_fds is different for different event loops
|
|
* it takes the events from event loops and for each FD
|
|
* launches process_fd
|
|
*/
|
|
#ifdef USE_KQUEUE
|
|
static bool
|
|
process_fds(isc__socketthread_t *thread, struct kevent *events, int nevents) {
|
|
int i;
|
|
bool readable, writable;
|
|
bool done = false;
|
|
bool have_ctlevent = false;
|
|
if (nevents == thread->nevents) {
|
|
/*
|
|
* This is not an error, but something unexpected. If this
|
|
* happens, it may indicate the need for increasing
|
|
* ISC_SOCKET_MAXEVENTS.
|
|
*/
|
|
thread_log(thread, ISC_LOGCATEGORY_GENERAL,
|
|
ISC_LOGMODULE_SOCKET, ISC_LOG_INFO,
|
|
"maximum number of FD events (%d) received",
|
|
nevents);
|
|
}
|
|
|
|
for (i = 0; i < nevents; i++) {
|
|
REQUIRE(events[i].ident < thread->manager->maxsocks);
|
|
if (events[i].ident == (uintptr_t)thread->pipe_fds[0]) {
|
|
have_ctlevent = true;
|
|
continue;
|
|
}
|
|
readable = (events[i].filter == EVFILT_READ);
|
|
writable = (events[i].filter == EVFILT_WRITE);
|
|
process_fd(thread, events[i].ident, readable, writable);
|
|
}
|
|
|
|
if (have_ctlevent) {
|
|
done = process_ctlfd(thread);
|
|
}
|
|
|
|
return (done);
|
|
}
|
|
#elif defined(USE_EPOLL)
|
|
static bool
|
|
process_fds(isc__socketthread_t *thread, struct epoll_event *events,
|
|
int nevents) {
|
|
int i;
|
|
bool done = false;
|
|
bool have_ctlevent = false;
|
|
|
|
if (nevents == thread->nevents) {
|
|
thread_log(thread, ISC_LOGCATEGORY_GENERAL,
|
|
ISC_LOGMODULE_SOCKET, ISC_LOG_INFO,
|
|
"maximum number of FD events (%d) received",
|
|
nevents);
|
|
}
|
|
|
|
for (i = 0; i < nevents; i++) {
|
|
REQUIRE(events[i].data.fd < (int)thread->manager->maxsocks);
|
|
if (events[i].data.fd == thread->pipe_fds[0]) {
|
|
have_ctlevent = true;
|
|
continue;
|
|
}
|
|
if ((events[i].events & EPOLLERR) != 0 ||
|
|
(events[i].events & EPOLLHUP) != 0) {
|
|
/*
|
|
* epoll does not set IN/OUT bits on an erroneous
|
|
* condition, so we need to try both anyway. This is a
|
|
* bit inefficient, but should be okay for such rare
|
|
* events. Note also that the read or write attempt
|
|
* won't block because we use non-blocking sockets.
|
|
*/
|
|
int fd = events[i].data.fd;
|
|
events[i].events |= thread->epoll_events[fd];
|
|
}
|
|
process_fd(thread, events[i].data.fd,
|
|
(events[i].events & EPOLLIN) != 0,
|
|
(events[i].events & EPOLLOUT) != 0);
|
|
}
|
|
|
|
if (have_ctlevent) {
|
|
done = process_ctlfd(thread);
|
|
}
|
|
|
|
return (done);
|
|
}
|
|
#elif defined(USE_DEVPOLL)
|
|
static bool
|
|
process_fds(isc__socketthread_t *thread, struct pollfd *events, int nevents) {
|
|
int i;
|
|
bool done = false;
|
|
bool have_ctlevent = false;
|
|
|
|
if (nevents == thread->nevents) {
|
|
thread_log(thread, ISC_LOGCATEGORY_GENERAL,
|
|
ISC_LOGMODULE_SOCKET, ISC_LOG_INFO,
|
|
"maximum number of FD events (%d) received",
|
|
nevents);
|
|
}
|
|
|
|
for (i = 0; i < nevents; i++) {
|
|
REQUIRE(events[i].fd < (int)thread->manager->maxsocks);
|
|
if (events[i].fd == thread->pipe_fds[0]) {
|
|
have_ctlevent = true;
|
|
continue;
|
|
}
|
|
process_fd(thread, events[i].fd,
|
|
(events[i].events & POLLIN) != 0,
|
|
(events[i].events & POLLOUT) != 0);
|
|
}
|
|
|
|
if (have_ctlevent) {
|
|
done = process_ctlfd(thread);
|
|
}
|
|
|
|
return (done);
|
|
}
|
|
#elif defined(USE_SELECT)
|
|
static void
|
|
process_fds(isc__socketthread_t *thread, int maxfd, fd_set *readfds,
|
|
fd_set *writefds) {
|
|
int i;
|
|
|
|
REQUIRE(maxfd <= (int)thread->manager->maxsocks);
|
|
|
|
for (i = 0; i < maxfd; i++) {
|
|
if (i == thread->pipe_fds[0] || i == thread->pipe_fds[1]) {
|
|
continue;
|
|
}
|
|
process_fd(thread, i, FD_ISSET(i, readfds),
|
|
FD_ISSET(i, writefds));
|
|
}
|
|
}
|
|
#endif /* ifdef USE_KQUEUE */
|
|
|
|
static bool
|
|
process_ctlfd(isc__socketthread_t *thread) {
|
|
int msg, fd;
|
|
|
|
for (;;) {
|
|
select_readmsg(thread, &fd, &msg);
|
|
|
|
thread_log(thread, IOEVENT,
|
|
"watcher got message %d for socket %d", msg, fd);
|
|
|
|
/*
|
|
* Nothing to read?
|
|
*/
|
|
if (msg == SELECT_POKE_NOTHING) {
|
|
break;
|
|
}
|
|
|
|
/*
|
|
* Handle shutdown message. We really should
|
|
* jump out of this loop right away, but
|
|
* it doesn't matter if we have to do a little
|
|
* more work first.
|
|
*/
|
|
if (msg == SELECT_POKE_SHUTDOWN) {
|
|
return (true);
|
|
}
|
|
|
|
/*
|
|
* This is a wakeup on a socket. Look
|
|
* at the event queue for both read and write,
|
|
* and decide if we need to watch on it now
|
|
* or not.
|
|
*/
|
|
wakeup_socket(thread, fd, msg);
|
|
}
|
|
|
|
return (false);
|
|
}
|
|
|
|
/*
|
|
* This is the thread that will loop forever, always in a select or poll
|
|
* call.
|
|
*
|
|
* When select returns something to do, do whatever's necessary and post
|
|
* an event to the task that was requesting the action.
|
|
*/
|
|
static isc_threadresult_t
|
|
netthread(void *uap) {
|
|
isc__socketthread_t *thread = uap;
|
|
isc__socketmgr_t *manager = thread->manager;
|
|
(void)manager;
|
|
bool done;
|
|
int cc;
|
|
if (manager->nthreads > 1) {
|
|
isc_thread_setaffinity(thread->threadid);
|
|
}
|
|
#ifdef USE_KQUEUE
|
|
const char *fnname = "kevent()";
|
|
#elif defined(USE_EPOLL)
|
|
const char *fnname = "epoll_wait()";
|
|
#elif defined(USE_DEVPOLL)
|
|
isc_result_t result;
|
|
const char *fnname = "ioctl(DP_POLL)";
|
|
struct dvpoll dvp;
|
|
int pass;
|
|
#if defined(ISC_SOCKET_USE_POLLWATCH)
|
|
pollstate_t pollstate = poll_idle;
|
|
#endif /* if defined(ISC_SOCKET_USE_POLLWATCH) */
|
|
#elif defined(USE_SELECT)
|
|
const char *fnname = "select()";
|
|
int maxfd;
|
|
int ctlfd;
|
|
#endif /* ifdef USE_KQUEUE */
|
|
char strbuf[ISC_STRERRORSIZE];
|
|
|
|
#if defined(USE_SELECT)
|
|
/*
|
|
* Get the control fd here. This will never change.
|
|
*/
|
|
ctlfd = thread->pipe_fds[0];
|
|
#endif /* if defined(USE_SELECT) */
|
|
done = false;
|
|
while (!done) {
|
|
do {
|
|
#ifdef USE_KQUEUE
|
|
cc = kevent(thread->kqueue_fd, NULL, 0, thread->events,
|
|
thread->nevents, NULL);
|
|
#elif defined(USE_EPOLL)
|
|
cc = epoll_wait(thread->epoll_fd, thread->events,
|
|
thread->nevents, -1);
|
|
#elif defined(USE_DEVPOLL)
|
|
/*
|
|
* Re-probe every thousand calls.
|
|
*/
|
|
if (thread->calls++ > 1000U) {
|
|
result = isc_resource_getcurlimit(
|
|
isc_resource_openfiles,
|
|
&thread->open_max);
|
|
if (result != ISC_R_SUCCESS) {
|
|
thread->open_max = 64;
|
|
}
|
|
thread->calls = 0;
|
|
}
|
|
for (pass = 0; pass < 2; pass++) {
|
|
dvp.dp_fds = thread->events;
|
|
dvp.dp_nfds = thread->nevents;
|
|
if (dvp.dp_nfds >= thread->open_max) {
|
|
dvp.dp_nfds = thread->open_max - 1;
|
|
}
|
|
#ifndef ISC_SOCKET_USE_POLLWATCH
|
|
dvp.dp_timeout = -1;
|
|
#else /* ifndef ISC_SOCKET_USE_POLLWATCH */
|
|
if (pollstate == poll_idle) {
|
|
dvp.dp_timeout = -1;
|
|
} else {
|
|
dvp.dp_timeout =
|
|
ISC_SOCKET_POLLWATCH_TIMEOUT;
|
|
}
|
|
#endif /* ISC_SOCKET_USE_POLLWATCH */
|
|
cc = ioctl(thread->devpoll_fd, DP_POLL, &dvp);
|
|
if (cc == -1 && errno == EINVAL) {
|
|
/*
|
|
* {OPEN_MAX} may have dropped. Look
|
|
* up the current value and try again.
|
|
*/
|
|
result = isc_resource_getcurlimit(
|
|
isc_resource_openfiles,
|
|
&thread->open_max);
|
|
if (result != ISC_R_SUCCESS) {
|
|
thread->open_max = 64;
|
|
}
|
|
} else {
|
|
break;
|
|
}
|
|
}
|
|
#elif defined(USE_SELECT)
|
|
/*
|
|
* We will have only one thread anyway, we can lock
|
|
* manager lock and don't care
|
|
*/
|
|
LOCK(&manager->lock);
|
|
memmove(thread->read_fds_copy, thread->read_fds,
|
|
thread->fd_bufsize);
|
|
memmove(thread->write_fds_copy, thread->write_fds,
|
|
thread->fd_bufsize);
|
|
maxfd = thread->maxfd + 1;
|
|
UNLOCK(&manager->lock);
|
|
|
|
cc = select(maxfd, thread->read_fds_copy,
|
|
thread->write_fds_copy, NULL, NULL);
|
|
#endif /* USE_KQUEUE */
|
|
|
|
if (cc < 0 && !SOFT_ERROR(errno)) {
|
|
strerror_r(errno, strbuf, sizeof(strbuf));
|
|
FATAL_ERROR(__FILE__, __LINE__, "%s failed: %s",
|
|
fnname, strbuf);
|
|
}
|
|
|
|
#if defined(USE_DEVPOLL) && defined(ISC_SOCKET_USE_POLLWATCH)
|
|
if (cc == 0) {
|
|
if (pollstate == poll_active) {
|
|
pollstate = poll_checking;
|
|
} else if (pollstate == poll_checking) {
|
|
pollstate = poll_idle;
|
|
}
|
|
} else if (cc > 0) {
|
|
if (pollstate == poll_checking) {
|
|
/*
|
|
* XXX: We'd like to use a more
|
|
* verbose log level as it's actually an
|
|
* unexpected event, but the kernel bug
|
|
* reportedly happens pretty frequently
|
|
* (and it can also be a false positive)
|
|
* so it would be just too noisy.
|
|
*/
|
|
thread_log(thread,
|
|
ISC_LOGCATEGORY_GENERAL,
|
|
ISC_LOGMODULE_SOCKET,
|
|
ISC_LOG_DEBUG(1),
|
|
"unexpected POLL timeout");
|
|
}
|
|
pollstate = poll_active;
|
|
}
|
|
#endif /* if defined(USE_DEVPOLL) && defined(ISC_SOCKET_USE_POLLWATCH) */
|
|
} while (cc < 0);
|
|
|
|
#if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL)
|
|
done = process_fds(thread, thread->events, cc);
|
|
#elif defined(USE_SELECT)
|
|
process_fds(thread, maxfd, thread->read_fds_copy,
|
|
thread->write_fds_copy);
|
|
|
|
/*
|
|
* Process reads on internal, control fd.
|
|
*/
|
|
if (FD_ISSET(ctlfd, thread->read_fds_copy)) {
|
|
done = process_ctlfd(thread);
|
|
}
|
|
#endif /* if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL) \
|
|
* */
|
|
}
|
|
|
|
thread_log(thread, TRACE, "watcher exiting");
|
|
return ((isc_threadresult_t)0);
|
|
}
|
|
|
|
void
|
|
isc_socketmgr_setreserved(isc_socketmgr_t *manager0, uint32_t reserved) {
|
|
isc__socketmgr_t *manager = (isc__socketmgr_t *)manager0;
|
|
|
|
REQUIRE(VALID_MANAGER(manager));
|
|
|
|
manager->reserved = reserved;
|
|
}
|
|
|
|
void
|
|
isc_socketmgr_maxudp(isc_socketmgr_t *manager0, unsigned int maxudp) {
|
|
isc__socketmgr_t *manager = (isc__socketmgr_t *)manager0;
|
|
|
|
REQUIRE(VALID_MANAGER(manager));
|
|
|
|
manager->maxudp = maxudp;
|
|
}
|
|
|
|
/*
|
|
* Setup socket thread, thread->manager and thread->threadid must be filled.
|
|
*/
|
|
|
|
static isc_result_t
|
|
setup_thread(isc__socketthread_t *thread) {
|
|
isc_result_t result = ISC_R_SUCCESS;
|
|
int i;
|
|
char strbuf[ISC_STRERRORSIZE];
|
|
|
|
REQUIRE(thread != NULL);
|
|
REQUIRE(VALID_MANAGER(thread->manager));
|
|
REQUIRE(thread->threadid >= 0 &&
|
|
thread->threadid < thread->manager->nthreads);
|
|
|
|
thread->fds = isc_mem_get(thread->manager->mctx,
|
|
thread->manager->maxsocks *
|
|
sizeof(isc__socket_t *));
|
|
|
|
memset(thread->fds, 0,
|
|
thread->manager->maxsocks * sizeof(isc_socket_t *));
|
|
|
|
thread->fdstate = isc_mem_get(thread->manager->mctx,
|
|
thread->manager->maxsocks * sizeof(int));
|
|
|
|
memset(thread->fdstate, 0, thread->manager->maxsocks * sizeof(int));
|
|
|
|
thread->fdlock = isc_mem_get(thread->manager->mctx,
|
|
FDLOCK_COUNT * sizeof(isc_mutex_t));
|
|
|
|
for (i = 0; i < FDLOCK_COUNT; i++) {
|
|
isc_mutex_init(&thread->fdlock[i]);
|
|
}
|
|
|
|
if (pipe(thread->pipe_fds) != 0) {
|
|
strerror_r(errno, strbuf, sizeof(strbuf));
|
|
UNEXPECTED_ERROR(__FILE__, __LINE__, "pipe() failed: %s",
|
|
strbuf);
|
|
return (ISC_R_UNEXPECTED);
|
|
}
|
|
RUNTIME_CHECK(make_nonblock(thread->pipe_fds[0]) == ISC_R_SUCCESS);
|
|
|
|
#ifdef USE_KQUEUE
|
|
thread->nevents = ISC_SOCKET_MAXEVENTS;
|
|
thread->events = isc_mem_get(thread->manager->mctx,
|
|
sizeof(struct kevent) * thread->nevents);
|
|
|
|
thread->kqueue_fd = kqueue();
|
|
if (thread->kqueue_fd == -1) {
|
|
result = isc__errno2result(errno);
|
|
strerror_r(errno, strbuf, sizeof(strbuf));
|
|
UNEXPECTED_ERROR(__FILE__, __LINE__, "kqueue failed: %s",
|
|
strbuf);
|
|
isc_mem_put(thread->manager->mctx, thread->events,
|
|
sizeof(struct kevent) * thread->nevents);
|
|
return (result);
|
|
}
|
|
|
|
result = watch_fd(thread, thread->pipe_fds[0], SELECT_POKE_READ);
|
|
if (result != ISC_R_SUCCESS) {
|
|
close(thread->kqueue_fd);
|
|
isc_mem_put(thread->manager->mctx, thread->events,
|
|
sizeof(struct kevent) * thread->nevents);
|
|
}
|
|
return (result);
|
|
|
|
#elif defined(USE_EPOLL)
|
|
thread->nevents = ISC_SOCKET_MAXEVENTS;
|
|
thread->epoll_events =
|
|
isc_mem_get(thread->manager->mctx,
|
|
(thread->manager->maxsocks * sizeof(uint32_t)));
|
|
|
|
memset(thread->epoll_events, 0,
|
|
thread->manager->maxsocks * sizeof(uint32_t));
|
|
|
|
thread->events =
|
|
isc_mem_get(thread->manager->mctx,
|
|
sizeof(struct epoll_event) * thread->nevents);
|
|
|
|
thread->epoll_fd = epoll_create(thread->nevents);
|
|
if (thread->epoll_fd == -1) {
|
|
result = isc__errno2result(errno);
|
|
strerror_r(errno, strbuf, sizeof(strbuf));
|
|
UNEXPECTED_ERROR(__FILE__, __LINE__, "epoll_create failed: %s",
|
|
strbuf);
|
|
return (result);
|
|
}
|
|
|
|
result = watch_fd(thread, thread->pipe_fds[0], SELECT_POKE_READ);
|
|
return (result);
|
|
|
|
#elif defined(USE_DEVPOLL)
|
|
thread->nevents = ISC_SOCKET_MAXEVENTS;
|
|
result = isc_resource_getcurlimit(isc_resource_openfiles,
|
|
&thread->open_max);
|
|
if (result != ISC_R_SUCCESS) {
|
|
thread->open_max = 64;
|
|
}
|
|
thread->calls = 0;
|
|
thread->events = isc_mem_get(thread->manager->mctx,
|
|
sizeof(struct pollfd) * thread->nevents);
|
|
|
|
/*
|
|
* Note: fdpollinfo should be able to support all possible FDs, so
|
|
* it must have maxsocks entries (not nevents).
|
|
*/
|
|
thread->fdpollinfo =
|
|
isc_mem_get(thread->manager->mctx,
|
|
sizeof(pollinfo_t) * thread->manager->maxsocks);
|
|
memset(thread->fdpollinfo, 0,
|
|
sizeof(pollinfo_t) * thread->manager->maxsocks);
|
|
thread->devpoll_fd = open("/dev/poll", O_RDWR);
|
|
if (thread->devpoll_fd == -1) {
|
|
result = isc__errno2result(errno);
|
|
strerror_r(errno, strbuf, sizeof(strbuf));
|
|
UNEXPECTED_ERROR(__FILE__, __LINE__,
|
|
"open(/dev/poll) failed: %s", strbuf);
|
|
isc_mem_put(thread->manager->mctx, thread->events,
|
|
sizeof(struct pollfd) * thread->nevents);
|
|
isc_mem_put(thread->manager->mctx, thread->fdpollinfo,
|
|
sizeof(pollinfo_t) * thread->manager->maxsocks);
|
|
return (result);
|
|
}
|
|
result = watch_fd(thread, thread->pipe_fds[0], SELECT_POKE_READ);
|
|
if (result != ISC_R_SUCCESS) {
|
|
close(thread->devpoll_fd);
|
|
isc_mem_put(thread->manager->mctx, thread->events,
|
|
sizeof(struct pollfd) * thread->nevents);
|
|
isc_mem_put(thread->manager->mctx, thread->fdpollinfo,
|
|
sizeof(pollinfo_t) * thread->manager->maxsocks);
|
|
return (result);
|
|
}
|
|
|
|
return (ISC_R_SUCCESS);
|
|
#elif defined(USE_SELECT)
|
|
UNUSED(result);
|
|
|
|
#if ISC_SOCKET_MAXSOCKETS > FD_SETSIZE
|
|
/*
|
|
* Note: this code should also cover the case of MAXSOCKETS <=
|
|
* FD_SETSIZE, but we separate the cases to avoid possible portability
|
|
* issues regarding howmany() and the actual representation of fd_set.
|
|
*/
|
|
thread->fd_bufsize = howmany(manager->maxsocks, NFDBITS) *
|
|
sizeof(fd_mask);
|
|
#else /* if ISC_SOCKET_MAXSOCKETS > FD_SETSIZE */
|
|
thread->fd_bufsize = sizeof(fd_set);
|
|
#endif /* if ISC_SOCKET_MAXSOCKETS > FD_SETSIZE */
|
|
|
|
thread->read_fds = isc_mem_get(thread->manager->mctx,
|
|
thread->fd_bufsize);
|
|
thread->read_fds_copy = isc_mem_get(thread->manager->mctx,
|
|
thread->fd_bufsize);
|
|
thread->write_fds = isc_mem_get(thread->manager->mctx,
|
|
thread->fd_bufsize);
|
|
thread->write_fds_copy = isc_mem_get(thread->manager->mctx,
|
|
thread->fd_bufsize);
|
|
memset(thread->read_fds, 0, thread->fd_bufsize);
|
|
memset(thread->write_fds, 0, thread->fd_bufsize);
|
|
|
|
(void)watch_fd(thread, thread->pipe_fds[0], SELECT_POKE_READ);
|
|
thread->maxfd = thread->pipe_fds[0];
|
|
|
|
return (ISC_R_SUCCESS);
|
|
#endif /* USE_KQUEUE */
|
|
}
|
|
|
|
static void
|
|
cleanup_thread(isc_mem_t *mctx, isc__socketthread_t *thread) {
|
|
isc_result_t result;
|
|
int i;
|
|
|
|
result = unwatch_fd(thread, thread->pipe_fds[0], SELECT_POKE_READ);
|
|
if (result != ISC_R_SUCCESS) {
|
|
UNEXPECTED_ERROR(__FILE__, __LINE__, "epoll_ctl(DEL) failed");
|
|
}
|
|
#ifdef USE_KQUEUE
|
|
close(thread->kqueue_fd);
|
|
isc_mem_put(mctx, thread->events,
|
|
sizeof(struct kevent) * thread->nevents);
|
|
#elif defined(USE_EPOLL)
|
|
close(thread->epoll_fd);
|
|
|
|
isc_mem_put(mctx, thread->events,
|
|
sizeof(struct epoll_event) * thread->nevents);
|
|
#elif defined(USE_DEVPOLL)
|
|
close(thread->devpoll_fd);
|
|
isc_mem_put(mctx, thread->events,
|
|
sizeof(struct pollfd) * thread->nevents);
|
|
isc_mem_put(mctx, thread->fdpollinfo,
|
|
sizeof(pollinfo_t) * thread->manager->maxsocks);
|
|
#elif defined(USE_SELECT)
|
|
if (thread->read_fds != NULL) {
|
|
isc_mem_put(mctx, thread->read_fds, thread->fd_bufsize);
|
|
}
|
|
if (thread->read_fds_copy != NULL) {
|
|
isc_mem_put(mctx, thread->read_fds_copy, thread->fd_bufsize);
|
|
}
|
|
if (thread->write_fds != NULL) {
|
|
isc_mem_put(mctx, thread->write_fds, thread->fd_bufsize);
|
|
}
|
|
if (thread->write_fds_copy != NULL) {
|
|
isc_mem_put(mctx, thread->write_fds_copy, thread->fd_bufsize);
|
|
}
|
|
#endif /* USE_KQUEUE */
|
|
for (i = 0; i < (int)thread->manager->maxsocks; i++) {
|
|
if (thread->fdstate[i] == CLOSE_PENDING) {
|
|
/* no need to lock */
|
|
(void)close(i);
|
|
}
|
|
}
|
|
|
|
#if defined(USE_EPOLL)
|
|
isc_mem_put(thread->manager->mctx, thread->epoll_events,
|
|
thread->manager->maxsocks * sizeof(uint32_t));
|
|
#endif /* if defined(USE_EPOLL) */
|
|
isc_mem_put(thread->manager->mctx, thread->fds,
|
|
thread->manager->maxsocks * sizeof(isc__socket_t *));
|
|
isc_mem_put(thread->manager->mctx, thread->fdstate,
|
|
thread->manager->maxsocks * sizeof(int));
|
|
|
|
for (i = 0; i < FDLOCK_COUNT; i++) {
|
|
isc_mutex_destroy(&thread->fdlock[i]);
|
|
}
|
|
isc_mem_put(thread->manager->mctx, thread->fdlock,
|
|
FDLOCK_COUNT * sizeof(isc_mutex_t));
|
|
}
|
|
|
|
isc_result_t
|
|
isc_socketmgr_create(isc_mem_t *mctx, isc_socketmgr_t **managerp) {
|
|
return (isc_socketmgr_create2(mctx, managerp, 0, 1));
|
|
}
|
|
|
|
isc_result_t
|
|
isc_socketmgr_create2(isc_mem_t *mctx, isc_socketmgr_t **managerp,
|
|
unsigned int maxsocks, int nthreads) {
|
|
int i;
|
|
isc__socketmgr_t *manager;
|
|
|
|
REQUIRE(managerp != NULL && *managerp == NULL);
|
|
|
|
if (maxsocks == 0) {
|
|
maxsocks = ISC_SOCKET_MAXSOCKETS;
|
|
}
|
|
|
|
manager = isc_mem_get(mctx, sizeof(*manager));
|
|
|
|
/* zero-clear so that necessary cleanup on failure will be easy */
|
|
memset(manager, 0, sizeof(*manager));
|
|
manager->maxsocks = maxsocks;
|
|
manager->reserved = 0;
|
|
manager->maxudp = 0;
|
|
manager->nthreads = nthreads;
|
|
manager->stats = NULL;
|
|
|
|
manager->common.magic = ISCAPI_SOCKETMGR_MAGIC;
|
|
manager->common.impmagic = SOCKET_MANAGER_MAGIC;
|
|
manager->mctx = NULL;
|
|
ISC_LIST_INIT(manager->socklist);
|
|
isc_mutex_init(&manager->lock);
|
|
isc_condition_init(&manager->shutdown_ok);
|
|
|
|
/*
|
|
* Start up the select/poll thread.
|
|
*/
|
|
manager->threads = isc_mem_get(mctx, sizeof(isc__socketthread_t) *
|
|
manager->nthreads);
|
|
isc_mem_attach(mctx, &manager->mctx);
|
|
|
|
for (i = 0; i < manager->nthreads; i++) {
|
|
manager->threads[i].manager = manager;
|
|
manager->threads[i].threadid = i;
|
|
setup_thread(&manager->threads[i]);
|
|
isc_thread_create(netthread, &manager->threads[i],
|
|
&manager->threads[i].thread);
|
|
char tname[1024];
|
|
sprintf(tname, "isc-socket-%d", i);
|
|
isc_thread_setname(manager->threads[i].thread, tname);
|
|
}
|
|
|
|
*managerp = (isc_socketmgr_t *)manager;
|
|
|
|
return (ISC_R_SUCCESS);
|
|
}
|
|
|
|
isc_result_t
|
|
isc_socketmgr_getmaxsockets(isc_socketmgr_t *manager0, unsigned int *nsockp) {
|
|
isc__socketmgr_t *manager = (isc__socketmgr_t *)manager0;
|
|
REQUIRE(VALID_MANAGER(manager));
|
|
REQUIRE(nsockp != NULL);
|
|
|
|
*nsockp = manager->maxsocks;
|
|
|
|
return (ISC_R_SUCCESS);
|
|
}
|
|
|
|
void
|
|
isc_socketmgr_setstats(isc_socketmgr_t *manager0, isc_stats_t *stats) {
|
|
isc__socketmgr_t *manager = (isc__socketmgr_t *)manager0;
|
|
|
|
REQUIRE(VALID_MANAGER(manager));
|
|
REQUIRE(ISC_LIST_EMPTY(manager->socklist));
|
|
REQUIRE(manager->stats == NULL);
|
|
REQUIRE(isc_stats_ncounters(stats) == isc_sockstatscounter_max);
|
|
|
|
isc_stats_attach(stats, &manager->stats);
|
|
}
|
|
|
|
void
|
|
isc_socketmgr_destroy(isc_socketmgr_t **managerp) {
|
|
isc__socketmgr_t *manager;
|
|
|
|
/*
|
|
* Destroy a socket manager.
|
|
*/
|
|
|
|
REQUIRE(managerp != NULL);
|
|
manager = (isc__socketmgr_t *)*managerp;
|
|
REQUIRE(VALID_MANAGER(manager));
|
|
|
|
LOCK(&manager->lock);
|
|
|
|
/*
|
|
* Wait for all sockets to be destroyed.
|
|
*/
|
|
while (!ISC_LIST_EMPTY(manager->socklist)) {
|
|
manager_log(manager, CREATION, "sockets exist");
|
|
WAIT(&manager->shutdown_ok, &manager->lock);
|
|
}
|
|
|
|
UNLOCK(&manager->lock);
|
|
|
|
/*
|
|
* Here, poke our select/poll thread. Do this by closing the write
|
|
* half of the pipe, which will send EOF to the read half.
|
|
* This is currently a no-op in the non-threaded case.
|
|
*/
|
|
for (int i = 0; i < manager->nthreads; i++) {
|
|
select_poke(manager, i, 0, SELECT_POKE_SHUTDOWN);
|
|
}
|
|
|
|
/*
|
|
* Wait for thread to exit.
|
|
*/
|
|
for (int i = 0; i < manager->nthreads; i++) {
|
|
isc_thread_join(manager->threads[i].thread, NULL);
|
|
cleanup_thread(manager->mctx, &manager->threads[i]);
|
|
}
|
|
/*
|
|
* Clean up.
|
|
*/
|
|
isc_mem_put(manager->mctx, manager->threads,
|
|
sizeof(isc__socketthread_t) * manager->nthreads);
|
|
(void)isc_condition_destroy(&manager->shutdown_ok);
|
|
|
|
if (manager->stats != NULL) {
|
|
isc_stats_detach(&manager->stats);
|
|
}
|
|
isc_mutex_destroy(&manager->lock);
|
|
manager->common.magic = 0;
|
|
manager->common.impmagic = 0;
|
|
isc_mem_putanddetach(&manager->mctx, manager, sizeof(*manager));
|
|
|
|
*managerp = NULL;
|
|
}
|
|
|
|
static isc_result_t
|
|
socket_recv(isc__socket_t *sock, isc_socketevent_t *dev, isc_task_t *task,
|
|
unsigned int flags) {
|
|
int io_state;
|
|
bool have_lock = false;
|
|
isc_task_t *ntask = NULL;
|
|
isc_result_t result = ISC_R_SUCCESS;
|
|
|
|
dev->ev_sender = task;
|
|
|
|
if (sock->type == isc_sockettype_udp) {
|
|
io_state = doio_recv(sock, dev);
|
|
} else {
|
|
LOCK(&sock->lock);
|
|
have_lock = true;
|
|
|
|
if (ISC_LIST_EMPTY(sock->recv_list)) {
|
|
io_state = doio_recv(sock, dev);
|
|
} else {
|
|
io_state = DOIO_SOFT;
|
|
}
|
|
}
|
|
|
|
switch (io_state) {
|
|
case DOIO_SOFT:
|
|
/*
|
|
* We couldn't read all or part of the request right now, so
|
|
* queue it.
|
|
*
|
|
* Attach to socket and to task
|
|
*/
|
|
isc_task_attach(task, &ntask);
|
|
dev->attributes |= ISC_SOCKEVENTATTR_ATTACHED;
|
|
|
|
if (!have_lock) {
|
|
LOCK(&sock->lock);
|
|
have_lock = true;
|
|
}
|
|
|
|
/*
|
|
* Enqueue the request. If the socket was previously not being
|
|
* watched, poke the watcher to start paying attention to it.
|
|
*/
|
|
bool do_poke = ISC_LIST_EMPTY(sock->recv_list);
|
|
ISC_LIST_ENQUEUE(sock->recv_list, dev, ev_link);
|
|
if (do_poke) {
|
|
select_poke(sock->manager, sock->threadid, sock->fd,
|
|
SELECT_POKE_READ);
|
|
}
|
|
|
|
socket_log(sock, NULL, EVENT,
|
|
"socket_recv: event %p -> task %p", dev, ntask);
|
|
|
|
if ((flags & ISC_SOCKFLAG_IMMEDIATE) != 0) {
|
|
result = ISC_R_INPROGRESS;
|
|
}
|
|
break;
|
|
|
|
case DOIO_EOF:
|
|
dev->result = ISC_R_EOF;
|
|
/* fallthrough */
|
|
|
|
case DOIO_HARD:
|
|
case DOIO_SUCCESS:
|
|
if ((flags & ISC_SOCKFLAG_IMMEDIATE) == 0) {
|
|
send_recvdone_event(sock, &dev);
|
|
}
|
|
break;
|
|
}
|
|
|
|
if (have_lock) {
|
|
UNLOCK(&sock->lock);
|
|
}
|
|
|
|
return (result);
|
|
}
|
|
|
|
isc_result_t
|
|
isc_socket_recv(isc_socket_t *sock0, isc_region_t *region, unsigned int minimum,
|
|
isc_task_t *task, isc_taskaction_t action, void *arg) {
|
|
isc__socket_t *sock = (isc__socket_t *)sock0;
|
|
isc_socketevent_t *dev;
|
|
isc__socketmgr_t *manager;
|
|
|
|
REQUIRE(VALID_SOCKET(sock));
|
|
REQUIRE(action != NULL);
|
|
|
|
manager = sock->manager;
|
|
REQUIRE(VALID_MANAGER(manager));
|
|
|
|
INSIST(sock->bound);
|
|
|
|
dev = allocate_socketevent(manager->mctx, sock, ISC_SOCKEVENT_RECVDONE,
|
|
action, arg);
|
|
if (dev == NULL) {
|
|
return (ISC_R_NOMEMORY);
|
|
}
|
|
|
|
return (isc_socket_recv2(sock0, region, minimum, task, dev, 0));
|
|
}
|
|
|
|
isc_result_t
|
|
isc_socket_recv2(isc_socket_t *sock0, isc_region_t *region,
|
|
unsigned int minimum, isc_task_t *task,
|
|
isc_socketevent_t *event, unsigned int flags) {
|
|
isc__socket_t *sock = (isc__socket_t *)sock0;
|
|
|
|
event->ev_sender = sock;
|
|
event->result = ISC_R_UNSET;
|
|
event->region = *region;
|
|
event->n = 0;
|
|
event->offset = 0;
|
|
event->attributes = 0;
|
|
|
|
/*
|
|
* UDP sockets are always partial read.
|
|
*/
|
|
if (sock->type == isc_sockettype_udp) {
|
|
event->minimum = 1;
|
|
} else {
|
|
if (minimum == 0) {
|
|
event->minimum = region->length;
|
|
} else {
|
|
event->minimum = minimum;
|
|
}
|
|
}
|
|
|
|
return (socket_recv(sock, event, task, flags));
|
|
}
|
|
|
|
static isc_result_t
|
|
socket_send(isc__socket_t *sock, isc_socketevent_t *dev, isc_task_t *task,
|
|
const isc_sockaddr_t *address, struct in6_pktinfo *pktinfo,
|
|
unsigned int flags) {
|
|
int io_state;
|
|
bool have_lock = false;
|
|
isc_task_t *ntask = NULL;
|
|
isc_result_t result = ISC_R_SUCCESS;
|
|
|
|
dev->ev_sender = task;
|
|
|
|
set_dev_address(address, sock, dev);
|
|
if (pktinfo != NULL) {
|
|
dev->attributes |= ISC_SOCKEVENTATTR_PKTINFO;
|
|
dev->pktinfo = *pktinfo;
|
|
|
|
if (!isc_sockaddr_issitelocal(&dev->address) &&
|
|
!isc_sockaddr_islinklocal(&dev->address))
|
|
{
|
|
socket_log(sock, NULL, TRACE,
|
|
"pktinfo structure provided, ifindex %u "
|
|
"(set to 0)",
|
|
pktinfo->ipi6_ifindex);
|
|
|
|
/*
|
|
* Set the pktinfo index to 0 here, to let the
|
|
* kernel decide what interface it should send on.
|
|
*/
|
|
dev->pktinfo.ipi6_ifindex = 0;
|
|
}
|
|
}
|
|
|
|
if (sock->type == isc_sockettype_udp) {
|
|
io_state = doio_send(sock, dev);
|
|
} else {
|
|
LOCK(&sock->lock);
|
|
have_lock = true;
|
|
|
|
if (ISC_LIST_EMPTY(sock->send_list)) {
|
|
io_state = doio_send(sock, dev);
|
|
} else {
|
|
io_state = DOIO_SOFT;
|
|
}
|
|
}
|
|
|
|
switch (io_state) {
|
|
case DOIO_SOFT:
|
|
/*
|
|
* We couldn't send all or part of the request right now, so
|
|
* queue it unless ISC_SOCKFLAG_NORETRY is set.
|
|
*/
|
|
if ((flags & ISC_SOCKFLAG_NORETRY) == 0) {
|
|
isc_task_attach(task, &ntask);
|
|
dev->attributes |= ISC_SOCKEVENTATTR_ATTACHED;
|
|
|
|
if (!have_lock) {
|
|
LOCK(&sock->lock);
|
|
have_lock = true;
|
|
}
|
|
|
|
/*
|
|
* Enqueue the request. If the socket was previously
|
|
* not being watched, poke the watcher to start
|
|
* paying attention to it.
|
|
*/
|
|
bool do_poke = ISC_LIST_EMPTY(sock->send_list);
|
|
ISC_LIST_ENQUEUE(sock->send_list, dev, ev_link);
|
|
if (do_poke) {
|
|
select_poke(sock->manager, sock->threadid,
|
|
sock->fd, SELECT_POKE_WRITE);
|
|
}
|
|
socket_log(sock, NULL, EVENT,
|
|
"socket_send: event %p -> task %p", dev,
|
|
ntask);
|
|
|
|
if ((flags & ISC_SOCKFLAG_IMMEDIATE) != 0) {
|
|
result = ISC_R_INPROGRESS;
|
|
}
|
|
break;
|
|
}
|
|
|
|
/* FALLTHROUGH */
|
|
|
|
case DOIO_HARD:
|
|
case DOIO_SUCCESS:
|
|
if (!have_lock) {
|
|
LOCK(&sock->lock);
|
|
have_lock = true;
|
|
}
|
|
if ((flags & ISC_SOCKFLAG_IMMEDIATE) == 0) {
|
|
send_senddone_event(sock, &dev);
|
|
}
|
|
break;
|
|
}
|
|
|
|
if (have_lock) {
|
|
UNLOCK(&sock->lock);
|
|
}
|
|
|
|
return (result);
|
|
}
|
|
|
|
isc_result_t
|
|
isc_socket_send(isc_socket_t *sock, isc_region_t *region, isc_task_t *task,
|
|
isc_taskaction_t action, void *arg) {
|
|
/*
|
|
* REQUIRE() checking is performed in isc_socket_sendto().
|
|
*/
|
|
return (isc_socket_sendto(sock, region, task, action, arg, NULL, NULL));
|
|
}
|
|
|
|
isc_result_t
|
|
isc_socket_sendto(isc_socket_t *sock0, isc_region_t *region, isc_task_t *task,
|
|
isc_taskaction_t action, void *arg,
|
|
const isc_sockaddr_t *address, struct in6_pktinfo *pktinfo) {
|
|
isc__socket_t *sock = (isc__socket_t *)sock0;
|
|
isc_socketevent_t *dev;
|
|
isc__socketmgr_t *manager;
|
|
|
|
REQUIRE(VALID_SOCKET(sock));
|
|
REQUIRE(region != NULL);
|
|
REQUIRE(task != NULL);
|
|
REQUIRE(action != NULL);
|
|
|
|
manager = sock->manager;
|
|
REQUIRE(VALID_MANAGER(manager));
|
|
|
|
INSIST(sock->bound);
|
|
|
|
dev = allocate_socketevent(manager->mctx, sock, ISC_SOCKEVENT_SENDDONE,
|
|
action, arg);
|
|
if (dev == NULL) {
|
|
return (ISC_R_NOMEMORY);
|
|
}
|
|
|
|
dev->region = *region;
|
|
|
|
return (socket_send(sock, dev, task, address, pktinfo, 0));
|
|
}
|
|
|
|
isc_result_t
|
|
isc_socket_sendto2(isc_socket_t *sock0, isc_region_t *region, isc_task_t *task,
|
|
const isc_sockaddr_t *address, struct in6_pktinfo *pktinfo,
|
|
isc_socketevent_t *event, unsigned int flags) {
|
|
isc__socket_t *sock = (isc__socket_t *)sock0;
|
|
|
|
REQUIRE(VALID_SOCKET(sock));
|
|
REQUIRE((flags & ~(ISC_SOCKFLAG_IMMEDIATE | ISC_SOCKFLAG_NORETRY)) ==
|
|
0);
|
|
if ((flags & ISC_SOCKFLAG_NORETRY) != 0) {
|
|
REQUIRE(sock->type == isc_sockettype_udp);
|
|
}
|
|
event->ev_sender = sock;
|
|
event->result = ISC_R_UNSET;
|
|
event->region = *region;
|
|
event->n = 0;
|
|
event->offset = 0;
|
|
event->attributes &= ~ISC_SOCKEVENTATTR_ATTACHED;
|
|
|
|
return (socket_send(sock, event, task, address, pktinfo, flags));
|
|
}
|
|
|
|
void
|
|
isc_socket_cleanunix(const isc_sockaddr_t *sockaddr, bool active) {
|
|
#ifndef _WIN32
|
|
int s;
|
|
struct stat sb;
|
|
char strbuf[ISC_STRERRORSIZE];
|
|
|
|
if (sockaddr->type.sa.sa_family != AF_UNIX) {
|
|
return;
|
|
}
|
|
|
|
#ifndef S_ISSOCK
|
|
#if defined(S_IFMT) && defined(S_IFSOCK)
|
|
#define S_ISSOCK(mode) ((mode & S_IFMT) == S_IFSOCK)
|
|
#elif defined(_S_IFMT) && defined(S_IFSOCK)
|
|
#define S_ISSOCK(mode) ((mode & _S_IFMT) == S_IFSOCK)
|
|
#endif /* if defined(S_IFMT) && defined(S_IFSOCK) */
|
|
#endif /* ifndef S_ISSOCK */
|
|
|
|
#ifndef S_ISFIFO
|
|
#if defined(S_IFMT) && defined(S_IFIFO)
|
|
#define S_ISFIFO(mode) ((mode & S_IFMT) == S_IFIFO)
|
|
#elif defined(_S_IFMT) && defined(S_IFIFO)
|
|
#define S_ISFIFO(mode) ((mode & _S_IFMT) == S_IFIFO)
|
|
#endif /* if defined(S_IFMT) && defined(S_IFIFO) */
|
|
#endif /* ifndef S_ISFIFO */
|
|
|
|
#if !defined(S_ISFIFO) && !defined(S_ISSOCK)
|
|
/* cppcheck-suppress preprocessorErrorDirective */
|
|
#error \
|
|
You need to define S_ISFIFO and S_ISSOCK as appropriate for your platform. See <sys/stat.h>.
|
|
#endif /* if !defined(S_ISFIFO) && !defined(S_ISSOCK) */
|
|
|
|
#ifndef S_ISFIFO
|
|
#define S_ISFIFO(mode) 0
|
|
#endif /* ifndef S_ISFIFO */
|
|
|
|
#ifndef S_ISSOCK
|
|
#define S_ISSOCK(mode) 0
|
|
#endif /* ifndef S_ISSOCK */
|
|
|
|
if (stat(sockaddr->type.sunix.sun_path, &sb) < 0) {
|
|
switch (errno) {
|
|
case ENOENT:
|
|
if (active) { /* We exited cleanly last time */
|
|
break;
|
|
}
|
|
/* FALLTHROUGH */
|
|
default:
|
|
strerror_r(errno, strbuf, sizeof(strbuf));
|
|
isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
|
|
ISC_LOGMODULE_SOCKET,
|
|
active ? ISC_LOG_ERROR : ISC_LOG_WARNING,
|
|
"isc_socket_cleanunix: stat(%s): %s",
|
|
sockaddr->type.sunix.sun_path, strbuf);
|
|
return;
|
|
}
|
|
} else {
|
|
if (!(S_ISSOCK(sb.st_mode) || S_ISFIFO(sb.st_mode))) {
|
|
isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
|
|
ISC_LOGMODULE_SOCKET,
|
|
active ? ISC_LOG_ERROR : ISC_LOG_WARNING,
|
|
"isc_socket_cleanunix: %s: not a socket",
|
|
sockaddr->type.sunix.sun_path);
|
|
return;
|
|
}
|
|
}
|
|
|
|
if (active) {
|
|
if (unlink(sockaddr->type.sunix.sun_path) < 0) {
|
|
strerror_r(errno, strbuf, sizeof(strbuf));
|
|
isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
|
|
ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
|
|
"isc_socket_cleanunix: unlink(%s): %s",
|
|
sockaddr->type.sunix.sun_path, strbuf);
|
|
}
|
|
return;
|
|
}
|
|
|
|
s = socket(AF_UNIX, SOCK_STREAM, 0);
|
|
if (s < 0) {
|
|
strerror_r(errno, strbuf, sizeof(strbuf));
|
|
isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
|
|
ISC_LOGMODULE_SOCKET, ISC_LOG_WARNING,
|
|
"isc_socket_cleanunix: socket(%s): %s",
|
|
sockaddr->type.sunix.sun_path, strbuf);
|
|
return;
|
|
}
|
|
|
|
if (connect(s, (const struct sockaddr *)&sockaddr->type.sunix,
|
|
sizeof(sockaddr->type.sunix)) < 0)
|
|
{
|
|
switch (errno) {
|
|
case ECONNREFUSED:
|
|
case ECONNRESET:
|
|
if (unlink(sockaddr->type.sunix.sun_path) < 0) {
|
|
strerror_r(errno, strbuf, sizeof(strbuf));
|
|
isc_log_write(
|
|
isc_lctx, ISC_LOGCATEGORY_GENERAL,
|
|
ISC_LOGMODULE_SOCKET, ISC_LOG_WARNING,
|
|
"isc_socket_cleanunix: "
|
|
"unlink(%s): %s",
|
|
sockaddr->type.sunix.sun_path, strbuf);
|
|
}
|
|
break;
|
|
default:
|
|
strerror_r(errno, strbuf, sizeof(strbuf));
|
|
isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
|
|
ISC_LOGMODULE_SOCKET, ISC_LOG_WARNING,
|
|
"isc_socket_cleanunix: connect(%s): %s",
|
|
sockaddr->type.sunix.sun_path, strbuf);
|
|
break;
|
|
}
|
|
}
|
|
close(s);
|
|
#else /* ifndef _WIN32 */
|
|
UNUSED(sockaddr);
|
|
UNUSED(active);
|
|
#endif /* ifndef _WIN32 */
|
|
}
|
|
|
|
isc_result_t
|
|
isc_socket_permunix(const isc_sockaddr_t *sockaddr, uint32_t perm,
|
|
uint32_t owner, uint32_t group) {
|
|
#ifndef _WIN32
|
|
isc_result_t result = ISC_R_SUCCESS;
|
|
char strbuf[ISC_STRERRORSIZE];
|
|
char path[sizeof(sockaddr->type.sunix.sun_path)];
|
|
#ifdef NEED_SECURE_DIRECTORY
|
|
char *slash;
|
|
#endif /* ifdef NEED_SECURE_DIRECTORY */
|
|
|
|
REQUIRE(sockaddr->type.sa.sa_family == AF_UNIX);
|
|
INSIST(strlen(sockaddr->type.sunix.sun_path) < sizeof(path));
|
|
strlcpy(path, sockaddr->type.sunix.sun_path, sizeof(path));
|
|
|
|
#ifdef NEED_SECURE_DIRECTORY
|
|
slash = strrchr(path, '/');
|
|
if (slash != NULL) {
|
|
if (slash != path) {
|
|
*slash = '\0';
|
|
} else {
|
|
strlcpy(path, "/", sizeof(path));
|
|
}
|
|
} else {
|
|
strlcpy(path, ".", sizeof(path));
|
|
}
|
|
#endif /* ifdef NEED_SECURE_DIRECTORY */
|
|
|
|
if (chmod(path, perm) < 0) {
|
|
strerror_r(errno, strbuf, sizeof(strbuf));
|
|
isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
|
|
ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
|
|
"isc_socket_permunix: chmod(%s, %d): %s", path,
|
|
perm, strbuf);
|
|
result = ISC_R_FAILURE;
|
|
}
|
|
if (chown(path, owner, group) < 0) {
|
|
strerror_r(errno, strbuf, sizeof(strbuf));
|
|
isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
|
|
ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
|
|
"isc_socket_permunix: chown(%s, %d, %d): %s",
|
|
path, owner, group, strbuf);
|
|
result = ISC_R_FAILURE;
|
|
}
|
|
return (result);
|
|
#else /* ifndef _WIN32 */
|
|
UNUSED(sockaddr);
|
|
UNUSED(perm);
|
|
UNUSED(owner);
|
|
UNUSED(group);
|
|
return (ISC_R_NOTIMPLEMENTED);
|
|
#endif /* ifndef _WIN32 */
|
|
}
|
|
|
|
isc_result_t
|
|
isc_socket_bind(isc_socket_t *sock0, const isc_sockaddr_t *sockaddr,
|
|
isc_socket_options_t options) {
|
|
isc__socket_t *sock = (isc__socket_t *)sock0;
|
|
char strbuf[ISC_STRERRORSIZE];
|
|
int on = 1;
|
|
|
|
REQUIRE(VALID_SOCKET(sock));
|
|
|
|
LOCK(&sock->lock);
|
|
|
|
INSIST(!sock->bound);
|
|
INSIST(!sock->dupped);
|
|
|
|
if (sock->pf != sockaddr->type.sa.sa_family) {
|
|
UNLOCK(&sock->lock);
|
|
return (ISC_R_FAMILYMISMATCH);
|
|
}
|
|
|
|
/*
|
|
* Only set SO_REUSEADDR when we want a specific port.
|
|
*/
|
|
#ifdef AF_UNIX
|
|
if (sock->pf == AF_UNIX) {
|
|
goto bind_socket;
|
|
}
|
|
#endif /* ifdef AF_UNIX */
|
|
if ((options & ISC_SOCKET_REUSEADDRESS) != 0 &&
|
|
isc_sockaddr_getport(sockaddr) != (in_port_t)0)
|
|
{
|
|
if (setsockopt(sock->fd, SOL_SOCKET, SO_REUSEADDR, (void *)&on,
|
|
sizeof(on)) < 0) {
|
|
UNEXPECTED_ERROR(__FILE__, __LINE__,
|
|
"setsockopt(%d) failed", sock->fd);
|
|
}
|
|
#if defined(__FreeBSD_kernel__) && defined(SO_REUSEPORT_LB)
|
|
if (setsockopt(sock->fd, SOL_SOCKET, SO_REUSEPORT_LB,
|
|
(void *)&on, sizeof(on)) < 0)
|
|
{
|
|
UNEXPECTED_ERROR(__FILE__, __LINE__,
|
|
"setsockopt(%d) failed", sock->fd);
|
|
}
|
|
#elif defined(__linux__) && defined(SO_REUSEPORT)
|
|
if (setsockopt(sock->fd, SOL_SOCKET, SO_REUSEPORT, (void *)&on,
|
|
sizeof(on)) < 0) {
|
|
UNEXPECTED_ERROR(__FILE__, __LINE__,
|
|
"setsockopt(%d) failed", sock->fd);
|
|
}
|
|
#endif /* if defined(__FreeBSD_kernel__) && defined(SO_REUSEPORT_LB) */
|
|
/* Press on... */
|
|
}
|
|
#ifdef AF_UNIX
|
|
bind_socket:
|
|
#endif /* ifdef AF_UNIX */
|
|
if (bind(sock->fd, &sockaddr->type.sa, sockaddr->length) < 0) {
|
|
inc_stats(sock->manager->stats,
|
|
sock->statsindex[STATID_BINDFAIL]);
|
|
|
|
UNLOCK(&sock->lock);
|
|
switch (errno) {
|
|
case EACCES:
|
|
return (ISC_R_NOPERM);
|
|
case EADDRNOTAVAIL:
|
|
return (ISC_R_ADDRNOTAVAIL);
|
|
case EADDRINUSE:
|
|
return (ISC_R_ADDRINUSE);
|
|
case EINVAL:
|
|
return (ISC_R_BOUND);
|
|
default:
|
|
strerror_r(errno, strbuf, sizeof(strbuf));
|
|
UNEXPECTED_ERROR(__FILE__, __LINE__, "bind: %s",
|
|
strbuf);
|
|
return (ISC_R_UNEXPECTED);
|
|
}
|
|
}
|
|
|
|
socket_log(sock, sockaddr, TRACE, "bound");
|
|
sock->bound = 1;
|
|
|
|
UNLOCK(&sock->lock);
|
|
return (ISC_R_SUCCESS);
|
|
}
|
|
|
|
/*
|
|
* Enable this only for specific OS versions, and only when they have repaired
|
|
* their problems with it. Until then, this is is broken and needs to be
|
|
* disabled by default. See RT22589 for details.
|
|
*/
|
|
#undef ENABLE_ACCEPTFILTER
|
|
|
|
isc_result_t
|
|
isc_socket_filter(isc_socket_t *sock0, const char *filter) {
|
|
isc__socket_t *sock = (isc__socket_t *)sock0;
|
|
#if defined(SO_ACCEPTFILTER) && defined(ENABLE_ACCEPTFILTER)
|
|
char strbuf[ISC_STRERRORSIZE];
|
|
struct accept_filter_arg afa;
|
|
#else /* if defined(SO_ACCEPTFILTER) && defined(ENABLE_ACCEPTFILTER) */
|
|
UNUSED(sock);
|
|
UNUSED(filter);
|
|
#endif /* if defined(SO_ACCEPTFILTER) && defined(ENABLE_ACCEPTFILTER) */
|
|
|
|
REQUIRE(VALID_SOCKET(sock));
|
|
|
|
#if defined(SO_ACCEPTFILTER) && defined(ENABLE_ACCEPTFILTER)
|
|
bzero(&afa, sizeof(afa));
|
|
strlcpy(afa.af_name, filter, sizeof(afa.af_name));
|
|
if (setsockopt(sock->fd, SOL_SOCKET, SO_ACCEPTFILTER, &afa,
|
|
sizeof(afa)) == -1) {
|
|
strerror_r(errno, strbuf, sizeof(strbuf));
|
|
socket_log(sock, NULL, CREATION,
|
|
"setsockopt(SO_ACCEPTFILTER): %s", strbuf);
|
|
return (ISC_R_FAILURE);
|
|
}
|
|
return (ISC_R_SUCCESS);
|
|
#else /* if defined(SO_ACCEPTFILTER) && defined(ENABLE_ACCEPTFILTER) */
|
|
return (ISC_R_NOTIMPLEMENTED);
|
|
#endif /* if defined(SO_ACCEPTFILTER) && defined(ENABLE_ACCEPTFILTER) */
|
|
}
|
|
|
|
/*
|
|
* Try enabling TCP Fast Open for a given socket if the OS supports it.
|
|
*/
|
|
static void
|
|
set_tcp_fastopen(isc__socket_t *sock, unsigned int backlog) {
|
|
#if defined(ENABLE_TCP_FASTOPEN) && defined(TCP_FASTOPEN)
|
|
char strbuf[ISC_STRERRORSIZE];
|
|
|
|
/*
|
|
* FreeBSD, as of versions 10.3 and 11.0, defines TCP_FASTOPEN while also
|
|
* shipping a default kernel without TFO support, so we special-case it by
|
|
* performing an additional runtime check for TFO support using sysctl to
|
|
* prevent setsockopt() errors from being logged.
|
|
*/
|
|
#if defined(__FreeBSD__) && defined(HAVE_SYSCTLBYNAME)
|
|
#define SYSCTL_TFO "net.inet.tcp.fastopen.enabled"
|
|
unsigned int enabled;
|
|
size_t enabledlen = sizeof(enabled);
|
|
static bool tfo_notice_logged = false;
|
|
|
|
if (sysctlbyname(SYSCTL_TFO, &enabled, &enabledlen, NULL, 0) < 0) {
|
|
/*
|
|
* This kernel does not support TCP Fast Open. There is
|
|
* nothing more we can do.
|
|
*/
|
|
return;
|
|
} else if (enabled == 0) {
|
|
/*
|
|
* This kernel does support TCP Fast Open, but it is disabled
|
|
* by sysctl. Notify the user, but do not nag.
|
|
*/
|
|
if (!tfo_notice_logged) {
|
|
isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
|
|
ISC_LOGMODULE_SOCKET, ISC_LOG_NOTICE,
|
|
"TCP_FASTOPEN support is disabled by "
|
|
"sysctl (" SYSCTL_TFO " = 0)");
|
|
tfo_notice_logged = true;
|
|
}
|
|
return;
|
|
}
|
|
#endif /* if defined(__FreeBSD__) && defined(HAVE_SYSCTLBYNAME) */
|
|
|
|
#ifdef __APPLE__
|
|
backlog = 1;
|
|
#else /* ifdef __APPLE__ */
|
|
backlog = backlog / 2;
|
|
if (backlog == 0) {
|
|
backlog = 1;
|
|
}
|
|
#endif /* ifdef __APPLE__ */
|
|
if (setsockopt(sock->fd, IPPROTO_TCP, TCP_FASTOPEN, (void *)&backlog,
|
|
sizeof(backlog)) < 0)
|
|
{
|
|
strerror_r(errno, strbuf, sizeof(strbuf));
|
|
UNEXPECTED_ERROR(__FILE__, __LINE__,
|
|
"setsockopt(%d, TCP_FASTOPEN) failed with %s",
|
|
sock->fd, strbuf);
|
|
/* TCP_FASTOPEN is experimental so ignore failures */
|
|
}
|
|
#else /* if defined(ENABLE_TCP_FASTOPEN) && defined(TCP_FASTOPEN) */
|
|
UNUSED(sock);
|
|
UNUSED(backlog);
|
|
#endif /* if defined(ENABLE_TCP_FASTOPEN) && defined(TCP_FASTOPEN) */
|
|
}
|
|
|
|
/*
|
|
* Set up to listen on a given socket. We do this by creating an internal
|
|
* event that will be dispatched when the socket has read activity. The
|
|
* watcher will send the internal event to the task when there is a new
|
|
* connection.
|
|
*
|
|
* Unlike in read, we don't preallocate a done event here. Every time there
|
|
* is a new connection we'll have to allocate a new one anyway, so we might
|
|
* as well keep things simple rather than having to track them.
|
|
*/
|
|
isc_result_t
|
|
isc_socket_listen(isc_socket_t *sock0, unsigned int backlog) {
|
|
isc__socket_t *sock = (isc__socket_t *)sock0;
|
|
char strbuf[ISC_STRERRORSIZE];
|
|
|
|
REQUIRE(VALID_SOCKET(sock));
|
|
|
|
LOCK(&sock->lock);
|
|
|
|
REQUIRE(!sock->listener);
|
|
REQUIRE(sock->bound);
|
|
REQUIRE(sock->type == isc_sockettype_tcp ||
|
|
sock->type == isc_sockettype_unix);
|
|
|
|
if (backlog == 0) {
|
|
backlog = SOMAXCONN;
|
|
}
|
|
|
|
if (listen(sock->fd, (int)backlog) < 0) {
|
|
UNLOCK(&sock->lock);
|
|
strerror_r(errno, strbuf, sizeof(strbuf));
|
|
|
|
UNEXPECTED_ERROR(__FILE__, __LINE__, "listen: %s", strbuf);
|
|
|
|
return (ISC_R_UNEXPECTED);
|
|
}
|
|
|
|
set_tcp_fastopen(sock, backlog);
|
|
|
|
sock->listener = 1;
|
|
|
|
UNLOCK(&sock->lock);
|
|
return (ISC_R_SUCCESS);
|
|
}
|
|
|
|
/*
|
|
* This should try to do aggressive accept() XXXMLG
|
|
*/
|
|
isc_result_t
|
|
isc_socket_accept(isc_socket_t *sock0, isc_task_t *task,
|
|
isc_taskaction_t action, void *arg) {
|
|
isc__socket_t *sock = (isc__socket_t *)sock0;
|
|
isc_socket_newconnev_t *dev;
|
|
isc__socketmgr_t *manager;
|
|
isc_task_t *ntask = NULL;
|
|
isc__socket_t *nsock;
|
|
isc_result_t result;
|
|
bool do_poke = false;
|
|
|
|
REQUIRE(VALID_SOCKET(sock));
|
|
manager = sock->manager;
|
|
REQUIRE(VALID_MANAGER(manager));
|
|
|
|
LOCK(&sock->lock);
|
|
|
|
REQUIRE(sock->listener);
|
|
|
|
/*
|
|
* Sender field is overloaded here with the task we will be sending
|
|
* this event to. Just before the actual event is delivered the
|
|
* actual ev_sender will be touched up to be the socket.
|
|
*/
|
|
dev = (isc_socket_newconnev_t *)isc_event_allocate(
|
|
manager->mctx, task, ISC_SOCKEVENT_NEWCONN, action, arg,
|
|
sizeof(*dev));
|
|
ISC_LINK_INIT(dev, ev_link);
|
|
|
|
result = allocate_socket(manager, sock->type, &nsock);
|
|
if (result != ISC_R_SUCCESS) {
|
|
isc_event_free(ISC_EVENT_PTR(&dev));
|
|
UNLOCK(&sock->lock);
|
|
return (result);
|
|
}
|
|
|
|
/*
|
|
* Attach to socket and to task.
|
|
*/
|
|
isc_task_attach(task, &ntask);
|
|
if (isc_task_exiting(ntask)) {
|
|
free_socket(&nsock);
|
|
isc_task_detach(&ntask);
|
|
isc_event_free(ISC_EVENT_PTR(&dev));
|
|
UNLOCK(&sock->lock);
|
|
return (ISC_R_SHUTTINGDOWN);
|
|
}
|
|
isc_refcount_increment0(&nsock->references);
|
|
nsock->statsindex = sock->statsindex;
|
|
|
|
dev->ev_sender = ntask;
|
|
dev->newsocket = (isc_socket_t *)nsock;
|
|
|
|
/*
|
|
* Poke watcher here. We still have the socket locked, so there
|
|
* is no race condition. We will keep the lock for such a short
|
|
* bit of time waking it up now or later won't matter all that much.
|
|
*/
|
|
do_poke = ISC_LIST_EMPTY(sock->accept_list);
|
|
ISC_LIST_ENQUEUE(sock->accept_list, dev, ev_link);
|
|
if (do_poke) {
|
|
select_poke(manager, sock->threadid, sock->fd,
|
|
SELECT_POKE_ACCEPT);
|
|
}
|
|
UNLOCK(&sock->lock);
|
|
return (ISC_R_SUCCESS);
|
|
}
|
|
|
|
isc_result_t
|
|
isc_socket_connect(isc_socket_t *sock0, const isc_sockaddr_t *addr,
|
|
isc_task_t *task, isc_taskaction_t action, void *arg) {
|
|
isc__socket_t *sock = (isc__socket_t *)sock0;
|
|
isc_socket_connev_t *dev;
|
|
isc_task_t *ntask = NULL;
|
|
isc__socketmgr_t *manager;
|
|
int cc;
|
|
char strbuf[ISC_STRERRORSIZE];
|
|
char addrbuf[ISC_SOCKADDR_FORMATSIZE];
|
|
|
|
REQUIRE(VALID_SOCKET(sock));
|
|
REQUIRE(addr != NULL);
|
|
REQUIRE(task != NULL);
|
|
REQUIRE(action != NULL);
|
|
|
|
manager = sock->manager;
|
|
REQUIRE(VALID_MANAGER(manager));
|
|
REQUIRE(addr != NULL);
|
|
|
|
if (isc_sockaddr_ismulticast(addr)) {
|
|
return (ISC_R_MULTICAST);
|
|
}
|
|
|
|
LOCK(&sock->lock);
|
|
|
|
dev = (isc_socket_connev_t *)isc_event_allocate(
|
|
manager->mctx, sock, ISC_SOCKEVENT_CONNECT, action, arg,
|
|
sizeof(*dev));
|
|
ISC_LINK_INIT(dev, ev_link);
|
|
|
|
if (sock->connecting) {
|
|
INSIST(isc_sockaddr_equal(&sock->peer_address, addr));
|
|
goto queue;
|
|
}
|
|
|
|
if (sock->connected) {
|
|
INSIST(isc_sockaddr_equal(&sock->peer_address, addr));
|
|
dev->result = ISC_R_SUCCESS;
|
|
isc_task_sendto(task, ISC_EVENT_PTR(&dev), sock->threadid);
|
|
|
|
UNLOCK(&sock->lock);
|
|
|
|
return (ISC_R_SUCCESS);
|
|
}
|
|
|
|
/*
|
|
* Try to do the connect right away, as there can be only one
|
|
* outstanding, and it might happen to complete.
|
|
*/
|
|
sock->peer_address = *addr;
|
|
cc = connect(sock->fd, &addr->type.sa, addr->length);
|
|
if (cc < 0) {
|
|
/*
|
|
* The socket is nonblocking and the connection cannot be
|
|
* completed immediately. It is possible to select(2) or
|
|
* poll(2) for completion by selecting the socket for writing.
|
|
* After select(2) indicates writability, use getsockopt(2) to
|
|
* read the SO_ERROR option at level SOL_SOCKET to determine
|
|
* whether connect() completed successfully (SO_ERROR is zero)
|
|
* or unsuccessfully (SO_ERROR is one of the usual error codes
|
|
* listed here, explaining the reason for the failure).
|
|
*/
|
|
if (sock->type == isc_sockettype_udp && errno == EINPROGRESS) {
|
|
cc = 0;
|
|
goto success;
|
|
}
|
|
if (SOFT_ERROR(errno) || errno == EINPROGRESS) {
|
|
goto queue;
|
|
}
|
|
|
|
switch (errno) {
|
|
#define ERROR_MATCH(a, b) \
|
|
case a: \
|
|
dev->result = b; \
|
|
goto err_exit;
|
|
ERROR_MATCH(EACCES, ISC_R_NOPERM);
|
|
ERROR_MATCH(EADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL);
|
|
ERROR_MATCH(EAFNOSUPPORT, ISC_R_ADDRNOTAVAIL);
|
|
ERROR_MATCH(ECONNREFUSED, ISC_R_CONNREFUSED);
|
|
ERROR_MATCH(EHOSTUNREACH, ISC_R_HOSTUNREACH);
|
|
#ifdef EHOSTDOWN
|
|
ERROR_MATCH(EHOSTDOWN, ISC_R_HOSTUNREACH);
|
|
#endif /* ifdef EHOSTDOWN */
|
|
ERROR_MATCH(ENETUNREACH, ISC_R_NETUNREACH);
|
|
ERROR_MATCH(ENOBUFS, ISC_R_NORESOURCES);
|
|
ERROR_MATCH(EPERM, ISC_R_HOSTUNREACH);
|
|
ERROR_MATCH(EPIPE, ISC_R_NOTCONNECTED);
|
|
ERROR_MATCH(ETIMEDOUT, ISC_R_TIMEDOUT);
|
|
ERROR_MATCH(ECONNRESET, ISC_R_CONNECTIONRESET);
|
|
#undef ERROR_MATCH
|
|
}
|
|
|
|
sock->connected = 0;
|
|
|
|
strerror_r(errno, strbuf, sizeof(strbuf));
|
|
isc_sockaddr_format(addr, addrbuf, sizeof(addrbuf));
|
|
UNEXPECTED_ERROR(__FILE__, __LINE__, "connect(%s) %d/%s",
|
|
addrbuf, errno, strbuf);
|
|
|
|
UNLOCK(&sock->lock);
|
|
inc_stats(sock->manager->stats,
|
|
sock->statsindex[STATID_CONNECTFAIL]);
|
|
isc_event_free(ISC_EVENT_PTR(&dev));
|
|
return (ISC_R_UNEXPECTED);
|
|
|
|
err_exit:
|
|
sock->connected = 0;
|
|
isc_task_sendto(task, ISC_EVENT_PTR(&dev), sock->threadid);
|
|
|
|
UNLOCK(&sock->lock);
|
|
inc_stats(sock->manager->stats,
|
|
sock->statsindex[STATID_CONNECTFAIL]);
|
|
return (ISC_R_SUCCESS);
|
|
}
|
|
|
|
/*
|
|
* If connect completed, fire off the done event.
|
|
*/
|
|
success:
|
|
if (cc == 0) {
|
|
sock->connected = 1;
|
|
sock->bound = 1;
|
|
dev->result = ISC_R_SUCCESS;
|
|
isc_task_sendto(task, ISC_EVENT_PTR(&dev), sock->threadid);
|
|
|
|
UNLOCK(&sock->lock);
|
|
|
|
inc_stats(sock->manager->stats,
|
|
sock->statsindex[STATID_CONNECT]);
|
|
|
|
return (ISC_R_SUCCESS);
|
|
}
|
|
|
|
queue:
|
|
|
|
/*
|
|
* Attach to task.
|
|
*/
|
|
isc_task_attach(task, &ntask);
|
|
|
|
dev->ev_sender = ntask;
|
|
|
|
/*
|
|
* Poke watcher here. We still have the socket locked, so there
|
|
* is no race condition. We will keep the lock for such a short
|
|
* bit of time waking it up now or later won't matter all that much.
|
|
*/
|
|
bool do_poke = ISC_LIST_EMPTY(sock->connect_list);
|
|
ISC_LIST_ENQUEUE(sock->connect_list, dev, ev_link);
|
|
if (do_poke && !sock->connecting) {
|
|
sock->connecting = 1;
|
|
select_poke(manager, sock->threadid, sock->fd,
|
|
SELECT_POKE_CONNECT);
|
|
}
|
|
|
|
UNLOCK(&sock->lock);
|
|
return (ISC_R_SUCCESS);
|
|
}
|
|
|
|
/*
|
|
* Called when a socket with a pending connect() finishes.
|
|
*/
|
|
static void
|
|
internal_connect(isc__socket_t *sock) {
|
|
isc_socket_connev_t *dev;
|
|
int cc;
|
|
isc_result_t result;
|
|
socklen_t optlen;
|
|
char strbuf[ISC_STRERRORSIZE];
|
|
char peerbuf[ISC_SOCKADDR_FORMATSIZE];
|
|
|
|
INSIST(VALID_SOCKET(sock));
|
|
REQUIRE(sock->fd >= 0);
|
|
|
|
/*
|
|
* Get the first item off the connect list.
|
|
* If it is empty, unlock the socket and return.
|
|
*/
|
|
dev = ISC_LIST_HEAD(sock->connect_list);
|
|
if (dev == NULL) {
|
|
INSIST(!sock->connecting);
|
|
goto finish;
|
|
}
|
|
|
|
INSIST(sock->connecting);
|
|
sock->connecting = 0;
|
|
|
|
/*
|
|
* Get any possible error status here.
|
|
*/
|
|
optlen = sizeof(cc);
|
|
if (getsockopt(sock->fd, SOL_SOCKET, SO_ERROR, (void *)&cc,
|
|
(void *)&optlen) != 0)
|
|
{
|
|
cc = errno;
|
|
} else {
|
|
errno = cc;
|
|
}
|
|
|
|
if (errno != 0) {
|
|
/*
|
|
* If the error is EAGAIN, just re-select on this
|
|
* fd and pretend nothing strange happened.
|
|
*/
|
|
if (SOFT_ERROR(errno) || errno == EINPROGRESS) {
|
|
sock->connecting = 1;
|
|
return;
|
|
}
|
|
|
|
inc_stats(sock->manager->stats,
|
|
sock->statsindex[STATID_CONNECTFAIL]);
|
|
|
|
/*
|
|
* Translate other errors into ISC_R_* flavors.
|
|
*/
|
|
switch (errno) {
|
|
#define ERROR_MATCH(a, b) \
|
|
case a: \
|
|
result = b; \
|
|
break;
|
|
ERROR_MATCH(EACCES, ISC_R_NOPERM);
|
|
ERROR_MATCH(EADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL);
|
|
ERROR_MATCH(EAFNOSUPPORT, ISC_R_ADDRNOTAVAIL);
|
|
ERROR_MATCH(ECONNREFUSED, ISC_R_CONNREFUSED);
|
|
ERROR_MATCH(EHOSTUNREACH, ISC_R_HOSTUNREACH);
|
|
#ifdef EHOSTDOWN
|
|
ERROR_MATCH(EHOSTDOWN, ISC_R_HOSTUNREACH);
|
|
#endif /* ifdef EHOSTDOWN */
|
|
ERROR_MATCH(ENETUNREACH, ISC_R_NETUNREACH);
|
|
ERROR_MATCH(ENOBUFS, ISC_R_NORESOURCES);
|
|
ERROR_MATCH(EPERM, ISC_R_HOSTUNREACH);
|
|
ERROR_MATCH(EPIPE, ISC_R_NOTCONNECTED);
|
|
ERROR_MATCH(ETIMEDOUT, ISC_R_TIMEDOUT);
|
|
ERROR_MATCH(ECONNRESET, ISC_R_CONNECTIONRESET);
|
|
#undef ERROR_MATCH
|
|
default:
|
|
result = ISC_R_UNEXPECTED;
|
|
isc_sockaddr_format(&sock->peer_address, peerbuf,
|
|
sizeof(peerbuf));
|
|
strerror_r(errno, strbuf, sizeof(strbuf));
|
|
UNEXPECTED_ERROR(__FILE__, __LINE__,
|
|
"internal_connect: connect(%s) %s",
|
|
peerbuf, strbuf);
|
|
}
|
|
} else {
|
|
inc_stats(sock->manager->stats,
|
|
sock->statsindex[STATID_CONNECT]);
|
|
result = ISC_R_SUCCESS;
|
|
sock->connected = 1;
|
|
sock->bound = 1;
|
|
}
|
|
|
|
do {
|
|
dev->result = result;
|
|
send_connectdone_event(sock, &dev);
|
|
dev = ISC_LIST_HEAD(sock->connect_list);
|
|
} while (dev != NULL);
|
|
|
|
finish:
|
|
unwatch_fd(&sock->manager->threads[sock->threadid], sock->fd,
|
|
SELECT_POKE_CONNECT);
|
|
}
|
|
|
|
isc_result_t
|
|
isc_socket_getpeername(isc_socket_t *sock0, isc_sockaddr_t *addressp) {
|
|
isc__socket_t *sock = (isc__socket_t *)sock0;
|
|
isc_result_t result;
|
|
|
|
REQUIRE(VALID_SOCKET(sock));
|
|
REQUIRE(addressp != NULL);
|
|
|
|
LOCK(&sock->lock);
|
|
|
|
if (sock->connected) {
|
|
*addressp = sock->peer_address;
|
|
result = ISC_R_SUCCESS;
|
|
} else {
|
|
result = ISC_R_NOTCONNECTED;
|
|
}
|
|
|
|
UNLOCK(&sock->lock);
|
|
|
|
return (result);
|
|
}
|
|
|
|
isc_result_t
|
|
isc_socket_getsockname(isc_socket_t *sock0, isc_sockaddr_t *addressp) {
|
|
isc__socket_t *sock = (isc__socket_t *)sock0;
|
|
socklen_t len;
|
|
isc_result_t result;
|
|
char strbuf[ISC_STRERRORSIZE];
|
|
|
|
REQUIRE(VALID_SOCKET(sock));
|
|
REQUIRE(addressp != NULL);
|
|
|
|
LOCK(&sock->lock);
|
|
|
|
if (!sock->bound) {
|
|
result = ISC_R_NOTBOUND;
|
|
goto out;
|
|
}
|
|
|
|
result = ISC_R_SUCCESS;
|
|
|
|
len = sizeof(addressp->type);
|
|
if (getsockname(sock->fd, &addressp->type.sa, (void *)&len) < 0) {
|
|
strerror_r(errno, strbuf, sizeof(strbuf));
|
|
UNEXPECTED_ERROR(__FILE__, __LINE__, "getsockname: %s", strbuf);
|
|
result = ISC_R_UNEXPECTED;
|
|
goto out;
|
|
}
|
|
addressp->length = (unsigned int)len;
|
|
|
|
out:
|
|
UNLOCK(&sock->lock);
|
|
|
|
return (result);
|
|
}
|
|
|
|
/*
|
|
* Run through the list of events on this socket, and cancel the ones
|
|
* queued for task "task" of type "how". "how" is a bitmask.
|
|
*/
|
|
void
|
|
isc_socket_cancel(isc_socket_t *sock0, isc_task_t *task, unsigned int how) {
|
|
isc__socket_t *sock = (isc__socket_t *)sock0;
|
|
|
|
REQUIRE(VALID_SOCKET(sock));
|
|
|
|
/*
|
|
* Quick exit if there is nothing to do. Don't even bother locking
|
|
* in this case.
|
|
*/
|
|
if (how == 0) {
|
|
return;
|
|
}
|
|
|
|
LOCK(&sock->lock);
|
|
|
|
/*
|
|
* All of these do the same thing, more or less.
|
|
* Each will:
|
|
* o If the internal event is marked as "posted" try to
|
|
* remove it from the task's queue. If this fails, mark it
|
|
* as canceled instead, and let the task clean it up later.
|
|
* o For each I/O request for that task of that type, post
|
|
* its done event with status of "ISC_R_CANCELED".
|
|
* o Reset any state needed.
|
|
*/
|
|
if (((how & ISC_SOCKCANCEL_RECV) != 0) &&
|
|
!ISC_LIST_EMPTY(sock->recv_list)) {
|
|
isc_socketevent_t *dev;
|
|
isc_socketevent_t *next;
|
|
isc_task_t *current_task;
|
|
|
|
dev = ISC_LIST_HEAD(sock->recv_list);
|
|
|
|
while (dev != NULL) {
|
|
current_task = dev->ev_sender;
|
|
next = ISC_LIST_NEXT(dev, ev_link);
|
|
|
|
if ((task == NULL) || (task == current_task)) {
|
|
dev->result = ISC_R_CANCELED;
|
|
send_recvdone_event(sock, &dev);
|
|
}
|
|
dev = next;
|
|
}
|
|
}
|
|
|
|
if (((how & ISC_SOCKCANCEL_SEND) != 0) &&
|
|
!ISC_LIST_EMPTY(sock->send_list)) {
|
|
isc_socketevent_t *dev;
|
|
isc_socketevent_t *next;
|
|
isc_task_t *current_task;
|
|
|
|
dev = ISC_LIST_HEAD(sock->send_list);
|
|
|
|
while (dev != NULL) {
|
|
current_task = dev->ev_sender;
|
|
next = ISC_LIST_NEXT(dev, ev_link);
|
|
|
|
if ((task == NULL) || (task == current_task)) {
|
|
dev->result = ISC_R_CANCELED;
|
|
send_senddone_event(sock, &dev);
|
|
}
|
|
dev = next;
|
|
}
|
|
}
|
|
|
|
if (((how & ISC_SOCKCANCEL_ACCEPT) != 0) &&
|
|
!ISC_LIST_EMPTY(sock->accept_list)) {
|
|
isc_socket_newconnev_t *dev;
|
|
isc_socket_newconnev_t *next;
|
|
isc_task_t *current_task;
|
|
|
|
dev = ISC_LIST_HEAD(sock->accept_list);
|
|
while (dev != NULL) {
|
|
current_task = dev->ev_sender;
|
|
next = ISC_LIST_NEXT(dev, ev_link);
|
|
|
|
if ((task == NULL) || (task == current_task)) {
|
|
ISC_LIST_UNLINK(sock->accept_list, dev,
|
|
ev_link);
|
|
|
|
isc_refcount_decrementz(
|
|
&NEWCONNSOCK(dev)->references);
|
|
free_socket((isc__socket_t **)&dev->newsocket);
|
|
|
|
dev->result = ISC_R_CANCELED;
|
|
dev->ev_sender = sock;
|
|
isc_task_sendtoanddetach(¤t_task,
|
|
ISC_EVENT_PTR(&dev),
|
|
sock->threadid);
|
|
}
|
|
|
|
dev = next;
|
|
}
|
|
}
|
|
|
|
if (((how & ISC_SOCKCANCEL_CONNECT) != 0) &&
|
|
!ISC_LIST_EMPTY(sock->connect_list))
|
|
{
|
|
isc_socket_connev_t *dev;
|
|
isc_socket_connev_t *next;
|
|
isc_task_t *current_task;
|
|
|
|
INSIST(sock->connecting);
|
|
sock->connecting = 0;
|
|
|
|
dev = ISC_LIST_HEAD(sock->connect_list);
|
|
|
|
while (dev != NULL) {
|
|
current_task = dev->ev_sender;
|
|
next = ISC_LIST_NEXT(dev, ev_link);
|
|
|
|
if ((task == NULL) || (task == current_task)) {
|
|
dev->result = ISC_R_CANCELED;
|
|
send_connectdone_event(sock, &dev);
|
|
}
|
|
dev = next;
|
|
}
|
|
}
|
|
|
|
UNLOCK(&sock->lock);
|
|
}
|
|
|
|
isc_sockettype_t
|
|
isc_socket_gettype(isc_socket_t *sock0) {
|
|
isc__socket_t *sock = (isc__socket_t *)sock0;
|
|
|
|
REQUIRE(VALID_SOCKET(sock));
|
|
|
|
return (sock->type);
|
|
}
|
|
|
|
void
|
|
isc_socket_ipv6only(isc_socket_t *sock0, bool yes) {
|
|
isc__socket_t *sock = (isc__socket_t *)sock0;
|
|
#if defined(IPV6_V6ONLY)
|
|
int onoff = yes ? 1 : 0;
|
|
#else /* if defined(IPV6_V6ONLY) */
|
|
UNUSED(yes);
|
|
UNUSED(sock);
|
|
#endif /* if defined(IPV6_V6ONLY) */
|
|
|
|
REQUIRE(VALID_SOCKET(sock));
|
|
INSIST(!sock->dupped);
|
|
|
|
#ifdef IPV6_V6ONLY
|
|
if (sock->pf == AF_INET6) {
|
|
if (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_V6ONLY,
|
|
(void *)&onoff, sizeof(int)) < 0)
|
|
{
|
|
char strbuf[ISC_STRERRORSIZE];
|
|
strerror_r(errno, strbuf, sizeof(strbuf));
|
|
UNEXPECTED_ERROR(__FILE__, __LINE__,
|
|
"setsockopt(%d, IPV6_V6ONLY) failed: "
|
|
"%s",
|
|
sock->fd, strbuf);
|
|
}
|
|
}
|
|
#endif /* ifdef IPV6_V6ONLY */
|
|
}
|
|
|
|
static void
|
|
setdscp(isc__socket_t *sock, isc_dscp_t dscp) {
|
|
#if defined(IP_TOS) || defined(IPV6_TCLASS)
|
|
int value = dscp << 2;
|
|
#endif /* if defined(IP_TOS) || defined(IPV6_TCLASS) */
|
|
|
|
sock->dscp = dscp;
|
|
|
|
#ifdef IP_TOS
|
|
if (sock->pf == AF_INET) {
|
|
if (setsockopt(sock->fd, IPPROTO_IP, IP_TOS, (void *)&value,
|
|
sizeof(value)) < 0) {
|
|
char strbuf[ISC_STRERRORSIZE];
|
|
strerror_r(errno, strbuf, sizeof(strbuf));
|
|
UNEXPECTED_ERROR(__FILE__, __LINE__,
|
|
"setsockopt(%d, IP_TOS, %.02x) "
|
|
"failed: %s",
|
|
sock->fd, value >> 2, strbuf);
|
|
}
|
|
}
|
|
#endif /* ifdef IP_TOS */
|
|
#ifdef IPV6_TCLASS
|
|
if (sock->pf == AF_INET6) {
|
|
if (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_TCLASS,
|
|
(void *)&value, sizeof(value)) < 0)
|
|
{
|
|
char strbuf[ISC_STRERRORSIZE];
|
|
strerror_r(errno, strbuf, sizeof(strbuf));
|
|
UNEXPECTED_ERROR(__FILE__, __LINE__,
|
|
"setsockopt(%d, IPV6_TCLASS, %.02x) "
|
|
"failed: %s",
|
|
sock->fd, dscp >> 2, strbuf);
|
|
}
|
|
}
|
|
#endif /* ifdef IPV6_TCLASS */
|
|
}
|
|
|
|
void
|
|
isc_socket_dscp(isc_socket_t *sock0, isc_dscp_t dscp) {
|
|
isc__socket_t *sock = (isc__socket_t *)sock0;
|
|
|
|
REQUIRE(VALID_SOCKET(sock));
|
|
REQUIRE(dscp < 0x40);
|
|
|
|
#if !defined(IP_TOS) && !defined(IPV6_TCLASS)
|
|
UNUSED(dscp);
|
|
#else /* if !defined(IP_TOS) && !defined(IPV6_TCLASS) */
|
|
if (dscp < 0) {
|
|
return;
|
|
}
|
|
|
|
/* The DSCP value must not be changed once it has been set. */
|
|
if (isc_dscp_check_value != -1) {
|
|
INSIST(dscp == isc_dscp_check_value);
|
|
}
|
|
#endif /* if !defined(IP_TOS) && !defined(IPV6_TCLASS) */
|
|
|
|
#ifdef notyet
|
|
REQUIRE(!sock->dupped);
|
|
#endif /* ifdef notyet */
|
|
|
|
setdscp(sock, dscp);
|
|
}
|
|
|
|
isc_socketevent_t *
|
|
isc_socket_socketevent(isc_mem_t *mctx, void *sender, isc_eventtype_t eventtype,
|
|
isc_taskaction_t action, void *arg) {
|
|
return (allocate_socketevent(mctx, sender, eventtype, action, arg));
|
|
}
|
|
|
|
void
|
|
isc_socket_setname(isc_socket_t *socket0, const char *name, void *tag) {
|
|
isc__socket_t *sock = (isc__socket_t *)socket0;
|
|
|
|
/*
|
|
* Name 'sock'.
|
|
*/
|
|
|
|
REQUIRE(VALID_SOCKET(sock));
|
|
|
|
LOCK(&sock->lock);
|
|
strlcpy(sock->name, name, sizeof(sock->name));
|
|
sock->tag = tag;
|
|
UNLOCK(&sock->lock);
|
|
}
|
|
|
|
const char *
|
|
isc_socket_getname(isc_socket_t *socket0) {
|
|
isc__socket_t *sock = (isc__socket_t *)socket0;
|
|
|
|
return (sock->name);
|
|
}
|
|
|
|
void *
|
|
isc_socket_gettag(isc_socket_t *socket0) {
|
|
isc__socket_t *sock = (isc__socket_t *)socket0;
|
|
|
|
return (sock->tag);
|
|
}
|
|
|
|
int
|
|
isc_socket_getfd(isc_socket_t *socket0) {
|
|
isc__socket_t *sock = (isc__socket_t *)socket0;
|
|
|
|
return ((short)sock->fd);
|
|
}
|
|
|
|
static isc_once_t hasreuseport_once = ISC_ONCE_INIT;
|
|
static bool hasreuseport = false;
|
|
|
|
static void
|
|
init_hasreuseport(void) {
|
|
/*
|
|
* SO_REUSEPORT works very differently on *BSD and on Linux (because why not).
|
|
* We only want to use it on Linux, if it's available. On BSD we want to dup()
|
|
* sockets instead of re-binding them.
|
|
*/
|
|
#if (defined(SO_REUSEPORT) && defined(__linux__)) || \
|
|
(defined(SO_REUSEPORT_LB) && defined(__FreeBSD_kernel__))
|
|
int sock, yes = 1;
|
|
sock = socket(AF_INET, SOCK_DGRAM, 0);
|
|
if (sock < 0) {
|
|
sock = socket(AF_INET6, SOCK_DGRAM, 0);
|
|
if (sock < 0) {
|
|
return;
|
|
}
|
|
}
|
|
if (setsockopt(sock, SOL_SOCKET, SO_REUSEADDR, (void *)&yes,
|
|
sizeof(yes)) < 0) {
|
|
close(sock);
|
|
return;
|
|
#if defined(__FreeBSD_kernel__)
|
|
} else if (setsockopt(sock, SOL_SOCKET, SO_REUSEPORT_LB, (void *)&yes,
|
|
sizeof(yes)) < 0)
|
|
#else /* if defined(__FreeBSD_kernel__) */
|
|
} else if (setsockopt(sock, SOL_SOCKET, SO_REUSEPORT, (void *)&yes,
|
|
sizeof(yes)) < 0)
|
|
#endif /* if defined(__FreeBSD_kernel__) */
|
|
{
|
|
close(sock);
|
|
return;
|
|
}
|
|
hasreuseport = true;
|
|
close(sock);
|
|
#endif /* if (defined(SO_REUSEPORT) && defined(__linux__)) || \
|
|
* (defined(SO_REUSEPORT_LB) && defined(__FreeBSD_kernel__)) */
|
|
}
|
|
|
|
bool
|
|
isc_socket_hasreuseport(void) {
|
|
RUNTIME_CHECK(isc_once_do(&hasreuseport_once, init_hasreuseport) ==
|
|
ISC_R_SUCCESS);
|
|
return (hasreuseport);
|
|
}
|
|
|
|
#if defined(HAVE_LIBXML2) || defined(HAVE_JSON_C)
|
|
static const char *
|
|
_socktype(isc_sockettype_t type) {
|
|
switch (type) {
|
|
case isc_sockettype_udp:
|
|
return ("udp");
|
|
case isc_sockettype_tcp:
|
|
return ("tcp");
|
|
case isc_sockettype_unix:
|
|
return ("unix");
|
|
default:
|
|
return ("not-initialized");
|
|
}
|
|
}
|
|
#endif /* if defined(HAVE_LIBXML2) || defined(HAVE_JSON_C) */
|
|
|
|
#ifdef HAVE_LIBXML2
|
|
#define TRY0(a) \
|
|
do { \
|
|
xmlrc = (a); \
|
|
if (xmlrc < 0) \
|
|
goto error; \
|
|
} while (0)
|
|
int
|
|
isc_socketmgr_renderxml(isc_socketmgr_t *mgr0, void *writer0) {
|
|
isc__socketmgr_t *mgr = (isc__socketmgr_t *)mgr0;
|
|
isc__socket_t *sock = NULL;
|
|
char peerbuf[ISC_SOCKADDR_FORMATSIZE];
|
|
isc_sockaddr_t addr;
|
|
socklen_t len;
|
|
int xmlrc;
|
|
xmlTextWriterPtr writer = (xmlTextWriterPtr)writer0;
|
|
|
|
LOCK(&mgr->lock);
|
|
|
|
TRY0(xmlTextWriterStartElement(writer, ISC_XMLCHAR "sockets"));
|
|
sock = ISC_LIST_HEAD(mgr->socklist);
|
|
while (sock != NULL) {
|
|
LOCK(&sock->lock);
|
|
TRY0(xmlTextWriterStartElement(writer, ISC_XMLCHAR "socket"));
|
|
|
|
TRY0(xmlTextWriterStartElement(writer, ISC_XMLCHAR "id"));
|
|
TRY0(xmlTextWriterWriteFormatString(writer, "%p", sock));
|
|
TRY0(xmlTextWriterEndElement(writer));
|
|
|
|
if (sock->name[0] != 0) {
|
|
TRY0(xmlTextWriterStartElement(writer,
|
|
ISC_XMLCHAR "name"));
|
|
TRY0(xmlTextWriterWriteFormatString(writer, "%s",
|
|
sock->name));
|
|
TRY0(xmlTextWriterEndElement(writer)); /* name */
|
|
}
|
|
|
|
TRY0(xmlTextWriterStartElement(writer,
|
|
ISC_XMLCHAR "references"));
|
|
TRY0(xmlTextWriterWriteFormatString(
|
|
writer, "%d",
|
|
(int)isc_refcount_current(&sock->references)));
|
|
TRY0(xmlTextWriterEndElement(writer));
|
|
|
|
TRY0(xmlTextWriterWriteElement(
|
|
writer, ISC_XMLCHAR "type",
|
|
ISC_XMLCHAR _socktype(sock->type)));
|
|
|
|
if (sock->connected) {
|
|
isc_sockaddr_format(&sock->peer_address, peerbuf,
|
|
sizeof(peerbuf));
|
|
TRY0(xmlTextWriterWriteElement(
|
|
writer, ISC_XMLCHAR "peer-address",
|
|
ISC_XMLCHAR peerbuf));
|
|
}
|
|
|
|
len = sizeof(addr);
|
|
if (getsockname(sock->fd, &addr.type.sa, (void *)&len) == 0) {
|
|
isc_sockaddr_format(&addr, peerbuf, sizeof(peerbuf));
|
|
TRY0(xmlTextWriterWriteElement(
|
|
writer, ISC_XMLCHAR "local-address",
|
|
ISC_XMLCHAR peerbuf));
|
|
}
|
|
|
|
TRY0(xmlTextWriterStartElement(writer, ISC_XMLCHAR "states"));
|
|
if (sock->listener) {
|
|
TRY0(xmlTextWriterWriteElement(writer,
|
|
ISC_XMLCHAR "state",
|
|
ISC_XMLCHAR "listener"));
|
|
}
|
|
if (sock->connected) {
|
|
TRY0(xmlTextWriterWriteElement(
|
|
writer, ISC_XMLCHAR "state",
|
|
ISC_XMLCHAR "connected"));
|
|
}
|
|
if (sock->connecting) {
|
|
TRY0(xmlTextWriterWriteElement(
|
|
writer, ISC_XMLCHAR "state",
|
|
ISC_XMLCHAR "connecting"));
|
|
}
|
|
if (sock->bound) {
|
|
TRY0(xmlTextWriterWriteElement(writer,
|
|
ISC_XMLCHAR "state",
|
|
ISC_XMLCHAR "bound"));
|
|
}
|
|
|
|
TRY0(xmlTextWriterEndElement(writer)); /* states */
|
|
|
|
TRY0(xmlTextWriterEndElement(writer)); /* socket */
|
|
|
|
UNLOCK(&sock->lock);
|
|
sock = ISC_LIST_NEXT(sock, link);
|
|
}
|
|
TRY0(xmlTextWriterEndElement(writer)); /* sockets */
|
|
|
|
error:
|
|
if (sock != NULL) {
|
|
UNLOCK(&sock->lock);
|
|
}
|
|
|
|
UNLOCK(&mgr->lock);
|
|
|
|
return (xmlrc);
|
|
}
|
|
#endif /* HAVE_LIBXML2 */
|
|
|
|
#ifdef HAVE_JSON_C
|
|
#define CHECKMEM(m) \
|
|
do { \
|
|
if (m == NULL) { \
|
|
result = ISC_R_NOMEMORY; \
|
|
goto error; \
|
|
} \
|
|
} while (0)
|
|
|
|
isc_result_t
|
|
isc_socketmgr_renderjson(isc_socketmgr_t *mgr0, void *stats0) {
|
|
isc_result_t result = ISC_R_SUCCESS;
|
|
isc__socketmgr_t *mgr = (isc__socketmgr_t *)mgr0;
|
|
isc__socket_t *sock = NULL;
|
|
char peerbuf[ISC_SOCKADDR_FORMATSIZE];
|
|
isc_sockaddr_t addr;
|
|
socklen_t len;
|
|
json_object *obj, *array = json_object_new_array();
|
|
json_object *stats = (json_object *)stats0;
|
|
|
|
CHECKMEM(array);
|
|
|
|
LOCK(&mgr->lock);
|
|
|
|
sock = ISC_LIST_HEAD(mgr->socklist);
|
|
while (sock != NULL) {
|
|
json_object *states, *entry = json_object_new_object();
|
|
char buf[255];
|
|
|
|
CHECKMEM(entry);
|
|
json_object_array_add(array, entry);
|
|
|
|
LOCK(&sock->lock);
|
|
|
|
snprintf(buf, sizeof(buf), "%p", sock);
|
|
obj = json_object_new_string(buf);
|
|
CHECKMEM(obj);
|
|
json_object_object_add(entry, "id", obj);
|
|
|
|
if (sock->name[0] != 0) {
|
|
obj = json_object_new_string(sock->name);
|
|
CHECKMEM(obj);
|
|
json_object_object_add(entry, "name", obj);
|
|
}
|
|
|
|
obj = json_object_new_int(
|
|
(int)isc_refcount_current(&sock->references));
|
|
CHECKMEM(obj);
|
|
json_object_object_add(entry, "references", obj);
|
|
|
|
obj = json_object_new_string(_socktype(sock->type));
|
|
CHECKMEM(obj);
|
|
json_object_object_add(entry, "type", obj);
|
|
|
|
if (sock->connected) {
|
|
isc_sockaddr_format(&sock->peer_address, peerbuf,
|
|
sizeof(peerbuf));
|
|
obj = json_object_new_string(peerbuf);
|
|
CHECKMEM(obj);
|
|
json_object_object_add(entry, "peer-address", obj);
|
|
}
|
|
|
|
len = sizeof(addr);
|
|
if (getsockname(sock->fd, &addr.type.sa, (void *)&len) == 0) {
|
|
isc_sockaddr_format(&addr, peerbuf, sizeof(peerbuf));
|
|
obj = json_object_new_string(peerbuf);
|
|
CHECKMEM(obj);
|
|
json_object_object_add(entry, "local-address", obj);
|
|
}
|
|
|
|
states = json_object_new_array();
|
|
CHECKMEM(states);
|
|
json_object_object_add(entry, "states", states);
|
|
|
|
if (sock->listener) {
|
|
obj = json_object_new_string("listener");
|
|
CHECKMEM(obj);
|
|
json_object_array_add(states, obj);
|
|
}
|
|
|
|
if (sock->connected) {
|
|
obj = json_object_new_string("connected");
|
|
CHECKMEM(obj);
|
|
json_object_array_add(states, obj);
|
|
}
|
|
|
|
if (sock->connecting) {
|
|
obj = json_object_new_string("connecting");
|
|
CHECKMEM(obj);
|
|
json_object_array_add(states, obj);
|
|
}
|
|
|
|
if (sock->bound) {
|
|
obj = json_object_new_string("bound");
|
|
CHECKMEM(obj);
|
|
json_object_array_add(states, obj);
|
|
}
|
|
|
|
UNLOCK(&sock->lock);
|
|
sock = ISC_LIST_NEXT(sock, link);
|
|
}
|
|
|
|
json_object_object_add(stats, "sockets", array);
|
|
array = NULL;
|
|
result = ISC_R_SUCCESS;
|
|
|
|
error:
|
|
if (array != NULL) {
|
|
json_object_put(array);
|
|
}
|
|
|
|
if (sock != NULL) {
|
|
UNLOCK(&sock->lock);
|
|
}
|
|
|
|
UNLOCK(&mgr->lock);
|
|
|
|
return (result);
|
|
}
|
|
#endif /* HAVE_JSON_C */
|
|
|
|
isc_result_t
|
|
isc_socketmgr_createinctx(isc_mem_t *mctx, isc_socketmgr_t **managerp) {
|
|
isc_result_t result;
|
|
|
|
result = isc_socketmgr_create(mctx, managerp);
|
|
|
|
return (result);
|
|
}
|