diff --git a/postfix/.indent.pro b/postfix/.indent.pro index bc5fa9c63..ea48c48d4 100644 --- a/postfix/.indent.pro +++ b/postfix/.indent.pro @@ -94,6 +94,7 @@ -TDSN_BUF -TDSN_SPLIT -TDSN_STAT +-TEVENT_MASK -TEXPAND_ATTR -TFILE -TFORWARD_INFO diff --git a/postfix/HISTORY b/postfix/HISTORY index 9557aef4a..613550be2 100644 --- a/postfix/HISTORY +++ b/postfix/HISTORY @@ -13179,6 +13179,55 @@ Apologies for any names omitted. more quickly release unused file handles. Files: global/mail_params.h, proto/postconf.5.html +20070202 + + Catch-up: FreeBSD kqueue support. File: util/events.c. + +20070205 + + System-V poll(2) support. This is now the preferred method + to test a single file descriptor on sufficiently recent + versions of FreeBSD, NetBSD, OpenBSD, Solaris and Linux; + other systems will be added as evidence becomes available + of usable poll(2) implementations. Files: util/read_wait.c, + util/write_wait.c, util/readble.c, util/writable.c. + + Streamlined the event_enable_read/write implementation to + speed up smtp-source performance, by eliminating expensive + kqueue/devpoll/epoll system calls when only the application + call-back information changes. On FreeBSD, smtp-sink/source + tests now run 5% faster than with the old select(2) based + implementation. File util/events.c. + +20070206 + + Catch-up: Solaris /dev/poll support. File: util/events.c. + + Bugfix (introduced 20060823): initial state was not in state + machine, causing memory access outside the lookup table. + File: smtpstone/smtp-sink.c. + +20070210 + + Catch-up: Linux epoll support. File: util/events.c. + +20070211 + + Polished the kqueue/devpoll/epoll support; this is now + enabled by default on sufficiently recent versions of + FreeBSD, NetBSD, OpenBSD, Solaris and Linux; other systems + will be added as evidence becomes available of usable + implementations. File: util/events.c. + +20070212 + + Further polish: removed some typos from new code in the + events.c handler, undid some unnecessary changes to the + {read,write}{_wait,able}.c modules, and addressed Victor's + paranoia for multi-client servers with a thousand clients + while linked with third-party libraries that can't handle + file descriptors >= FD_SETSIZE. + Wish list: Update message content length when adding/removing headers. @@ -13199,10 +13248,6 @@ Wish list: Make postmap header/body aware so people can test multi-line header checks. - Eliminate Linux 1024 select() file handle bottleneck and - eliminate select()/poll() scaling problems by implementing - kqueue(2) and epoll(2) support. - REDIRECT should override original recipient info, and probably override DSN as well. diff --git a/postfix/RELEASE_NOTES b/postfix/RELEASE_NOTES index 31d0925a8..8eecb2154 100644 --- a/postfix/RELEASE_NOTES +++ b/postfix/RELEASE_NOTES @@ -17,6 +17,18 @@ Incompatibility with Postfix 2.2 and earlier If you upgrade from Postfix 2.2 or earlier, read RELEASE_NOTES-2.3 before proceeding. +Major changes with Postfix snapshot 20070212-event +================================================== + +Better support for systems that run thousands of Postfix processes. +Postfix now supports FreeBSD kqueue(2), Solaris poll(7d) and Linux +epoll(4) as more scalable alternatives to the traditional select(2) +system call, and uses poll(2) when examining a single file descriptor +for readability or writability. These features are supported on +sufficiently recent versions of FreeBSD, NetBSD, OpenBSD, Solaris +and Linux; support for other systems will be added as evidence +becomes available that usable implementations exist. + Incompatibility with Postfix snapshot 20070201 ============================================== diff --git a/postfix/makedefs b/postfix/makedefs index 724bea453..4e80525dc 100644 --- a/postfix/makedefs +++ b/postfix/makedefs @@ -25,6 +25,14 @@ # \fIinclude\fR directory. # The following directives are special: # .RS +# .IP \fB-DNO_DEVPOLL\fR +# Do not build with Solaris /dev/poll support. +# By default, /dev/poll support is compiled in on platforms that +# are known to support it. +# .IP \fB-DNO_EPOLL\fR +# Do not build with Linux EPOLL support. +# By default, EPOLL support is compiled in on platforms that +# are known to support it. # .IP \fB-DNO_IPV6\fR # Do not build with IPv6 support. # By default, IPv6 support is compiled in on platforms that @@ -158,9 +166,9 @@ case "$SYSTEM.$RELEASE" in case $RELEASE in 5.[0-4]) CCARGS="$CCARGS -DMISSING_USLEEP -DNO_POSIX_REGEXP";; esac - # Solaris 8 added IPv6 + # Solaris 8 added IPv6 and /dev/poll case $RELEASE in - 5.[0-7]|5.[0-7].*) CCARGS="$CCARGS -DNO_IPV6";; + 5.[0-7]|5.[0-7].*) CCARGS="$CCARGS -DNO_IPV6 -DNO_DEVPOLL";; esac # Solaris 9 added closefrom(), futimesat() and /dev/*random case $RELEASE in @@ -262,9 +270,14 @@ case "$SYSTEM.$RELEASE" in } done done + # Kernel 2.4 added IPv6 case "$RELEASE" in 2.[0-3].*) CCARGS="$CCARGS -DNO_IPV6";; esac + # Kernel 2.6 added EPOLL + case "$RELEASE" in + 2.[0-5].*) CCARGS="$CCARGS -DNO_EPOLL";; + esac ;; GNU.0*|GNU/kFreeBSD.[56]*) SYSTYPE=GNU0 diff --git a/postfix/src/global/mail_version.h b/postfix/src/global/mail_version.h index 20e12eb06..8742a07dd 100644 --- a/postfix/src/global/mail_version.h +++ b/postfix/src/global/mail_version.h @@ -20,7 +20,7 @@ * Patches change both the patchlevel and the release date. Snapshots have no * patchlevel; they change the release date only. */ -#define MAIL_RELEASE_DATE "20070202" +#define MAIL_RELEASE_DATE "20070212" #define MAIL_VERSION_NUMBER "2.4" #ifdef SNAPSHOT diff --git a/postfix/src/global/post_mail.c b/postfix/src/global/post_mail.c index 56a94654c..22deb4fdb 100644 --- a/postfix/src/global/post_mail.c +++ b/postfix/src/global/post_mail.c @@ -270,21 +270,6 @@ static void post_mail_open_event(int event, char *context) switch (event) { - /* - * Connection established. Request notification when the server sends - * the initial response. This intermediate case is necessary for some - * versions of LINUX and perhaps Solaris, where UNIX-domain - * connect(2) blocks until the server performs an accept(2). - */ - case EVENT_WRITE: - if (msg_verbose) - msg_info("%s: write event", myname); - event_disable_readwrite(vstream_fileno(state->stream)); - non_blocking(vstream_fileno(state->stream), BLOCKING); - event_enable_read(vstream_fileno(state->stream), - post_mail_open_event, (char *) state); - return; - /* * Initial server reply. Stop the watchdog timer, disable further * read events that end up calling this function, and notify the @@ -295,6 +280,7 @@ static void post_mail_open_event(int event, char *context) msg_info("%s: read event", myname); event_cancel_timer(post_mail_open_event, context); event_disable_readwrite(vstream_fileno(state->stream)); + non_blocking(vstream_fileno(state->stream), BLOCKING); post_mail_init(state->stream, state->sender, state->recipient, state->filter_class, state->trace_flags, state->queue_id); @@ -371,7 +357,7 @@ void post_mail_fopen_async(const char *sender, const char *recipient, * same interface as all successes. */ if (stream != 0) { - event_enable_write(vstream_fileno(stream), post_mail_open_event, + event_enable_read(vstream_fileno(stream), post_mail_open_event, (void *) state); event_request_timer(post_mail_open_event, (void *) state, var_daemon_timeout); diff --git a/postfix/src/master/multi_server.c b/postfix/src/master/multi_server.c index 1330b23e6..c54c8a19c 100644 --- a/postfix/src/master/multi_server.c +++ b/postfix/src/master/multi_server.c @@ -149,6 +149,7 @@ #include #include +#include /* select() */ #include #include #include @@ -163,6 +164,10 @@ #endif #include +#ifdef USE_SYS_SELECT_H +#include /* select() */ +#endif + /* Utility library. */ #include @@ -328,6 +333,20 @@ static void multi_server_wakeup(int fd) VSTREAM *stream; char *tmp; +#ifndef USE_SELECT_EVENTS + int new_fd; + + /* + * Leave some handles < FD_SETSIZE for DBMS libraries, in the unlikely + * case of a multi-server with a thousand clients. + */ + if (fd < FD_SETSIZE / 8) { + if ((new_fd = fcntl(fd, F_DUPFD, FD_SETSIZE / 8)) < 0) + msg_fatal("fcntl F_DUPFD: %m"); + (void) close(fd); + fd = new_fd; + } +#endif if (msg_verbose) msg_info("connection established fd %d", fd); non_blocking(fd, BLOCKING); diff --git a/postfix/src/oqmgr/qmgr_transport.c b/postfix/src/oqmgr/qmgr_transport.c index 7fa50208c..7ff31c57f 100644 --- a/postfix/src/oqmgr/qmgr_transport.c +++ b/postfix/src/oqmgr/qmgr_transport.c @@ -236,9 +236,8 @@ static void qmgr_transport_event(int unused_event, char *context) event_cancel_timer(qmgr_transport_abort, context); /* - * Disable further read events that end up calling this function, turn - * off the Linux connect() workaround, and free up this pending - * connection pipeline slot. + * Disable further read events that end up calling this function, and + * free up this pending connection pipeline slot. */ if (alloc->stream) { event_disable_readwrite(vstream_fileno(alloc->stream)); diff --git a/postfix/src/qmgr/qmgr_transport.c b/postfix/src/qmgr/qmgr_transport.c index edcd9b384..de04bd192 100644 --- a/postfix/src/qmgr/qmgr_transport.c +++ b/postfix/src/qmgr/qmgr_transport.c @@ -241,9 +241,8 @@ static void qmgr_transport_event(int unused_event, char *context) event_cancel_timer(qmgr_transport_abort, context); /* - * Disable further read events that end up calling this function, turn - * off the Linux connect() workaround, and free up this pending - * connection pipeline slot. + * Disable further read events that end up calling this function, and + * free up this pending connection pipeline slot. */ if (alloc->stream) { event_disable_readwrite(vstream_fileno(alloc->stream)); diff --git a/postfix/src/smtpstone/smtp-sink.c b/postfix/src/smtpstone/smtp-sink.c index feb8e13bb..b18278fa9 100644 --- a/postfix/src/smtpstone/smtp-sink.c +++ b/postfix/src/smtpstone/smtp-sink.c @@ -975,6 +975,7 @@ static int command_read(SINK_STATE *state) static struct cmd_trans cmd_trans[] = { ST_ANY, '\r', ST_CR, ST_CR, '\n', ST_CR_LF, + 0, 0, 0, }; struct cmd_trans *cp; char *ptr; @@ -986,6 +987,8 @@ static int command_read(SINK_STATE *state) #define NEXT_CHAR(state) \ (PUSH_BACK_PEEK(state) ? PUSH_BACK_GET(state) : VSTREAM_GETC(state->stream)) + if (state->data_state == ST_CR_LF) + state->data_state = ST_ANY; /* XXX */ for (;;) { if ((ch = NEXT_CHAR(state)) == VSTREAM_EOF) return (-1); @@ -1005,7 +1008,8 @@ static int command_read(SINK_STATE *state) * first state. */ for (cp = cmd_trans; cp->state != state->data_state; cp++) - /* void */ ; + if (cp->want == 0) + msg_panic("command_read: unknown state: %d", state->data_state); if (ch == cp->want) state->data_state = cp->next_state; else if (ch == cmd_trans[0].want) diff --git a/postfix/src/smtpstone/smtp-source.c b/postfix/src/smtpstone/smtp-source.c index 23f5e3b7e..9a736e14a 100644 --- a/postfix/src/smtpstone/smtp-source.c +++ b/postfix/src/smtpstone/smtp-source.c @@ -467,6 +467,7 @@ static void connect_done(int unused_event, char *context) fail_connect(session); } else { non_blocking(fd, BLOCKING); + /* Disable write events. */ event_disable_readwrite(fd); event_enable_read(fd, read_banner, (char *) session); dequeue_connect(session); @@ -520,7 +521,6 @@ static void send_helo(SESSION *session) /* * Prepare for the next event. */ - event_disable_readwrite(vstream_fileno(session->stream)); event_enable_read(vstream_fileno(session->stream), helo_done, (char *) session); } @@ -562,7 +562,6 @@ static void send_mail(SESSION *session) /* * Prepare for the next event. */ - event_disable_readwrite(vstream_fileno(session->stream)); event_enable_read(vstream_fileno(session->stream), mail_done, (char *) session); } @@ -613,7 +612,6 @@ static void send_rcpt(int unused_event, char *context) /* * Prepare for the next event. */ - event_disable_readwrite(vstream_fileno(session->stream)); event_enable_read(vstream_fileno(session->stream), rcpt_done, (char *) session); } @@ -660,7 +658,6 @@ static void send_data(int unused_event, char *context) /* * Prepare for the next event. */ - event_disable_readwrite(vstream_fileno(session->stream)); event_enable_read(vstream_fileno(session->stream), data_done, (char *) session); } @@ -736,7 +733,6 @@ static void data_done(int unused_event, char *context) /* * Prepare for the next event. */ - event_disable_readwrite(vstream_fileno(session->stream)); event_enable_read(vstream_fileno(session->stream), dot_done, (char *) session); } @@ -775,7 +771,6 @@ static void dot_done(int unused_event, char *context) static void send_quit(SESSION *session) { command(session->stream, "QUIT"); - event_disable_readwrite(vstream_fileno(session->stream)); event_enable_read(vstream_fileno(session->stream), quit_done, (char *) session); } diff --git a/postfix/src/util/events.c b/postfix/src/util/events.c index c7b741dfa..e12f4bcda 100644 --- a/postfix/src/util/events.c +++ b/postfix/src/util/events.c @@ -77,7 +77,10 @@ /* partial reads or writes. /* An I/O channel cannot handle more than one request at the /* same time. The application is allowed to enable an event that -/* is already enabled (same channel, callback and context). +/* is already enabled (same channel, same read or write operation, +/* but perhaps a different callback or context). On systems with +/* kernel-based event filters this is preferred usage, because +/* each disable and enable request would cost a system call. /* /* The manifest constants EVENT_NULL_CONTEXT and EVENT_NULL_TYPE /* provide convenient null values. @@ -93,7 +96,7 @@ /* .IP EVENT_WRITE /* write event, /* .IP EVENT_XCPT -/* exception. +/* exception (actually, any event other than read or write). /* .RE /* .IP context /* Application context given to event_enable_read() (event_enable_write()). @@ -135,6 +138,7 @@ #include #include /* offsetof() */ #include /* bzero() prototype for 44BSD */ +#include /* INT_MAX */ #ifdef USE_SYS_SELECT_H #include @@ -148,9 +152,86 @@ #include "ring.h" #include "events.h" +#if (defined(USE_KQUEUE_EVENTS) && defined(USE_DEVPOLL_EVENTS)) \ + || (defined(USE_KQUEUE_EVENTS) && defined(USE_EPOLL_EVENTS)) \ + || (defined(USE_KQUEUE_EVENTS) && defined(USE_SELECT_EVENTS)) \ + || (defined(USE_DEVPOLL_EVENTS) && defined(USE_EPOLL_EVENTS)) \ + || (defined(USE_DEVPOLL_EVENTS) && defined(USE_SELECT_EVENTS)) \ + || (defined(USE_EPOLL_EVENTS) && defined(USE_SELECT_EVENTS)) +#error "don't define multiple USE_KQUEUE/DEVPOLL/EPOLL/SELECT_EVENTS" +#endif + +#if !defined(USE_KQUEUE_EVENTS) && !defined(USE_DEVPOLL_EVENTS) \ + && !defined(USE_EPOLL_EVENTS) && !defined(USE_SELECT_EVENTS) +#error "define one of USE_KQUEUE/DEVPOLL/EPOLL/SELECT_EVENTS" +#endif + /* - * I/O events. We pre-allocate one data structure per file descriptor. XXX - * For now use FD_SETSIZE as defined along with the fd-set type. + * Traditional BSD-style select(2). Works everywhere, but has a built-in + * upper bound on the number of file descriptors, and that limit is hard to + * change on Linux. Is sometimes emulated with SYSV-style poll(2) which + * doesn't have the file descriptor limit, but unfortunately does not help + * to improve the performance of servers with lots of connections. + */ +#define EVENT_ALLOC_INCR 10 + +#ifdef USE_SELECT_EVENTS +typedef fd_set EVENT_MASK; + +#define EVENT_MASK_BYTE_COUNT(mask) sizeof(*(mask)) +#define EVENT_MASK_ZERO(mask) FD_ZERO(mask) +#define EVENT_MASK_SET(fd, mask) FD_SET((fd), (mask)) +#define EVENT_MASK_ISSET(fd, mask) FD_ISSET((fd), (mask)) +#define EVENT_MASK_CLR(fd, mask) FD_CLR((fd), (mask)) +#else + + /* + * Kernel-based event filters (kqueue, /dev/poll, epoll). We use the + * following file descriptor mask structure which is expanded on the fly. + */ +typedef struct { + char *data; /* bit mask */ + size_t data_len; /* data byte count */ +} EVENT_MASK; + + /* Bits per byte, byte in vector, bit offset in byte, bytes per set. */ +#define EVENT_MASK_NBBY (8) +#define EVENT_MASK_FD_BYTE(fd, mask) \ + (((unsigned char *) (mask)->data)[(fd) / EVENT_MASK_NBBY]) +#define EVENT_MASK_FD_BIT(fd) (1 << ((fd) % EVENT_MASK_NBBY)) +#define EVENT_MASK_BYTES_NEEDED(len) \ + (((len) + (EVENT_MASK_NBBY -1)) / EVENT_MASK_NBBY) +#define EVENT_MASK_BYTE_COUNT(mask) ((mask)->data_len) + + /* Memory management. */ +#define EVENT_MASK_ALLOC(mask, bit_len) do { \ + size_t _byte_len = EVENT_MASK_BYTES_NEEDED(bit_len); \ + (mask)->data = mymalloc(_byte_len); \ + memset((mask)->data, 0, _byte_len); \ + (mask)->data_len = _byte_len; \ + } while (0) +#define EVENT_MASK_REALLOC(mask, bit_len) do { \ + size_t _byte_len = EVENT_MASK_BYTES_NEEDED(bit_len); \ + size_t _old_len = (mask)->data_len; \ + (mask)->data = myrealloc((mask)->data, _byte_len); \ + memset((mask)->data + _old_len, 0, _byte_len - _old_len); \ + (mask)->data_len = _byte_len; \ + } while (0) +#define EVENT_MASK_FREE(mask) myfree((mask)->data) + + /* Set operations, modeled after FD_ZERO/SET/ISSET/CLR. */ +#define EVENT_MASK_ZERO(mask) \ + memset((mask)->data, 0, (mask)->data_len) +#define EVENT_MASK_SET(fd, mask) \ + (EVENT_MASK_FD_BYTE((fd), (mask)) |= EVENT_MASK_FD_BIT(fd)) +#define EVENT_MASK_ISSET(fd, mask) \ + (EVENT_MASK_FD_BYTE((fd), (mask)) & EVENT_MASK_FD_BIT(fd)) +#define EVENT_MASK_CLR(fd, mask) \ + (EVENT_MASK_FD_BYTE((fd), (mask)) &= ~EVENT_MASK_FD_BIT(fd)) +#endif + + /* + * I/O events. */ typedef struct EVENT_FDTABLE EVENT_FDTABLE; @@ -158,12 +239,242 @@ struct EVENT_FDTABLE { EVENT_NOTIFY_RDWR callback; char *context; }; -static fd_set event_rmask; /* enabled read events */ -static fd_set event_wmask; /* enabled write events */ -static fd_set event_xmask; /* for bad news mostly */ -static int event_fdsize; /* number of file descriptors */ +static EVENT_MASK event_rmask; /* enabled read events */ +static EVENT_MASK event_wmask; /* enabled write events */ +static EVENT_MASK event_xmask; /* for bad news mostly */ +static int event_fdlimit; /* per-process open file limit */ static EVENT_FDTABLE *event_fdtable; /* one slot per file descriptor */ -static int event_max_fd; /* highest fd number seen */ +static int event_fdslots; /* number of file descriptor slots */ +static int event_max_fd = -1; /* highest fd number seen */ + + /* + * FreeBSD kqueue supports no system call to find out what descriptors are + * registered in the kernel-based filter. To implement our own sanity checks + * we maintain our own descriptor bitmask. + * + * FreeBSD kqueue does support application context pointers. Unfortunately, + * changing that information would cost a system call, and some of the + * competitors don't support application context. To keep the implementation + * simple we maintain our own table with call-back information. + * + * FreeBSD kqueue silently unregisters a descriptor from its filter when the + * descriptor is closed, so our information could get out of sync with the + * kernel. But that will never happen, because we have to meticulously + * unregister a file descriptor before it is closed, to avoid errors on + * systems that are built with USE_SELECT_EVENTS. + */ +#ifdef USE_KQUEUE_EVENTS +#include + + /* + * Some early FreeBSD implementations don't have the EV_SET macro. + */ +#ifndef EV_SET +#define EV_SET(kp, id, fi, fl, ffl, da, ud) do { \ + (kp)->ident = (id); \ + (kp)->filter = (fi); \ + (kp)->flags = (fl); \ + (kp)->fflags = (ffl); \ + (kp)->data = (da); \ + (kp)->udata = (ud); \ + } while(0) +#endif + + /* + * Macros to initialize the kernel-based filter; see event_init(). + */ +static int event_kq; /* handle to event filter */ + +#define EVENT_REG_INIT_HANDLE(er, n) do { \ + er = event_kq = kqueue(); \ + } while (0) +#define EVENT_REG_INIT_TEXT "kqueue" + + /* + * Macros to update the kernel-based filter; see event_enable_read(), + * event_enable_write() and event_disable_readwrite(). + */ +#define EVENT_REG_FD_OP(er, fh, ev, op) do { \ + struct kevent dummy; \ + EV_SET(&dummy, (fh), (ev), (op), 0, 0, 0); \ + (er) = kevent(event_kq, &dummy, 1, 0, 0, 0); \ + } while (0) + +#define EVENT_REG_ADD_OP(e, f, ev) EVENT_REG_FD_OP((e), (f), (ev), EV_ADD) +#define EVENT_REG_ADD_READ(e, f) EVENT_REG_ADD_OP((e), (f), EVFILT_READ) +#define EVENT_REG_ADD_WRITE(e, f) EVENT_REG_ADD_OP((e), (f), EVFILT_WRITE) +#define EVENT_REG_ADD_TEXT "kevent EV_ADD" + +#define EVENT_REG_DEL_OP(e, f, ev) EVENT_REG_FD_OP((e), (f), (ev), EV_DELETE) +#define EVENT_REG_DEL_READ(e, f) EVENT_REG_DEL_OP((e), (f), EVFILT_READ) +#define EVENT_REG_DEL_WRITE(e, f) EVENT_REG_DEL_OP((e), (f), EVFILT_WRITE) +#define EVENT_REG_DEL_TEXT "kevent EV_DELETE" + + /* + * Macros to retrieve event buffers from the kernel; see event_loop(). + */ +typedef struct kevent EVENT_BUFFER; + +#define EVENT_BUFFER_READ(event_count, event_buf, buflen, delay) do { \ + struct timespec ts; \ + struct timespec *tsp; \ + if ((delay) < 0) { \ + tsp = 0; \ + } else { \ + tsp = &ts; \ + ts.tv_nsec = 0; \ + ts.tv_sec = (delay); \ + } \ + (event_count) = kevent(event_kq, (struct kevent *) 0, 0, (event_buf), \ + (buflen), (tsp)); \ + } while (0) + + /* + * Macros to process event buffers from the kernel; see event_loop(). + */ +#define EVENT_GET_FD(bp) ((bp)->ident) +#define EVENT_GET_TYPE(bp) ((bp)->filter) +#define EVENT_TEST_READ(bp) (EVENT_GET_TYPE(bp) == EVFILT_READ) +#define EVENT_TEST_WRITE(bp) (EVENT_GET_TYPE(bp) == EVFILT_WRITE) + +#endif + + /* + * Solaris /dev/poll does not support application context, so we have to + * maintain our own. This has the benefit of avoiding an expensive system + * call just to change a call-back function or argument. + * + * Solaris /dev/poll does have a way to query if a specific descriptor is + * registered. However, we maintain a descriptor mask anyway because a) it + * avoids having to make an expensive system call to find out if something + * is registered, b) some USE_MUMBLE_EVENTS implementations need a + * descriptor bitmask anyway and c) we use the bitmask already to implement + * sanity checks. + */ +#ifdef USE_DEVPOLL_EVENTS +#include +#include + + /* + * Macros to initialize the kernel-based filter; see event_init(). + */ +static int event_pollfd; /* handle to file descriptor set */ + +#define EVENT_REG_INIT_HANDLE(er, n) do { \ + er = event_pollfd = open("/dev/poll", O_RDWR); \ + } while (0) +#define EVENT_REG_INIT_TEXT "open /dev/poll" + + /* + * Macros to update the kernel-based filter; see event_enable_read(), + * event_enable_write() and event_disable_readwrite(). + */ +#define EVENT_REG_FD_OP(er, fh, ev) do { \ + struct pollfd dummy; \ + dummy.fd = (fh); \ + dummy.events = (ev); \ + (er) = write(event_pollfd, (char *) &dummy, \ + sizeof(dummy)) != sizeof(dummy) ? -1 : 0; \ + } while (0) + +#define EVENT_REG_ADD_READ(e, f) EVENT_REG_FD_OP((e), (f), POLLIN) +#define EVENT_REG_ADD_WRITE(e, f) EVENT_REG_FD_OP((e), (f), POLLOUT) +#define EVENT_REG_ADD_TEXT "write /dev/poll" + +#define EVENT_REG_DEL_BOTH(e, f) EVENT_REG_FD_OP((e), (f), POLLREMOVE) +#define EVENT_REG_DEL_TEXT "write /dev/poll" + + /* + * Macros to retrieve event buffers from the kernel; see event_loop(). + */ +typedef struct pollfd EVENT_BUFFER; + +#define EVENT_BUFFER_READ(event_count, event_buf, buflen, delay) do { \ + struct dvpoll dvpoll; \ + dvpoll.dp_fds = (event_buf); \ + dvpoll.dp_nfds = (buflen); \ + dvpoll.dp_timeout = (delay) < 0 ? -1 : (delay) * 1000; \ + (event_count) = ioctl(event_pollfd, DP_POLL, &dvpoll); \ + } while (0) + + /* + * Macros to process event buffers from the kernel; see event_loop(). + */ +#define EVENT_GET_FD(bp) ((bp)->fd) +#define EVENT_GET_TYPE(bp) ((bp)->revents) +#define EVENT_TEST_READ(bp) (EVENT_GET_TYPE(bp) & POLLIN) +#define EVENT_TEST_WRITE(bp) (EVENT_GET_TYPE(bp) & POLLOUT) + +#endif + + /* + * Linux epoll supports no system call to find out what descriptors are + * registered in the kernel-based filter. To implement our own sanity checks + * we maintain our own descriptor bitmask. + * + * Linux epoll does support application context pointers. Unfortunately, + * changing that information would cost a system call, and some of the + * competitors don't support application context. To keep the implementation + * simple we maintain our own table with call-back information. + * + * Linux epoll silently unregisters a descriptor from its filter when the + * descriptor is closed, so our information could get out of sync with the + * kernel. But that will never happen, because we have to meticulously + * unregister a file descriptor before it is closed, to avoid errors on + */ +#ifdef USE_EPOLL_EVENTS +#include + + /* + * Macros to initialize the kernel-based filter; see event_init(). + */ +static int event_epollfd; /* epoll handle */ + +#define EVENT_REG_INIT_HANDLE(er, n) do { \ + er = event_epollfd = epoll_create(n); \ + } while (0) +#define EVENT_REG_INIT_TEXT "epoll_create" + + /* + * Macros to update the kernel-based filter; see event_enable_read(), + * event_enable_write() and event_disable_readwrite(). + */ +#define EVENT_REG_FD_OP(er, fh, ev, op) do { \ + struct epoll_event dummy; \ + dummy.events = (ev); \ + dummy.data.fd = (fh); \ + (er) = epoll_ctl(event_epollfd, (op), (fh), &dummy); \ + } while (0) + +#define EVENT_REG_ADD_OP(e, f, ev) EVENT_REG_FD_OP((e), (f), (ev), EPOLL_CTL_ADD) +#define EVENT_REG_ADD_READ(e, f) EVENT_REG_ADD_OP((e), (f), EPOLLIN) +#define EVENT_REG_ADD_WRITE(e, f) EVENT_REG_ADD_OP((e), (f), EPOLLOUT) +#define EVENT_REG_ADD_TEXT "epoll_ctl EPOLL_CTL_ADD" + +#define EVENT_REG_DEL_OP(e, f, ev) EVENT_REG_FD_OP((e), (f), (ev), EPOLL_CTL_DEL) +#define EVENT_REG_DEL_READ(e, f) EVENT_REG_DEL_OP((e), (f), EPOLLIN) +#define EVENT_REG_DEL_WRITE(e, f) EVENT_REG_DEL_OP((e), (f), EPOLLOUT) +#define EVENT_REG_DEL_TEXT "epoll_ctl(EPOLL_CTL_DEL)" + + /* + * Macros to retrieve event buffers from the kernel; see event_loop(). + */ +typedef struct epoll_event EVENT_BUFFER; + +#define EVENT_BUFFER_READ(event_count, event_buf, buflen, delay) do { \ + (event_count) = epoll_wait(event_epollfd, (event_buf), (buflen), \ + (delay) < 0 ? -1 : (delay) * 1000); \ + } while (0) + + /* + * Macros to process event buffers from the kernel; see event_loop(). + */ +#define EVENT_GET_FD(bp) ((bp)->data.fd) +#define EVENT_GET_TYPE(bp) ((bp)->events) +#define EVENT_TEST_READ(bp) (EVENT_GET_TYPE(bp) & EPOLLIN) +#define EVENT_TEST_WRITE(bp) (EVENT_GET_TYPE(bp) & EPOLLOUT) + +#endif /* * Timer events. Timer requests are kept sorted, in a circular list. We use @@ -201,21 +512,29 @@ static time_t event_present; /* cached time of day */ static void event_init(void) { EVENT_FDTABLE *fdp; + int err; if (!EVENT_INIT_NEEDED()) msg_panic("event_init: repeated call"); /* - * Initialize the file descriptor table. XXX It should be possible to - * adjust (or at least extend) the table size on the fly. + * Initialize the file descriptor masks and the call-back table. Where + * possible we extend these data structures on the fly. With select(2) + * based implementations we can only handle FD_SETSIZE open files. */ - if ((event_fdsize = open_limit(FD_SETSIZE)) < 0) +#ifdef USE_SELECT_EVENTS + if ((event_fdlimit = open_limit(FD_SETSIZE)) < 0) msg_fatal("unable to determine open file limit"); - if (event_fdsize < FD_SETSIZE / 2 && event_fdsize < 256) - msg_warn("could allocate space for only %d open files", event_fdsize); +#else + if ((event_fdlimit = open_limit(INT_MAX)) < 0) + msg_fatal("unable to determine open file limit"); +#endif + if (event_fdlimit < FD_SETSIZE / 2 && event_fdlimit < 256) + msg_warn("could allocate space for only %d open files", event_fdlimit); + event_fdslots = EVENT_ALLOC_INCR; event_fdtable = (EVENT_FDTABLE *) - mymalloc(sizeof(EVENT_FDTABLE) * event_fdsize); - for (fdp = event_fdtable; fdp < event_fdtable + event_fdsize; fdp++) { + mymalloc(sizeof(EVENT_FDTABLE) * event_fdslots); + for (fdp = event_fdtable; fdp < event_fdtable + event_fdslots; fdp++) { fdp->callback = 0; fdp->context = 0; } @@ -223,9 +542,22 @@ static void event_init(void) /* * Initialize the I/O event request masks. */ - FD_ZERO(&event_rmask); - FD_ZERO(&event_wmask); - FD_ZERO(&event_xmask); +#ifdef USE_SELECT_EVENTS + EVENT_MASK_ZERO(&event_rmask); + EVENT_MASK_ZERO(&event_wmask); + EVENT_MASK_ZERO(&event_xmask); +#else + EVENT_MASK_ALLOC(&event_rmask, event_fdslots); + EVENT_MASK_ALLOC(&event_wmask, event_fdslots); + EVENT_MASK_ALLOC(&event_xmask, event_fdslots); + + /* + * Initialize the kernel-based filter. + */ + EVENT_REG_INIT_HANDLE(err, event_fdslots); + if (err < 0) + msg_fatal("%s: %m", EVENT_REG_INIT_TEXT); +#endif /* * Initialize timer stuff. @@ -240,6 +572,43 @@ static void event_init(void) msg_panic("event_init: unable to initialize"); } +/* event_extend - make room for more descriptor slots */ + +static void event_extend(int fd) +{ + const char *myname = "event_extend"; + int old_slots = event_fdslots; + int new_slots = (event_fdslots > fd / 2 ? + 2 * old_slots : fd + EVENT_ALLOC_INCR); + EVENT_FDTABLE *fdp; + int err; + + if (msg_verbose > 2) + msg_info("%s: fd %d", myname, fd); + event_fdtable = (EVENT_FDTABLE *) + myrealloc((char *) event_fdtable, sizeof(EVENT_FDTABLE) * new_slots); + event_fdslots = new_slots; + for (fdp = event_fdtable + old_slots; + fdp < event_fdtable + new_slots; fdp++) { + fdp->callback = 0; + fdp->context = 0; + } + + /* + * Initialize the I/O event request masks. + */ +#ifndef USE_SELECT_EVENTS + EVENT_MASK_REALLOC(&event_rmask, new_slots); + EVENT_MASK_REALLOC(&event_wmask, new_slots); + EVENT_MASK_REALLOC(&event_xmask, new_slots); +#endif +#ifdef EVENT_REG_UPD_HANDLE + EVENT_REG_UPD_HANDLE(err, new_slots); + if (err < 0) + msg_fatal("%s: %s: %m", myname, EVENT_REG_UPD_TEXT); +#endif +} + /* event_time - look up cached time of day */ time_t event_time(void) @@ -254,18 +623,19 @@ time_t event_time(void) void event_drain(int time_limit) { - fd_set zero_mask; + EVENT_MASK zero_mask; time_t max_time; if (EVENT_INIT_NEEDED()) return; - FD_ZERO(&zero_mask); + EVENT_MASK_ZERO(&zero_mask); (void) time(&event_present); max_time = event_present + time_limit; while (event_present < max_time && (event_timer_head.pred != &event_timer_head - || memcmp(&zero_mask, &event_xmask, sizeof(zero_mask)) != 0)) + || memcmp(&zero_mask, &event_xmask, + EVENT_MASK_BYTE_COUNT(&zero_mask)) != 0)) event_loop(1); } @@ -275,6 +645,7 @@ void event_enable_read(int fd, EVENT_NOTIFY_RDWR callback, char *context) { const char *myname = "event_enable_read"; EVENT_FDTABLE *fdp; + int err; if (EVENT_INIT_NEEDED()) event_init(); @@ -282,30 +653,45 @@ void event_enable_read(int fd, EVENT_NOTIFY_RDWR callback, char *context) /* * Sanity checks. */ - if (fd < 0 || fd >= event_fdsize) + if (fd < 0 || fd >= event_fdlimit) msg_panic("%s: bad file descriptor: %d", myname, fd); if (msg_verbose > 2) msg_info("%s: fd %d", myname, fd); + if (fd >= event_fdslots) + event_extend(fd); + /* - * Disallow multiple requests on the same file descriptor. Allow - * duplicates of the same request. + * Disallow mixed (i.e. read and write) requests on the same descriptor. */ - fdp = event_fdtable + fd; - if (FD_ISSET(fd, &event_xmask)) { - if (FD_ISSET(fd, &event_rmask) - && fdp->callback == callback - && fdp->context == context) - return; - msg_panic("%s: fd %d: multiple I/O request", myname, fd); + if (EVENT_MASK_ISSET(fd, &event_wmask)) + msg_panic("%s: fd %d: read/write I/O request", myname, fd); + + /* + * Postfix 2.4 allows multiple event_enable_read() calls on the same + * descriptor without requiring event_disable_readwrite() calls between + * them. With kernel-based filters (kqueue, /dev/poll, epoll) it's + * wasteful to make system calls when we change only application + * call-back information. It has a noticeable effect on smtp-source + * performance. + */ + if (EVENT_MASK_ISSET(fd, &event_rmask) == 0) { + EVENT_MASK_SET(fd, &event_xmask); + EVENT_MASK_SET(fd, &event_rmask); + if (event_max_fd < fd) + event_max_fd = fd; +#ifndef USE_SELECT_EVENTS + EVENT_REG_ADD_READ(err, fd); + if (err < 0) + msg_fatal("%s: %s: %m", myname, EVENT_REG_ADD_TEXT); +#endif + } + fdp = event_fdtable + fd; + if (fdp->callback != callback || fdp->context != context) { + fdp->callback = callback; + fdp->context = context; } - FD_SET(fd, &event_xmask); - FD_SET(fd, &event_rmask); - fdp->callback = callback; - fdp->context = context; - if (event_max_fd < fd) - event_max_fd = fd; } /* event_enable_write - enable write events */ @@ -314,6 +700,7 @@ void event_enable_write(int fd, EVENT_NOTIFY_RDWR callback, char *context) { const char *myname = "event_enable_write"; EVENT_FDTABLE *fdp; + int err; if (EVENT_INIT_NEEDED()) event_init(); @@ -321,30 +708,45 @@ void event_enable_write(int fd, EVENT_NOTIFY_RDWR callback, char *context) /* * Sanity checks. */ - if (fd < 0 || fd >= event_fdsize) + if (fd < 0 || fd >= event_fdlimit) msg_panic("%s: bad file descriptor: %d", myname, fd); if (msg_verbose > 2) msg_info("%s: fd %d", myname, fd); + if (fd >= event_fdslots) + event_extend(fd); + /* - * Disallow multiple requests on the same file descriptor. Allow - * duplicates of the same request. + * Disallow mixed (i.e. read and write) requests on the same descriptor. */ - fdp = event_fdtable + fd; - if (FD_ISSET(fd, &event_xmask)) { - if (FD_ISSET(fd, &event_wmask) - && fdp->callback == callback - && fdp->context == context) - return; - msg_panic("%s: fd %d: multiple I/O request", myname, fd); + if (EVENT_MASK_ISSET(fd, &event_rmask)) + msg_panic("%s: fd %d: read/write I/O request", myname, fd); + + /* + * Postfix 2.4 allows multiple event_enable_write() calls on the same + * descriptor without requiring event_disable_readwrite() calls between + * them. With kernel-based filters (kqueue, /dev/poll, epoll) it's + * incredibly wasteful to make unregister and register system calls when + * we change only application call-back information. It has a noticeable + * effect on smtp-source performance. + */ + if (EVENT_MASK_ISSET(fd, &event_wmask) == 0) { + EVENT_MASK_SET(fd, &event_xmask); + EVENT_MASK_SET(fd, &event_wmask); + if (event_max_fd < fd) + event_max_fd = fd; +#ifndef USE_SELECT_EVENTS + EVENT_REG_ADD_WRITE(err, fd); + if (err < 0) + msg_fatal("%s: %s: %m", myname, EVENT_REG_ADD_TEXT); +#endif + } + fdp = event_fdtable + fd; + if (fdp->callback != callback || fdp->context != context) { + fdp->callback = callback; + fdp->context = context; } - FD_SET(fd, &event_xmask); - FD_SET(fd, &event_wmask); - fdp->callback = callback; - fdp->context = context; - if (event_max_fd < fd) - event_max_fd = fd; } /* event_disable_readwrite - disable request for read or write events */ @@ -353,6 +755,7 @@ void event_disable_readwrite(int fd) { const char *myname = "event_disable_readwrite"; EVENT_FDTABLE *fdp; + int err; if (EVENT_INIT_NEEDED()) event_init(); @@ -360,7 +763,7 @@ void event_disable_readwrite(int fd) /* * Sanity checks. */ - if (fd < 0 || fd >= event_fdsize) + if (fd < 0 || fd >= event_fdlimit) msg_panic("%s: bad file descriptor: %d", myname, fd); if (msg_verbose > 2) @@ -370,9 +773,32 @@ void event_disable_readwrite(int fd) * Don't complain when there is nothing to cancel. The request may have * been canceled from another thread. */ - FD_CLR(fd, &event_xmask); - FD_CLR(fd, &event_rmask); - FD_CLR(fd, &event_wmask); + if (fd >= event_fdslots) + return; +#ifndef USE_SELECT_EVENTS +#ifdef EVENT_REG_DEL_BOTH + /* XXX Can't seem to disable READ and WRITE events selectively. */ + if (EVENT_MASK_ISSET(fd, &event_rmask) + || EVENT_MASK_ISSET(fd, &event_wmask)) { + EVENT_REG_DEL_BOTH(err, fd); + if (err < 0) + msg_fatal("%s: %s: %m", myname, EVENT_REG_DEL_TEXT); + } +#else + if (EVENT_MASK_ISSET(fd, &event_rmask)) { + EVENT_REG_DEL_READ(err, fd); + if (err < 0) + msg_fatal("%s: %s: %m", myname, EVENT_REG_DEL_TEXT); + } else if (EVENT_MASK_ISSET(fd, &event_wmask)) { + EVENT_REG_DEL_WRITE(err, fd); + if (err < 0) + msg_fatal("%s: %s: %m", myname, EVENT_REG_DEL_TEXT); + } +#endif /* EVENT_REG_DEL_BOTH */ +#endif /* USE_SELECT_EVENTS */ + EVENT_MASK_CLR(fd, &event_xmask); + EVENT_MASK_CLR(fd, &event_rmask); + EVENT_MASK_CLR(fd, &event_wmask); fdp = event_fdtable + fd; fdp->callback = 0; fdp->context = 0; @@ -481,11 +907,20 @@ void event_loop(int delay) { const char *myname = "event_loop"; static int nested; + +#ifdef USE_SELECT_EVENTS fd_set rmask; fd_set wmask; fd_set xmask; struct timeval tv; struct timeval *tvp; + +#else + EVENT_BUFFER event_buf[100]; + EVENT_BUFFER *bp; + int event_count; + +#endif EVENT_TIMER *timer; int fd; EVENT_FDTABLE *fdp; @@ -529,6 +964,7 @@ void event_loop(int delay) * Negative delay means: wait until something happens. Zero delay means: * poll. Positive delay means: wait at most this long. */ +#ifdef USE_SELECT_EVENTS if (select_delay < 0) { tvp = 0; } else { @@ -551,6 +987,16 @@ void event_loop(int delay) msg_fatal("event_loop: select: %m"); return; } +#else + EVENT_BUFFER_READ(event_count, event_buf, + sizeof(event_buf) / sizeof(event_buf[0]), + select_delay); + if (event_count < 0) { + if (errno != EINTR) + msg_fatal("event_loop: kevent: %m"); + return; + } +#endif /* * Before entering the application call-back routines, make sure we @@ -587,26 +1033,56 @@ void event_loop(int delay) * wanted. We do not change the event request masks. It is up to the * application to determine when a read or write is complete. */ - for (fd = 0, fdp = event_fdtable; fd <= event_max_fd; fd++, fdp++) { +#ifdef USE_SELECT_EVENTS + for (fd = 0; fd <= event_max_fd; fd++) { if (FD_ISSET(fd, &event_xmask)) { + /* In case event_fdtable is updated. */ + fdp = event_fdtable + fd; if (FD_ISSET(fd, &xmask)) { if (msg_verbose > 2) - msg_info("%s: exception %d 0x%lx 0x%lx", myname, + msg_info("%s: exception fd=%d act=0x%lx 0x%lx", myname, fd, (long) fdp->callback, (long) fdp->context); fdp->callback(EVENT_XCPT, fdp->context); } else if (FD_ISSET(fd, &wmask)) { if (msg_verbose > 2) - msg_info("%s: write %d 0x%lx 0x%lx", myname, + msg_info("%s: write fd=%d act=0x%lx 0x%lx", myname, fd, (long) fdp->callback, (long) fdp->context); fdp->callback(EVENT_WRITE, fdp->context); } else if (FD_ISSET(fd, &rmask)) { if (msg_verbose > 2) - msg_info("%s: read %d 0x%lx 0x%lx", myname, + msg_info("%s: read fd=%d act=0x%lx 0x%lx", myname, fd, (long) fdp->callback, (long) fdp->context); fdp->callback(EVENT_READ, fdp->context); } } } +#else + for (bp = event_buf; bp < event_buf + event_count; bp++) { + fd = EVENT_GET_FD(bp); + if (fd < 0 || fd > event_max_fd) + msg_panic("%s: bad file descriptor: %d", myname, fd); + if (EVENT_MASK_ISSET(fd, &event_xmask)) { + fdp = event_fdtable + fd; + if (EVENT_TEST_READ(bp)) { + if (msg_verbose > 2) + msg_info("%s: read fd=%d act=0x%lx 0x%lx", myname, + fd, (long) fdp->callback, (long) fdp->context); + fdp->callback(EVENT_READ, fdp->context); + } else if (EVENT_TEST_WRITE(bp)) { + if (msg_verbose > 2) + msg_info("%s: write fd=%d act=0x%lx 0x%lx", myname, + fd, (long) fdp->callback, + (long) fdp->context); + fdp->callback(EVENT_WRITE, fdp->context); + } else { + if (msg_verbose > 2) + msg_info("%s: other fd=%d act=0x%lx 0x%lx", myname, + fd, (long) fdp->callback, (long) fdp->context); + fdp->callback(EVENT_XCPT, fdp->context); + } + } + } +#endif nested--; } @@ -640,8 +1116,10 @@ static void echo(int unused_event, char *unused_context) printf("Result: %s", buf); } -int main(void) +int main(int argc, char **argv) { + if (argv[1]) + msg_verbose = atoi(argv[1]); event_request_timer(timer_event, "3 first", 3); event_request_timer(timer_event, "3 second", 3); event_request_timer(timer_event, "4 first", 4); diff --git a/postfix/src/util/read_wait.c b/postfix/src/util/read_wait.c index b1ec3528c..c94c1a06f 100644 --- a/postfix/src/util/read_wait.c +++ b/postfix/src/util/read_wait.c @@ -44,6 +44,10 @@ #include #include +#ifdef USE_SYSV_POLL +#include +#endif + #ifdef USE_SYS_SELECT_H #include #endif @@ -57,6 +61,7 @@ int read_wait(int fd, int timeout) { +#ifndef USE_SYSV_POLL fd_set read_fds; fd_set except_fds; struct timeval tv; @@ -99,4 +104,32 @@ int read_wait(int fd, int timeout) return (0); } } +#else + + /* + * System-V poll() is optimal for polling a few descriptors. + */ + struct pollfd pollfd; + +#define WAIT_FOR_EVENT (-1) + + pollfd.fd = fd; + pollfd.events = POLLIN; + for (;;) { + switch (poll(&pollfd, 1, timeout < 0 ? + WAIT_FOR_EVENT : timeout * 1000)) { + case -1: + if (errno != EINTR) + msg_fatal("poll: %m"); + continue; + case 0: + errno = ETIMEDOUT; + return (-1); + default: + if (pollfd.revents & POLLNVAL) + msg_fatal("poll: %m"); + return (0); + } + } +#endif } diff --git a/postfix/src/util/readable.c b/postfix/src/util/readable.c index a3af0daf0..7d297403b 100644 --- a/postfix/src/util/readable.c +++ b/postfix/src/util/readable.c @@ -37,6 +37,10 @@ #include #include +#ifdef USE_SYSV_POLL +#include +#endif + #ifdef USE_SYS_SELECT_H #include #endif @@ -50,6 +54,7 @@ int readable(int fd) { +#ifndef USE_SYSV_POLL struct timeval tv; fd_set read_fds; fd_set except_fds; @@ -85,4 +90,30 @@ int readable(int fd) return (0); } } +#else + + /* + * System-V poll() is optimal for polling a few descriptors. + */ + struct pollfd pollfd; + +#define DONT_WAIT_FOR_EVENT 0 + + pollfd.fd = fd; + pollfd.events = POLLIN; + for (;;) { + switch (poll(&pollfd, 1, DONT_WAIT_FOR_EVENT)) { + case -1: + if (errno != EINTR) + msg_fatal("poll: %m"); + continue; + case 0: + return (0); + default: + if (pollfd.revents & POLLNVAL) + msg_fatal("poll: %m"); + return (1); + } + } +#endif } diff --git a/postfix/src/util/sys_defs.h b/postfix/src/util/sys_defs.h index 63e9e3c90..6d4b8e4c2 100644 --- a/postfix/src/util/sys_defs.h +++ b/postfix/src/util/sys_defs.h @@ -151,6 +151,18 @@ # define HAVE_GETIFADDRS #endif +#if (defined(__FreeBSD_version) && __FreeBSD_version >= 300000) \ + || (defined(__NetBSD_Version__) && __NetBSD_Version__ >= 103000000) \ + || (defined(OpenBSD) && OpenBSD >= 199700) /* OpenBSD 2.0?? */ +# define USE_SYSV_POLL +#endif + +#if (defined(__FreeBSD_version) && __FreeBSD_version >= 410000) \ + || (defined(__NetBSD_Version__) && __NetBSD_Version__ >= 200000000) \ + || (defined(OpenBSD) && OpenBSD >= 200105) /* OpenBSD 2.9 */ +# define USE_KQUEUE_EVENTS +#endif + #endif /* @@ -393,6 +405,10 @@ extern int opterr; #ifndef NO_FUTIMESAT # define HAS_FUTIMESAT #endif +#define USE_SYSV_POLL +#ifndef NO_DEVPOLL +# define USE_DEVPOLL_EVENTS +#endif /* * Allow build environment to override paths. @@ -705,6 +721,10 @@ extern int initgroups(const char *, int); # define CANT_WRITE_BEFORE_SENDING_FD #endif #define HAS_DEV_URANDOM /* introduced in 1.1 */ +#ifndef NO_EPOLL +# define USE_EPOLL_EVENTS /* introduced in 2.5 */ +#endif +#define USE_SYSV_POLL #endif #ifdef LINUX1 @@ -1213,6 +1233,15 @@ extern int dup2_pass_on_exec(int oldd, int newd); extern const char *inet_ntop(int, const void *, char *, size_t); extern int inet_pton(int, const char *, void *); +#endif + + /* + * Defaults for systems without kqueue, /dev/poll or epoll support. + * master/multi-server.c relies on this. + */ +#if !defined(USE_KQUEUE_EVENTS) && !defined(USE_DEVPOLL_EVENTS) \ + && !defined(USE_EPOLL_EVENTS) +#define USE_SELECT_EVENTS #endif /* diff --git a/postfix/src/util/writable.c b/postfix/src/util/writable.c index 52c6d4c7f..f37eb2229 100644 --- a/postfix/src/util/writable.c +++ b/postfix/src/util/writable.c @@ -37,6 +37,10 @@ #include #include +#ifdef USE_SYSV_POLL +#include +#endif + #ifdef USE_SYS_SELECT_H #include #endif @@ -50,6 +54,7 @@ int writable(int fd) { +#ifndef USE_SYSV_POLL struct timeval tv; fd_set write_fds; fd_set except_fds; @@ -85,4 +90,30 @@ int writable(int fd) return (0); } } +#else + + /* + * System-V poll() is optimal for polling a few descriptors. + */ + struct pollfd pollfd; + +#define DONT_WAIT_FOR_EVENT 0 + + pollfd.fd = fd; + pollfd.events = POLLOUT; + for (;;) { + switch (poll(&pollfd, 1, DONT_WAIT_FOR_EVENT)) { + case -1: + if (errno != EINTR) + msg_fatal("poll: %m"); + continue; + case 0: + return (0); + default: + if (pollfd.revents & POLLNVAL) + msg_fatal("poll: %m"); + return (1); + } + } +#endif } diff --git a/postfix/src/util/write_wait.c b/postfix/src/util/write_wait.c index a17c0e595..1a42c126e 100644 --- a/postfix/src/util/write_wait.c +++ b/postfix/src/util/write_wait.c @@ -44,6 +44,10 @@ #include #include +#ifdef USE_SYSV_POLL +#include +#endif + #ifdef USE_SYS_SELECT_H #include #endif @@ -57,6 +61,7 @@ int write_wait(int fd, int timeout) { +#ifndef USE_SYSV_POLL fd_set write_fds; fd_set except_fds; struct timeval tv; @@ -99,4 +104,32 @@ int write_wait(int fd, int timeout) return (0); } } +#else + + /* + * System-V poll() is optimal for polling a few descriptors. + */ + struct pollfd pollfd; + +#define WAIT_FOR_EVENT (-1) + + pollfd.fd = fd; + pollfd.events = POLLOUT; + for (;;) { + switch (poll(&pollfd, 1, timeout < 0 ? + WAIT_FOR_EVENT : timeout * 1000)) { + case -1: + if (errno != EINTR) + msg_fatal("poll: %m"); + continue; + case 0: + errno = ETIMEDOUT; + return (-1); + default: + if (pollfd.revents & POLLNVAL) + msg_fatal("poll: %m"); + return (0); + } + } +#endif }