ovs/lib/daemon-unix.c

/*
 * Copyright (c) 2008, 2009, 2010, 2011, 2012, 2013 Nicira, Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at:
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#include <config.h>
#include "daemon.h"
#include "daemon-private.h"
#include <errno.h>
#include <fcntl.h>
#include <signal.h>
#include <stdlib.h>
#include <string.h>
#include <sys/resource.h>
#include <sys/wait.h>
#include <sys/stat.h>
#include <unistd.h>
#include "command-line.h"
#include "fatal-signal.h"
#include "dirs.h"
#include "lockfile.h"
#include "ovs-thread.h"
#include "process.h"
#include "socket-util.h"
#include "timeval.h"
#include "util.h"
#include "openvswitch/vlog.h"

VLOG_DEFINE_THIS_MODULE(daemon_unix);

/* --detach: Should we run in the background? */
bool detach;                    /* Was --detach specified? */
static bool detached;           /* Have we already detached? */

/* --pidfile: Name of pidfile (null if none). */
char *pidfile;

/* Device and inode of pidfile, so we can avoid reopening it. */
static dev_t pidfile_dev;
static ino_t pidfile_ino;

/* --overwrite-pidfile: Create pidfile even if one already exists and is
   locked? */
static bool overwrite_pidfile;

/* --no-chdir: Should we chdir to "/"? */
static bool chdir_ = true;

/* File descriptor used by daemonize_start() and daemonize_complete(). */
static int daemonize_fd = -1;

/* --monitor: Should a supervisory process monitor the daemon and restart it if
 * it dies due to an error signal? */
static bool monitor;

static void check_already_running(void);
static int lock_pidfile(FILE *, int command);
static pid_t fork_and_clean_up(void);
static void daemonize_post_detach(void);

/* Returns the file name that would be used for a pidfile if 'name' were
 * provided to set_pidfile().  The caller must free the returned string. */
char *
make_pidfile_name(const char *name)
{
    return (!name
            ? xasprintf("%s/%s.pid", ovs_rundir(), program_name)
            : abs_file_name(ovs_rundir(), name));
}

/* Sets that we do not chdir to "/". */
void
set_no_chdir(void)
{
    chdir_ = false;
}

/* Normally, daemonize() or damonize_start() will terminate the program with a
 * message if a locked pidfile already exists.  If this function is called, an
 * existing pidfile will be replaced, with a warning. */
void
ignore_existing_pidfile(void)
{
    overwrite_pidfile = true;
}

/* Sets up a following call to daemonize() to detach from the foreground
 * session, running this process in the background.  */
void
set_detach(void)
{
    detach = true;
}

/* Sets up a following call to daemonize() to fork a supervisory process to
 * monitor the daemon and restart it if it dies due to an error signal.  */
void
daemon_set_monitor(void)
{
    monitor = true;
}

/* If a pidfile has been configured, creates it and stores the running
 * process's pid in it.  Ensures that the pidfile will be deleted when the
 * process exits. */
static void
make_pidfile(void)
{
    long int pid = getpid();
    struct stat s;
    char *tmpfile;
    FILE *file;
    int error;

    /* Create a temporary pidfile. */
    if (overwrite_pidfile) {
        tmpfile = xasprintf("%s.tmp%ld", pidfile, pid);
        fatal_signal_add_file_to_unlink(tmpfile);
    } else {
        /* Everyone shares the same file which will be treated as a lock.  To
         * avoid some uncomfortable race conditions, we can't set up the fatal
         * signal unlink until we've acquired it. */
        tmpfile = xasprintf("%s.tmp", pidfile);
    }

    file = fopen(tmpfile, "a+");
    if (!file) {
        VLOG_FATAL("%s: create failed (%s)", tmpfile, ovs_strerror(errno));
    }

    error = lock_pidfile(file, F_SETLK);
    if (error) {
        /* Looks like we failed to acquire the lock.  Note that, if we failed
         * for some other reason (and '!overwrite_pidfile'), we will have
         * left 'tmpfile' as garbage in the file system. */
        VLOG_FATAL("%s: fcntl(F_SETLK) failed (%s)", tmpfile,
                   ovs_strerror(error));
    }

    if (!overwrite_pidfile) {
        /* We acquired the lock.  Make sure to clean up on exit, and verify
         * that we're allowed to create the actual pidfile. */
        fatal_signal_add_file_to_unlink(tmpfile);
        check_already_running();
    }

    if (fstat(fileno(file), &s) == -1) {
        VLOG_FATAL("%s: fstat failed (%s)", tmpfile, ovs_strerror(errno));
    }

    if (ftruncate(fileno(file), 0) == -1) {
        VLOG_FATAL("%s: truncate failed (%s)", tmpfile, ovs_strerror(errno));
    }

    fprintf(file, "%ld\n", pid);
    if (fflush(file) == EOF) {
        VLOG_FATAL("%s: write failed (%s)", tmpfile, ovs_strerror(errno));
    }

    error = rename(tmpfile, pidfile);

    /* Due to a race, 'tmpfile' may be owned by a different process, so we
     * shouldn't delete it on exit. */
    fatal_signal_remove_file_to_unlink(tmpfile);

    if (error < 0) {
        VLOG_FATAL("failed to rename \"%s\" to \"%s\" (%s)",
                   tmpfile, pidfile, ovs_strerror(errno));
    }

    /* Ensure that the pidfile will get deleted on exit. */
    fatal_signal_add_file_to_unlink(pidfile);

    /* Clean up.
     *
     * We don't close 'file' because its file descriptor must remain open to
     * hold the lock. */
    pidfile_dev = s.st_dev;
    pidfile_ino = s.st_ino;
    free(tmpfile);
}

/* Calls fork() and on success returns its return value.  On failure, logs an
 * error and exits unsuccessfully.
 *
 * Post-fork, but before returning, this function calls a few other functions
 * that are generally useful if the child isn't planning to exec a new
 * process. */
static pid_t
fork_and_clean_up(void)
{
    pid_t pid = xfork();
    if (pid > 0) {
        /* Running in parent process. */
        fatal_signal_fork();
    } else if (!pid) {
        /* Running in child process. */
        lockfile_postfork();
    }
    return pid;
}

/* Forks, then:
 *
 *   - In the parent, waits for the child to signal that it has completed its
 *     startup sequence.  Then stores -1 in '*fdp' and returns the child's
 *     pid in '*child_pid' argument.
 *
 *   - In the child, stores a fd in '*fdp' and returns 0 through '*child_pid'
 *     argument.  The caller should pass the fd to fork_notify_startup() after
 *     it finishes its startup sequence.
 *
 * Returns 0 on success.  If something goes wrong and child process was not
 * able to signal its readiness by calling fork_notify_startup(), then this
 * function returns -1. However, even in case of failure it still sets child
 * process id in '*child_pid'. */
static int
fork_and_wait_for_startup(int *fdp, pid_t *child_pid)
{
    int fds[2];
    pid_t pid;
    int ret = 0;

    xpipe(fds);

    pid = fork_and_clean_up();
    if (pid > 0) {
        /* Running in parent process. */
        size_t bytes_read;
        char c;

        close(fds[1]);
        if (read_fully(fds[0], &c, 1, &bytes_read) != 0) {
            int retval;
            int status;

            do {
                retval = waitpid(pid, &status, 0);
            } while (retval == -1 && errno == EINTR);

            if (retval == pid) {
                if (WIFEXITED(status) && WEXITSTATUS(status)) {
                    /* Child exited with an error.  Convey the same error
                     * to our parent process as a courtesy. */
                    exit(WEXITSTATUS(status));
                } else {
                    char *status_msg = process_status_msg(status);
                    VLOG_ERR("fork child died before signaling startup (%s)",
                             status_msg);
                    ret = -1;
                }
            } else if (retval < 0) {
                VLOG_FATAL("waitpid failed (%s)", ovs_strerror(errno));
            } else {
                OVS_NOT_REACHED();
            }
        }
        close(fds[0]);
        *fdp = -1;
    } else if (!pid) {
        /* Running in child process. */
        close(fds[0]);
        *fdp = fds[1];
    }
    *child_pid = pid;
    return ret;
}

static void
fork_notify_startup(int fd)
{
    if (fd != -1) {
        size_t bytes_written;
        int error;

        error = write_fully(fd, "", 1, &bytes_written);
        if (error) {
            VLOG_FATAL("pipe write failed (%s)", ovs_strerror(error));
        }

        close(fd);
    }
}

static bool
should_restart(int status)
{
    if (WIFSIGNALED(status)) {
        static const int error_signals[] = {
            /* This list of signals is documented in daemon.man.  If you
             * change the list, update the documentation too. */
            SIGABRT, SIGALRM, SIGBUS, SIGFPE, SIGILL, SIGPIPE, SIGSEGV,
            SIGXCPU, SIGXFSZ
        };

        size_t i;

        for (i = 0; i < ARRAY_SIZE(error_signals); i++) {
            if (error_signals[i] == WTERMSIG(status)) {
                return true;
            }
        }
    }
    return false;
}

static void
monitor_daemon(pid_t daemon_pid)
{
    /* XXX Should log daemon's stderr output at startup time. */
    time_t last_restart;
    char *status_msg;
    int crashes;
    bool child_ready = true;

    set_subprogram_name("monitor");
    status_msg = xstrdup("healthy");
    last_restart = TIME_MIN;
    crashes = 0;
    for (;;) {
        int retval;
        int status;

        proctitle_set("monitoring pid %lu (%s)",
                      (unsigned long int) daemon_pid, status_msg);

        if (child_ready) {
            do {
                retval = waitpid(daemon_pid, &status, 0);
            } while (retval == -1 && errno == EINTR);
            if (retval == -1) {
                VLOG_FATAL("waitpid failed (%s)", ovs_strerror(errno));
            }
        }

        if (!child_ready || retval == daemon_pid) {
            char *s = process_status_msg(status);
            if (should_restart(status)) {
                free(status_msg);
                status_msg = xasprintf("%d crashes: pid %lu died, %s",
                                       ++crashes,
                                       (unsigned long int) daemon_pid, s);
                free(s);

                if (WCOREDUMP(status)) {
                    /* Disable further core dumps to save disk space. */
                    struct rlimit r;

                    r.rlim_cur = 0;
                    r.rlim_max = 0;
                    if (setrlimit(RLIMIT_CORE, &r) == -1) {
                        VLOG_WARN("failed to disable core dumps: %s",
                                  ovs_strerror(errno));
                    }
                }

                /* Throttle restarts to no more than once every 10 seconds. */
                if (time(NULL) < last_restart + 10) {
                    VLOG_WARN("%s, waiting until 10 seconds since last "
                              "restart", status_msg);
                    for (;;) {
                        time_t now = time(NULL);
                        time_t wakeup = last_restart + 10;
                        if (now >= wakeup) {
                            break;
                        }
                        xsleep(wakeup - now);
                    }
                }
                last_restart = time(NULL);

                VLOG_ERR("%s, restarting", status_msg);
                child_ready = !fork_and_wait_for_startup(&daemonize_fd,
                                                         &daemon_pid);
                if (child_ready && !daemon_pid) {
                    /* Child process needs to break out of monitoring
                     * loop. */
                    break;
                }
            } else {
                VLOG_INFO("pid %lu died, %s, exiting",
                          (unsigned long int) daemon_pid, s);
                free(s);
                exit(0);
            }
        }
    }
    free(status_msg);

    /* Running in new daemon process. */
    proctitle_restore();
    set_subprogram_name("");
}

/* If daemonization is configured, then starts daemonization, by forking and
 * returning in the child process.  The parent process hangs around until the
 * child lets it know either that it completed startup successfully (by calling
 * daemon_complete()) or that it failed to start up (by exiting with a nonzero
 * exit code). */
void
daemonize_start(void)
{
    assert_single_threaded();
    daemonize_fd = -1;

    if (detach) {
        pid_t pid;

        if (fork_and_wait_for_startup(&daemonize_fd, &pid)) {
            VLOG_FATAL("could not detach from foreground session");
        }
        if (pid > 0) {
            /* Running in parent process. */
            exit(0);
        }

        /* Running in daemon or monitor process. */
        setsid();
    }

    if (monitor) {
        int saved_daemonize_fd = daemonize_fd;
        pid_t daemon_pid;

        if (fork_and_wait_for_startup(&daemonize_fd, &daemon_pid)) {
            VLOG_FATAL("could not initiate process monitoring");
        }
        if (daemon_pid > 0) {
            /* Running in monitor process. */
            fork_notify_startup(saved_daemonize_fd);
            close_standard_fds();
            monitor_daemon(daemon_pid);
        }
        /* Running in daemon process. */
    }

    forbid_forking("running in daemon process");

    if (pidfile) {
        make_pidfile();
    }

    /* Make sure that the unixctl commands for vlog get registered in a
     * daemon, even before the first log message. */
    vlog_init();
}

/* If daemonization is configured, then this function notifies the parent
 * process that the child process has completed startup successfully.  It also
 * call daemonize_post_detach().
 *
 * Calling this function more than once has no additional effect. */
void
daemonize_complete(void)
{
    if (pidfile) {
        free(pidfile);
        pidfile = NULL;
    }

    if (!detached) {
        detached = true;

        fork_notify_startup(daemonize_fd);
        daemonize_fd = -1;
        daemonize_post_detach();
    }
}

/* If daemonization is configured, then this function does traditional Unix
 * daemonization behavior: join a new session, chdir to the root (if not
 * disabled), and close the standard file descriptors.
 *
 * It only makes sense to call this function as part of an implementation of a
 * special daemon subprocess.  A normal daemon should just call
 * daemonize_complete(). */
static void
daemonize_post_detach(void)
{
    if (detach) {
        if (chdir_) {
            ignore(chdir("/"));
        }
        close_standard_fds();
    }
}

void
daemon_usage(void)
{
    printf(
        "\nDaemon options:\n"
        "  --detach                run in background as daemon\n"
        "  --no-chdir              do not chdir to '/'\n"
        "  --pidfile[=FILE]        create pidfile (default: %s/%s.pid)\n"
        "  --overwrite-pidfile     with --pidfile, start even if already "
                                   "running\n",
        ovs_rundir(), program_name);
}

static int
lock_pidfile__(FILE *file, int command, struct flock *lck)
{
    int error;

    lck->l_type = F_WRLCK;
    lck->l_whence = SEEK_SET;
    lck->l_start = 0;
    lck->l_len = 0;
    lck->l_pid = 0;

    do {
        error = fcntl(fileno(file), command, lck) == -1 ? errno : 0;
    } while (error == EINTR);
    return error;
}

static int
lock_pidfile(FILE *file, int command)
{
    struct flock lck;

    return lock_pidfile__(file, command, &lck);
}

static pid_t
read_pidfile__(const char *pidfile, bool delete_if_stale)
{
    struct stat s, s2;
    struct flock lck;
    char line[128];
    FILE *file;
    int error;

    if ((pidfile_ino || pidfile_dev)
        && !stat(pidfile, &s)
        && s.st_ino == pidfile_ino && s.st_dev == pidfile_dev) {
        /* It's our own pidfile.  We can't afford to open it, because closing
         * *any* fd for a file that a process has locked also releases all the
         * locks on that file.
         *
         * Fortunately, we know the associated pid anyhow: */
        return getpid();
    }

    file = fopen(pidfile, "r+");
    if (!file) {
        if (errno == ENOENT && delete_if_stale) {
            return 0;
        }
        error = errno;
        VLOG_WARN("%s: open: %s", pidfile, ovs_strerror(error));
        goto error;
    }

    error = lock_pidfile__(file, F_GETLK, &lck);
    if (error) {
        VLOG_WARN("%s: fcntl: %s", pidfile, ovs_strerror(error));
        goto error;
    }
    if (lck.l_type == F_UNLCK) {
        /* pidfile exists but it isn't locked by anyone.  We need to delete it
         * so that a new pidfile can go in its place.  But just calling
         * unlink(pidfile) makes a nasty race: what if someone else unlinks it
         * before we do and then replaces it by a valid pidfile?  We'd unlink
         * their valid pidfile.  We do a little dance to avoid the race, by
         * locking the invalid pidfile.  Only one process can have the invalid
         * pidfile locked, and only that process has the right to unlink it. */
        if (!delete_if_stale) {
            error = ESRCH;
            VLOG_DBG("%s: pid file is stale", pidfile);
            goto error;
        }

        /* Get the lock. */
        error = lock_pidfile(file, F_SETLK);
        if (error) {
            /* We lost a race with someone else doing the same thing. */
            VLOG_WARN("%s: lost race to lock pidfile", pidfile);
            goto error;
        }

        /* Is the file we have locked still named 'pidfile'? */
        if (stat(pidfile, &s) || fstat(fileno(file), &s2)
            || s.st_ino != s2.st_ino || s.st_dev != s2.st_dev) {
            /* No.  We lost a race with someone else who got the lock before
             * us, deleted the pidfile, and closed it (releasing the lock). */
            error = EALREADY;
            VLOG_WARN("%s: lost race to delete pidfile", pidfile);
            goto error;
        }

        /* We won the right to delete the stale pidfile. */
        if (unlink(pidfile)) {
            error = errno;
            VLOG_WARN("%s: failed to delete stale pidfile (%s)",
                      pidfile, ovs_strerror(error));
            goto error;
        }
        VLOG_DBG("%s: deleted stale pidfile", pidfile);
        fclose(file);
        return 0;
    }

    if (!fgets(line, sizeof line, file)) {
        if (ferror(file)) {
            error = errno;
            VLOG_WARN("%s: read: %s", pidfile, ovs_strerror(error));
        } else {
            error = ESRCH;
            VLOG_WARN("%s: read: unexpected end of file", pidfile);
        }
        goto error;
    }

    if (lck.l_pid != strtoul(line, NULL, 10)) {
        /* The process that has the pidfile locked is not the process that
         * created it.  It must be stale, with the process that has it locked
         * preparing to delete it. */
        error = ESRCH;
        VLOG_WARN("%s: stale pidfile for pid %s being deleted by pid %ld",
                  pidfile, line, (long int) lck.l_pid);
        goto error;
    }

    fclose(file);
    return lck.l_pid;

error:
    if (file) {
        fclose(file);
    }
    return -error;
}

/* Opens and reads a PID from 'pidfile'.  Returns the positive PID if
 * successful, otherwise a negative errno value. */
pid_t
read_pidfile(const char *pidfile)
{
    return read_pidfile__(pidfile, false);
}

/* Checks whether a process with the given 'pidfile' is already running and,
 * if so, aborts.  If 'pidfile' is stale, deletes it. */
static void
check_already_running(void)
{
    long int pid = read_pidfile__(pidfile, true);
    if (pid > 0) {
        VLOG_FATAL("%s: already running as pid %ld, aborting", pidfile, pid);
    } else if (pid < 0) {
        VLOG_FATAL("%s: pidfile check failed (%s), aborting",
                   pidfile, ovs_strerror(-pid));
    }
}


/* stub functions for non-windows platform. */

void
service_start(int *argc OVS_UNUSED, char **argv[] OVS_UNUSED)
{
}

void
service_stop(void)
{
}

bool
should_service_stop(void)
{
    return false;
}
-												Import from old repository commit 61ef2b42a9c4ba8e1600f15bb0236765edc2ad45.

											
										
										
											2009-07-08 13:19:16 -07:00
+								/*
-												ovs-thread: Add support for various thread-related assertions.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2013-06-19 13:07:35 -07:00
+								 * Copyright (c) 2008, 2009, 2010, 2011, 2012, 2013 Nicira, Inc.
-												Import from old repository commit 61ef2b42a9c4ba8e1600f15bb0236765edc2ad45.

											
										
										
											2009-07-08 13:19:16 -07:00
+								 *
-												Update primary code license to Apache 2.0.

											
										
										
											2009-06-15 15:11:30 -07:00
+								 * Licensed under the Apache License, Version 2.0 (the "License");
 								 * you may not use this file except in compliance with the License.
 								 * You may obtain a copy of the License at:
-												Import from old repository commit 61ef2b42a9c4ba8e1600f15bb0236765edc2ad45.

											
										
										
											2009-07-08 13:19:16 -07:00
+								 *
-												Update primary code license to Apache 2.0.

											
										
										
											2009-06-15 15:11:30 -07:00
+								 *     http://www.apache.org/licenses/LICENSE-2.0
 								 *
 								 * Unless required by applicable law or agreed to in writing, software
 								 * distributed under the License is distributed on an "AS IS" BASIS,
 								 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 								 * See the License for the specific language governing permissions and
 								 * limitations under the License.
-												Import from old repository commit 61ef2b42a9c4ba8e1600f15bb0236765edc2ad45.

											
										
										
											2009-07-08 13:19:16 -07:00
+								 */
 								#include <config.h>
 								#include "daemon.h"
-												daemon: Move some common code to daemon.c

We have some common code between daemon-unix.c and
daemon-windows.c. Move them to daemon.c

Signed-off-by: Gurucharan Shetty <gshetty@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-04-23 14:22:38 -07:00
+								#include "daemon-private.h"
-												Import from old repository commit 61ef2b42a9c4ba8e1600f15bb0236765edc2ad45.

											
										
										
											2009-07-08 13:19:16 -07:00
+								#include <errno.h>
 								#include <fcntl.h>
-												Add some missing "#include"s.

These are required to build on FreeBSD 8.0.

											
										
										
											2010-05-26 10:37:39 -07:00
+								#include <signal.h>
-												Import from old repository commit 61ef2b42a9c4ba8e1600f15bb0236765edc2ad45.

											
										
										
											2009-07-08 13:19:16 -07:00
+								#include <stdlib.h>
 								#include <string.h>
-												Add some missing "#include"s.

These are required to build on FreeBSD 8.0.

											
										
										
											2010-05-26 10:37:39 -07:00
+								#include <sys/resource.h>
-												daemon: Allow daemon child process to report success or failure to parent.

There are conflicting pressures in startup of a daemon process:

    * The parent process should exit with an error code if the daemon
      cannot start up successfully.

    * Some startup actions must be performed in the child process, not in
      the parent.  The most obvious of these are file locking, since
      child processes do not inherit locks, and anything that requires
      knowing the child process's PID (e.g. unixctl sockets).

Until now, this conflict has usually been handled by giving up part of the
first property, i.e. in some cases the parent process would exit
successfully and the child immediately afterward exit with a failure code.

This commit introduces a better approach, by allowing daemons to perform
startup work in the child and only then signal the parent that they have
successfully started.  If the child instead exits without signaling
success, the parent passes this exit code along to its own parent.

This commit also modifies the daemons that can usefully take advantage of
this new feature to do so.

											
										
										
											2009-12-17 10:56:01 -08:00
+								#include <sys/wait.h>
-												lib: Remove warnings in daemon.c

On some platforms compilation of daemon.c results in implicit
declaration of function fstat and stat warnings.

											
										
										
											2010-10-14 22:59:11 +00:00
+								#include <sys/stat.h>
-												Import from old repository commit 61ef2b42a9c4ba8e1600f15bb0236765edc2ad45.

											
										
										
											2009-07-08 13:19:16 -07:00
+								#include <unistd.h>
-												daemon: Make --monitor process change its process title.

When --monitor is used, administrators sometimes become confused about the
presence of two copies of each process.  This commit attempts to clarify
the situation by making the monitoring process change its process name, as
seen in /proc/$pid/cmdline and in "ps", to clearly indicate what is going
on.

CC: Dan Wendlandt <dan@nicira.com>

											
										
										
											2010-01-19 15:00:56 -08:00
+								#include "command-line.h"
-												Import from old repository commit 61ef2b42a9c4ba8e1600f15bb0236765edc2ad45.

											
										
										
											2009-07-08 13:19:16 -07:00
+								#include "fatal-signal.h"
 								#include "dirs.h"
-												Implement library for lockfiles and use it in cfg code.

This is useful because the upcoming configuration database also needs a
lockfile implementation.

Also adds tests.

											
										
										
											2009-10-14 16:52:04 -07:00
+								#include "lockfile.h"
-												ovs-thread: Add support for various thread-related assertions.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2013-06-19 13:07:35 -07:00
+								#include "ovs-thread.h"
-												daemon: Add support for process monitoring and restart.

											
										
										
											2010-01-15 12:13:46 -08:00
+								#include "process.h"
-												daemon: Don't ignore failed write to pipe.

If the write to the pipe fails here then the parent will think that the
child failed to start up, so the child should oblige it by bailing out.

											
										
										
											2009-12-18 13:46:33 -08:00
+								#include "socket-util.h"
-												Make sure that time advances in a daemon between calls to time_refresh().

Open vSwitch uses an interval timer signal to tell it that its cached idea
of the current time has expired.  However, this didn't work in a daemon
detached from the foreground session (invoked with --detach) because a
child created with fork() does not inherit the parent's interval timer and
we did not re-set it after calling fork().

This commit fixes the problem by setting the interval timer back up after
calling fork() from daemonize().

This fix is based on code inspection (which was then verified to be correct
through testing).  It may not fix any actual problems in practice, because
time_refresh() is called every time through the poll loop, and the poll
loop typically runs more quickly than the periodic timer fires (1 ms or so
average in ovs-vswitchd, vs. 100 ms timer interval).

											
										
										
											2009-10-15 10:39:10 -07:00
+								#include "timeval.h"
-												Import from old repository commit 61ef2b42a9c4ba8e1600f15bb0236765edc2ad45.

											
										
										
											2009-07-08 13:19:16 -07:00
+								#include "util.h"
-												lib: Move vlog.h to <openvswitch/vlog.h>

A new function vlog_insert_module() is introduced to avoid using
list_insert() from the vlog.h header.

Signed-off-by: Thomas Graf <tgraf@noironetworks.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-12-15 14:10:38 +01:00
+								#include "openvswitch/vlog.h"
-												Import from old repository commit 61ef2b42a9c4ba8e1600f15bb0236765edc2ad45.

											
										
										
											2009-07-08 13:19:16 -07:00
-												daemon: Rename daemon.c as daemon-unix.c

An upcoming commit re-introduces daemon.c to have
common functions across daemon-unix.c and daemon-windows.c

Signed-off-by: Gurucharan Shetty <gshetty@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-04-23 09:03:38 -07:00
+								VLOG_DEFINE_THIS_MODULE(daemon_unix);
-												vlog: Introduce VLOG_DEFINE_THIS_MODULE for declaring vlog module in use.

Adding a macro to define the vlog module in use adds a level of
indirection, which makes it easier to change how the vlog module must be
defined.  A followup commit needs to do that, so getting these widespread
changes out of the way first should make that commit easier to review.

											
										
										
											2010-07-16 11:02:49 -07:00
-												daemon: Improve comments.

Elsewhere we put the name of command-line options that control global
variables in the comment, so do so here as well.

Also fix a comment typo.

											
										
										
											2010-08-22 23:13:35 -07:00
+								/* --detach: Should we run in the background? */
-												daemon: Move some common code to daemon.c

We have some common code between daemon-unix.c and
daemon-windows.c. Move them to daemon.c

Signed-off-by: Gurucharan Shetty <gshetty@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-04-23 14:22:38 -07:00
+								bool detach;                    /* Was --detach specified? */
-												daemon: Factor out code into new function daemonize_post_detach().

This code will have another user in an upcoming commit.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2012-05-21 11:08:59 -07:00
+								static bool detached;           /* Have we already detached? */
-												Import from old repository commit 61ef2b42a9c4ba8e1600f15bb0236765edc2ad45.

											
										
										
											2009-07-08 13:19:16 -07:00
-												daemon: Improve comments.

Elsewhere we put the name of command-line options that control global
variables in the comment, so do so here as well.

Also fix a comment typo.

											
										
										
											2010-08-22 23:13:35 -07:00
+								/* --pidfile: Name of pidfile (null if none). */
-												daemon: Move some common code to daemon.c

We have some common code between daemon-unix.c and
daemon-windows.c. Move them to daemon.c

Signed-off-by: Gurucharan Shetty <gshetty@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-04-23 14:22:38 -07:00
+								char *pidfile;
-												Import from old repository commit 61ef2b42a9c4ba8e1600f15bb0236765edc2ad45.

											
										
										
											2009-07-08 13:19:16 -07:00
-												daemon: Fix behavior of read_pidfile() for our own pidfile.

Opening a file descriptor and then closing it always discards any locks
held on the underlying file, even if the file is still open as another file
descriptor.  This meant that calling read_pidfile() on the process's own
pidfile would discard the lock and make other OVS processes think that the
process had died.  This commit fixes the problem.

											
										
										
											2010-09-23 09:39:47 -07:00
+								/* Device and inode of pidfile, so we can avoid reopening it. */
 								static dev_t pidfile_dev;
 								static ino_t pidfile_ino;
-												daemon: Improve comments.

Elsewhere we put the name of command-line options that control global
variables in the comment, so do so here as well.

Also fix a comment typo.

											
										
										
											2010-08-22 23:13:35 -07:00
+								/* --overwrite-pidfile: Create pidfile even if one already exists and is
 								   locked? */
-												daemon: Remove short options from daemon library

The daemon library provides a few short options, but these then take
away their availability from programs that wish to use the library.
Since the daemon options are generally going to be called from a script
(which doesn't care how much typing is involved), we'll only provide
long options.

											
										
										
											2009-08-05 14:20:24 -07:00
+								static bool overwrite_pidfile;
-												Import from old repository commit 61ef2b42a9c4ba8e1600f15bb0236765edc2ad45.

											
										
										
											2009-07-08 13:19:16 -07:00
-												daemon: Improve comments.

Elsewhere we put the name of command-line options that control global
variables in the comment, so do so here as well.

Also fix a comment typo.

											
										
										
											2010-08-22 23:13:35 -07:00
+								/* --no-chdir: Should we chdir to "/"? */
-												daemon: Provide option to not chdir to root

By default, Open vSwitch daemons change their working directories to the
root directory.  This commit provides a --no-chdir option to prevent this
behavior.

											
										
										
											2009-08-04 22:41:46 -07:00
+								static bool chdir_ = true;
-												daemon: Refactor code.

This commit should not change behavior, but it paves the way for
implementing --monitor in the following commit.

											
										
										
											2010-01-15 15:29:52 -08:00
+								/* File descriptor used by daemonize_start() and daemonize_complete(). */
 								static int daemonize_fd = -1;
-												daemon: Allow daemon child process to report success or failure to parent.

There are conflicting pressures in startup of a daemon process:

    * The parent process should exit with an error code if the daemon
      cannot start up successfully.

    * Some startup actions must be performed in the child process, not in
      the parent.  The most obvious of these are file locking, since
      child processes do not inherit locks, and anything that requires
      knowing the child process's PID (e.g. unixctl sockets).

Until now, this conflict has usually been handled by giving up part of the
first property, i.e. in some cases the parent process would exit
successfully and the child immediately afterward exit with a failure code.

This commit introduces a better approach, by allowing daemons to perform
startup work in the child and only then signal the parent that they have
successfully started.  If the child instead exits without signaling
success, the parent passes this exit code along to its own parent.

This commit also modifies the daemons that can usefully take advantage of
this new feature to do so.

											
										
										
											2009-12-17 10:56:01 -08:00
-												daemon: Add support for process monitoring and restart.

											
										
										
											2010-01-15 12:13:46 -08:00
+								/* --monitor: Should a supervisory process monitor the daemon and restart it if
 								 * it dies due to an error signal? */
 								static bool monitor;
-												daemon: Avoid races on pidfile creation.

Until now, if two copies of one OVS daemon started up at the same time,
then due to races in pidfile creation it was possible for both of them to
start successfully, instead of just one.  This was made worse when a
previous copy of the daemon had died abruptly, leaving a stale pidfile.

This commit implements a new pidfile creation and removal protocol that I
believe closes these races.  Now, a pidfile is asserted with "link" instead
of "rename", which prevents the race on creation, and a stale pidfile may
only be deleted by a process after it has taken a lock on it.

This may solve mysterious problems seen occasionally on vswitch restart.
I'm still puzzled by these problems, however, because I don't see anything
in our tests cases that would actually cause two copies of a daemon to
start at the same time, which as far as I can see is a necessary
precondition for the problem.

											
										
										
											2011-04-04 10:59:19 -07:00
+								static void check_already_running(void);
 								static int lock_pidfile(FILE *, int command);
-												daemon: Cleanup some functions.

Some functions are unused and some functions can be
declared as static.

Signed-off-by: Gurucharan Shetty <gshetty@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-01-10 08:33:15 -08:00
+								static pid_t fork_and_clean_up(void);
 								static void daemonize_post_detach(void);
-												daemon: Avoid races on pidfile creation.

Until now, if two copies of one OVS daemon started up at the same time,
then due to races in pidfile creation it was possible for both of them to
start successfully, instead of just one.  This was made worse when a
previous copy of the daemon had died abruptly, leaving a stale pidfile.

This commit implements a new pidfile creation and removal protocol that I
believe closes these races.  Now, a pidfile is asserted with "link" instead
of "rename", which prevents the race on creation, and a stale pidfile may
only be deleted by a process after it has taken a lock on it.

This may solve mysterious problems seen occasionally on vswitch restart.
I'm still puzzled by these problems, however, because I don't see anything
in our tests cases that would actually cause two copies of a daemon to
start at the same time, which as far as I can see is a necessary
precondition for the problem.

											
										
										
											2011-04-04 10:59:19 -07:00
-												Import from old repository commit 61ef2b42a9c4ba8e1600f15bb0236765edc2ad45.

											
										
										
											2009-07-08 13:19:16 -07:00
+								/* Returns the file name that would be used for a pidfile if 'name' were
 								 * provided to set_pidfile().  The caller must free the returned string. */
-												daemon: Move some common code to daemon.c

We have some common code between daemon-unix.c and
daemon-windows.c. Move them to daemon.c

Signed-off-by: Gurucharan Shetty <gshetty@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-04-23 14:22:38 -07:00
+								char *
-												treewide: Remove trailing whitespace

Signed-off-by: Joe Perches <joe@perches.com>
Acked-by: Simon Horman <horms@verge.net.au>
Signed-off-by: Jesse Gross <jesse@nicira.com>

											
										
										
											2010-08-30 00:24:53 -07:00
+								make_pidfile_name(const char *name)
-												Import from old repository commit 61ef2b42a9c4ba8e1600f15bb0236765edc2ad45.

											
										
										
											2009-07-08 13:19:16 -07:00
+								{
-												util: New functions get_cwd(), abs_file_name().

These will be used further in an upcoming commit.

											
										
										
											2010-03-16 15:06:11 -07:00
+								    return (!name
-												Make installation directories overridable at runtime.

This makes it possible to run tests that need access to installation
directories, such as the rundir, without having access to the actual
installation directories (/var/run is generally not world-writable), by
setting environment variables.  This is not a good way to do things in
general--usually it would be better to choose the correct directories
at configure time--so for now this is undocumented.

											
										
										
											2010-11-29 12:28:26 -08:00
+								            ? xasprintf("%s/%s.pid", ovs_rundir(), program_name)
 								            : abs_file_name(ovs_rundir(), name));
-												Import from old repository commit 61ef2b42a9c4ba8e1600f15bb0236765edc2ad45.

											
										
										
											2009-07-08 13:19:16 -07:00
+								}
-												daemon: Provide option to not chdir to root

By default, Open vSwitch daemons change their working directories to the
root directory.  This commit provides a --no-chdir option to prevent this
behavior.

											
										
										
											2009-08-04 22:41:46 -07:00
+								/* Sets that we do not chdir to "/". */
 								void
 								set_no_chdir(void)
 								{
 								    chdir_ = false;
 								}
-												daemon: Integrate checking for an existing pidfile into daemonize_start().

Until now, it has been the responsibility of an individual daemon to call
die_if_already_running() at an appropriate time.  A long time ago, this
had to happen *before* daemonizing, because once the process daemonized
itself there was no way to report failure to the process that originally
started the daemon.  With the introduction of daemonize_start(), this is
now possible, but we haven't been taking advantage of it.

Therefore, this commit integrates the die_if_already_running() call into
daemonize_start() and deletes the calls to it from individual daemons.

											
										
										
											2011-03-31 09:44:30 -07:00
+								/* Normally, daemonize() or damonize_start() will terminate the program with a
 								 * message if a locked pidfile already exists.  If this function is called, an
 								 * existing pidfile will be replaced, with a warning. */
-												Import from old repository commit 61ef2b42a9c4ba8e1600f15bb0236765edc2ad45.

											
										
										
											2009-07-08 13:19:16 -07:00
+								void
 								ignore_existing_pidfile(void)
 								{
-												daemon: Remove short options from daemon library

The daemon library provides a few short options, but these then take
away their availability from programs that wish to use the library.
Since the daemon options are generally going to be called from a script
(which doesn't care how much typing is involved), we'll only provide
long options.

											
										
										
											2009-08-05 14:20:24 -07:00
+								    overwrite_pidfile = true;
-												Import from old repository commit 61ef2b42a9c4ba8e1600f15bb0236765edc2ad45.

											
										
										
											2009-07-08 13:19:16 -07:00
+								}
 								/* Sets up a following call to daemonize() to detach from the foreground
 								 * session, running this process in the background.  */
 								void
 								set_detach(void)
 								{
 								    detach = true;
 								}
-												daemon: Add support for process monitoring and restart.

											
										
										
											2010-01-15 12:13:46 -08:00
+								/* Sets up a following call to daemonize() to fork a supervisory process to
 								 * monitor the daemon and restart it if it dies due to an error signal.  */
 								void
 								daemon_set_monitor(void)
 								{
 								    monitor = true;
 								}
-												daemon: Improve comments.

Elsewhere we put the name of command-line options that control global
variables in the comment, so do so here as well.

Also fix a comment typo.

											
										
										
											2010-08-22 23:13:35 -07:00
+								/* If a pidfile has been configured, creates it and stores the running
 								 * process's pid in it.  Ensures that the pidfile will be deleted when the
 								 * process exits. */
-												Import from old repository commit 61ef2b42a9c4ba8e1600f15bb0236765edc2ad45.

											
										
										
											2009-07-08 13:19:16 -07:00
+								static void
 								make_pidfile(void)
 								{
-												daemon: Avoid races on pidfile creation.

Until now, if two copies of one OVS daemon started up at the same time,
then due to races in pidfile creation it was possible for both of them to
start successfully, instead of just one.  This was made worse when a
previous copy of the daemon had died abruptly, leaving a stale pidfile.

This commit implements a new pidfile creation and removal protocol that I
believe closes these races.  Now, a pidfile is asserted with "link" instead
of "rename", which prevents the race on creation, and a stale pidfile may
only be deleted by a process after it has taken a lock on it.

This may solve mysterious problems seen occasionally on vswitch restart.
I'm still puzzled by these problems, however, because I don't see anything
in our tests cases that would actually cause two copies of a daemon to
start at the same time, which as far as I can see is a necessary
precondition for the problem.

											
										
										
											2011-04-04 10:59:19 -07:00
+								    long int pid = getpid();
 								    struct stat s;
 								    char *tmpfile;
 								    FILE *file;
 								    int error;
 								    /* Create a temporary pidfile. */
-												daemon: Avoid the link() syscall.

make_pidfile() depends on the link() system call to atomically
create pidfiles when multiple daemons are started concurrently.
However, this system call isn't available on ESX so an alternative
strategy is necessary.  Fortunately, the approach this patch takes
is cleaner than the original code.

Signed-off-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2012-11-14 18:34:14 -08:00
+								    if (overwrite_pidfile) {
 								        tmpfile = xasprintf("%s.tmp%ld", pidfile, pid);
 								        fatal_signal_add_file_to_unlink(tmpfile);
 								    } else {
 								        /* Everyone shares the same file which will be treated as a lock.  To
 								         * avoid some uncomfortable race conditions, we can't set up the fatal
 								         * signal unlink until we've acquired it. */
 								        tmpfile = xasprintf("%s.tmp", pidfile);
 								    }
 								    file = fopen(tmpfile, "a+");
-												daemon: Avoid races on pidfile creation.

Until now, if two copies of one OVS daemon started up at the same time,
then due to races in pidfile creation it was possible for both of them to
start successfully, instead of just one.  This was made worse when a
previous copy of the daemon had died abruptly, leaving a stale pidfile.

This commit implements a new pidfile creation and removal protocol that I
believe closes these races.  Now, a pidfile is asserted with "link" instead
of "rename", which prevents the race on creation, and a stale pidfile may
only be deleted by a process after it has taken a lock on it.

This may solve mysterious problems seen occasionally on vswitch restart.
I'm still puzzled by these problems, however, because I don't see anything
in our tests cases that would actually cause two copies of a daemon to
start at the same time, which as far as I can see is a necessary
precondition for the problem.

											
										
										
											2011-04-04 10:59:19 -07:00
+								    if (!file) {
-												Replace all uses of strerror() by ovs_strerror(), for thread safety.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-06-24 10:54:49 -07:00
+								        VLOG_FATAL("%s: create failed (%s)", tmpfile, ovs_strerror(errno));
-												daemon: Avoid races on pidfile creation.

Until now, if two copies of one OVS daemon started up at the same time,
then due to races in pidfile creation it was possible for both of them to
start successfully, instead of just one.  This was made worse when a
previous copy of the daemon had died abruptly, leaving a stale pidfile.

This commit implements a new pidfile creation and removal protocol that I
believe closes these races.  Now, a pidfile is asserted with "link" instead
of "rename", which prevents the race on creation, and a stale pidfile may
only be deleted by a process after it has taken a lock on it.

This may solve mysterious problems seen occasionally on vswitch restart.
I'm still puzzled by these problems, however, because I don't see anything
in our tests cases that would actually cause two copies of a daemon to
start at the same time, which as far as I can see is a necessary
precondition for the problem.

											
										
										
											2011-04-04 10:59:19 -07:00
+								    }
-												daemon: Avoid the link() syscall.

make_pidfile() depends on the link() system call to atomically
create pidfiles when multiple daemons are started concurrently.
However, this system call isn't available on ESX so an alternative
strategy is necessary.  Fortunately, the approach this patch takes
is cleaner than the original code.

Signed-off-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2012-11-14 18:34:14 -08:00
+								    error = lock_pidfile(file, F_SETLK);
 								    if (error) {
 								        /* Looks like we failed to acquire the lock.  Note that, if we failed
 								         * for some other reason (and '!overwrite_pidfile'), we will have
 								         * left 'tmpfile' as garbage in the file system. */
-												Replace all uses of strerror() by ovs_strerror(), for thread safety.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-06-24 10:54:49 -07:00
+								        VLOG_FATAL("%s: fcntl(F_SETLK) failed (%s)", tmpfile,
 								                   ovs_strerror(error));
-												daemon: Avoid the link() syscall.

make_pidfile() depends on the link() system call to atomically
create pidfiles when multiple daemons are started concurrently.
However, this system call isn't available on ESX so an alternative
strategy is necessary.  Fortunately, the approach this patch takes
is cleaner than the original code.

Signed-off-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2012-11-14 18:34:14 -08:00
+								    }
 								    if (!overwrite_pidfile) {
 								        /* We acquired the lock.  Make sure to clean up on exit, and verify
 								         * that we're allowed to create the actual pidfile. */
 								        fatal_signal_add_file_to_unlink(tmpfile);
 								        check_already_running();
 								    }
-												daemon: Avoid races on pidfile creation.

Until now, if two copies of one OVS daemon started up at the same time,
then due to races in pidfile creation it was possible for both of them to
start successfully, instead of just one.  This was made worse when a
previous copy of the daemon had died abruptly, leaving a stale pidfile.

This commit implements a new pidfile creation and removal protocol that I
believe closes these races.  Now, a pidfile is asserted with "link" instead
of "rename", which prevents the race on creation, and a stale pidfile may
only be deleted by a process after it has taken a lock on it.

This may solve mysterious problems seen occasionally on vswitch restart.
I'm still puzzled by these problems, however, because I don't see anything
in our tests cases that would actually cause two copies of a daemon to
start at the same time, which as far as I can see is a necessary
precondition for the problem.

											
										
										
											2011-04-04 10:59:19 -07:00
+								    if (fstat(fileno(file), &s) == -1) {
-												Replace all uses of strerror() by ovs_strerror(), for thread safety.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-06-24 10:54:49 -07:00
+								        VLOG_FATAL("%s: fstat failed (%s)", tmpfile, ovs_strerror(errno));
-												daemon: Avoid races on pidfile creation.

Until now, if two copies of one OVS daemon started up at the same time,
then due to races in pidfile creation it was possible for both of them to
start successfully, instead of just one.  This was made worse when a
previous copy of the daemon had died abruptly, leaving a stale pidfile.

This commit implements a new pidfile creation and removal protocol that I
believe closes these races.  Now, a pidfile is asserted with "link" instead
of "rename", which prevents the race on creation, and a stale pidfile may
only be deleted by a process after it has taken a lock on it.

This may solve mysterious problems seen occasionally on vswitch restart.
I'm still puzzled by these problems, however, because I don't see anything
in our tests cases that would actually cause two copies of a daemon to
start at the same time, which as far as I can see is a necessary
precondition for the problem.

											
										
										
											2011-04-04 10:59:19 -07:00
+								    }
-												daemon: Avoid the link() syscall.

make_pidfile() depends on the link() system call to atomically
create pidfiles when multiple daemons are started concurrently.
However, this system call isn't available on ESX so an alternative
strategy is necessary.  Fortunately, the approach this patch takes
is cleaner than the original code.

Signed-off-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2012-11-14 18:34:14 -08:00
+								    if (ftruncate(fileno(file), 0) == -1) {
-												Replace all uses of strerror() by ovs_strerror(), for thread safety.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-06-24 10:54:49 -07:00
+								        VLOG_FATAL("%s: truncate failed (%s)", tmpfile, ovs_strerror(errno));
-												daemon: Avoid the link() syscall.

make_pidfile() depends on the link() system call to atomically
create pidfiles when multiple daemons are started concurrently.
However, this system call isn't available on ESX so an alternative
strategy is necessary.  Fortunately, the approach this patch takes
is cleaner than the original code.

Signed-off-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2012-11-14 18:34:14 -08:00
+								    }
-												daemon: Avoid races on pidfile creation.

Until now, if two copies of one OVS daemon started up at the same time,
then due to races in pidfile creation it was possible for both of them to
start successfully, instead of just one.  This was made worse when a
previous copy of the daemon had died abruptly, leaving a stale pidfile.

This commit implements a new pidfile creation and removal protocol that I
believe closes these races.  Now, a pidfile is asserted with "link" instead
of "rename", which prevents the race on creation, and a stale pidfile may
only be deleted by a process after it has taken a lock on it.

This may solve mysterious problems seen occasionally on vswitch restart.
I'm still puzzled by these problems, however, because I don't see anything
in our tests cases that would actually cause two copies of a daemon to
start at the same time, which as far as I can see is a necessary
precondition for the problem.

											
										
										
											2011-04-04 10:59:19 -07:00
+								    fprintf(file, "%ld\n", pid);
 								    if (fflush(file) == EOF) {
-												Replace all uses of strerror() by ovs_strerror(), for thread safety.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-06-24 10:54:49 -07:00
+								        VLOG_FATAL("%s: write failed (%s)", tmpfile, ovs_strerror(errno));
-												daemon: Avoid races on pidfile creation.

Until now, if two copies of one OVS daemon started up at the same time,
then due to races in pidfile creation it was possible for both of them to
start successfully, instead of just one.  This was made worse when a
previous copy of the daemon had died abruptly, leaving a stale pidfile.

This commit implements a new pidfile creation and removal protocol that I
believe closes these races.  Now, a pidfile is asserted with "link" instead
of "rename", which prevents the race on creation, and a stale pidfile may
only be deleted by a process after it has taken a lock on it.

This may solve mysterious problems seen occasionally on vswitch restart.
I'm still puzzled by these problems, however, because I don't see anything
in our tests cases that would actually cause two copies of a daemon to
start at the same time, which as far as I can see is a necessary
precondition for the problem.

											
										
										
											2011-04-04 10:59:19 -07:00
+								    }
-												daemon: Avoid the link() syscall.

make_pidfile() depends on the link() system call to atomically
create pidfiles when multiple daemons are started concurrently.
However, this system call isn't available on ESX so an alternative
strategy is necessary.  Fortunately, the approach this patch takes
is cleaner than the original code.

Signed-off-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2012-11-14 18:34:14 -08:00
+								    error = rename(tmpfile, pidfile);
-												daemon: Avoid races on pidfile creation.

Until now, if two copies of one OVS daemon started up at the same time,
then due to races in pidfile creation it was possible for both of them to
start successfully, instead of just one.  This was made worse when a
previous copy of the daemon had died abruptly, leaving a stale pidfile.

This commit implements a new pidfile creation and removal protocol that I
believe closes these races.  Now, a pidfile is asserted with "link" instead
of "rename", which prevents the race on creation, and a stale pidfile may
only be deleted by a process after it has taken a lock on it.

This may solve mysterious problems seen occasionally on vswitch restart.
I'm still puzzled by these problems, however, because I don't see anything
in our tests cases that would actually cause two copies of a daemon to
start at the same time, which as far as I can see is a necessary
precondition for the problem.

											
										
										
											2011-04-04 10:59:19 -07:00
-												daemon: Avoid the link() syscall.

make_pidfile() depends on the link() system call to atomically
create pidfiles when multiple daemons are started concurrently.
However, this system call isn't available on ESX so an alternative
strategy is necessary.  Fortunately, the approach this patch takes
is cleaner than the original code.

Signed-off-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2012-11-14 18:34:14 -08:00
+								    /* Due to a race, 'tmpfile' may be owned by a different process, so we
 								     * shouldn't delete it on exit. */
 								    fatal_signal_remove_file_to_unlink(tmpfile);
 								    if (error < 0) {
 								        VLOG_FATAL("failed to rename \"%s\" to \"%s\" (%s)",
-												Replace all uses of strerror() by ovs_strerror(), for thread safety.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-06-24 10:54:49 -07:00
+								                   tmpfile, pidfile, ovs_strerror(errno));
-												daemon: Avoid races on pidfile creation.

Until now, if two copies of one OVS daemon started up at the same time,
then due to races in pidfile creation it was possible for both of them to
start successfully, instead of just one.  This was made worse when a
previous copy of the daemon had died abruptly, leaving a stale pidfile.

This commit implements a new pidfile creation and removal protocol that I
believe closes these races.  Now, a pidfile is asserted with "link" instead
of "rename", which prevents the race on creation, and a stale pidfile may
only be deleted by a process after it has taken a lock on it.

This may solve mysterious problems seen occasionally on vswitch restart.
I'm still puzzled by these problems, however, because I don't see anything
in our tests cases that would actually cause two copies of a daemon to
start at the same time, which as far as I can see is a necessary
precondition for the problem.

											
										
										
											2011-04-04 10:59:19 -07:00
+								    }
 								    /* Ensure that the pidfile will get deleted on exit. */
 								    fatal_signal_add_file_to_unlink(pidfile);
 								    /* Clean up.
 								     *
 								     * We don't close 'file' because its file descriptor must remain open to
 								     * hold the lock. */
 								    pidfile_dev = s.st_dev;
 								    pidfile_ino = s.st_ino;
 								    free(tmpfile);
-												Import from old repository commit 61ef2b42a9c4ba8e1600f15bb0236765edc2ad45.

											
										
										
											2009-07-08 13:19:16 -07:00
+								}
-												daemon: Factor out code into new function fork_and_wait_for_startup().

This function will be useful in an upcoming commit.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2012-05-21 11:08:13 -07:00
+								/* Calls fork() and on success returns its return value.  On failure, logs an
 								 * error and exits unsuccessfully.
 								 *
 								 * Post-fork, but before returning, this function calls a few other functions
 								 * that are generally useful if the child isn't planning to exec a new
 								 * process. */
-												daemon: Cleanup some functions.

Some functions are unused and some functions can be
declared as static.

Signed-off-by: Gurucharan Shetty <gshetty@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-01-10 08:33:15 -08:00
+								static pid_t
-												daemon: Factor out code into new function fork_and_wait_for_startup().

This function will be useful in an upcoming commit.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2012-05-21 11:08:13 -07:00
+								fork_and_clean_up(void)
 								{
-												ovs-thread: Add support for various thread-related assertions.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2013-06-19 13:07:35 -07:00
+								    pid_t pid = xfork();
-												daemon: Factor out code into new function fork_and_wait_for_startup().

This function will be useful in an upcoming commit.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2012-05-21 11:08:13 -07:00
+								    if (pid > 0) {
 								        /* Running in parent process. */
 								        fatal_signal_fork();
 								    } else if (!pid) {
 								        /* Running in child process. */
 								        lockfile_postfork();
 								    }
 								    return pid;
 								}
-												daemon: Add comment.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2012-05-14 14:21:18 -07:00
+								/* Forks, then:
 								 *
 								 *   - In the parent, waits for the child to signal that it has completed its
-												daemon: restart child process if it died before signaling its readiness

The child process (the one being monitored) could die before it was able
to call fork_notify_startup() function.  If such situation arises, then
parent process (the one monitoring child process) would also terminate
with a fatal log message:

...|EMER|fork child died before signaling startup (killed (...))

This patch changes that behavior by always restarting child process
if it was able to start up at least once in the past.  However, if
child was not able to start up even once, then the monitor process
would still terminate, because that would most likely indicate a
persistent programming or system error.

To reproduce use following script:

while : ; do kill -SIGSEGV `cat /var/run/openvswitch/ovs-vswitchd.pid`; done

Signed-Off-By: Ansis Atteka <aatteka@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>
VMware-BZ: 1273550

											
										
										
											2014-07-07 17:11:39 -07:00
+								 *     startup sequence.  Then stores -1 in '*fdp' and returns the child's
 								 *     pid in '*child_pid' argument.
-												daemon: Add comment.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2012-05-14 14:21:18 -07:00
+								 *
-												daemon: restart child process if it died before signaling its readiness

The child process (the one being monitored) could die before it was able
to call fork_notify_startup() function.  If such situation arises, then
parent process (the one monitoring child process) would also terminate
with a fatal log message:

...|EMER|fork child died before signaling startup (killed (...))

This patch changes that behavior by always restarting child process
if it was able to start up at least once in the past.  However, if
child was not able to start up even once, then the monitor process
would still terminate, because that would most likely indicate a
persistent programming or system error.

To reproduce use following script:

while : ; do kill -SIGSEGV `cat /var/run/openvswitch/ovs-vswitchd.pid`; done

Signed-Off-By: Ansis Atteka <aatteka@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>
VMware-BZ: 1273550

											
										
										
											2014-07-07 17:11:39 -07:00
+								 *   - In the child, stores a fd in '*fdp' and returns 0 through '*child_pid'
 								 *     argument.  The caller should pass the fd to fork_notify_startup() after
 								 *     it finishes its startup sequence.
-												daemon: Add comment.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2012-05-14 14:21:18 -07:00
+								 *
-												daemon: restart child process if it died before signaling its readiness

The child process (the one being monitored) could die before it was able
to call fork_notify_startup() function.  If such situation arises, then
parent process (the one monitoring child process) would also terminate
with a fatal log message:

...|EMER|fork child died before signaling startup (killed (...))

This patch changes that behavior by always restarting child process
if it was able to start up at least once in the past.  However, if
child was not able to start up even once, then the monitor process
would still terminate, because that would most likely indicate a
persistent programming or system error.

To reproduce use following script:

while : ; do kill -SIGSEGV `cat /var/run/openvswitch/ovs-vswitchd.pid`; done

Signed-Off-By: Ansis Atteka <aatteka@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>
VMware-BZ: 1273550

											
										
										
											2014-07-07 17:11:39 -07:00
+								 * Returns 0 on success.  If something goes wrong and child process was not
 								 * able to signal its readiness by calling fork_notify_startup(), then this
 								 * function returns -1. However, even in case of failure it still sets child
 								 * process id in '*child_pid'. */
 								static int
 								fork_and_wait_for_startup(int *fdp, pid_t *child_pid)
-												daemon: Refactor code.

This commit should not change behavior, but it paves the way for
implementing --monitor in the following commit.

											
										
										
											2010-01-15 15:29:52 -08:00
+								{
 								    int fds[2];
 								    pid_t pid;
-												daemon: restart child process if it died before signaling its readiness

The child process (the one being monitored) could die before it was able
to call fork_notify_startup() function.  If such situation arises, then
parent process (the one monitoring child process) would also terminate
with a fatal log message:

...|EMER|fork child died before signaling startup (killed (...))

This patch changes that behavior by always restarting child process
if it was able to start up at least once in the past.  However, if
child was not able to start up even once, then the monitor process
would still terminate, because that would most likely indicate a
persistent programming or system error.

To reproduce use following script:

while : ; do kill -SIGSEGV `cat /var/run/openvswitch/ovs-vswitchd.pid`; done

Signed-Off-By: Ansis Atteka <aatteka@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>
VMware-BZ: 1273550

											
										
										
											2014-07-07 17:11:39 -07:00
+								    int ret = 0;
-												daemon: Refactor code.

This commit should not change behavior, but it paves the way for
implementing --monitor in the following commit.

											
										
										
											2010-01-15 15:29:52 -08:00
-												Log anything that could prevent a daemon from starting.

If a daemon doesn't start, we need to know why.  Being able to
consistently consult the log to find out is helpful.

											
										
										
											2011-03-31 16:23:50 -07:00
+								    xpipe(fds);
-												daemon: Refactor code.

This commit should not change behavior, but it paves the way for
implementing --monitor in the following commit.

											
										
										
											2010-01-15 15:29:52 -08:00
-												daemon: Factor out code into new function fork_and_wait_for_startup().

This function will be useful in an upcoming commit.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2012-05-21 11:08:13 -07:00
+								    pid = fork_and_clean_up();
-												daemon: Refactor code.

This commit should not change behavior, but it paves the way for
implementing --monitor in the following commit.

											
										
										
											2010-01-15 15:29:52 -08:00
+								    if (pid > 0) {
 								        /* Running in parent process. */
-												daemon: Tolerate EINTR in fork_and_wait_for_startup().

It seems possible that a signal coming in at the wrong time could confuse
this code.  It's always best to loop on EINTR.

											
										
										
											2011-03-31 09:36:10 -07:00
+								        size_t bytes_read;
-												daemon: Refactor code.

This commit should not change behavior, but it paves the way for
implementing --monitor in the following commit.

											
										
										
											2010-01-15 15:29:52 -08:00
+								        char c;
 								        close(fds[1]);
-												daemon: Tolerate EINTR in fork_and_wait_for_startup().

It seems possible that a signal coming in at the wrong time could confuse
this code.  It's always best to loop on EINTR.

											
										
										
											2011-03-31 09:36:10 -07:00
+								        if (read_fully(fds[0], &c, 1, &bytes_read) != 0) {
-												daemon: Refactor code.

This commit should not change behavior, but it paves the way for
implementing --monitor in the following commit.

											
										
										
											2010-01-15 15:29:52 -08:00
+								            int retval;
 								            int status;
 								            do {
 								                retval = waitpid(pid, &status, 0);
 								            } while (retval == -1 && errno == EINTR);
-												daemon: Better log when fork child dies early from signals.

On one machine, "/etc/init.d/openvswitch-switch start" failed to start
with:

   ovs-vswitchd: fork child failed to signal startup (Success)
   Starting ovs-vswitchd ... failed!

"strace" revealed that the fork child was actually segfaulting, but the
message output didn't indicate that in any way.  This commit fixes the
log message (but not the segfault itself).

Reported-by: Michael Hu <mhu@nicira.com>
Bug #8457.

											
										
										
											2011-11-23 12:15:42 -08:00
+								            if (retval == pid) {
 								                if (WIFEXITED(status) && WEXITSTATUS(status)) {
 								                    /* Child exited with an error.  Convey the same error
 								                     * to our parent process as a courtesy. */
 								                    exit(WEXITSTATUS(status));
 								                } else {
 								                    char *status_msg = process_status_msg(status);
-												daemon: restart child process if it died before signaling its readiness

The child process (the one being monitored) could die before it was able
to call fork_notify_startup() function.  If such situation arises, then
parent process (the one monitoring child process) would also terminate
with a fatal log message:

...|EMER|fork child died before signaling startup (killed (...))

This patch changes that behavior by always restarting child process
if it was able to start up at least once in the past.  However, if
child was not able to start up even once, then the monitor process
would still terminate, because that would most likely indicate a
persistent programming or system error.

To reproduce use following script:

while : ; do kill -SIGSEGV `cat /var/run/openvswitch/ovs-vswitchd.pid`; done

Signed-Off-By: Ansis Atteka <aatteka@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>
VMware-BZ: 1273550

											
										
										
											2014-07-07 17:11:39 -07:00
+								                    VLOG_ERR("fork child died before signaling startup (%s)",
 								                             status_msg);
 								                    ret = -1;
-												daemon: Better log when fork child dies early from signals.

On one machine, "/etc/init.d/openvswitch-switch start" failed to start
with:

   ovs-vswitchd: fork child failed to signal startup (Success)
   Starting ovs-vswitchd ... failed!

"strace" revealed that the fork child was actually segfaulting, but the
message output didn't indicate that in any way.  This commit fixes the
log message (but not the segfault itself).

Reported-by: Michael Hu <mhu@nicira.com>
Bug #8457.

											
										
										
											2011-11-23 12:15:42 -08:00
+								                }
 								            } else if (retval < 0) {
-												Replace all uses of strerror() by ovs_strerror(), for thread safety.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-06-24 10:54:49 -07:00
+								                VLOG_FATAL("waitpid failed (%s)", ovs_strerror(errno));
-												daemon: Better log when fork child dies early from signals.

On one machine, "/etc/init.d/openvswitch-switch start" failed to start
with:

   ovs-vswitchd: fork child failed to signal startup (Success)
   Starting ovs-vswitchd ... failed!

"strace" revealed that the fork child was actually segfaulting, but the
message output didn't indicate that in any way.  This commit fixes the
log message (but not the segfault itself).

Reported-by: Michael Hu <mhu@nicira.com>
Bug #8457.

											
										
										
											2011-11-23 12:15:42 -08:00
+								            } else {
-												Rename NOT_REACHED to OVS_NOT_REACHED

This allows other libraries to use util.h that has already
defined NOT_REACHED.

Signed-off-by: Harold Lim <haroldl@vmware.com>
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-12-17 10:32:12 -08:00
+								                OVS_NOT_REACHED();
-												daemon: Refactor code.

This commit should not change behavior, but it paves the way for
implementing --monitor in the following commit.

											
										
										
											2010-01-15 15:29:52 -08:00
+								            }
 								        }
 								        close(fds[0]);
 								        *fdp = -1;
 								    } else if (!pid) {
 								        /* Running in child process. */
 								        close(fds[0]);
 								        *fdp = fds[1];
 								    }
-												daemon: restart child process if it died before signaling its readiness

The child process (the one being monitored) could die before it was able
to call fork_notify_startup() function.  If such situation arises, then
parent process (the one monitoring child process) would also terminate
with a fatal log message:

...|EMER|fork child died before signaling startup (killed (...))

This patch changes that behavior by always restarting child process
if it was able to start up at least once in the past.  However, if
child was not able to start up even once, then the monitor process
would still terminate, because that would most likely indicate a
persistent programming or system error.

To reproduce use following script:

while : ; do kill -SIGSEGV `cat /var/run/openvswitch/ovs-vswitchd.pid`; done

Signed-Off-By: Ansis Atteka <aatteka@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>
VMware-BZ: 1273550

											
										
										
											2014-07-07 17:11:39 -07:00
+								    *child_pid = pid;
 								    return ret;
-												daemon: Refactor code.

This commit should not change behavior, but it paves the way for
implementing --monitor in the following commit.

											
										
										
											2010-01-15 15:29:52 -08:00
+								}
 								static void
 								fork_notify_startup(int fd)
 								{
 								    if (fd != -1) {
 								        size_t bytes_written;
 								        int error;
 								        error = write_fully(fd, "", 1, &bytes_written);
 								        if (error) {
-												Replace all uses of strerror() by ovs_strerror(), for thread safety.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-06-24 10:54:49 -07:00
+								            VLOG_FATAL("pipe write failed (%s)", ovs_strerror(error));
-												daemon: Refactor code.

This commit should not change behavior, but it paves the way for
implementing --monitor in the following commit.

											
										
										
											2010-01-15 15:29:52 -08:00
+								        }
 								        close(fd);
 								    }
 								}
-												daemon: Add support for process monitoring and restart.

											
										
										
											2010-01-15 12:13:46 -08:00
+								static bool
 								should_restart(int status)
 								{
 								    if (WIFSIGNALED(status)) {
 								        static const int error_signals[] = {
-												daemon: Precisely document signals that cause the monitor to restart.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Joe Stringer <joestringer@nicira.com>

											
										
										
											2013-10-11 16:52:50 -07:00
+								            /* This list of signals is documented in daemon.man.  If you
 								             * change the list, update the documentation too. */
-												daemon: Add support for process monitoring and restart.

											
										
										
											2010-01-15 12:13:46 -08:00
+								            SIGABRT, SIGALRM, SIGBUS, SIGFPE, SIGILL, SIGPIPE, SIGSEGV,
 								            SIGXCPU, SIGXFSZ
 								        };
 								        size_t i;
 								        for (i = 0; i < ARRAY_SIZE(error_signals); i++) {
 								            if (error_signals[i] == WTERMSIG(status)) {
 								                return true;
 								            }
 								        }
 								    }
 								    return false;
 								}
 								static void
 								monitor_daemon(pid_t daemon_pid)
 								{
 								    /* XXX Should log daemon's stderr output at startup time. */
-												daemon: Throttle max respawning rate.

If a monitored daemon dies quickly at startup, the system can waste a lot
of CPU time continually restarting it.  This commit prevents a given
daemon from restarting more than once every 10 seconds.

											
										
										
											2010-05-12 10:02:23 -07:00
+								    time_t last_restart;
-												daemon: Make --monitor process change its process title.

When --monitor is used, administrators sometimes become confused about the
presence of two copies of each process.  This commit attempts to clarify
the situation by making the monitoring process change its process name, as
seen in /proc/$pid/cmdline and in "ps", to clearly indicate what is going
on.

CC: Dan Wendlandt <dan@nicira.com>

											
										
										
											2010-01-19 15:00:56 -08:00
+								    char *status_msg;
-												daemon: Report number of crashes on monitor process command line.

											
										
										
											2010-09-21 14:27:02 -07:00
+								    int crashes;
-												daemon: restart child process if it died before signaling its readiness

The child process (the one being monitored) could die before it was able
to call fork_notify_startup() function.  If such situation arises, then
parent process (the one monitoring child process) would also terminate
with a fatal log message:

...|EMER|fork child died before signaling startup (killed (...))

This patch changes that behavior by always restarting child process
if it was able to start up at least once in the past.  However, if
child was not able to start up even once, then the monitor process
would still terminate, because that would most likely indicate a
persistent programming or system error.

To reproduce use following script:

while : ; do kill -SIGSEGV `cat /var/run/openvswitch/ovs-vswitchd.pid`; done

Signed-Off-By: Ansis Atteka <aatteka@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>
VMware-BZ: 1273550

											
										
										
											2014-07-07 17:11:39 -07:00
+								    bool child_ready = true;
-												daemon: Add support for process monitoring and restart.

											
										
										
											2010-01-15 12:13:46 -08:00
-												util: Make subprogram_name thread-specific.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-07-12 14:18:01 -07:00
+								    set_subprogram_name("monitor");
-												daemon: Make --monitor process change its process title.

When --monitor is used, administrators sometimes become confused about the
presence of two copies of each process.  This commit attempts to clarify
the situation by making the monitoring process change its process name, as
seen in /proc/$pid/cmdline and in "ps", to clearly indicate what is going
on.

CC: Dan Wendlandt <dan@nicira.com>

											
										
										
											2010-01-19 15:00:56 -08:00
+								    status_msg = xstrdup("healthy");
-												daemon: Throttle max respawning rate.

If a monitored daemon dies quickly at startup, the system can waste a lot
of CPU time continually restarting it.  This commit prevents a given
daemon from restarting more than once every 10 seconds.

											
										
										
											2010-05-12 10:02:23 -07:00
+								    last_restart = TIME_MIN;
-												daemon: Report number of crashes on monitor process command line.

											
										
										
											2010-09-21 14:27:02 -07:00
+								    crashes = 0;
-												daemon: Add support for process monitoring and restart.

											
										
										
											2010-01-15 12:13:46 -08:00
+								    for (;;) {
 								        int retval;
 								        int status;
-												lib: Move addition of program_name to proctitle_set

Signed-off-by: Ed Maste <emaste@adaranet.com>
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2012-10-11 20:49:38 +00:00
+								        proctitle_set("monitoring pid %lu (%s)",
 								                      (unsigned long int) daemon_pid, status_msg);
-												daemon: Make --monitor process change its process title.

When --monitor is used, administrators sometimes become confused about the
presence of two copies of each process.  This commit attempts to clarify
the situation by making the monitoring process change its process name, as
seen in /proc/$pid/cmdline and in "ps", to clearly indicate what is going
on.

CC: Dan Wendlandt <dan@nicira.com>

											
										
										
											2010-01-19 15:00:56 -08:00
-												daemon: restart child process if it died before signaling its readiness

The child process (the one being monitored) could die before it was able
to call fork_notify_startup() function.  If such situation arises, then
parent process (the one monitoring child process) would also terminate
with a fatal log message:

...|EMER|fork child died before signaling startup (killed (...))

This patch changes that behavior by always restarting child process
if it was able to start up at least once in the past.  However, if
child was not able to start up even once, then the monitor process
would still terminate, because that would most likely indicate a
persistent programming or system error.

To reproduce use following script:

while : ; do kill -SIGSEGV `cat /var/run/openvswitch/ovs-vswitchd.pid`; done

Signed-Off-By: Ansis Atteka <aatteka@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>
VMware-BZ: 1273550

											
										
										
											2014-07-07 17:11:39 -07:00
+								        if (child_ready) {
 								            do {
 								                retval = waitpid(daemon_pid, &status, 0);
 								            } while (retval == -1 && errno == EINTR);
 								            if (retval == -1) {
 								                VLOG_FATAL("waitpid failed (%s)", ovs_strerror(errno));
 								            }
 								        }
-												daemon: Add support for process monitoring and restart.

											
										
										
											2010-01-15 12:13:46 -08:00
-												daemon: restart child process if it died before signaling its readiness

The child process (the one being monitored) could die before it was able
to call fork_notify_startup() function.  If such situation arises, then
parent process (the one monitoring child process) would also terminate
with a fatal log message:

...|EMER|fork child died before signaling startup (killed (...))

This patch changes that behavior by always restarting child process
if it was able to start up at least once in the past.  However, if
child was not able to start up even once, then the monitor process
would still terminate, because that would most likely indicate a
persistent programming or system error.

To reproduce use following script:

while : ; do kill -SIGSEGV `cat /var/run/openvswitch/ovs-vswitchd.pid`; done

Signed-Off-By: Ansis Atteka <aatteka@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>
VMware-BZ: 1273550

											
										
										
											2014-07-07 17:11:39 -07:00
+								        if (!child_ready || retval == daemon_pid) {
-												daemon: Make --monitor process change its process title.

When --monitor is used, administrators sometimes become confused about the
presence of two copies of each process.  This commit attempts to clarify
the situation by making the monitoring process change its process name, as
seen in /proc/$pid/cmdline and in "ps", to clearly indicate what is going
on.

CC: Dan Wendlandt <dan@nicira.com>

											
										
										
											2010-01-19 15:00:56 -08:00
+								            char *s = process_status_msg(status);
 								            if (should_restart(status)) {
-												daemon: Don't call a normal exit from the monitor a "crash".

When the monitored child is killed with SIGTERM, the monitoring process
currently logs a message like "1 crashes: pid 12345 died, killed by
signal 15 (Terminated), exiting".  This counts the SIGTERM as a crash, even
though it's intentional.

This commit changes the log message to omit the "%d crashes" part on normal
termination.

											
										
										
											2010-10-27 09:29:08 -07:00
+								                free(status_msg);
 								                status_msg = xasprintf("%d crashes: pid %lu died, %s",
 								                                       ++crashes,
 								                                       (unsigned long int) daemon_pid, s);
 								                free(s);
-												daemon: Allow monitored daemon to dump core no more than once.

If the monitored daemon dumps core frequently, then this can quickly
exhaust the host's disk space.  This commit limits core dumps to at most
one per monitored session (typically, once per boot).

											
										
										
											2010-05-11 10:56:10 -07:00
+								                if (WCOREDUMP(status)) {
 								                    /* Disable further core dumps to save disk space. */
 								                    struct rlimit r;
 								                    r.rlim_cur = 0;
 								                    r.rlim_max = 0;
 								                    if (setrlimit(RLIMIT_CORE, &r) == -1) {
 								                        VLOG_WARN("failed to disable core dumps: %s",
-												Replace all uses of strerror() by ovs_strerror(), for thread safety.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-06-24 10:54:49 -07:00
+								                                  ovs_strerror(errno));
-												daemon: Allow monitored daemon to dump core no more than once.

If the monitored daemon dumps core frequently, then this can quickly
exhaust the host's disk space.  This commit limits core dumps to at most
one per monitored session (typically, once per boot).

											
										
										
											2010-05-11 10:56:10 -07:00
+								                    }
 								                }
-												daemon: Throttle max respawning rate.

If a monitored daemon dies quickly at startup, the system can waste a lot
of CPU time continually restarting it.  This commit prevents a given
daemon from restarting more than once every 10 seconds.

											
										
										
											2010-05-12 10:02:23 -07:00
+								                /* Throttle restarts to no more than once every 10 seconds. */
 								                if (time(NULL) < last_restart + 10) {
 								                    VLOG_WARN("%s, waiting until 10 seconds since last "
 								                              "restart", status_msg);
 								                    for (;;) {
 								                        time_t now = time(NULL);
 								                        time_t wakeup = last_restart + 10;
 								                        if (now >= wakeup) {
 								                            break;
 								                        }
-												utils: Introduce xsleep for RCU quiescent state

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-03-21 09:20:42 -07:00
+								                        xsleep(wakeup - now);
-												daemon: Throttle max respawning rate.

If a monitored daemon dies quickly at startup, the system can waste a lot
of CPU time continually restarting it.  This commit prevents a given
daemon from restarting more than once every 10 seconds.

											
										
										
											2010-05-12 10:02:23 -07:00
+								                    }
 								                }
 								                last_restart = time(NULL);
-												daemon: Make --monitor process change its process title.

When --monitor is used, administrators sometimes become confused about the
presence of two copies of each process.  This commit attempts to clarify
the situation by making the monitoring process change its process name, as
seen in /proc/$pid/cmdline and in "ps", to clearly indicate what is going
on.

CC: Dan Wendlandt <dan@nicira.com>

											
										
										
											2010-01-19 15:00:56 -08:00
+								                VLOG_ERR("%s, restarting", status_msg);
-												daemon: restart child process if it died before signaling its readiness

The child process (the one being monitored) could die before it was able
to call fork_notify_startup() function.  If such situation arises, then
parent process (the one monitoring child process) would also terminate
with a fatal log message:

...|EMER|fork child died before signaling startup (killed (...))

This patch changes that behavior by always restarting child process
if it was able to start up at least once in the past.  However, if
child was not able to start up even once, then the monitor process
would still terminate, because that would most likely indicate a
persistent programming or system error.

To reproduce use following script:

while : ; do kill -SIGSEGV `cat /var/run/openvswitch/ovs-vswitchd.pid`; done

Signed-Off-By: Ansis Atteka <aatteka@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>
VMware-BZ: 1273550

											
										
										
											2014-07-07 17:11:39 -07:00
+								                child_ready = !fork_and_wait_for_startup(&daemonize_fd,
 								                                                         &daemon_pid);
 								                if (child_ready && !daemon_pid) {
 								                    /* Child process needs to break out of monitoring
 								                     * loop. */
-												daemon: Add support for process monitoring and restart.

											
										
										
											2010-01-15 12:13:46 -08:00
+								                    break;
 								                }
 								            } else {
-												daemon: Don't call a normal exit from the monitor a "crash".

When the monitored child is killed with SIGTERM, the monitoring process
currently logs a message like "1 crashes: pid 12345 died, killed by
signal 15 (Terminated), exiting".  This counts the SIGTERM as a crash, even
though it's intentional.

This commit changes the log message to omit the "%d crashes" part on normal
termination.

											
										
										
											2010-10-27 09:29:08 -07:00
+								                VLOG_INFO("pid %lu died, %s, exiting",
 								                          (unsigned long int) daemon_pid, s);
 								                free(s);
-												daemon: Add support for process monitoring and restart.

											
										
										
											2010-01-15 12:13:46 -08:00
+								                exit(0);
 								            }
 								        }
 								    }
-												daemon: Fix memory leak in --monitor implementation.

This leaked a small amount of memory each time a daemon process was
created.  It is only important if a daemon is otherwise very buggy.

Found with valgrind.

											
										
										
											2010-02-02 14:36:19 -08:00
+								    free(status_msg);
-												daemon: Add support for process monitoring and restart.

											
										
										
											2010-01-15 12:13:46 -08:00
 								    /* Running in new daemon process. */
-												daemon: Make --monitor process change its process title.

When --monitor is used, administrators sometimes become confused about the
presence of two copies of each process.  This commit attempts to clarify
the situation by making the monitoring process change its process name, as
seen in /proc/$pid/cmdline and in "ps", to clearly indicate what is going
on.

CC: Dan Wendlandt <dan@nicira.com>

											
										
										
											2010-01-19 15:00:56 -08:00
+								    proctitle_restore();
-												util: Make subprogram_name thread-specific.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-07-12 14:18:01 -07:00
+								    set_subprogram_name("");
-												daemon: Add support for process monitoring and restart.

											
										
										
											2010-01-15 12:13:46 -08:00
+								}
-												daemon: Allow daemon child process to report success or failure to parent.

There are conflicting pressures in startup of a daemon process:

    * The parent process should exit with an error code if the daemon
      cannot start up successfully.

    * Some startup actions must be performed in the child process, not in
      the parent.  The most obvious of these are file locking, since
      child processes do not inherit locks, and anything that requires
      knowing the child process's PID (e.g. unixctl sockets).

Until now, this conflict has usually been handled by giving up part of the
first property, i.e. in some cases the parent process would exit
successfully and the child immediately afterward exit with a failure code.

This commit introduces a better approach, by allowing daemons to perform
startup work in the child and only then signal the parent that they have
successfully started.  If the child instead exits without signaling
success, the parent passes this exit code along to its own parent.

This commit also modifies the daemons that can usefully take advantage of
this new feature to do so.

											
										
										
											2009-12-17 10:56:01 -08:00
+								/* If daemonization is configured, then starts daemonization, by forking and
 								 * returning in the child process.  The parent process hangs around until the
 								 * child lets it know either that it completed startup successfully (by calling
 								 * daemon_complete()) or that it failed to start up (by exiting with a nonzero
 								 * exit code). */
 								void
 								daemonize_start(void)
-												Import from old repository commit 61ef2b42a9c4ba8e1600f15bb0236765edc2ad45.

											
										
										
											2009-07-08 13:19:16 -07:00
+								{
-												ovs-thread: Add support for various thread-related assertions.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Ethan Jackson <ethan@nicira.com>

											
										
										
											2013-06-19 13:07:35 -07:00
+								    assert_single_threaded();
-												daemon: Refactor code.

This commit should not change behavior, but it paves the way for
implementing --monitor in the following commit.

											
										
										
											2010-01-15 15:29:52 -08:00
+								    daemonize_fd = -1;
-												daemon: Allow daemon child process to report success or failure to parent.

There are conflicting pressures in startup of a daemon process:

    * The parent process should exit with an error code if the daemon
      cannot start up successfully.

    * Some startup actions must be performed in the child process, not in
      the parent.  The most obvious of these are file locking, since
      child processes do not inherit locks, and anything that requires
      knowing the child process's PID (e.g. unixctl sockets).

Until now, this conflict has usually been handled by giving up part of the
first property, i.e. in some cases the parent process would exit
successfully and the child immediately afterward exit with a failure code.

This commit introduces a better approach, by allowing daemons to perform
startup work in the child and only then signal the parent that they have
successfully started.  If the child instead exits without signaling
success, the parent passes this exit code along to its own parent.

This commit also modifies the daemons that can usefully take advantage of
this new feature to do so.

											
										
										
											2009-12-17 10:56:01 -08:00
-												daemon: Refactor code.

This commit should not change behavior, but it paves the way for
implementing --monitor in the following commit.

											
										
										
											2010-01-15 15:29:52 -08:00
+								    if (detach) {
-												daemon: restart child process if it died before signaling its readiness

The child process (the one being monitored) could die before it was able
to call fork_notify_startup() function.  If such situation arises, then
parent process (the one monitoring child process) would also terminate
with a fatal log message:

...|EMER|fork child died before signaling startup (killed (...))

This patch changes that behavior by always restarting child process
if it was able to start up at least once in the past.  However, if
child was not able to start up even once, then the monitor process
would still terminate, because that would most likely indicate a
persistent programming or system error.

To reproduce use following script:

while : ; do kill -SIGSEGV `cat /var/run/openvswitch/ovs-vswitchd.pid`; done

Signed-Off-By: Ansis Atteka <aatteka@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>
VMware-BZ: 1273550

											
										
										
											2014-07-07 17:11:39 -07:00
+								        pid_t pid;
 								        if (fork_and_wait_for_startup(&daemonize_fd, &pid)) {
 								            VLOG_FATAL("could not detach from foreground session");
 								        }
 								        if (pid > 0) {
-												daemon: Allow daemon child process to report success or failure to parent.

There are conflicting pressures in startup of a daemon process:

    * The parent process should exit with an error code if the daemon
      cannot start up successfully.

    * Some startup actions must be performed in the child process, not in
      the parent.  The most obvious of these are file locking, since
      child processes do not inherit locks, and anything that requires
      knowing the child process's PID (e.g. unixctl sockets).

Until now, this conflict has usually been handled by giving up part of the
first property, i.e. in some cases the parent process would exit
successfully and the child immediately afterward exit with a failure code.

This commit introduces a better approach, by allowing daemons to perform
startup work in the child and only then signal the parent that they have
successfully started.  If the child instead exits without signaling
success, the parent passes this exit code along to its own parent.

This commit also modifies the daemons that can usefully take advantage of
this new feature to do so.

											
										
										
											2009-12-17 10:56:01 -08:00
+								            /* Running in parent process. */
-												Import from old repository commit 61ef2b42a9c4ba8e1600f15bb0236765edc2ad45.

											
										
										
											2009-07-08 13:19:16 -07:00
+								            exit(0);
 								        }
-												daemon: Start monitor process, not daemon process, in new session.

To keep control+C and other signals in the initiating session from killing
the monitor process, we need to put the monitor process into its own
session.  However, until this point, we've only done that for the daemon
processes that the monitor started, which means that control+C would kill
the monitor but not the daemons that it launched.

I don't know of a benefit to putting the monitor and daemon processes in
different sessions, as opposed to one new session for both of them, so
this change does the latter.

daemonize_post_detach() is called from one additional context where we'd
want to be in a new session, the worker_start() function, but that function
is documented as to be called after daemonize_start(), in which case we
will (after this commit) already have called setsid(), so no additional
change is required there.

Bug #14280.
Reported-by: Gordon Good <ggood@nicira.com>
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2012-12-13 14:01:23 -08:00
-												daemon: Add support for process monitoring and restart.

											
										
										
											2010-01-15 12:13:46 -08:00
+								        /* Running in daemon or monitor process. */
-												daemon: Start monitor process, not daemon process, in new session.

To keep control+C and other signals in the initiating session from killing
the monitor process, we need to put the monitor process into its own
session.  However, until this point, we've only done that for the daemon
processes that the monitor started, which means that control+C would kill
the monitor but not the daemons that it launched.

I don't know of a benefit to putting the monitor and daemon processes in
different sessions, as opposed to one new session for both of them, so
this change does the latter.

daemonize_post_detach() is called from one additional context where we'd
want to be in a new session, the worker_start() function, but that function
is documented as to be called after daemonize_start(), in which case we
will (after this commit) already have called setsid(), so no additional
change is required there.

Bug #14280.
Reported-by: Gordon Good <ggood@nicira.com>
Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2012-12-13 14:01:23 -08:00
+								        setsid();
-												daemon: Add support for process monitoring and restart.

											
										
										
											2010-01-15 12:13:46 -08:00
+								    }
 								    if (monitor) {
 								        int saved_daemonize_fd = daemonize_fd;
 								        pid_t daemon_pid;
-												daemon: restart child process if it died before signaling its readiness

The child process (the one being monitored) could die before it was able
to call fork_notify_startup() function.  If such situation arises, then
parent process (the one monitoring child process) would also terminate
with a fatal log message:

...|EMER|fork child died before signaling startup (killed (...))

This patch changes that behavior by always restarting child process
if it was able to start up at least once in the past.  However, if
child was not able to start up even once, then the monitor process
would still terminate, because that would most likely indicate a
persistent programming or system error.

To reproduce use following script:

while : ; do kill -SIGSEGV `cat /var/run/openvswitch/ovs-vswitchd.pid`; done

Signed-Off-By: Ansis Atteka <aatteka@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>
VMware-BZ: 1273550

											
										
										
											2014-07-07 17:11:39 -07:00
+								        if (fork_and_wait_for_startup(&daemonize_fd, &daemon_pid)) {
 								            VLOG_FATAL("could not initiate process monitoring");
 								        }
-												daemon: Add support for process monitoring and restart.

											
										
										
											2010-01-15 12:13:46 -08:00
+								        if (daemon_pid > 0) {
 								            /* Running in monitor process. */
 								            fork_notify_startup(saved_daemonize_fd);
 								            close_standard_fds();
 								            monitor_daemon(daemon_pid);
 								        }
-												daemon: Refactor code.

This commit should not change behavior, but it paves the way for
implementing --monitor in the following commit.

											
										
										
											2010-01-15 15:29:52 -08:00
+								        /* Running in daemon process. */
-												Import from old repository commit 61ef2b42a9c4ba8e1600f15bb0236765edc2ad45.

											
										
										
											2009-07-08 13:19:16 -07:00
+								    }
-												daemon: Refactor code.

This commit should not change behavior, but it paves the way for
implementing --monitor in the following commit.

											
										
										
											2010-01-15 15:29:52 -08:00
-												worker: Delete library.

It had no remaining users.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-04-25 15:03:27 -07:00
+								    forbid_forking("running in daemon process");
-												daemon: Avoid races on pidfile creation.

Until now, if two copies of one OVS daemon started up at the same time,
then due to races in pidfile creation it was possible for both of them to
start successfully, instead of just one.  This was made worse when a
previous copy of the daemon had died abruptly, leaving a stale pidfile.

This commit implements a new pidfile creation and removal protocol that I
believe closes these races.  Now, a pidfile is asserted with "link" instead
of "rename", which prevents the race on creation, and a stale pidfile may
only be deleted by a process after it has taken a lock on it.

This may solve mysterious problems seen occasionally on vswitch restart.
I'm still puzzled by these problems, however, because I don't see anything
in our tests cases that would actually cause two copies of a daemon to
start at the same time, which as far as I can see is a necessary
precondition for the problem.

											
										
										
											2011-04-04 10:59:19 -07:00
+								    if (pidfile) {
 								        make_pidfile();
 								    }
-												daemon: Make sure that vlog is initialized when a process daemonizes.

If a process daemonizes itself, then it should be possible to control that
process's log levels with "ovs-appctl vlog/set" and related commands.  The
vlog_init() function registers those commands.  But vlog_init() doesn't
normally get called until the first log message is issued.  This can take a
while, especially for ovs-controller, where I first noticed the problem.

This commit fixes the problem by calling vlog_init() from
daemonize_start(), which always gets called as a process daemonizes.

											
										
										
											2010-08-12 09:47:33 -07:00
 								    /* Make sure that the unixctl commands for vlog get registered in a
 								     * daemon, even before the first log message. */
 								    vlog_init();
-												Import from old repository commit 61ef2b42a9c4ba8e1600f15bb0236765edc2ad45.

											
										
										
											2009-07-08 13:19:16 -07:00
+								}
-												daemon: Allow daemon child process to report success or failure to parent.

There are conflicting pressures in startup of a daemon process:

    * The parent process should exit with an error code if the daemon
      cannot start up successfully.

    * Some startup actions must be performed in the child process, not in
      the parent.  The most obvious of these are file locking, since
      child processes do not inherit locks, and anything that requires
      knowing the child process's PID (e.g. unixctl sockets).

Until now, this conflict has usually been handled by giving up part of the
first property, i.e. in some cases the parent process would exit
successfully and the child immediately afterward exit with a failure code.

This commit introduces a better approach, by allowing daemons to perform
startup work in the child and only then signal the parent that they have
successfully started.  If the child instead exits without signaling
success, the parent passes this exit code along to its own parent.

This commit also modifies the daemons that can usefully take advantage of
this new feature to do so.

											
										
										
											2009-12-17 10:56:01 -08:00
+								/* If daemonization is configured, then this function notifies the parent
-												daemon: Factor out code into new function daemonize_post_detach().

This code will have another user in an upcoming commit.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2012-05-21 11:08:59 -07:00
+								 * process that the child process has completed startup successfully.  It also
 								 * call daemonize_post_detach().
-												ovs-vswitchd: Complete daemonization only after initial configuration.

Otherwise when we add support for saving and restoring configuration
of internal devices around kernel module unload and reload, there's
no easy way for the "restore" code to tell when all the interfaces
should be set up and ready for configuration.

											
										
										
											2011-01-28 12:44:00 -08:00
+								 *
 								 * Calling this function more than once has no additional effect. */
-												daemon: Allow daemon child process to report success or failure to parent.

There are conflicting pressures in startup of a daemon process:

    * The parent process should exit with an error code if the daemon
      cannot start up successfully.

    * Some startup actions must be performed in the child process, not in
      the parent.  The most obvious of these are file locking, since
      child processes do not inherit locks, and anything that requires
      knowing the child process's PID (e.g. unixctl sockets).

Until now, this conflict has usually been handled by giving up part of the
first property, i.e. in some cases the parent process would exit
successfully and the child immediately afterward exit with a failure code.

This commit introduces a better approach, by allowing daemons to perform
startup work in the child and only then signal the parent that they have
successfully started.  If the child instead exits without signaling
success, the parent passes this exit code along to its own parent.

This commit also modifies the daemons that can usefully take advantage of
this new feature to do so.

											
										
										
											2009-12-17 10:56:01 -08:00
+								void
 								daemonize_complete(void)
 								{
-												worker: Prevent worker from being responsible for pidfile deletion.

Currently we are creating the worker process after creation of the pidfile.
This means that the responsibility of deleting the pidfile after process
termination rests with the worker process.

When we restart openvswitch using the startup scripts, we SIGTERM the main
process and once it is cleaned up, we start ovs-vswitchd again. This results
in a race condition. The new ovs-vswitchd will create a pidfile because it is
unlocked. But, if the old worker process exits after the start of new
ovs-vswitchd, it will simply delete the pidfile underneath the new ovs-vswitchd.
This will eventually result in multiple ovs-vswitchd daemons.

This patch gives the responsibility of deleting the pidfile to the main
process.

Bug #16669.
Signed-off-by: Gurucharan Shetty <gshetty@nicira.com>

											
										
										
											2013-04-28 19:25:55 -07:00
+								    if (pidfile) {
 								        free(pidfile);
 								        pidfile = NULL;
 								    }
-												daemon: Factor out code into new function daemonize_post_detach().

This code will have another user in an upcoming commit.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2012-05-21 11:08:59 -07:00
+								    if (!detached) {
 								        detached = true;
 								        fork_notify_startup(daemonize_fd);
 								        daemonize_fd = -1;
 								        daemonize_post_detach();
 								    }
 								}
-												daemon: Allow daemon child process to report success or failure to parent.

There are conflicting pressures in startup of a daemon process:

    * The parent process should exit with an error code if the daemon
      cannot start up successfully.

    * Some startup actions must be performed in the child process, not in
      the parent.  The most obvious of these are file locking, since
      child processes do not inherit locks, and anything that requires
      knowing the child process's PID (e.g. unixctl sockets).

Until now, this conflict has usually been handled by giving up part of the
first property, i.e. in some cases the parent process would exit
successfully and the child immediately afterward exit with a failure code.

This commit introduces a better approach, by allowing daemons to perform
startup work in the child and only then signal the parent that they have
successfully started.  If the child instead exits without signaling
success, the parent passes this exit code along to its own parent.

This commit also modifies the daemons that can usefully take advantage of
this new feature to do so.

											
										
										
											2009-12-17 10:56:01 -08:00
-												daemon: Factor out code into new function daemonize_post_detach().

This code will have another user in an upcoming commit.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2012-05-21 11:08:59 -07:00
+								/* If daemonization is configured, then this function does traditional Unix
 								 * daemonization behavior: join a new session, chdir to the root (if not
 								 * disabled), and close the standard file descriptors.
 								 *
 								 * It only makes sense to call this function as part of an implementation of a
 								 * special daemon subprocess.  A normal daemon should just call
 								 * daemonize_complete(). */
-												daemon: Cleanup some functions.

Some functions are unused and some functions can be
declared as static.

Signed-off-by: Gurucharan Shetty <gshetty@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-01-10 08:33:15 -08:00
+								static void
-												daemon: Factor out code into new function daemonize_post_detach().

This code will have another user in an upcoming commit.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2012-05-21 11:08:59 -07:00
+								daemonize_post_detach(void)
 								{
-												daemon: Refactor code.

This commit should not change behavior, but it paves the way for
implementing --monitor in the following commit.

											
										
										
											2010-01-15 15:29:52 -08:00
+								    if (detach) {
-												daemon: Allow daemon child process to report success or failure to parent.

There are conflicting pressures in startup of a daemon process:

    * The parent process should exit with an error code if the daemon
      cannot start up successfully.

    * Some startup actions must be performed in the child process, not in
      the parent.  The most obvious of these are file locking, since
      child processes do not inherit locks, and anything that requires
      knowing the child process's PID (e.g. unixctl sockets).

Until now, this conflict has usually been handled by giving up part of the
first property, i.e. in some cases the parent process would exit
successfully and the child immediately afterward exit with a failure code.

This commit introduces a better approach, by allowing daemons to perform
startup work in the child and only then signal the parent that they have
successfully started.  If the child instead exits without signaling
success, the parent passes this exit code along to its own parent.

This commit also modifies the daemons that can usefully take advantage of
this new feature to do so.

											
										
										
											2009-12-17 10:56:01 -08:00
+								        if (chdir_) {
 								            ignore(chdir("/"));
 								        }
-												daemon: Refactor code.

This commit should not change behavior, but it paves the way for
implementing --monitor in the following commit.

											
										
										
											2010-01-15 15:29:52 -08:00
+								        close_standard_fds();
-												daemon: Allow daemon child process to report success or failure to parent.

There are conflicting pressures in startup of a daemon process:

    * The parent process should exit with an error code if the daemon
      cannot start up successfully.

    * Some startup actions must be performed in the child process, not in
      the parent.  The most obvious of these are file locking, since
      child processes do not inherit locks, and anything that requires
      knowing the child process's PID (e.g. unixctl sockets).

Until now, this conflict has usually been handled by giving up part of the
first property, i.e. in some cases the parent process would exit
successfully and the child immediately afterward exit with a failure code.

This commit introduces a better approach, by allowing daemons to perform
startup work in the child and only then signal the parent that they have
successfully started.  If the child instead exits without signaling
success, the parent passes this exit code along to its own parent.

This commit also modifies the daemons that can usefully take advantage of
this new feature to do so.

											
										
										
											2009-12-17 10:56:01 -08:00
+								    }
 								}
-												Import from old repository commit 61ef2b42a9c4ba8e1600f15bb0236765edc2ad45.

											
										
										
											2009-07-08 13:19:16 -07:00
+								void
 								daemon_usage(void)
 								{
 								    printf(
 								        "\nDaemon options:\n"
-												daemon: Remove short options from daemon library

The daemon library provides a few short options, but these then take
away their availability from programs that wish to use the library.
Since the daemon options are generally going to be called from a script
(which doesn't care how much typing is involved), we'll only provide
long options.

											
										
										
											2009-08-05 14:20:24 -07:00
+								        "  --detach                run in background as daemon\n"
-												daemon: Provide option to not chdir to root

By default, Open vSwitch daemons change their working directories to the
root directory.  This commit provides a --no-chdir option to prevent this
behavior.

											
										
										
											2009-08-04 22:41:46 -07:00
+								        "  --no-chdir              do not chdir to '/'\n"
-												daemon: Remove short options from daemon library

The daemon library provides a few short options, but these then take
away their availability from programs that wish to use the library.
Since the daemon options are generally going to be called from a script
(which doesn't care how much typing is involved), we'll only provide
long options.

											
										
										
											2009-08-05 14:20:24 -07:00
+								        "  --pidfile[=FILE]        create pidfile (default: %s/%s.pid)\n"
 								        "  --overwrite-pidfile     with --pidfile, start even if already "
 								                                   "running\n",
-												Make installation directories overridable at runtime.

This makes it possible to run tests that need access to installation
directories, such as the rundir, without having access to the actual
installation directories (/var/run is generally not world-writable), by
setting environment variables.  This is not a good way to do things in
general--usually it would be better to choose the correct directories
at configure time--so for now this is undocumented.

											
										
										
											2010-11-29 12:28:26 -08:00
+								        ovs_rundir(), program_name);
-												Import from old repository commit 61ef2b42a9c4ba8e1600f15bb0236765edc2ad45.

											
										
										
											2009-07-08 13:19:16 -07:00
+								}
-												daemon: Avoid races on pidfile creation.

Until now, if two copies of one OVS daemon started up at the same time,
then due to races in pidfile creation it was possible for both of them to
start successfully, instead of just one.  This was made worse when a
previous copy of the daemon had died abruptly, leaving a stale pidfile.

This commit implements a new pidfile creation and removal protocol that I
believe closes these races.  Now, a pidfile is asserted with "link" instead
of "rename", which prevents the race on creation, and a stale pidfile may
only be deleted by a process after it has taken a lock on it.

This may solve mysterious problems seen occasionally on vswitch restart.
I'm still puzzled by these problems, however, because I don't see anything
in our tests cases that would actually cause two copies of a daemon to
start at the same time, which as far as I can see is a necessary
precondition for the problem.

											
										
										
											2011-04-04 10:59:19 -07:00
+								static int
 								lock_pidfile__(FILE *file, int command, struct flock *lck)
 								{
 								    int error;
 								    lck->l_type = F_WRLCK;
 								    lck->l_whence = SEEK_SET;
 								    lck->l_start = 0;
 								    lck->l_len = 0;
 								    lck->l_pid = 0;
 								    do {
 								        error = fcntl(fileno(file), command, lck) == -1 ? errno : 0;
 								    } while (error == EINTR);
 								    return error;
 								}
 								static int
 								lock_pidfile(FILE *file, int command)
 								{
 								    struct flock lck;
 								    return lock_pidfile__(file, command, &lck);
 								}
-												daemon: Avoid redundant code in already_running().

This function substantially duplicated read_pidfile(), so reuse that
code instead.

											
										
										
											2011-03-29 09:44:55 -07:00
+								static pid_t
-												daemon: Avoid races on pidfile creation.

Until now, if two copies of one OVS daemon started up at the same time,
then due to races in pidfile creation it was possible for both of them to
start successfully, instead of just one.  This was made worse when a
previous copy of the daemon had died abruptly, leaving a stale pidfile.

This commit implements a new pidfile creation and removal protocol that I
believe closes these races.  Now, a pidfile is asserted with "link" instead
of "rename", which prevents the race on creation, and a stale pidfile may
only be deleted by a process after it has taken a lock on it.

This may solve mysterious problems seen occasionally on vswitch restart.
I'm still puzzled by these problems, however, because I don't see anything
in our tests cases that would actually cause two copies of a daemon to
start at the same time, which as far as I can see is a necessary
precondition for the problem.

											
										
										
											2011-04-04 10:59:19 -07:00
+								read_pidfile__(const char *pidfile, bool delete_if_stale)
-												Import from old repository commit 61ef2b42a9c4ba8e1600f15bb0236765edc2ad45.

											
										
										
											2009-07-08 13:19:16 -07:00
+								{
-												daemon: Avoid races on pidfile creation.

Until now, if two copies of one OVS daemon started up at the same time,
then due to races in pidfile creation it was possible for both of them to
start successfully, instead of just one.  This was made worse when a
previous copy of the daemon had died abruptly, leaving a stale pidfile.

This commit implements a new pidfile creation and removal protocol that I
believe closes these races.  Now, a pidfile is asserted with "link" instead
of "rename", which prevents the race on creation, and a stale pidfile may
only be deleted by a process after it has taken a lock on it.

This may solve mysterious problems seen occasionally on vswitch restart.
I'm still puzzled by these problems, however, because I don't see anything
in our tests cases that would actually cause two copies of a daemon to
start at the same time, which as far as I can see is a necessary
precondition for the problem.

											
										
										
											2011-04-04 10:59:19 -07:00
+								    struct stat s, s2;
-												Import from old repository commit 61ef2b42a9c4ba8e1600f15bb0236765edc2ad45.

											
										
										
											2009-07-08 13:19:16 -07:00
+								    struct flock lck;
-												daemon: Avoid races on pidfile creation.

Until now, if two copies of one OVS daemon started up at the same time,
then due to races in pidfile creation it was possible for both of them to
start successfully, instead of just one.  This was made worse when a
previous copy of the daemon had died abruptly, leaving a stale pidfile.

This commit implements a new pidfile creation and removal protocol that I
believe closes these races.  Now, a pidfile is asserted with "link" instead
of "rename", which prevents the race on creation, and a stale pidfile may
only be deleted by a process after it has taken a lock on it.

This may solve mysterious problems seen occasionally on vswitch restart.
I'm still puzzled by these problems, however, because I don't see anything
in our tests cases that would actually cause two copies of a daemon to
start at the same time, which as far as I can see is a necessary
precondition for the problem.

											
										
										
											2011-04-04 10:59:19 -07:00
+								    char line[128];
-												Import from old repository commit 61ef2b42a9c4ba8e1600f15bb0236765edc2ad45.

											
										
										
											2009-07-08 13:19:16 -07:00
+								    FILE *file;
 								    int error;
-												daemon: Fix behavior of read_pidfile() for our own pidfile.

Opening a file descriptor and then closing it always discards any locks
held on the underlying file, even if the file is still open as another file
descriptor.  This meant that calling read_pidfile() on the process's own
pidfile would discard the lock and make other OVS processes think that the
process had died.  This commit fixes the problem.

											
										
										
											2010-09-23 09:39:47 -07:00
+								    if ((pidfile_ino || pidfile_dev)
 								        && !stat(pidfile, &s)
 								        && s.st_ino == pidfile_ino && s.st_dev == pidfile_dev) {
 								        /* It's our own pidfile.  We can't afford to open it, because closing
 								         * *any* fd for a file that a process has locked also releases all the
 								         * locks on that file.
 								         *
 								         * Fortunately, we know the associated pid anyhow: */
 								        return getpid();
 								    }
-												daemon: Avoid races on pidfile creation.

Until now, if two copies of one OVS daemon started up at the same time,
then due to races in pidfile creation it was possible for both of them to
start successfully, instead of just one.  This was made worse when a
previous copy of the daemon had died abruptly, leaving a stale pidfile.

This commit implements a new pidfile creation and removal protocol that I
believe closes these races.  Now, a pidfile is asserted with "link" instead
of "rename", which prevents the race on creation, and a stale pidfile may
only be deleted by a process after it has taken a lock on it.

This may solve mysterious problems seen occasionally on vswitch restart.
I'm still puzzled by these problems, however, because I don't see anything
in our tests cases that would actually cause two copies of a daemon to
start at the same time, which as far as I can see is a necessary
precondition for the problem.

											
										
										
											2011-04-04 10:59:19 -07:00
+								    file = fopen(pidfile, "r+");
-												Import from old repository commit 61ef2b42a9c4ba8e1600f15bb0236765edc2ad45.

											
										
										
											2009-07-08 13:19:16 -07:00
+								    if (!file) {
-												daemon: Avoid races on pidfile creation.

Until now, if two copies of one OVS daemon started up at the same time,
then due to races in pidfile creation it was possible for both of them to
start successfully, instead of just one.  This was made worse when a
previous copy of the daemon had died abruptly, leaving a stale pidfile.

This commit implements a new pidfile creation and removal protocol that I
believe closes these races.  Now, a pidfile is asserted with "link" instead
of "rename", which prevents the race on creation, and a stale pidfile may
only be deleted by a process after it has taken a lock on it.

This may solve mysterious problems seen occasionally on vswitch restart.
I'm still puzzled by these problems, however, because I don't see anything
in our tests cases that would actually cause two copies of a daemon to
start at the same time, which as far as I can see is a necessary
precondition for the problem.

											
										
										
											2011-04-04 10:59:19 -07:00
+								        if (errno == ENOENT && delete_if_stale) {
-												daemon: Avoid redundant code in already_running().

This function substantially duplicated read_pidfile(), so reuse that
code instead.

											
										
										
											2011-03-29 09:44:55 -07:00
+								            return 0;
 								        }
-												Import from old repository commit 61ef2b42a9c4ba8e1600f15bb0236765edc2ad45.

											
										
										
											2009-07-08 13:19:16 -07:00
+								        error = errno;
-												Replace all uses of strerror() by ovs_strerror(), for thread safety.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-06-24 10:54:49 -07:00
+								        VLOG_WARN("%s: open: %s", pidfile, ovs_strerror(error));
-												Import from old repository commit 61ef2b42a9c4ba8e1600f15bb0236765edc2ad45.

											
										
										
											2009-07-08 13:19:16 -07:00
+								        goto error;
 								    }
-												daemon: Avoid races on pidfile creation.

Until now, if two copies of one OVS daemon started up at the same time,
then due to races in pidfile creation it was possible for both of them to
start successfully, instead of just one.  This was made worse when a
previous copy of the daemon had died abruptly, leaving a stale pidfile.

This commit implements a new pidfile creation and removal protocol that I
believe closes these races.  Now, a pidfile is asserted with "link" instead
of "rename", which prevents the race on creation, and a stale pidfile may
only be deleted by a process after it has taken a lock on it.

This may solve mysterious problems seen occasionally on vswitch restart.
I'm still puzzled by these problems, however, because I don't see anything
in our tests cases that would actually cause two copies of a daemon to
start at the same time, which as far as I can see is a necessary
precondition for the problem.

											
										
										
											2011-04-04 10:59:19 -07:00
+								    error = lock_pidfile__(file, F_GETLK, &lck);
 								    if (error) {
-												Replace all uses of strerror() by ovs_strerror(), for thread safety.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-06-24 10:54:49 -07:00
+								        VLOG_WARN("%s: fcntl: %s", pidfile, ovs_strerror(error));
-												Import from old repository commit 61ef2b42a9c4ba8e1600f15bb0236765edc2ad45.

											
										
										
											2009-07-08 13:19:16 -07:00
+								        goto error;
 								    }
 								    if (lck.l_type == F_UNLCK) {
-												daemon: Avoid races on pidfile creation.

Until now, if two copies of one OVS daemon started up at the same time,
then due to races in pidfile creation it was possible for both of them to
start successfully, instead of just one.  This was made worse when a
previous copy of the daemon had died abruptly, leaving a stale pidfile.

This commit implements a new pidfile creation and removal protocol that I
believe closes these races.  Now, a pidfile is asserted with "link" instead
of "rename", which prevents the race on creation, and a stale pidfile may
only be deleted by a process after it has taken a lock on it.

This may solve mysterious problems seen occasionally on vswitch restart.
I'm still puzzled by these problems, however, because I don't see anything
in our tests cases that would actually cause two copies of a daemon to
start at the same time, which as far as I can see is a necessary
precondition for the problem.

											
										
										
											2011-04-04 10:59:19 -07:00
+								        /* pidfile exists but it isn't locked by anyone.  We need to delete it
 								         * so that a new pidfile can go in its place.  But just calling
 								         * unlink(pidfile) makes a nasty race: what if someone else unlinks it
 								         * before we do and then replaces it by a valid pidfile?  We'd unlink
 								         * their valid pidfile.  We do a little dance to avoid the race, by
 								         * locking the invalid pidfile.  Only one process can have the invalid
 								         * pidfile locked, and only that process has the right to unlink it. */
 								        if (!delete_if_stale) {
 								            error = ESRCH;
-												daemon: Reduce log level of "pid file is stale" message.

This message will appear repeatedly when ovs-vswitchd is running, if there
is any stale pidfile in /var/run/openvswitch, because ovs-vswitchd reads
all of the pidfiles in that directory periodically to update statistics.

											
										
										
											2011-04-05 12:17:08 -07:00
+								            VLOG_DBG("%s: pid file is stale", pidfile);
-												daemon: Avoid races on pidfile creation.

Until now, if two copies of one OVS daemon started up at the same time,
then due to races in pidfile creation it was possible for both of them to
start successfully, instead of just one.  This was made worse when a
previous copy of the daemon had died abruptly, leaving a stale pidfile.

This commit implements a new pidfile creation and removal protocol that I
believe closes these races.  Now, a pidfile is asserted with "link" instead
of "rename", which prevents the race on creation, and a stale pidfile may
only be deleted by a process after it has taken a lock on it.

This may solve mysterious problems seen occasionally on vswitch restart.
I'm still puzzled by these problems, however, because I don't see anything
in our tests cases that would actually cause two copies of a daemon to
start at the same time, which as far as I can see is a necessary
precondition for the problem.

											
										
										
											2011-04-04 10:59:19 -07:00
+								            goto error;
 								        }
 								        /* Get the lock. */
 								        error = lock_pidfile(file, F_SETLK);
 								        if (error) {
 								            /* We lost a race with someone else doing the same thing. */
 								            VLOG_WARN("%s: lost race to lock pidfile", pidfile);
 								            goto error;
 								        }
 								        /* Is the file we have locked still named 'pidfile'? */
 								        if (stat(pidfile, &s) || fstat(fileno(file), &s2)
 								            || s.st_ino != s2.st_ino || s.st_dev != s2.st_dev) {
 								            /* No.  We lost a race with someone else who got the lock before
 								             * us, deleted the pidfile, and closed it (releasing the lock). */
 								            error = EALREADY;
 								            VLOG_WARN("%s: lost race to delete pidfile", pidfile);
 								            goto error;
 								        }
 								        /* We won the right to delete the stale pidfile. */
 								        if (unlink(pidfile)) {
 								            error = errno;
 								            VLOG_WARN("%s: failed to delete stale pidfile (%s)",
-												Replace all uses of strerror() by ovs_strerror(), for thread safety.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-06-24 10:54:49 -07:00
+								                      pidfile, ovs_strerror(error));
-												daemon: Avoid races on pidfile creation.

Until now, if two copies of one OVS daemon started up at the same time,
then due to races in pidfile creation it was possible for both of them to
start successfully, instead of just one.  This was made worse when a
previous copy of the daemon had died abruptly, leaving a stale pidfile.

This commit implements a new pidfile creation and removal protocol that I
believe closes these races.  Now, a pidfile is asserted with "link" instead
of "rename", which prevents the race on creation, and a stale pidfile may
only be deleted by a process after it has taken a lock on it.

This may solve mysterious problems seen occasionally on vswitch restart.
I'm still puzzled by these problems, however, because I don't see anything
in our tests cases that would actually cause two copies of a daemon to
start at the same time, which as far as I can see is a necessary
precondition for the problem.

											
										
										
											2011-04-04 10:59:19 -07:00
+								            goto error;
 								        }
 								        VLOG_DBG("%s: deleted stale pidfile", pidfile);
 								        fclose(file);
 								        return 0;
-												Import from old repository commit 61ef2b42a9c4ba8e1600f15bb0236765edc2ad45.

											
										
										
											2009-07-08 13:19:16 -07:00
+								    }
 								    if (!fgets(line, sizeof line, file)) {
 								        if (ferror(file)) {
 								            error = errno;
-												Replace all uses of strerror() by ovs_strerror(), for thread safety.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-06-24 10:54:49 -07:00
+								            VLOG_WARN("%s: read: %s", pidfile, ovs_strerror(error));
-												Import from old repository commit 61ef2b42a9c4ba8e1600f15bb0236765edc2ad45.

											
										
										
											2009-07-08 13:19:16 -07:00
+								        } else {
 								            error = ESRCH;
 								            VLOG_WARN("%s: read: unexpected end of file", pidfile);
 								        }
 								        goto error;
 								    }
 								    if (lck.l_pid != strtoul(line, NULL, 10)) {
-												daemon: Avoid races on pidfile creation.

Until now, if two copies of one OVS daemon started up at the same time,
then due to races in pidfile creation it was possible for both of them to
start successfully, instead of just one.  This was made worse when a
previous copy of the daemon had died abruptly, leaving a stale pidfile.

This commit implements a new pidfile creation and removal protocol that I
believe closes these races.  Now, a pidfile is asserted with "link" instead
of "rename", which prevents the race on creation, and a stale pidfile may
only be deleted by a process after it has taken a lock on it.

This may solve mysterious problems seen occasionally on vswitch restart.
I'm still puzzled by these problems, however, because I don't see anything
in our tests cases that would actually cause two copies of a daemon to
start at the same time, which as far as I can see is a necessary
precondition for the problem.

											
										
										
											2011-04-04 10:59:19 -07:00
+								        /* The process that has the pidfile locked is not the process that
 								         * created it.  It must be stale, with the process that has it locked
 								         * preparing to delete it. */
-												Import from old repository commit 61ef2b42a9c4ba8e1600f15bb0236765edc2ad45.

											
										
										
											2009-07-08 13:19:16 -07:00
+								        error = ESRCH;
-												daemon: Avoid races on pidfile creation.

Until now, if two copies of one OVS daemon started up at the same time,
then due to races in pidfile creation it was possible for both of them to
start successfully, instead of just one.  This was made worse when a
previous copy of the daemon had died abruptly, leaving a stale pidfile.

This commit implements a new pidfile creation and removal protocol that I
believe closes these races.  Now, a pidfile is asserted with "link" instead
of "rename", which prevents the race on creation, and a stale pidfile may
only be deleted by a process after it has taken a lock on it.

This may solve mysterious problems seen occasionally on vswitch restart.
I'm still puzzled by these problems, however, because I don't see anything
in our tests cases that would actually cause two copies of a daemon to
start at the same time, which as far as I can see is a necessary
precondition for the problem.

											
										
										
											2011-04-04 10:59:19 -07:00
+								        VLOG_WARN("%s: stale pidfile for pid %s being deleted by pid %ld",
 								                  pidfile, line, (long int) lck.l_pid);
-												Import from old repository commit 61ef2b42a9c4ba8e1600f15bb0236765edc2ad45.

											
										
										
											2009-07-08 13:19:16 -07:00
+								        goto error;
 								    }
 								    fclose(file);
 								    return lck.l_pid;
 								error:
 								    if (file) {
 								        fclose(file);
 								    }
 								    return -error;
 								}
-												daemon: Avoid redundant code in already_running().

This function substantially duplicated read_pidfile(), so reuse that
code instead.

											
										
										
											2011-03-29 09:44:55 -07:00
 								/* Opens and reads a PID from 'pidfile'.  Returns the positive PID if
 								 * successful, otherwise a negative errno value. */
 								pid_t
 								read_pidfile(const char *pidfile)
 								{
-												daemon: Avoid races on pidfile creation.

Until now, if two copies of one OVS daemon started up at the same time,
then due to races in pidfile creation it was possible for both of them to
start successfully, instead of just one.  This was made worse when a
previous copy of the daemon had died abruptly, leaving a stale pidfile.

This commit implements a new pidfile creation and removal protocol that I
believe closes these races.  Now, a pidfile is asserted with "link" instead
of "rename", which prevents the race on creation, and a stale pidfile may
only be deleted by a process after it has taken a lock on it.

This may solve mysterious problems seen occasionally on vswitch restart.
I'm still puzzled by these problems, however, because I don't see anything
in our tests cases that would actually cause two copies of a daemon to
start at the same time, which as far as I can see is a necessary
precondition for the problem.

											
										
										
											2011-04-04 10:59:19 -07:00
+								    return read_pidfile__(pidfile, false);
-												daemon: Avoid redundant code in already_running().

This function substantially duplicated read_pidfile(), so reuse that
code instead.

											
										
										
											2011-03-29 09:44:55 -07:00
+								}
-												daemon: Avoid races on pidfile creation.

Until now, if two copies of one OVS daemon started up at the same time,
then due to races in pidfile creation it was possible for both of them to
start successfully, instead of just one.  This was made worse when a
previous copy of the daemon had died abruptly, leaving a stale pidfile.

This commit implements a new pidfile creation and removal protocol that I
believe closes these races.  Now, a pidfile is asserted with "link" instead
of "rename", which prevents the race on creation, and a stale pidfile may
only be deleted by a process after it has taken a lock on it.

This may solve mysterious problems seen occasionally on vswitch restart.
I'm still puzzled by these problems, however, because I don't see anything
in our tests cases that would actually cause two copies of a daemon to
start at the same time, which as far as I can see is a necessary
precondition for the problem.

											
										
										
											2011-04-04 10:59:19 -07:00
+								/* Checks whether a process with the given 'pidfile' is already running and,
 								 * if so, aborts.  If 'pidfile' is stale, deletes it. */
 								static void
 								check_already_running(void)
-												daemon: Avoid redundant code in already_running().

This function substantially duplicated read_pidfile(), so reuse that
code instead.

											
										
										
											2011-03-29 09:44:55 -07:00
+								{
-												daemon: Avoid races on pidfile creation.

Until now, if two copies of one OVS daemon started up at the same time,
then due to races in pidfile creation it was possible for both of them to
start successfully, instead of just one.  This was made worse when a
previous copy of the daemon had died abruptly, leaving a stale pidfile.

This commit implements a new pidfile creation and removal protocol that I
believe closes these races.  Now, a pidfile is asserted with "link" instead
of "rename", which prevents the race on creation, and a stale pidfile may
only be deleted by a process after it has taken a lock on it.

This may solve mysterious problems seen occasionally on vswitch restart.
I'm still puzzled by these problems, however, because I don't see anything
in our tests cases that would actually cause two copies of a daemon to
start at the same time, which as far as I can see is a necessary
precondition for the problem.

											
										
										
											2011-04-04 10:59:19 -07:00
+								    long int pid = read_pidfile__(pidfile, true);
 								    if (pid > 0) {
 								        VLOG_FATAL("%s: already running as pid %ld, aborting", pidfile, pid);
 								    } else if (pid < 0) {
 								        VLOG_FATAL("%s: pidfile check failed (%s), aborting",
-												Replace all uses of strerror() by ovs_strerror(), for thread safety.

Signed-off-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2013-06-24 10:54:49 -07:00
+								                   pidfile, ovs_strerror(-pid));
-												daemon: Avoid races on pidfile creation.

Until now, if two copies of one OVS daemon started up at the same time,
then due to races in pidfile creation it was possible for both of them to
start successfully, instead of just one.  This was made worse when a
previous copy of the daemon had died abruptly, leaving a stale pidfile.

This commit implements a new pidfile creation and removal protocol that I
believe closes these races.  Now, a pidfile is asserted with "link" instead
of "rename", which prevents the race on creation, and a stale pidfile may
only be deleted by a process after it has taken a lock on it.

This may solve mysterious problems seen occasionally on vswitch restart.
I'm still puzzled by these problems, however, because I don't see anything
in our tests cases that would actually cause two copies of a daemon to
start at the same time, which as far as I can see is a necessary
precondition for the problem.

											
										
										
											2011-04-04 10:59:19 -07:00
+								    }
-												daemon: Avoid redundant code in already_running().

This function substantially duplicated read_pidfile(), so reuse that
code instead.

											
										
										
											2011-03-29 09:44:55 -07:00
+								}
-												daemon-windows: Ability to handle windows service calls.

The following code does not add any users yet.

The visioned workflow that this piece of code should work with is:
* Create a windows service through a startup script with
a tool like 'sc'
ex:  sc create ovsdb-server binpath=
 "C:\openvswitch\usr\sbin\ovsdb-server.exe -vconsole:off
-vsyslog:off -vfile:info --remote=ptcp:6632:127.0.0.1 --log-file
--service-monitor --service"

* Start the service from the startup script.
ex: sc start ovsdb-server

* Terminate the service during shutdown process.
ex: sc stop ovsdb-server

* Abrupt termination will restart the service.

Signed-off-by: Gurucharan Shetty <gshetty@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>

											
										
										
											2014-01-16 16:16:24 -08:00
 								/* stub functions for non-windows platform. */
 								void
 								service_start(int *argc OVS_UNUSED, char **argv[] OVS_UNUSED)
 								{
 								}
 								void
 								service_stop(void)
 								{
 								}
 								bool
 								should_service_stop(void)
 								{
 								    return false;
 								}