diff options
Diffstat (limited to 'src/monitor/monitor.c')
-rw-r--r-- | src/monitor/monitor.c | 336 |
1 files changed, 159 insertions, 177 deletions
diff --git a/src/monitor/monitor.c b/src/monitor/monitor.c index dc6f03d5..d2717fae 100644 --- a/src/monitor/monitor.c +++ b/src/monitor/monitor.c @@ -20,6 +20,7 @@ */ #include "util/util.h" +#include "util/child_common.h" #include <sys/types.h> #include <sys/wait.h> #include <sys/time.h> @@ -70,9 +71,16 @@ int cmdline_debug_microseconds; struct svc_spy; +enum mt_svc_type { + MT_SVC_SERVICE, + MT_SVC_PROVIDER +}; + struct mt_svc { struct mt_svc *prev; struct mt_svc *next; + enum mt_svc_type type; + struct sbus_connection *conn; struct svc_spy *conn_spy; @@ -96,6 +104,10 @@ struct mt_svc { int debug_level; struct tevent_timer *ping_ev; + + struct sss_child_ctx *child_ctx; + + struct tevent_timer *sigkill_ev; }; struct config_file_callback { @@ -132,6 +144,7 @@ struct mt_ctx { bool services_started; struct netlink_ctx *nlctx; const char *conf_path; + struct sss_sigchild_ctx *sigchld_ctx; }; static int start_service(struct mt_svc *mt_svc); @@ -142,18 +155,19 @@ static int service_send_ping(struct mt_svc *svc); static int service_signal_reset_offline(struct mt_svc *svc); static void ping_check(DBusPendingCall *pending, void *data); -static int service_check_alive(struct mt_svc *svc); - static void set_tasks_checker(struct mt_svc *srv); -static void set_global_checker(struct mt_ctx *ctx); static int monitor_kill_service (struct mt_svc *svc); static int get_service_config(struct mt_ctx *ctx, const char *name, struct mt_svc **svc_cfg); static int get_provider_config(struct mt_ctx *ctx, const char *name, struct mt_svc **svc_cfg); -static int add_new_service(struct mt_ctx *ctx, const char *name); -static int add_new_provider(struct mt_ctx *ctx, const char *name); +static int add_new_service(struct mt_ctx *ctx, + const char *name, + int restarts); +static int add_new_provider(struct mt_ctx *ctx, + const char *name, + int restarts); static int mark_service_as_started(struct mt_svc *svc); @@ -396,7 +410,7 @@ static int mark_service_as_started(struct mt_svc *svc) DEBUG(4, ("Now starting services!\n")); /* then start all services */ for (i = 0; ctx->services[i]; i++) { - add_new_service(ctx, ctx->services[i]); + add_new_service(ctx, ctx->services[i], 0); } } @@ -423,7 +437,7 @@ static void services_startup_timeout(struct tevent_context *ev, DEBUG(4, ("Now starting services!\n")); /* then start all services */ for (i = 0; ctx->services[i]; i++) { - add_new_service(ctx, ctx->services[i]); + add_new_service(ctx, ctx->services[i], 0); } } } @@ -479,98 +493,37 @@ static int monitor_dbus_init(struct mt_ctx *ctx) return ret; } -static void svc_try_restart(struct mt_svc *svc, time_t now) -{ - int ret; - - DLIST_REMOVE(svc->mt_ctx->svc_list, svc); - if (svc->last_restart != 0) { - if ((now - svc->last_restart) > 30) { /* TODO: get val from config */ - /* it was long ago reset restart threshold */ - svc->restarts = 0; - } - } - - /* restart the process */ - if (svc->restarts > 3) { /* TODO: get val from config */ - DEBUG(0, ("Process [%s], definitely stopped!\n", svc->name)); - talloc_free(svc); - return; - } - - /* Shut down the current ping timer so it will restart - * cleanly in start_service() - */ - talloc_free(svc->ping_ev); - - ret = start_service(svc); - if (ret != EOK) { - DEBUG(0,("Failed to restart service '%s'\n", svc->name)); - talloc_free(svc); - return; - } - - svc->restarts++; - svc->last_restart = now; - return; -} - static void tasks_check_handler(struct tevent_context *ev, struct tevent_timer *te, struct timeval t, void *ptr) { struct mt_svc *svc = talloc_get_type(ptr, struct mt_svc); - time_t now = time(NULL); - bool process_alive = true; int ret; - ret = service_check_alive(svc); + ret = service_send_ping(svc); switch (ret) { case EOK: /* all fine */ break; - case ECHILD: - DEBUG(1,("Process (%s) is stopped!\n", svc->name)); - process_alive = false; + case ENXIO: + DEBUG(1,("Child (%s) not responding! (yet)\n", svc->name)); break; default: - /* TODO: should we tear down it ? */ - DEBUG(1,("Checking for service %s(%d) failed!!\n", - svc->name, svc->pid)); + /* TODO: should we tear it down ? */ + DEBUG(1,("Sending a message to service (%s) failed!!\n", svc->name)); break; } - if (process_alive) { - ret = service_send_ping(svc); - switch (ret) { - case EOK: - /* all fine */ - break; - - case ENXIO: - DEBUG(1,("Child (%s) not responding! (yet)\n", svc->name)); - break; - - default: - /* TODO: should we tear it down ? */ - DEBUG(1,("Sending a message to service (%s) failed!!\n", svc->name)); - break; - } + if (svc->failed_pongs >= 3) { + /* too long since we last heard of this process */ + DEBUG(SSSDBG_CRIT_FAILURE, + ("Killing service [%s], not responding to pings!\n", + svc->name)); - if (svc->failed_pongs >= 3) { - /* too long since we last heard of this process */ - DEBUG(SSSDBG_CRIT_FAILURE, - ("Killing service [%s], not responding to pings!\n", - svc->name)); - monitor_kill_service(svc); - process_alive = false; - } - } - - if (!process_alive) { - svc_try_restart(svc, now); + /* Kill the service. The SIGCHLD handler will restart it */ + monitor_kill_service(svc); return; } @@ -595,75 +548,52 @@ static void set_tasks_checker(struct mt_svc *svc) svc->ping_ev = te; } -static void global_checks_handler(struct tevent_context *ev, - struct tevent_timer *te, - struct timeval t, void *ptr) +static void mt_svc_sigkill(struct tevent_context *ev, + struct tevent_timer *te, + struct timeval t, void *ptr); +static int monitor_kill_service (struct mt_svc *svc) { - struct mt_ctx *ctx = talloc_get_type(ptr, struct mt_ctx); - struct mt_svc *svc; - int status; - pid_t pid; - - if (!ctx->check_children) { - goto done; - } - - errno = 0; - pid = waitpid(0, &status, WNOHANG); - if (pid == 0) { - goto done; - } + int ret; + struct timeval tv; - if (pid == -1) { - DEBUG(0, ("waitpid returned -1 (errno:%d[%s])\n", - errno, strerror(errno))); - goto done; + ret = kill(svc->pid, SIGTERM); + if (ret != EOK) { + DEBUG(SSSDBG_FATAL_FAILURE, + ("Sending signal to child (%s:%d) failed! " + "Ignore and pretend child is dead.\n", + svc->name, svc->pid)); + talloc_free(svc); } - /* let's see if it is a known service, and try to restart it */ - for (svc = ctx->svc_list; svc; svc = svc->next) { - if (svc->pid == pid) { - time_t now = time(NULL); - DEBUG(1, ("Service [%s] did exit\n", svc->name)); - svc_try_restart(svc, now); - goto done; - } - } - if (svc == NULL) { - DEBUG(0, ("Unknown child (%d) did exit\n", pid)); - } + /* Set up a timer to send SIGKILL if this process + * doesn't exit within sixty seconds + */ + tv = tevent_timeval_current_ofs(60, 0); + svc->sigkill_ev = tevent_add_timer(svc->mt_ctx->ev, svc, tv, + mt_svc_sigkill, svc); -done: - set_global_checker(ctx); + return ret; } -static void set_global_checker(struct mt_ctx *ctx) +static void mt_svc_sigkill(struct tevent_context *ev, + struct tevent_timer *te, + struct timeval t, void *ptr) { - struct tevent_timer *te = NULL; - struct timeval tv; + int ret; + struct mt_svc *svc = talloc_get_type(ptr, struct mt_svc); - gettimeofday(&tv, NULL); - tv.tv_sec += 1; /* once a second */ - tv.tv_usec = 0; - te = tevent_add_timer(ctx->ev, ctx, tv, global_checks_handler, ctx); - if (te == NULL) { - DEBUG(0, ("failed to add global checker event! PANIC TIME!\n")); - /* FIXME: is this right ? shoulkd we try to clean up first ?*/ - exit(-1); - } -} + DEBUG(SSSDBG_FATAL_FAILURE, + ("[%s][%d] is not responding to SIGTERM. Sending SIGKILL.\n", + svc->name, svc->pid)); -static int monitor_kill_service (struct mt_svc *svc) -{ - int ret; - ret = kill(svc->pid, SIGTERM); + ret = kill(svc->pid, SIGKILL); if (ret != EOK) { - DEBUG(0,("Sending signal to child (%s:%d) failed! " - "Ignore and pretend child is dead.\n", - svc->name, svc->pid)); + DEBUG(SSSDBG_FATAL_FAILURE, + ("Sending signal to child (%s:%d) failed! " + "Ignore and pretend child is dead.\n", + svc->name, svc->pid)); + talloc_free(svc); } - - return ret; } static void reload_reply(DBusPendingCall *pending, void *data) @@ -910,6 +840,7 @@ static int get_service_config(struct mt_ctx *ctx, const char *name, int ret; char *path; struct mt_svc *svc; + time_t now = time(NULL); *svc_cfg = NULL; @@ -918,6 +849,7 @@ static int get_service_config(struct mt_ctx *ctx, const char *name, return ENOMEM; } svc->mt_ctx = ctx; + svc->type = MT_SVC_SERVICE; talloc_set_destructor((TALLOC_CTX *)svc, svc_destructor); @@ -1013,13 +945,17 @@ static int get_service_config(struct mt_ctx *ctx, const char *name, svc->ping_time = MONITOR_DEF_PING_TIME; } + svc->last_restart = now; + *svc_cfg = svc; talloc_free(path); return EOK; } -static int add_new_service(struct mt_ctx *ctx, const char *name) +static int add_new_service(struct mt_ctx *ctx, + const char *name, + int restarts) { int ret; struct mt_svc *svc; @@ -1028,6 +964,7 @@ static int add_new_service(struct mt_ctx *ctx, const char *name) if (ret != EOK) { return ret; } + svc->restarts = restarts; ret = start_service(svc); if (ret != EOK) { @@ -1044,6 +981,7 @@ static int get_provider_config(struct mt_ctx *ctx, const char *name, int ret; char *path; struct mt_svc *svc; + time_t now = time(NULL); *svc_cfg = NULL; @@ -1052,6 +990,7 @@ static int get_provider_config(struct mt_ctx *ctx, const char *name, return ENOMEM; } svc->mt_ctx = ctx; + svc->type = MT_SVC_PROVIDER; talloc_set_destructor((TALLOC_CTX *)svc, svc_destructor); @@ -1165,11 +1104,15 @@ static int get_provider_config(struct mt_ctx *ctx, const char *name, } } + svc->last_restart = now; + *svc_cfg = svc; return EOK; } -static int add_new_provider(struct mt_ctx *ctx, const char *name) +static int add_new_provider(struct mt_ctx *ctx, + const char *name, + int restarts) { int ret; struct mt_svc *svc; @@ -1180,6 +1123,7 @@ static int add_new_provider(struct mt_ctx *ctx, const char *name) name)); return ret; } + svc->restarts = restarts; if (strcasecmp(svc->provider, "local") == 0) { /* The LOCAL provider requires no back-end currently @@ -2020,6 +1964,10 @@ int monitor_process_init(struct mt_ctx *ctx, return EIO; } + /* Set up the SIGCHLD handler */ + ret = sss_sigchld_init(ctx, ctx->ev, &ctx->sigchld_ctx); + if (ret != EOK) return ret; + #if 0 This feature is incomplete and can leave the SSSD in a bad state if the config file is changed while the SSSD is running. @@ -2071,7 +2019,7 @@ int monitor_process_init(struct mt_ctx *ctx, /* start providers */ num_providers = 0; for (dom = ctx->domains; dom; dom = dom->next) { - ret = add_new_provider(ctx, dom->name); + ret = add_new_provider(ctx, dom->name, 0); if (ret != EOK && ret != ENOENT) { return ret; } @@ -2097,13 +2045,10 @@ int monitor_process_init(struct mt_ctx *ctx, /* No providers start services immediately * Normally this means only LOCAL is configured */ for (i = 0; ctx->services[i]; i++) { - add_new_service(ctx, ctx->services[i]); + add_new_service(ctx, ctx->services[i], 0); } } - /* now start checking for global events */ - set_global_checker(ctx); - return EOK; } @@ -2283,39 +2228,6 @@ done: dbus_message_unref(reply); } - - -/* service_check_alive - * This function checks if the service child is still alive - */ -static int service_check_alive(struct mt_svc *svc) -{ - int status; - pid_t pid; - - DEBUG(4,("Checking service %s(%d) is still alive\n", svc->name, svc->pid)); - - pid = waitpid(svc->pid, &status, WNOHANG); - if (pid == 0) { - return EOK; - } - - if (pid != svc->pid) { - DEBUG(1, ("bad return (%d) from waitpid() waiting for %d\n", - pid, svc->pid)); - /* TODO: what do we do now ? */ - return EINVAL; - } - - if (WIFEXITED(status)) { /* children exited on it's own */ - /* TODO: check configuration to see if it was removed - * from the list of process to run */ - DEBUG(0,("Process [%s] exited\n", svc->name)); - } - - return ECHILD; -} - static void service_startup_handler(struct tevent_context *ev, struct tevent_timer *te, struct timeval t, void *ptr); @@ -2349,10 +2261,12 @@ static int start_service(struct mt_svc *svc) return EOK; } +static void mt_svc_exit_handler(int pid, int wait_status, void *pvt); static void service_startup_handler(struct tevent_context *ev, struct tevent_timer *te, struct timeval t, void *ptr) { + errno_t ret; struct mt_svc *mt_svc; char **args; @@ -2372,6 +2286,22 @@ static void service_startup_handler(struct tevent_context *ev, /* Parent */ mt_svc->mt_ctx->check_children = true; mt_svc->failed_pongs = 0; + + /* Handle process exit */ + ret = sss_child_register(mt_svc, + mt_svc->mt_ctx->sigchld_ctx, + mt_svc->pid, + mt_svc_exit_handler, + mt_svc, + &mt_svc->child_ctx); + if (ret != EOK) { + DEBUG(SSSDBG_FATAL_FAILURE, + ("Could not register sigchld handler.\n")); + /* Should we exit here? For now, we'll hope this + * child never dies, because we can't restart it. + */ + } + DLIST_ADD(mt_svc->mt_ctx->svc_list, mt_svc); set_tasks_checker(mt_svc); @@ -2393,6 +2323,58 @@ static void service_startup_handler(struct tevent_context *ev, _exit(1); } +static void mt_svc_exit_handler(int pid, int wait_status, void *pvt) +{ + struct mt_svc *svc = talloc_get_type(pvt, struct mt_svc); + time_t now = time(NULL); + + if WIFEXITED(wait_status) { + DEBUG(SSSDBG_OP_FAILURE, + ("Child [%s] exited with code [%d]\n", + svc->name, WEXITSTATUS(wait_status))); + } else if WIFSIGNALED(wait_status) { + DEBUG(SSSDBG_OP_FAILURE, + ("Child [%s] terminated with signal [%d]\n", + svc->name, WTERMSIG(wait_status))); + } else { + DEBUG(0, ("Child [%s] did not exit cleanly\n", svc->name)); + /* Forcibly kill this child, just in case */ + kill(svc->pid, SIGKILL); + + /* Return and let us get caught by another + * call to the SIGCHLD handler + */ + return; + } + + if ((now - svc->last_restart) > 30) { /* TODO: get val from config */ + svc->restarts = 0; + } + + /* Restart the service */ + if (svc->restarts > 2) { /* TODO: get val from config */ + DEBUG(SSSDBG_FATAL_FAILURE, + ("Process [%s], definitely stopped!\n", svc->name)); + talloc_free(svc); + return; + } + + if (svc->type == MT_SVC_SERVICE) { + add_new_service(svc->mt_ctx, svc->name, svc->restarts + 1); + } else if (svc->type == MT_SVC_PROVIDER) { + add_new_provider(svc->mt_ctx, svc->name, svc->restarts + 1); + } else { + /* Invalid type? */ + DEBUG(SSSDBG_CRIT_FAILURE, + ("BUG: Invalid child process type [%d]\n", svc->type)); + } + + /* Free the old service (which will also remove it + * from the child list) + */ + talloc_free(svc); +} + int main(int argc, const char *argv[]) { int opt; |