Skip to content

Commit

Permalink
multipathd: move systemd watchdog handling into daemon
Browse files Browse the repository at this point in the history
Only multipathd needs to take care of notifying systemd. There's
no need to track this information in struct config, or to limit our
checker interval to it, as checkerloop() wakes up every second anyway.

While at it, fix the watchdog enablement logic:

- the watchdog should only be active if WATCHDOG_PID is either unset,
  or matches the daemon's PID, and if WATCHDOG_USEC is not 0.
- the watchdog should trigger twice per systemd-set interval.
- if WatchdogSec= is set to an unreasonable value, make a smarter
  choice than just disabling the watchdog, and print a more meaningful
  error message.

Use timestamp comparison to make sure the watchdog is triggered even
if a checkerloop iteration takes more than a second.

Signed-off-by: Martin Wilck <[email protected]>
Reviewed-by: Benjamin Marzinski <[email protected]>
  • Loading branch information
mwilck committed Nov 21, 2024
1 parent 2f778d6 commit 950d580
Show file tree
Hide file tree
Showing 3 changed files with 56 additions and 40 deletions.
25 changes: 0 additions & 25 deletions libmultipath/config.c
Original file line number Diff line number Diff line change
Expand Up @@ -858,27 +858,6 @@ process_config_dir(struct config *conf, char *dir)
pthread_cleanup_pop(1);
}

#ifdef USE_SYSTEMD
static void set_max_checkint_from_watchdog(struct config *conf)
{
char *envp = getenv("WATCHDOG_USEC");
unsigned long checkint;

if (envp && sscanf(envp, "%lu", &checkint) == 1) {
/* Value is in microseconds */
checkint /= 1000000;
if (checkint < 1 || checkint > UINT_MAX) {
condlog(1, "invalid value for WatchdogSec: \"%s\"", envp);
return;
}
if (conf->max_checkint == 0 || conf->max_checkint > checkint)
conf->max_checkint = checkint;
condlog(3, "enabling watchdog, interval %ld", checkint);
conf->use_watchdog = true;
}
}
#endif

static int init_config__ (const char *file, struct config *conf);

int init_config(const char *file)
Expand Down Expand Up @@ -916,7 +895,6 @@ int init_config__ (const char *file, struct config *conf)
conf->attribute_flags = 0;
conf->reassign_maps = DEFAULT_REASSIGN_MAPS;
conf->checkint = CHECKINT_UNDEF;
conf->use_watchdog = false;
conf->max_checkint = 0;
conf->force_sync = DEFAULT_FORCE_SYNC;
conf->partition_delim = (default_partition_delim != NULL ?
Expand Down Expand Up @@ -967,9 +945,6 @@ int init_config__ (const char *file, struct config *conf)
/*
* fill the voids left in the config file
*/
#ifdef USE_SYSTEMD
set_max_checkint_from_watchdog(conf);
#endif
if (conf->max_checkint == 0) {
if (conf->checkint == CHECKINT_UNDEF)
conf->checkint = DEFAULT_CHECKINT;
Expand Down
1 change: 0 additions & 1 deletion libmultipath/config.h
Original file line number Diff line number Diff line change
Expand Up @@ -148,7 +148,6 @@ struct config {
unsigned int checkint;
unsigned int max_checkint;
unsigned int adjust_int;
bool use_watchdog;
int pgfailback;
int rr_weight;
int no_path_retry;
Expand Down
70 changes: 56 additions & 14 deletions multipathd/main.c
Original file line number Diff line number Diff line change
Expand Up @@ -2153,6 +2153,61 @@ partial_retrigger_tick(vector pathvec)
}
}

#ifdef USE_SYSTEMD
static int get_watchdog_interval(void)
{
const char *envp;
long long checkint;
long pid;

envp = getenv("WATCHDOG_PID");
/* See sd_watchdog_enabled(3) */
if (envp && sscanf(envp, "%lu", &pid) == 1 && pid != daemon_pid)
return -1;

envp = getenv("WATCHDOG_USEC");
if (!envp || sscanf(envp, "%llu", &checkint) != 1 || checkint == 0)
return -1;

/*
* Value is in microseconds, and the watchdog should be triggered
* twice per interval.
*/
checkint /= 2000000;
if (checkint > INT_MAX / 2) {
condlog(1, "WatchdogSec=%lld is too high, assuming %d",
checkint * 2, INT_MAX);
checkint = INT_MAX / 2;
} else if (checkint < 1) {
condlog(1, "WatchdogSec=1 is too low, daemon will be killed by systemd!");
checkint = 1;
}

condlog(3, "enabling watchdog, interval %llds", checkint);
return checkint;
}

static void watchdog_tick(const struct timespec *time) {
static int watchdog_interval;
static struct timespec last_time;
struct timespec diff_time;

if (watchdog_interval == 0)
watchdog_interval = get_watchdog_interval();
if (watchdog_interval < 0)
return;

timespecsub(time, &last_time, &diff_time);
if (diff_time.tv_sec >= watchdog_interval) {
condlog(4, "%s: sending watchdog message", __func__);
sd_notify(0, "WATCHDOG=1");
last_time = *time;
}
}
#else
static void watchdog_tick(const struct timespec *time __attribute__((unused))) {}
#endif

static bool update_prio(struct multipath *mpp, bool refresh_all)
{
int oldpriority;
Expand Down Expand Up @@ -2931,9 +2986,6 @@ checkerloop (void *ap)
struct timespec last_time;
struct config *conf;
int foreign_tick = 0;
#ifdef USE_SYSTEMD
bool use_watchdog;
#endif

pthread_cleanup_push(rcu_unregister, NULL);
rcu_register_thread();
Expand All @@ -2944,13 +2996,6 @@ checkerloop (void *ap)
get_monotonic_time(&last_time);
last_time.tv_sec -= 1;

/* use_watchdog is set from process environment and never changes */
conf = get_multipath_config();
#ifdef USE_SYSTEMD
use_watchdog = conf->use_watchdog;
#endif
put_multipath_config(conf);

while (1) {
struct timespec diff_time, start_time, end_time;
int num_paths = 0, strict_timing;
Expand All @@ -2967,10 +3012,7 @@ checkerloop (void *ap)
(long)diff_time.tv_sec, diff_time.tv_nsec / 1000);
last_time = start_time;
ticks = diff_time.tv_sec;
#ifdef USE_SYSTEMD
if (use_watchdog)
sd_notify(0, "WATCHDOG=1");
#endif
watchdog_tick(&start_time);
while (checker_state != CHECKER_FINISHED) {
struct multipath *mpp;
int i;
Expand Down

0 comments on commit 950d580

Please sign in to comment.