From d7420d7274444d317ee1aea0f0030a986aa74bd0 Mon Sep 17 00:00:00 2001 From: Ian Barwick Date: Wed, 30 Jan 2019 14:36:52 +0900 Subject: [PATCH] daemon (start|stop): verify that repmgrd starts/stops. Note this may not always be possible for "daemon stop" if we are unable to determine the repmgrd PID. --- doc/repmgr-daemon-start.sgml | 66 +++++++++++++++- doc/repmgr-daemon-stop.sgml | 38 ++++++++++ doc/repmgrd-configuration.sgml | 2 +- repmgr-action-daemon.c | 135 +++++++++++++++++++++++++++++++-- repmgr-action-standby.c | 4 +- repmgr-client-global.h | 7 +- repmgr-client.c | 14 +++- repmgr-client.h | 2 +- 8 files changed, 249 insertions(+), 19 deletions(-) diff --git a/doc/repmgr-daemon-start.sgml b/doc/repmgr-daemon-start.sgml index 0d52e426..9cd732d1 100644 --- a/doc/repmgr-daemon-start.sgml +++ b/doc/repmgr-daemon-start.sgml @@ -23,7 +23,11 @@ This command starts the repmgrd daemon on the local node. - + + By default, &repmgr; will wait for up to 15 seconds to confirm that repmgrd + started. This behaviour can be overridden by specifying a diffent value using the + option, or disabled altogether with the option. + @@ -65,6 +69,33 @@ + + + + + Wait for the specified number of seconds to confirm that repmgrd + started successfully. + + + Note that providing is the equivalent of . + + + + + + + + + + Don't wait to confirm that repmgrd + started successfully. + + + This is equivalent to providing . + + + + @@ -79,7 +110,12 @@ - repmgrd could be started. + The repmgrd start command (defined in + repmgrd_service_start_command) was successfully executed. + + + If the option was provided, &repmgr; will confirm that + repmgrd has actually started up. @@ -94,11 +130,33 @@ + + + + + + &repmgr; was unable to connect to the local PostgreSQL node. + + + PostgreSQL must be running before repmgrd + can be started. Additionally, unless the option was + provided, &repmgr; needs to be able to connect to the local PostgreSQL node + to determine the state of repmgrd. + + + + + - repmgrd could not be started. + The repmgrd start command (defined in + repmgrd_service_start_command) was not successfully executed. + + + This can also mean that &repmgr; was unable to confirm whether repmgrd + successfully started (unless the option was provided). @@ -109,7 +167,7 @@ See also - , + , , diff --git a/doc/repmgr-daemon-stop.sgml b/doc/repmgr-daemon-stop.sgml index a57f4cf1..6a7b497f 100644 --- a/doc/repmgr-daemon-stop.sgml +++ b/doc/repmgr-daemon-stop.sgml @@ -24,6 +24,18 @@ local node. + + By default, &repmgr; will wait for up to 15 seconds to confirm that repmgrd + stopped. This behaviour can be overridden by specifying a diffent value using the + option, or disabled altogether with the option. + + + + If PostgreSQL is not running on the local node, under some circumstances &repmgr; may not + be able to confirm if repmgrd has actually stopped. + + + @@ -65,6 +77,32 @@ + + + + + Wait for the specified number of seconds to confirm that repmgrd + stopped successfully. + + + Note that providing is the equivalent of . + + + + + + + + + + Don't wait to confirm that repmgrd + stopped successfully. + + + This is equivalent to providing . + + + diff --git a/doc/repmgrd-configuration.sgml b/doc/repmgrd-configuration.sgml index 5dba56dd..c2a20cc2 100644 --- a/doc/repmgrd-configuration.sgml +++ b/doc/repmgrd-configuration.sgml @@ -348,7 +348,7 @@ - + repmgrd starting and stopping diff --git a/repmgr-action-daemon.c b/repmgr-action-daemon.c index 18bfd4d8..df20b561 100644 --- a/repmgr-action-daemon.c +++ b/repmgr-action-daemon.c @@ -25,7 +25,9 @@ #include "repmgr-client-global.h" #include "repmgr-action-daemon.h" - +#define REPMGR_DAEMON_STOP_START_WAIT 15 +#define REPMGR_DAEMON_STATUS_START_HINT _("use \"repmgr daemon status\" to confirm that repmgrd was successfully started") +#define REPMGR_DAEMON_STATUS_STOP_HINT _("use \"repmgr daemon status\" to confirm that repmgrd was successfully stopped") /* * Possibly also show: @@ -410,7 +412,7 @@ do_daemon_start(void) /* TODO: if PostgreSQL is not available, have repmgrd loop and retry connection */ log_error(_("unable to connect to local node")); log_detail(_("PostgreSQL must be running before \"repmgrd\" can be started")); - exit(ERR_REPMGRD_SERVICE); + exit(ERR_DB_CONN); } /* @@ -421,7 +423,17 @@ do_daemon_start(void) if (is_repmgrd_running(conn) == true) { + pid_t pid = UNKNOWN_PID; + log_error(_("repmgrd appears to be running already")); + + pid = repmgrd_get_pid(conn); + + if (pid != UNKNOWN_PID) + log_detail(_("repmgrd PID is %i"), pid); + else + log_warning(_("unable to determine repmgrd PID")); + PQfinish(conn); exit(ERR_REPMGRD_SERVICE); } @@ -457,6 +469,52 @@ do_daemon_start(void) } termPQExpBuffer(&output_buf); + + if (runtime_options.no_wait == true || runtime_options.wait == 0) + { + log_hint(REPMGR_DAEMON_STATUS_START_HINT); + } + else + { + int i = 0; + int timeout = REPMGR_DAEMON_STOP_START_WAIT; + + if (runtime_options.wait_provided) + timeout = runtime_options.wait; + + conn = establish_db_connection(config_file_options.conninfo, false); + + if (PQstatus(conn) != CONNECTION_OK) + { + log_notice(_("unable to connect to local node")); + log_hint(REPMGR_DAEMON_STATUS_START_HINT); + exit(ERR_DB_CONN); + } + + for (;;) + { + if (is_repmgrd_running(conn) == true) + { + log_notice(_("repmgrd was successfully started")); + PQfinish(conn); + break; + } + + if (i == timeout) + { + PQfinish(conn); + log_error(_("repmgrd does not appear to have started after %i seconds"), + timeout); + log_hint(REPMGR_DAEMON_STATUS_START_HINT); + exit(ERR_REPMGRD_SERVICE); + } + + log_debug("sleeping 1 second; %i of %i attempts to determine if repmgrd is running", + i, runtime_options.wait); + sleep(1); + i++; + } + } } @@ -468,7 +526,7 @@ void do_daemon_stop(void) bool success; pid_t pid = UNKNOWN_PID; - if (config_file_options.repmgrd_service_start_command[0] == '\0') + if (config_file_options.repmgrd_service_stop_command[0] == '\0') { log_error(_("\"repmgrd_service_stop_command\" is not set")); log_hint(_("set \"repmgrd_service_stop_command\" in \"repmgr.conf\"")); @@ -485,6 +543,9 @@ void do_daemon_stop(void) if (PQstatus(conn) != CONNECTION_OK) { + /* + * a PostgreSQL connection is not required to stop repmgrd, + */ log_warning(_("unable to connect to local node")); } else @@ -534,6 +595,66 @@ void do_daemon_stop(void) } termPQExpBuffer(&output_buf); + + if (runtime_options.no_wait == true || runtime_options.wait == 0) + { + log_hint(REPMGR_DAEMON_STATUS_STOP_HINT); + } + else + { + int i = 0; + int timeout = REPMGR_DAEMON_STOP_START_WAIT; + /* + * + */ + if (pid == UNKNOWN_PID) + { + /* + * XXX attempt to get pidfile from config + * and get contents + * ( see check_and_create_pid_file() ) + * if PID still unknown, exit here + */ + log_warning(_("unable to determine repmgrd PID")); + log_hint(REPMGR_DAEMON_STATUS_STOP_HINT); + exit(ERR_REPMGRD_SERVICE); + } + + if (runtime_options.wait_provided) + timeout = runtime_options.wait; + + for (;;) + { + if (kill(pid, 0) == -1) + { + if (errno == ESRCH) + { + log_notice(_("repmgrd was successfully stopped")); + exit(SUCCESS); + } + else + { + log_error(_("unable to determine status of process with PID %i"), pid); + log_detail("%s", strerror(errno)); + exit(ERR_REPMGRD_SERVICE); + } + } + + + if (i == timeout) + { + log_error(_("repmgrd does not appear to have stopped after %i seconds"), + timeout); + log_hint(REPMGR_DAEMON_STATUS_START_HINT); + exit(ERR_REPMGRD_SERVICE); + } + + log_debug("sleeping 1 second; %i of %i attempts to determine if repmgrd with PID %i is running", + i, timeout, pid); + sleep(1); + i++; + } + } } @@ -559,16 +680,20 @@ void do_daemon_help(void) printf(_("DAEMON START\n")); puts(""); - printf(_(" \"daemon start\" attempts to start repmgrd")); + printf(_(" \"daemon start\" attempts to start repmgrd\n")); puts(""); printf(_(" --dry-run check prerequisites but don't start repmgrd\n")); + printf(_(" -w/--wait wait for repmgrd to start (default: %i seconds)\n"), REPMGR_DAEMON_STOP_START_WAIT); + printf(_(" --no-wait don't wait for repmgrd to start\n")); puts(""); printf(_("DAEMON STOP\n")); puts(""); - printf(_(" \"daemon stop\" attempts to stop repmgrd")); + printf(_(" \"daemon stop\" attempts to stop repmgrd\n")); puts(""); printf(_(" --dry-run check prerequisites but don't stop repmgrd\n")); + printf(_(" -w/--wait wait for repmgrd to stop (default: %i seconds)\n"), REPMGR_DAEMON_STOP_START_WAIT); + printf(_(" --no-wait don't wait for repmgrd to stop\n")); puts(""); printf(_("DAEMON PAUSE\n")); diff --git a/repmgr-action-standby.c b/repmgr-action-standby.c index 83281c4d..be829c93 100644 --- a/repmgr-action-standby.c +++ b/repmgr-action-standby.c @@ -2298,7 +2298,7 @@ do_standby_follow(void) NULL); } - if (PQstatus(follow_target_conn) == CONNECTION_OK || runtime_options.wait == false) + if (PQstatus(follow_target_conn) == CONNECTION_OK || runtime_options.wait_provided == false) { break; } @@ -2317,7 +2317,7 @@ do_standby_follow(void) log_error(_("unable to connect to target node %i"), follow_target_node_id); } - if (runtime_options.wait == true) + if (runtime_options.wait_provided == true) { if (follow_target_node_id == UNKNOWN_NODE_ID) { diff --git a/repmgr-client-global.h b/repmgr-client-global.h index 8146dcbe..b8d6de7c 100644 --- a/repmgr-client-global.h +++ b/repmgr-client-global.h @@ -35,13 +35,14 @@ typedef struct bool connection_param_provided; bool host_param_provided; bool limit_provided; + bool wait_provided; /* general configuration options */ char config_file[MAXPGPATH]; bool dry_run; bool force; char pg_bindir[MAXLEN]; /* overrides setting in repmgr.conf */ - bool wait; + int wait; bool no_wait; /* logging options */ @@ -137,9 +138,9 @@ typedef struct #define T_RUNTIME_OPTIONS_INITIALIZER { \ /* configuration metadata */ \ - false, false, false, false, \ + false, false, false, false, false, \ /* general configuration options */ \ - "", false, false, "", false, false, \ + "", false, false, "", -1, false, \ /* logging options */ \ "", false, false, false, false, \ /* output options */ \ diff --git a/repmgr-client.c b/repmgr-client.c index 79495ab5..246ba999 100644 --- a/repmgr-client.c +++ b/repmgr-client.c @@ -254,7 +254,11 @@ main(int argc, char **argv) /* -w/--wait */ case 'w': - runtime_options.wait = true; + runtime_options.wait_provided = true; + if (optarg != NULL) + { + runtime_options.wait = repmgr_atoi(optarg, "--wait", &cli_errors, 0); + } break; /* -W/--no-wait */ @@ -1713,17 +1717,19 @@ check_cli_parameters(const int action) /* --wait/--no-wait */ - if (runtime_options.wait == true && runtime_options.no_wait == true) + if (runtime_options.wait_provided == true && runtime_options.no_wait == true) { item_list_append_format(&cli_errors, _("both --wait and --no-wait options provided")); } else { - if (runtime_options.wait) + if (runtime_options.wait_provided) { switch (action) { + case DAEMON_START: + case DAEMON_STOP: case STANDBY_FOLLOW: break; default: @@ -1736,6 +1742,8 @@ check_cli_parameters(const int action) { switch (action) { + case DAEMON_START: + case DAEMON_STOP: case NODE_REJOIN: break; default: diff --git a/repmgr-client.h b/repmgr-client.h index b819ad2a..064d9d35 100644 --- a/repmgr-client.h +++ b/repmgr-client.h @@ -116,7 +116,7 @@ static struct option long_options[] = {"dry-run", no_argument, NULL, OPT_DRY_RUN}, {"force", no_argument, NULL, 'F'}, {"pg_bindir", required_argument, NULL, 'b'}, - {"wait", no_argument, NULL, 'w'}, + {"wait", optional_argument, NULL, 'w'}, {"no-wait", no_argument, NULL, 'W'}, /* connection options */