Add functionality to "pause" repmgrd

In some circumstances, e.g. while performing a switchover, it is essential
that repmgrd does not take any kind of failover action, as this will put
the cluster into an incorrect state.

Previously it was necessary to stop repmgrd on all nodes (or at least
those nodes which repmgrd would consider as promotion candidates), however
this is a cumbersome and potentially risk-prone operation, particularly if the
replication cluster contains more than a couple of servers.

To prevent this issue from occurring, this patch introduces the ability
to "pause" repmgrd on all nodes wth a single command ("repmgr daemon pause")
which notifies repmgrd not to take any failover action until the node
is "unpaused" ("repmgr daemon unpause").

"repmgr daemon status" provides an overview of each node and whether repmgrd
is running, and if so whether it is paused.

"repmgr standby switchover" has been modified to automatically pause repmgrd
while carrying out the switchover.

See documentation for further details.
This commit is contained in:
Ian Barwick
2018-09-27 16:42:10 +09:00
parent fce3c02760
commit 2491b8ae52
27 changed files with 1943 additions and 121 deletions

View File

@@ -30,10 +30,15 @@
* NODE STATUS
* NODE CHECK
*
* DAEMON STATUS
* DAEMON PAUSE
* DAEMON UNPAUSE
*
* For internal use:
* NODE REJOIN
* NODE SERVICE
*
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
@@ -62,6 +67,7 @@
#include "repmgr-action-bdr.h"
#include "repmgr-action-node.h"
#include "repmgr-action-cluster.h"
#include "repmgr-action-daemon.h"
#include <storage/fd.h> /* for PG_TEMP_FILE_PREFIX */
@@ -438,6 +444,10 @@ main(int argc, char **argv)
runtime_options.siblings_follow = true;
break;
case OPT_REPMGRD_NO_PAUSE:
runtime_options.repmgrd_no_pause = true;
break;
/*----------------------
* "node status" options
*----------------------
@@ -900,6 +910,21 @@ main(int argc, char **argv)
else if (strcasecmp(repmgr_action, "CLEANUP") == 0)
action = CLUSTER_CLEANUP;
}
else if (strcasecmp(repmgr_command, "DAEMON") == 0)
{
if (help_option == true)
{
do_daemon_help();
exit(SUCCESS);
}
if (strcasecmp(repmgr_action, "STATUS") == 0)
action = DAEMON_STATUS;
else if (strcasecmp(repmgr_action, "PAUSE") == 0)
action = DAEMON_PAUSE;
else if (strcasecmp(repmgr_action, "UNPAUSE") == 0)
action = DAEMON_UNPAUSE;
}
else
{
valid_repmgr_command_found = false;
@@ -1298,6 +1323,17 @@ main(int argc, char **argv)
do_cluster_cleanup();
break;
/* DAEMON */
case DAEMON_STATUS:
do_daemon_status();
break;
case DAEMON_PAUSE:
do_daemon_pause();
break;
case DAEMON_UNPAUSE:
do_daemon_unpause();
break;
default:
/* An action will have been determined by this point */
break;
@@ -1744,6 +1780,18 @@ check_cli_parameters(const int action)
}
}
if (runtime_options.repmgrd_no_pause == true)
{
switch (action)
{
case STANDBY_SWITCHOVER:
break;
default:
item_list_append_format(&cli_warnings,
_("--repmgrd-no-pause will be ignored when executing %s"),
action_name(action));
}
}
if (runtime_options.config_files[0] != '\0')
{
@@ -1772,6 +1820,8 @@ check_cli_parameters(const int action)
case WITNESS_UNREGISTER:
case NODE_REJOIN:
case NODE_SERVICE:
case DAEMON_PAUSE:
case DAEMON_UNPAUSE:
break;
default:
item_list_append_format(&cli_warnings,
@@ -1851,6 +1901,14 @@ action_name(const int action)
return "CLUSTER MATRIX";
case CLUSTER_CROSSCHECK:
return "CLUSTER CROSSCHECK";
case DAEMON_STATUS:
return "DAEMON STATUS";
case DAEMON_PAUSE:
return "DAEMON PAUSE";
case DAEMON_UNPAUSE:
return "DAEMON UNPAUSE";
}
return "UNKNOWN ACTION";
@@ -1878,6 +1936,42 @@ print_error_list(ItemList *error_list, int log_level)
}
void
print_status_header(int cols, ColHeader *headers)
{
int i;
for (i = 0; i < cols; i++)
{
if (i == 0)
printf(" ");
else
printf(" | ");
printf("%-*s",
headers[i].max_length,
headers[i].title);
}
printf("\n");
printf("-");
for (i = 0; i < cols; i++)
{
int j;
for (j = 0; j < headers[i].max_length; j++)
printf("-");
if (i < (cols - 1))
printf("-+-");
else
printf("-");
}
printf("\n");
}
void
print_help_header(void)
{
@@ -3021,4 +3115,3 @@ drop_replication_slot_if_exists(PGconn *conn, int node_id, char *slot_name)
}
}
}