diff --git a/doc/repmgr-node-rejoin.sgml b/doc/repmgr-node-rejoin.sgml
index d7583e32..f8015ce1 100644
--- a/doc/repmgr-node-rejoin.sgml
+++ b/doc/repmgr-node-rejoin.sgml
@@ -45,6 +45,77 @@
+
+
+ Options
+
+
+
+
+
+
+ Check prerequisites but don't actually execute the rejoin.
+
+
+
+
+
+
+
+
+ Execute pg_rewind if necessary.
+
+
+ It is only necessary to provide the pg_rewind
+ if using PostgreSQL 9.3 or 9.4, and pg_rewind
+ is not installed in the PostgreSQL bin directory.
+
+
+
+
+
+
+
+
+ comma-separated list of configuration files to retain after
+ executing pg_rewind.
+
+
+ Currently pg_rewind will overwrite
+ the local node's configuration files with the files from the source node,
+ so it's advisable to use this option to ensure they are kept.
+
+
+
+
+
+
+
+
+
+ Directory to temporarily store configuration files specified with
+ ; default: /tmp.
+
+
+
+
+
+
+
+
+
+ Don't wait for the node to rejoin cluster.
+
+
+ If this option is supplied, &repmgr; will restart the node but
+ not wait for it to connect to the primary.
+
+
+
+
+
+
+
Event notifications
@@ -77,11 +148,18 @@
+
+
+ pg_rewind
+ using with "repmgr node rejoin"
+
+
Using pg_rewind
repmgr node rejoin can optionally use pg_rewind to re-integrate a
node which has diverged from the rest of the cluster, typically a failed primary.
- pg_rewind is available in PostgreSQL 9.5 and later.
+ pg_rewind is available in PostgreSQL 9.5 and later as part of the core distribution,
+ and can be installed from external sources for PostgreSQL 9.3 and 9.4.
diff --git a/doc/repmgr-standby-follow.sgml b/doc/repmgr-standby-follow.sgml
index 1bee915b..0079dc37 100644
--- a/doc/repmgr-standby-follow.sgml
+++ b/doc/repmgr-standby-follow.sgml
@@ -70,7 +70,7 @@
-
+
diff --git a/doc/switchover.sgml b/doc/switchover.sgml
index 74109e13..00e3d54a 100644
--- a/doc/switchover.sgml
+++ b/doc/switchover.sgml
@@ -180,6 +180,10 @@
+
+ pg_rewind
+ using with "repmgr standby switchover"
+
Switchover and pg_rewind
If the demotion candidate does not shut down smoothly or cleanly, there's a risk it
@@ -206,7 +210,7 @@
pg_rewind has been part of the core PostgreSQL distribution since
- version 9.5. Users of versions 9.3 and 9.4 will need to manually install; source code available here:
+ version 9.5. Users of versions 9.3 and 9.4 will need to manually install it; the source code is available here:
https://github.com/vmware/pg_rewind.
If the pg_rewind
binary is not installed in the PostgreSQL bin directory, provide
diff --git a/errcode.h b/errcode.h
index 9e4fd158..5cf39b7d 100644
--- a/errcode.h
+++ b/errcode.h
@@ -45,5 +45,6 @@
#define ERR_OUT_OF_MEMORY 21
#define ERR_SWITCHOVER_INCOMPLETE 22
#define ERR_FOLLOW_FAIL 23
+#define ERR_REJOIN_FAIL 24
#endif /* _ERRCODE_H_ */
diff --git a/repmgr-action-node.c b/repmgr-action-node.c
index 4246d1a9..4044baa7 100644
--- a/repmgr-action-node.c
+++ b/repmgr-action-node.c
@@ -1758,7 +1758,17 @@ do_node_rejoin(void)
PQfinish(upstream_conn);
/* connect to registered primary and check it's not in recovery */
- upstream_conn = establish_db_connection(primary_node_record.conninfo, true);
+ upstream_conn = establish_db_connection(primary_node_record.conninfo, false);
+
+ if (PQstatus(upstream_conn) != CONNECTION_OK)
+ {
+ log_error(_("unable to connect to current primary \"%s\" (node ID: %i)"),
+ primary_node_record.node_name,
+ primary_node_record.node_id);
+ log_detail(_("primay node conninfo is: \"%s\""),
+ primary_node_record.conninfo);
+ exit(ERR_BAD_CONFIG);
+ }
upstream_recovery_type = get_recovery_type(upstream_conn);
@@ -1992,22 +2002,99 @@ do_node_rejoin(void)
}
/*
- * XXX add checks that node actually started and connected to primary,
- * if not exit with ERR_REJOIN_FAIL
+ * Actively check that node actually started and connected to primary,
+ * if not exit with ERR_REJOIN_FAIL.
+ *
+ * This check can be overridden with -W/--no-wait, in which case a one-time
+ * check will be carried out.
*/
+ if (runtime_options.no_wait == false)
+ {
+ int i;
- create_event_notification(upstream_conn,
- &config_file_options,
- config_file_options.node_id,
- "node_rejoin",
- success,
- follow_output.data);
+ for (i = 0; i < config_file_options.standby_reconnect_timeout; i++)
+ {
+ if (is_server_available(config_file_options.conninfo))
+ {
+ log_verbose(LOG_INFO, _("demoted primary is pingable"));
+ break;
+ }
- PQfinish(upstream_conn);
+ if (i % 5 == 0)
+ {
+ log_verbose(LOG_INFO, _("waiting for node %i to respond to pings; %i of max %i attempts"),
+ config_file_options.node_id,
+ i + 1, config_file_options.standby_reconnect_timeout);
+ }
+ else
+ {
+ log_debug("sleeping 1 second waiting for node %i to respond to pings; %i of max %i attempts",
+ config_file_options.node_id,
+ i + 1, config_file_options.standby_reconnect_timeout);
+ }
- log_notice(_("NODE REJOIN successful"));
- log_detail("%s", follow_output.data);
+ sleep(1);
+ }
+ for (; i < config_file_options.standby_reconnect_timeout; i++)
+ {
+ success = is_downstream_node_attached(upstream_conn, config_file_options.node_name);
+
+ if (success == true)
+ {
+ log_verbose(LOG_INFO, _("node %i has attached to its upstream node"),
+ config_file_options.node_id);
+ break;
+ }
+
+ if (i % 5 == 0)
+ {
+ log_info(_("waiting for node %i to connect to new primary; %i of max %i attempts"),
+ config_file_options.node_id,
+ i + 1, config_file_options.standby_reconnect_timeout);
+ }
+ else
+ {
+ log_debug("sleeping 1 second waiting for node %i to connect to new primary; %i of max %i attempts",
+ config_file_options.node_id,
+ i + 1, config_file_options.standby_reconnect_timeout);
+ }
+
+ sleep(1);
+ }
+
+ create_event_notification(upstream_conn,
+ &config_file_options,
+ config_file_options.node_id,
+ "node_rejoin",
+ success,
+ follow_output.data);
+
+ if (success == false)
+ {
+ termPQExpBuffer(&follow_output);
+ log_notice(_("NODE REJOIN failed"));
+ exit(ERR_REJOIN_FAIL);
+ }
+ }
+ else
+ {
+ success = is_downstream_node_attached(upstream_conn, config_file_options.node_name);
+ }
+
+
+ if (success == true)
+ {
+ log_notice(_("NODE REJOIN successful"));
+ log_detail("%s", follow_output.data);
+ }
+ else
+ {
+ /*
+ * if we reach here, no record found in upstream node's pg_stat_replication */
+ log_notice(_("NODE REJOIN has completed but node is not yet reattached to upstream"));
+ log_hint(_("you will need to manually check the node's replication status"));
+ }
termPQExpBuffer(&follow_output);
return;
@@ -2474,6 +2561,7 @@ do_node_help(void)
" after executing \"pg_rewind\"\n"));
printf(_(" --config-archive-dir directory to temporarily store retained configuration files\n" \
" (default: /tmp)\n"));
+ printf(_(" -W/--no-wait don't wait for the node to rejoin cluster\n"));
puts("");
printf(_("NODE SERVICE\n"));
diff --git a/repmgr-client-global.h b/repmgr-client-global.h
index 3f7e41b1..8139b063 100644
--- a/repmgr-client-global.h
+++ b/repmgr-client-global.h
@@ -42,6 +42,7 @@ typedef struct
bool force;
char pg_bindir[MAXLEN]; /* overrides setting in repmgr.conf */
bool wait;
+ bool no_wait;
/* logging options */
char log_level[MAXLEN]; /* overrides setting in repmgr.conf */
@@ -134,7 +135,7 @@ typedef struct
/* configuration metadata */ \
false, false, false, false, \
/* general configuration options */ \
- "", false, false, "", false, \
+ "", false, false, "", false, false, \
/* logging options */ \
"", false, false, false, \
/* output options */ \
diff --git a/repmgr-client.c b/repmgr-client.c
index a9d4cf98..d4239dd1 100644
--- a/repmgr-client.c
+++ b/repmgr-client.c
@@ -178,7 +178,7 @@ main(int argc, char **argv)
strncpy(runtime_options.username, pw->pw_name, MAXLEN);
}
- while ((c = getopt_long(argc, argv, "?Vb:f:FWd:h:p:U:R:S:D:ck:L:tvC:", long_options,
+ while ((c = getopt_long(argc, argv, "?Vb:f:FwWd:h:p:U:R:S:D:ck:L:tvC:", long_options,
&optindex)) != -1)
{
/*
@@ -243,11 +243,16 @@ main(int argc, char **argv)
strncpy(runtime_options.replication_user, optarg, MAXLEN);
break;
- /* -W/--wait */
- case 'W':
+ /* -w/--wait */
+ case 'w':
runtime_options.wait = true;
break;
+ /* -W/--no-wait */
+ case 'W':
+ runtime_options.no_wait = true;
+ break;
+
/*----------------------------
* database connection options
*----------------------------
@@ -1571,6 +1576,41 @@ check_cli_parameters(const int action)
}
}
+ /* --wait/--no-wait */
+
+ if (runtime_options.wait == true && runtime_options.no_wait == true)
+ {
+ item_list_append_format(&cli_errors,
+ _("both --wait and --no-wait options provided"));
+ }
+ else
+ {
+ if (runtime_options.wait)
+ {
+ switch (action)
+ {
+ case STANDBY_FOLLOW:
+ break;
+ default:
+ item_list_append_format(&cli_warnings,
+ _("--wait will be ignored when executing %s"),
+ action_name(action));
+ }
+ }
+ else if (runtime_options.wait)
+ {
+ switch (action)
+ {
+ case NODE_REJOIN:
+ break;
+ default:
+ item_list_append_format(&cli_warnings,
+ _("--no-wait will be ignored when executing %s"),
+ action_name(action));
+ }
+ }
+ }
+
/* repmgr node service --action */
if (runtime_options.action[0] != '\0')
{
diff --git a/repmgr-client.h b/repmgr-client.h
index b3ad5518..f8bcc3b7 100644
--- a/repmgr-client.h
+++ b/repmgr-client.h
@@ -86,6 +86,7 @@
#define OPT_REPL_CONN 1037
#define OPT_REMOTE_NODE_ID 1038
#define OPT_RECOVERY_CONF_ONLY 1039
+#define OPT_NO_WAIT 1040
/* deprecated since 3.3 */
#define OPT_DATA_DIR 999
@@ -104,7 +105,8 @@ static struct option long_options[] =
{"dry-run", no_argument, NULL, OPT_DRY_RUN},
{"force", no_argument, NULL, 'F'},
{"pg_bindir", required_argument, NULL, 'b'},
- {"wait", no_argument, NULL, 'W'},
+ {"wait", no_argument, NULL, 'w'},
+ {"no-wait", no_argument, NULL, 'W'},
/* connection options */
{"dbname", required_argument, NULL, 'd'},