standby clone: check upstream connections after data copy operation

With long-running copy operations, it's possible the connection(s) to
the primary/source server may go away for some reason, so recheck
their availability before attempting to reuse.
This commit is contained in:
Ian Barwick
2019-02-26 14:33:54 +09:00
parent 897e3bee14
commit 0578053875
5 changed files with 44 additions and 1 deletions

View File

@@ -18,6 +18,7 @@
it will actually be possible to stream from the target node (Ian)
repmgr: "standby switchover": improve handling of connection URIs when
executing "node rejoin" on the demotion candidate; GitHub #525 (Ian)
repmgr: check for stale connections during "standby clone" (Ian)
repmgr: fix long node ID display in "cluster show" (Ian)
repmgr: check for primary server before executing "witness register";
GitHub #538 (Ian)

View File

@@ -4272,6 +4272,25 @@ connection_ping(PGconn *conn)
}
ExecStatusType
connection_ping_reconnect(PGconn *conn)
{
ExecStatusType ping_result = connection_ping(conn);
if (PQstatus(conn) != CONNECTION_OK)
{
log_warning(_("connection error, attempting to reset"));
log_detail("%s", PQerrorMessage(conn));
PQreset(conn);
ping_result = connection_ping(conn);
}
log_verbose(LOG_DEBUG, "connection_ping_reconnect(): result is %s", PQresStatus(ping_result));
return ping_result;
}
/* ==================== */
/* monitoring functions */

View File

@@ -516,6 +516,7 @@ int wait_connection_availability(PGconn *conn, long long timeout);
bool is_server_available(const char *conninfo);
bool is_server_available_params(t_conninfo_param_list *param_list);
ExecStatusType connection_ping(PGconn *conn);
ExecStatusType connection_ping_reconnect(PGconn *conn);
/* monitoring functions */
void

View File

@@ -185,6 +185,14 @@ REPMGRD_OPTS="--daemonize=false"</programlisting>
</para>
</listitem>
<listitem>
<para>
&repmgr;: when executing <link linkend="repmgr-standby-clone"><command>repmgr standby clone</command></link>,
recheck primary/upstream connection(s) after the data copy operation is complete, as these may
have gone away.
</para>
</listitem>
<listitem>
<para>
&repmgr;: when executing <link linkend="repmgr-standby-switchover"><command>repmgr standby switchover</command></link>,

View File

@@ -605,7 +605,6 @@ do_standby_clone(void)
log_error(_("unknown clone mode"));
}
/* If the backup failed then exit */
if (r != SUCCESS)
{
@@ -5794,6 +5793,12 @@ run_basebackup(t_node_info *node_record)
if (r != 0)
return ERR_BAD_BASEBACKUP;
/* check connections are still available */
(void)connection_ping_reconnect(primary_conn);
if (source_conn != primary_conn)
(void)connection_ping_reconnect(source_conn);
/*
* If replication slots in use, check the created slot is on the correct
* node; the slot will initially get created on the source node, and will
@@ -6396,6 +6401,15 @@ stop_backup:
RecordStatus record_status = RECORD_NOT_FOUND;
PGconn *upstream_conn = NULL;
/* check connections are still available */
(void)connection_ping_reconnect(primary_conn);
if (source_conn != primary_conn)
(void)connection_ping_reconnect(source_conn);
(void)connection_ping_reconnect(source_conn);
record_status = get_node_record(source_conn, upstream_node_id, &upstream_node_record);
if (record_status != RECORD_FOUND)