From 39234afcbf3921c4920ceec854e538f33e083898 Mon Sep 17 00:00:00 2001 From: Ian Barwick Date: Tue, 26 Feb 2019 14:33:54 +0900 Subject: [PATCH] standby clone: check upstream connections after data copy operation With long-running copy operations, it's possible the connection(s) to the primary/source server may go away for some reason, so recheck their availability before attempting to reuse. --- HISTORY | 1 + dbutils.c | 19 +++++++++++++++++++ dbutils.h | 1 + doc/appendix-release-notes.sgml | 8 ++++++++ repmgr-action-standby.c | 16 +++++++++++++++- 5 files changed, 44 insertions(+), 1 deletion(-) diff --git a/HISTORY b/HISTORY index f384e151..6a629bf2 100644 --- a/HISTORY +++ b/HISTORY @@ -18,6 +18,7 @@ it will actually be possible to stream from the target node (Ian) repmgr: "standby switchover": improve handling of connection URIs when executing "node rejoin" on the demotion candidate; GitHub #525 (Ian) + repmgr: check for stale connections during "standby clone" (Ian) repmgr: fix long node ID display in "cluster show" (Ian) repmgr: check for primary server before executing "witness register"; GitHub #538 (Ian) diff --git a/dbutils.c b/dbutils.c index d5e12ca7..45db88e9 100644 --- a/dbutils.c +++ b/dbutils.c @@ -4272,6 +4272,25 @@ connection_ping(PGconn *conn) } +ExecStatusType +connection_ping_reconnect(PGconn *conn) +{ + ExecStatusType ping_result = connection_ping(conn); + + if (PQstatus(conn) != CONNECTION_OK) + { + log_warning(_("connection error, attempting to reset")); + log_detail("%s", PQerrorMessage(conn)); + PQreset(conn); + ping_result = connection_ping(conn); + } + + log_verbose(LOG_DEBUG, "connection_ping_reconnect(): result is %s", PQresStatus(ping_result)); + + return ping_result; +} + + /* ==================== */ /* monitoring functions */ diff --git a/dbutils.h b/dbutils.h index b9934102..16b1ad9c 100644 --- a/dbutils.h +++ b/dbutils.h @@ -515,6 +515,7 @@ int wait_connection_availability(PGconn *conn, long long timeout); bool is_server_available(const char *conninfo); bool is_server_available_params(t_conninfo_param_list *param_list); ExecStatusType connection_ping(PGconn *conn); +ExecStatusType connection_ping_reconnect(PGconn *conn); /* monitoring functions */ void diff --git a/doc/appendix-release-notes.sgml b/doc/appendix-release-notes.sgml index 8aeda485..926748e7 100644 --- a/doc/appendix-release-notes.sgml +++ b/doc/appendix-release-notes.sgml @@ -185,6 +185,14 @@ REPMGRD_OPTS="--daemonize=false" + + + &repmgr;: when executing repmgr standby clone, + recheck primary/upstream connection(s) after the data copy operation is complete, as these may + have gone away. + + + &repmgr;: when executing repmgr standby switchover, diff --git a/repmgr-action-standby.c b/repmgr-action-standby.c index 1394eac6..ebe75ec9 100644 --- a/repmgr-action-standby.c +++ b/repmgr-action-standby.c @@ -605,7 +605,6 @@ do_standby_clone(void) log_error(_("unknown clone mode")); } - /* If the backup failed then exit */ if (r != SUCCESS) { @@ -5794,6 +5793,12 @@ run_basebackup(t_node_info *node_record) if (r != 0) return ERR_BAD_BASEBACKUP; + /* check connections are still available */ + (void)connection_ping_reconnect(primary_conn); + + if (source_conn != primary_conn) + (void)connection_ping_reconnect(source_conn); + /* * If replication slots in use, check the created slot is on the correct * node; the slot will initially get created on the source node, and will @@ -6396,6 +6401,15 @@ stop_backup: RecordStatus record_status = RECORD_NOT_FOUND; PGconn *upstream_conn = NULL; + + /* check connections are still available */ + (void)connection_ping_reconnect(primary_conn); + + if (source_conn != primary_conn) + (void)connection_ping_reconnect(source_conn); + + (void)connection_ping_reconnect(source_conn); + record_status = get_node_record(source_conn, upstream_node_id, &upstream_node_record); if (record_status != RECORD_FOUND)