From fcd111ac4c1324081d166227acadaedee4eef647 Mon Sep 17 00:00:00 2001 From: Ian Barwick Date: Thu, 24 Aug 2017 22:44:03 +0900 Subject: [PATCH] Improve logging output during failover process --- dbutils.c | 29 +++++++++++++++++++++++++++++ dbutils.h | 1 + repmgr--4.0.sql | 6 +++++- repmgr-action-standby.c | 6 ++---- repmgr.c | 39 ++++++++++++++++++++++++++++++++------- repmgrd-physical.c | 4 ++-- repmgrd.c | 12 +++++++++--- 7 files changed, 80 insertions(+), 17 deletions(-) diff --git a/dbutils.c b/dbutils.c index 5221c761..ec3d01b4 100644 --- a/dbutils.c +++ b/dbutils.c @@ -1525,6 +1525,34 @@ identify_system(PGconn *repl_conn, t_system_identification *identification) } +bool +repmgrd_set_local_node_id(PGconn *conn, int local_node_id) +{ + PQExpBufferData query; + PGresult *res = NULL; + + initPQExpBuffer(&query); + + appendPQExpBuffer( + &query, + " SELECT repmgr.set_local_node_id(%i)", + local_node_id); + + res = PQexec(conn, query.data); + termPQExpBuffer(&query); + + if (PQresultStatus(res) != PGRES_TUPLES_OK) + { + PQclear(res); + return false; + } + + PQclear(res); + return true; +} + + + /* ================ */ /* result functions */ /* ================ */ @@ -3599,6 +3627,7 @@ announce_candidature(PGconn *conn, t_node_info *this_node, t_node_info *other_no return retval; } + void notify_follow_primary(PGconn *conn, int primary_node_id) { diff --git a/dbutils.h b/dbutils.h index 37b596cf..a58ed7ad 100644 --- a/dbutils.h +++ b/dbutils.h @@ -365,6 +365,7 @@ int get_primary_node_id(PGconn *conn); bool can_use_pg_rewind(PGconn *conn, const char *data_directory, PQExpBufferData *reason); int get_ready_archive_files(PGconn *conn, const char *data_directory); bool identify_system(PGconn *repl_conn, t_system_identification *identification); +bool repmgrd_set_local_node_id(PGconn *conn, int local_node_id); /* extension functions */ ExtensionStatus get_repmgr_extension_status(PGconn *conn); diff --git a/repmgr--4.0.sql b/repmgr--4.0.sql index e260890d..497e8b5a 100644 --- a/repmgr--4.0.sql +++ b/repmgr--4.0.sql @@ -56,6 +56,11 @@ LEFT JOIN repmgr.nodes un /* monitoring functions */ +CREATE FUNCTION set_local_node_id(INT) + RETURNS VOID + AS '$libdir/repmgr', 'set_local_node_id' + LANGUAGE C STRICT; + CREATE FUNCTION standby_set_last_updated() RETURNS TIMESTAMP WITH TIME ZONE AS '$libdir/repmgr', 'standby_set_last_updated' @@ -66,7 +71,6 @@ CREATE FUNCTION standby_get_last_updated() AS '$libdir/repmgr', 'standby_get_last_updated' LANGUAGE C STRICT; - /* failover functions */ CREATE FUNCTION request_vote(INT,INT) diff --git a/repmgr-action-standby.c b/repmgr-action-standby.c index c8519c96..0f469517 100644 --- a/repmgr-action-standby.c +++ b/repmgr-action-standby.c @@ -1141,8 +1141,6 @@ _do_standby_promote_internal(const char *data_dir) PGconn *conn = NULL; RecoveryType recovery_type = RECTYPE_UNKNOWN; - log_notice(_("promoting standby")); - /* * Promote standby to primary. * @@ -1153,8 +1151,8 @@ _do_standby_promote_internal(const char *data_dir) get_server_action(ACTION_PROMOTE, script, (char *)data_dir); - log_notice(_("promoting server using '%s'"), - script); + log_notice(_("promoting standby")); + log_detail(_("promoting server using '%s'"), script); r = system(script); if (r != 0) diff --git a/repmgr.c b/repmgr.c index ac0d6661..1f8cc3a4 100644 --- a/repmgr.c +++ b/repmgr.c @@ -46,6 +46,7 @@ typedef struct repmgrdSharedState { LWLockId lock; /* protects search/modification */ TimestampTz last_updated; + int local_node_id; /* streaming failover */ NodeState node_state; NodeVotingStatus voting_status; @@ -66,6 +67,9 @@ void _PG_fini(void); static void repmgr_shmem_startup(void); +Datum set_local_node_id(PG_FUNCTION_ARGS); +PG_FUNCTION_INFO_V1(set_local_node_id); + Datum standby_set_last_updated(PG_FUNCTION_ARGS); PG_FUNCTION_INFO_V1(standby_set_last_updated); @@ -107,9 +111,9 @@ PG_FUNCTION_INFO_V1(unset_bdr_failover_handler); void _PG_init(void) { - elog(INFO, "repmgr init"); + elog(DEBUG1, "repmgr init"); - // error here? + /* error here? */ if (!process_shared_preload_libraries_in_progress) return; @@ -128,6 +132,8 @@ _PG_init(void) shmem_startup_hook = repmgr_shmem_startup; } + + /* * Module unload callback */ @@ -171,6 +177,7 @@ repmgr_shmem_startup(void) shared_state->lock = LWLockAssign(); #endif + shared_state->local_node_id = UNKNOWN_NODE_ID; shared_state->current_electoral_term = 0; shared_state->voting_status = VS_NO_VOTE; shared_state->candidate_node_id = UNKNOWN_NODE_ID; @@ -186,6 +193,21 @@ repmgr_shmem_startup(void) /* monitoring functions */ /* ==================== */ +Datum +set_local_node_id(PG_FUNCTION_ARGS) +{ + int local_node_id = PG_GETARG_INT32(0); + + if (!shared_state) + PG_RETURN_NULL(); + + LWLockAcquire(shared_state->lock, LW_SHARED); + shared_state->local_node_id = local_node_id; + LWLockRelease(shared_state->lock); + + PG_RETURN_VOID(); +} + /* update and return last updated with current timestamp */ Datum standby_set_last_updated(PG_FUNCTION_ARGS) @@ -254,7 +276,8 @@ request_vote(PG_FUNCTION_ARGS) PG_RETURN_NULL(); } - elog(INFO, "requesting node id is %i for electoral term %i (our term: %i)", + elog(INFO, "node %i has received request from node %i for electoral term %i (our term: %i)", + shared_state->local_node_id, requesting_node_id, current_electoral_term, shared_state->current_electoral_term); @@ -270,7 +293,7 @@ request_vote(PG_FUNCTION_ARGS) "SELECT pg_catalog.pg_last_xlog_receive_location()"); #endif - elog(INFO, "query: %s", query.data); + elog(DEBUG1, "query: %s", query.data); ret = SPI_execute(query.data, true, 0); if (ret < 0) @@ -285,7 +308,7 @@ request_vote(PG_FUNCTION_ARGS) 1, &isnull)); - elog(INFO, "Our LSN is %X/%X", + elog(DEBUG1, "our LSN is %X/%X", (uint32) (our_lsn >> 32), (uint32) our_lsn); @@ -388,10 +411,12 @@ notify_follow_primary(PG_FUNCTION_ARGS) if (!shared_state) PG_RETURN_NULL(); - elog(INFO, "received notification to follow node %i", primary_node_id); - LWLockAcquire(shared_state->lock, LW_SHARED); + elog(INFO, "node %i received notification to follow node %i", + shared_state->local_node_id, + primary_node_id); + /* Explicitly set the primary node id */ shared_state->candidate_node_id = primary_node_id; shared_state->follow_new_primary = true; diff --git a/repmgrd-physical.c b/repmgrd-physical.c index 981b5336..2a906284 100644 --- a/repmgrd-physical.c +++ b/repmgrd-physical.c @@ -799,7 +799,7 @@ do_primary_failover(void) } else if (election_result == ELECTION_WON) { - log_notice("I am the winner, will now promote self and inform other nodes"); + log_notice("this node is the winner, will now promote self and inform other nodes"); failover_state = promote_self(); } @@ -1648,7 +1648,7 @@ follow_new_primary(int new_primary_id) local_node_info.node_id, upstream_node_info.node_id); - log_notice("%s\n", event_details.data); + log_notice("%s", event_details.data); create_event_notification( upstream_conn, diff --git a/repmgrd.c b/repmgrd.c index 660425c6..bfdb319a 100644 --- a/repmgrd.c +++ b/repmgrd.c @@ -307,6 +307,8 @@ main(int argc, char **argv) terminate(ERR_BAD_CONFIG); } + repmgrd_set_local_node_id(local_conn, config_file_options.node_id); + if (config_file_options.replication_type == REPLICATION_TYPE_BDR) { log_debug("node id is %i", local_node_info.node_id); @@ -639,9 +641,13 @@ try_reconnect(t_node_info *node_info) PQfinish(conn); log_notice(_("unable to reconnect to node")); } - log_info(_("sleeping %i seconds until next reconnection attempt"), - config_file_options.reconnect_interval); - sleep(config_file_options.reconnect_interval); + + if (i + 1 < max_attempts) + { + log_info(_("sleeping %i seconds until next reconnection attempt"), + config_file_options.reconnect_interval); + sleep(config_file_options.reconnect_interval); + } }