mirror of
https://github.com/EnterpriseDB/repmgr.git
synced 2026-03-26 16:46:28 +00:00
Witness server: on failover attempt to reconnect to new master
Previously it was just quitting.
This commit is contained in:
@@ -443,7 +443,7 @@ get_master_connection(PGconn *standby_conn, char *cluster,
|
|||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
/* if it is a standby clear info */
|
/* if it is a standby, clear info */
|
||||||
PQclear(res2);
|
PQclear(res2);
|
||||||
PQfinish(master_conn);
|
PQfinish(master_conn);
|
||||||
*master_id = -1;
|
*master_id = -1;
|
||||||
|
|||||||
55
repmgrd.c
55
repmgrd.c
@@ -482,23 +482,41 @@ witness_monitor(void)
|
|||||||
char monitor_witness_timestamp[MAXLEN];
|
char monitor_witness_timestamp[MAXLEN];
|
||||||
PGresult *res;
|
PGresult *res;
|
||||||
char sqlquery[QUERY_STR_LEN];
|
char sqlquery[QUERY_STR_LEN];
|
||||||
|
bool connection_ok;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Check if the master is still available, if after 5 minutes of retries
|
* Check if master is available;
|
||||||
* we cannot reconnect, return false.
|
* if not, assume failover situation and try to determie new master
|
||||||
|
* ZZZ only if `AUTOMATIC_FAILOVER` set???
|
||||||
*/
|
*/
|
||||||
check_connection(primary_conn, "master"); /* this take up to
|
connection_ok = check_connection(primary_conn, "master");
|
||||||
* local_options.reconnect_attempts
|
|
||||||
* local_options.reconnect_intvl seconds
|
|
||||||
*/
|
|
||||||
|
|
||||||
if (PQstatus(primary_conn) != CONNECTION_OK)
|
if(connection_ok == FALSE)
|
||||||
{
|
{
|
||||||
/*
|
|
||||||
* If we can't reconnect, just exit... XXX we need to make witness
|
log_debug(_("Old primary node ID: %i\n"), primary_options.node);
|
||||||
* connect to the new master
|
/* We need to wait a while for the new primary to be promoted */
|
||||||
*/
|
// ZZZ loop here `local_options.reconnect_attempts` times
|
||||||
terminate(0);
|
|
||||||
|
log_info(
|
||||||
|
_("Waiting %i seconds for a new master to be promoted...\n"),
|
||||||
|
local_options.master_response_timeout
|
||||||
|
);
|
||||||
|
|
||||||
|
sleep(local_options.master_response_timeout);
|
||||||
|
|
||||||
|
primary_conn = get_master_connection(my_local_conn,
|
||||||
|
local_options.cluster_name, &primary_options.node, NULL);
|
||||||
|
|
||||||
|
if (PQstatus(primary_conn) != CONNECTION_OK)
|
||||||
|
{
|
||||||
|
log_err(_("Unable to determine a valid master server, exiting...\n"));
|
||||||
|
|
||||||
|
PQfinish(primary_conn);
|
||||||
|
terminate(ERR_DB_CON);
|
||||||
|
}
|
||||||
|
|
||||||
|
log_debug(_("New master found with node ID: %i\n"), primary_options.node);
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Fast path for the case where no history is requested */
|
/* Fast path for the case where no history is requested */
|
||||||
@@ -602,6 +620,7 @@ standby_monitor(void)
|
|||||||
if (local_options.failover == MANUAL_FAILOVER)
|
if (local_options.failover == MANUAL_FAILOVER)
|
||||||
{
|
{
|
||||||
log_err(_("We couldn't reconnect to master. Now checking if another node has been promoted.\n"));
|
log_err(_("We couldn't reconnect to master. Now checking if another node has been promoted.\n"));
|
||||||
|
// ZZZ why 6 here? make config option?
|
||||||
for (connection_retries = 0; connection_retries < 6; connection_retries++)
|
for (connection_retries = 0; connection_retries < 6; connection_retries++)
|
||||||
{
|
{
|
||||||
primary_conn = get_master_connection(my_local_conn,
|
primary_conn = get_master_connection(my_local_conn,
|
||||||
@@ -805,7 +824,7 @@ do_failover(void)
|
|||||||
res = PQexec(my_local_conn, sqlquery);
|
res = PQexec(my_local_conn, sqlquery);
|
||||||
if (PQresultStatus(res) != PGRES_TUPLES_OK)
|
if (PQresultStatus(res) != PGRES_TUPLES_OK)
|
||||||
{
|
{
|
||||||
log_err(_("Can't get nodes' info: %s\n"), PQerrorMessage(my_local_conn));
|
log_err(_("Unable to retrieve node records: %s\n"), PQerrorMessage(my_local_conn));
|
||||||
PQclear(res);
|
PQclear(res);
|
||||||
terminate(ERR_DB_QUERY);
|
terminate(ERR_DB_QUERY);
|
||||||
}
|
}
|
||||||
@@ -1201,9 +1220,11 @@ check_connection(PGconn *conn, const char *type)
|
|||||||
}
|
}
|
||||||
if (!is_pgup(conn, local_options.master_response_timeout))
|
if (!is_pgup(conn, local_options.master_response_timeout))
|
||||||
{
|
{
|
||||||
log_err(_("%s: We couldn't reconnect for long enough, exiting...\n"),
|
log_err(_("%s: Unable to reconnect to master after %i seconds...\n"),
|
||||||
progname);
|
progname,
|
||||||
/* XXX Anything else to do here? */
|
local_options.master_response_timeout
|
||||||
|
);
|
||||||
|
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
return true;
|
return true;
|
||||||
@@ -1258,6 +1279,8 @@ check_node_configuration(void)
|
|||||||
*/
|
*/
|
||||||
log_info(_("%s Checking node %d in cluster '%s'\n"),
|
log_info(_("%s Checking node %d in cluster '%s'\n"),
|
||||||
progname, local_options.node, local_options.cluster_name);
|
progname, local_options.node, local_options.cluster_name);
|
||||||
|
|
||||||
|
// ZZZ change to COUNT(*) ???
|
||||||
sqlquery_snprintf(sqlquery,
|
sqlquery_snprintf(sqlquery,
|
||||||
"SELECT * "
|
"SELECT * "
|
||||||
" FROM %s.repl_nodes "
|
" FROM %s.repl_nodes "
|
||||||
|
|||||||
Reference in New Issue
Block a user