Regularly sync witness server repl_nodes table.

Although the witness server will resync the repl_nodes table following
a failover, other operations (e.g. removing or cloning a standby)
were previously not reflected in the witness server's copy of this
table.

As a short-term workaround, automatically resync the table at regular
intervals (defined by the configuration file parameter
"witness_repl_nodes_sync_interval_secs", default 30 seconds).
This commit is contained in:
Ian Barwick
2016-03-29 16:46:18 +09:00
committed by Ian Barwick
parent f9dba283d4
commit 0ebd9c15d9
7 changed files with 54 additions and 16 deletions

View File

@@ -235,6 +235,9 @@ parse_config(t_configuration_options *options)
options->monitor_interval_secs = 2;
options->retry_promote_interval_secs = 300;
/* default to resyncing repl_nodes table every 30 seconds on the witness server */
options->witness_repl_nodes_sync_interval_secs = 30;
memset(options->event_notification_command, 0, sizeof(options->event_notification_command));
options->tablespace_mapping.head = NULL;
@@ -358,6 +361,8 @@ parse_config(t_configuration_options *options)
options->monitor_interval_secs = repmgr_atoi(value, "monitor_interval_secs", &config_errors, false);
else if (strcmp(name, "retry_promote_interval_secs") == 0)
options->retry_promote_interval_secs = repmgr_atoi(value, "retry_promote_interval_secs", &config_errors, false);
else if (strcmp(name, "witness_repl_nodes_sync_interval_secs") == 0)
options->witness_repl_nodes_sync_interval_secs = repmgr_atoi(value, "witness_repl_nodes_sync_interval_secs", &config_errors, false);
else if (strcmp(name, "use_replication_slots") == 0)
/* XXX we should have a dedicated boolean argument format */
options->use_replication_slots = repmgr_atoi(value, "use_replication_slots", &config_errors, false);

View File

@@ -75,13 +75,14 @@ typedef struct
char logfile[MAXLEN];
int monitor_interval_secs;
int retry_promote_interval_secs;
int witness_repl_nodes_sync_interval_secs;
int use_replication_slots;
char event_notification_command[MAXLEN];
EventNotificationList event_notifications;
TablespaceList tablespace_mapping;
} t_configuration_options;
#define T_CONFIGURATION_OPTIONS_INITIALIZER { "", -1, NO_UPSTREAM_NODE, "", MANUAL_FAILOVER, -1, "", "", "", "", "", "", "", -1, -1, -1, "", "", "", "", 0, 0, 0, "", { NULL, NULL }, {NULL, NULL} }
#define T_CONFIGURATION_OPTIONS_INITIALIZER { "", -1, NO_UPSTREAM_NODE, "", MANUAL_FAILOVER, -1, "", "", "", "", "", "", "", -1, -1, -1, "", "", "", "", 0, 0, 0, 0, "", { NULL, NULL }, {NULL, NULL} }
typedef struct ErrorListCell
{

View File

@@ -1138,7 +1138,7 @@ copy_configuration(PGconn *masterconn, PGconn *witnessconn, char *cluster_name)
}
sqlquery_snprintf(sqlquery,
"SELECT id, type, upstream_node_id, name, conninfo, priority, slot_name FROM %s.repl_nodes",
"SELECT id, type, upstream_node_id, name, conninfo, priority, slot_name, active FROM %s.repl_nodes",
get_repmgr_schema_quoted(masterconn));
log_verbose(LOG_DEBUG, "copy_configuration():\n%s\n", sqlquery);
@@ -1158,7 +1158,7 @@ copy_configuration(PGconn *masterconn, PGconn *witnessconn, char *cluster_name)
log_verbose(LOG_DEBUG,
"copy_configuration(): writing node record for node %s (id: %s)\n",
PQgetvalue(res, i, 4),
PQgetvalue(res, i, 3),
PQgetvalue(res, i, 0));
node_record_created = create_node_record(witnessconn,
@@ -1174,7 +1174,10 @@ copy_configuration(PGconn *masterconn, PGconn *witnessconn, char *cluster_name)
atoi(PQgetvalue(res, i, 5)),
strlen(PQgetvalue(res, i, 6))
? PQgetvalue(res, i, 6)
: NULL
: NULL,
(strcmp(PQgetvalue(res, i, 7), "t") == 0)
? true
: false
);
if (node_record_created == false)
@@ -1200,7 +1203,7 @@ copy_configuration(PGconn *masterconn, PGconn *witnessconn, char *cluster_name)
* XXX we should pass the record parameters as a struct.
*/
bool
create_node_record(PGconn *conn, char *action, int node, char *type, int upstream_node, char *cluster_name, char *node_name, char *conninfo, int priority, char *slot_name)
create_node_record(PGconn *conn, char *action, int node, char *type, int upstream_node, char *cluster_name, char *node_name, char *conninfo, int priority, char *slot_name, bool active)
{
char sqlquery[QUERY_STR_LEN];
char upstream_node_id[MAXLEN];
@@ -1241,8 +1244,8 @@ create_node_record(PGconn *conn, char *action, int node, char *type, int upstrea
sqlquery_snprintf(sqlquery,
"INSERT INTO %s.repl_nodes "
" (id, type, upstream_node_id, cluster, "
" name, conninfo, slot_name, priority) "
"VALUES (%i, '%s', %s, '%s', '%s', '%s', %s, %i) ",
" name, conninfo, slot_name, priority, active) "
"VALUES (%i, '%s', %s, '%s', '%s', '%s', %s, %i, %s) ",
get_repmgr_schema_quoted(conn),
node,
type,
@@ -1251,7 +1254,8 @@ create_node_record(PGconn *conn, char *action, int node, char *type, int upstrea
node_name,
conninfo,
slot_name_buf,
priority);
priority,
active == true ? "TRUE" : "FALSE");
log_verbose(LOG_DEBUG, "create_node_record(): %s\n", sqlquery);
@@ -1291,7 +1295,7 @@ delete_node_record(PGconn *conn, int node, char *action)
if (action != NULL)
{
log_verbose(LOG_DEBUG, "create_node_record(): action is \"%s\"\n", action);
log_verbose(LOG_DEBUG, "delete_node_record(): action is \"%s\"\n", action);
}
res = PQexec(conn, sqlquery);

View File

@@ -122,7 +122,7 @@ bool start_backup(PGconn *conn, char *first_wal_segment, bool fast_checkpoint);
bool stop_backup(PGconn *conn, char *last_wal_segment);
bool set_config_bool(PGconn *conn, const char *config_param, bool state);
bool copy_configuration(PGconn *masterconn, PGconn *witnessconn, char *cluster_name);
bool create_node_record(PGconn *conn, char *action, int node, char *type, int upstream_node, char *cluster_name, char *node_name, char *conninfo, int priority, char *slot_name);
bool create_node_record(PGconn *conn, char *action, int node, char *type, int upstream_node, char *cluster_name, char *node_name, char *conninfo, int priority, char *slot_name, bool active);
bool delete_node_record(PGconn *conn, int node, char *action);
int get_node_record(PGconn *conn, char *cluster, int node_id, t_node_info *node_info);
bool update_node_record_status(PGconn *conn, char *cluster_name, int this_node_id, char *type, int upstream_node_id, bool active);

View File

@@ -1071,7 +1071,8 @@ do_master_register(void)
options.node_name,
options.conninfo,
options.priority,
repmgr_slot_name_ptr);
repmgr_slot_name_ptr,
true);
if (record_created == false)
{
@@ -1172,9 +1173,8 @@ do_standby_register(void)
options.node_name,
options.conninfo,
options.priority,
repmgr_slot_name_ptr);
repmgr_slot_name_ptr,
true);
if (record_created == false)
{
@@ -3784,7 +3784,8 @@ do_witness_create(void)
options.node_name,
options.conninfo,
options.priority,
NULL);
NULL,
true);
if (record_created == false)
{

View File

@@ -146,3 +146,6 @@ monitor_interval_secs=2
# seconds; by default this would be half an hour, as 'retry_promote_interval_secs'
# default value is 300)
retry_promote_interval_secs=300
# Number of seconds after which the witness server resyncs the repl_nodes table
witness_repl_nodes_sync_interval_secs=15

View File

@@ -298,9 +298,12 @@ main(int argc, char **argv)
*/
do
{
/* Timer for repl_nodes synchronisation interval */
int sync_repl_nodes_elapsed = 0;
/*
* Set my server mode, establish a connection to master and start
* monitor
* monitoring
*/
switch (node_info.type)
@@ -472,6 +475,24 @@ main(int argc, char **argv)
sleep(local_options.monitor_interval_secs);
/*
* On a witness node, regularly resync the repl_nodes table
* to keep up with any changes on the primary
*
* TODO: only resync the table if changes actually detected
*/
if (node_info.type == WITNESS)
{
sync_repl_nodes_elapsed += local_options.monitor_interval_secs;
log_debug(_("%i - %i \n"), sync_repl_nodes_elapsed, local_options.witness_repl_nodes_sync_interval_secs);
if(sync_repl_nodes_elapsed >= local_options.witness_repl_nodes_sync_interval_secs)
{
log_debug(_("Resyncing repl_nodes table\n"));
copy_configuration(master_conn, my_local_conn, local_options.cluster_name);
sync_repl_nodes_elapsed = 0;
}
}
if (got_SIGHUP)
{
/*
@@ -486,6 +507,7 @@ main(int argc, char **argv)
}
got_SIGHUP = false;
}
if (failover_done)
{
log_debug(_("standby check loop will terminate\n"));
@@ -1954,6 +1976,8 @@ check_node_configuration(void)
/* Adding the node */
log_info(_("adding node %d to cluster '%s'\n"),
local_options.node, local_options.cluster_name);
/* XXX use create_node_record() */
sqlquery_snprintf(sqlquery,
"INSERT INTO %s.repl_nodes"
" (id, cluster, name, conninfo, priority, witness) "