diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c index a1b1d0e986..6ba6e6021d 100644 --- a/src/cluster_legacy.c +++ b/src/cluster_legacy.c @@ -1103,6 +1103,7 @@ void clusterInit(void) { server.cluster->failover_auth_time = 0; server.cluster->failover_auth_count = 0; server.cluster->failover_auth_rank = 0; + server.cluster->failover_failed_primary_rank = 0; server.cluster->failover_auth_epoch = 0; server.cluster->cant_failover_reason = CLUSTER_CANT_FAILOVER_NONE; server.cluster->lastVoteEpoch = 0; @@ -4510,6 +4511,45 @@ int clusterGetReplicaRank(void) { return rank; } +/* This function returns the "rank" of this instance's primary, in the context + * of all failed primary list. The primary node will be ignored if failed time + * exceeds cluster-node-timeout * cluster-replica-validity-factor. + * + * If multiple primary nodes go down at the same time, there is a certain + * probability that their replicas will initiate the elections at the same time, + * and lead to insufficient votes. + * + * The failed primary rank is used to add a delay to start an election in order + * to avoid simultaneous elections of replicas. */ +int clusterGetFailedPrimaryRank(void) { + serverAssert(nodeIsReplica(myself)); + serverAssert(myself->replicaof); + + int rank = 0; + mstime_t now = mstime(); + dictIterator *di; + dictEntry *de; + + di = dictGetSafeIterator(server.cluster->nodes); + while ((de = dictNext(di)) != NULL) { + clusterNode *node = dictGetVal(de); + + /* Skip nodes that do not need to participate in the rank. */ + if (!nodeFailed(node) || !clusterNodeIsVotingPrimary(node) || node->num_replicas == 0) continue; + + /* If cluster-replica-validity-factor is enabled, skip the invalid nodes. */ + if (server.cluster_replica_validity_factor) { + if ((now - node->fail_time) > (server.cluster_node_timeout * server.cluster_replica_validity_factor)) + continue; + } + + if (memcmp(node->shard_id, myself->shard_id, CLUSTER_NAMELEN) < 0) rank++; + } + dictReleaseIterator(di); + + return rank; +} + /* This function is called by clusterHandleReplicaFailover() in order to * let the replica log why it is not able to failover. Sometimes there are * not the conditions, but since the failover function is called again and @@ -4691,6 +4731,11 @@ void clusterHandleReplicaFailover(void) { * Specifically 1 second * rank. This way replicas that have a probably * less updated replication offset, are penalized. */ server.cluster->failover_auth_time += server.cluster->failover_auth_rank * 1000; + /* We add another delay that is proportional to the failed primary rank. + * Specifically 0.5 second * rank. This way those failed primaries will be + * elected in rank to avoid the vote conflicts. */ + server.cluster->failover_failed_primary_rank = clusterGetFailedPrimaryRank(); + server.cluster->failover_auth_time += server.cluster->failover_failed_primary_rank * 500; /* However if this is a manual failover, no delay is needed. */ if (server.cluster->mf_end) { server.cluster->failover_auth_time = now; @@ -4701,9 +4746,9 @@ void clusterHandleReplicaFailover(void) { } serverLog(LL_NOTICE, "Start of election delayed for %lld milliseconds " - "(rank #%d, offset %lld).", + "(rank #%d, primary rank #%d, offset %lld).", server.cluster->failover_auth_time - now, server.cluster->failover_auth_rank, - replicationGetReplicaOffset()); + server.cluster->failover_failed_primary_rank, replicationGetReplicaOffset()); /* Now that we have a scheduled election, broadcast our offset * to all the other replicas so that they'll updated their offsets * if our offset is better. */ @@ -4719,6 +4764,9 @@ void clusterHandleReplicaFailover(void) { * replicas for the same primary since we computed our election delay. * Update the delay if our rank changed. * + * It is also possible that we received the message that telling a + * shard is up. Update the delay if our failed_primary_rank changed. + * * Not performed if this is a manual failover. */ if (server.cluster->failover_auth_sent == 0 && server.cluster->mf_end == 0) { int newrank = clusterGetReplicaRank(); @@ -4729,6 +4777,15 @@ void clusterHandleReplicaFailover(void) { serverLog(LL_NOTICE, "Replica rank updated to #%d, added %lld milliseconds of delay.", newrank, added_delay); } + + int new_failed_primary_rank = clusterGetFailedPrimaryRank(); + if (new_failed_primary_rank != server.cluster->failover_failed_primary_rank) { + long long added_delay = (new_failed_primary_rank - server.cluster->failover_failed_primary_rank) * 500; + server.cluster->failover_auth_time += added_delay; + server.cluster->failover_failed_primary_rank = new_failed_primary_rank; + serverLog(LL_NOTICE, "Failed primary rank updated to #%d, added %lld milliseconds of delay.", + new_failed_primary_rank, added_delay); + } } /* Return ASAP if we can't still start the election. */ diff --git a/src/cluster_legacy.h b/src/cluster_legacy.h index ac14bd583c..226842c5dc 100644 --- a/src/cluster_legacy.h +++ b/src/cluster_legacy.h @@ -382,13 +382,14 @@ struct clusterState { clusterNode *importing_slots_from[CLUSTER_SLOTS]; clusterNode *slots[CLUSTER_SLOTS]; /* The following fields are used to take the replica state on elections. */ - mstime_t failover_auth_time; /* Time of previous or next election. */ - int failover_auth_count; /* Number of votes received so far. */ - int failover_auth_sent; /* True if we already asked for votes. */ - int failover_auth_rank; /* This replica rank for current auth request. */ - uint64_t failover_auth_epoch; /* Epoch of the current election. */ - int cant_failover_reason; /* Why a replica is currently not able to - failover. See the CANT_FAILOVER_* macros. */ + mstime_t failover_auth_time; /* Time of previous or next election. */ + int failover_auth_count; /* Number of votes received so far. */ + int failover_auth_sent; /* True if we already asked for votes. */ + int failover_auth_rank; /* This replica rank for current auth request. */ + int failover_failed_primary_rank; /* The rank of this instance in the context of all failed primary list. */ + uint64_t failover_auth_epoch; /* Epoch of the current election. */ + int cant_failover_reason; /* Why a replica is currently not able to + * failover. See the CANT_FAILOVER_* macros. */ /* Manual failover state in common. */ mstime_t mf_end; /* Manual failover time limit (ms unixtime). It is zero if there is no MF in progress. */ diff --git a/tests/unit/cluster/failover2.tcl b/tests/unit/cluster/failover2.tcl index 9262049e4e..2272a150ee 100644 --- a/tests/unit/cluster/failover2.tcl +++ b/tests/unit/cluster/failover2.tcl @@ -62,10 +62,8 @@ start_cluster 3 4 {tags {external:skip cluster} overrides {cluster-ping-interval verify_no_log_message -3 "*Failover attempt expired*" 0 verify_no_log_message -6 "*Failover attempt expired*" 0 } - } ;# start_cluster - start_cluster 7 3 {tags {external:skip cluster} overrides {cluster-ping-interval 1000 cluster-node-timeout 5000}} { test "Primaries will not time out then they are elected in the same epoch" { # Since we have the delay time, so these node may not initiate the @@ -102,3 +100,36 @@ start_cluster 7 3 {tags {external:skip cluster} overrides {cluster-ping-interval resume_process [srv -2 pid] } } ;# start_cluster + +run_solo {cluster} { + start_cluster 32 15 {tags {external:skip cluster} overrides {cluster-ping-interval 1000 cluster-node-timeout 15000}} { + test "Multiple primary nodes are down, rank them based on the failed primary" { + # Killing these primary nodes. + for {set j 0} {$j < 15} {incr j} { + pause_process [srv -$j pid] + } + + # Make sure that a node starts failover. + wait_for_condition 1000 100 { + [s -40 role] == "master" + } else { + fail "No failover detected" + } + + # Wait for the cluster state to become ok. + for {set j 0} {$j < [llength $::servers]} {incr j} { + if {[process_is_paused [srv -$j pid]]} continue + wait_for_condition 1000 100 { + [CI $j cluster_state] eq "ok" + } else { + fail "Cluster node $j cluster_state:[CI $j cluster_state]" + } + } + + # Resuming these primary nodes, speed up the shutdown. + for {set j 0} {$j < 15} {incr j} { + resume_process [srv -$j pid] + } + } + } ;# start_cluster +} ;# run_solo