Skip to content
This repository has been archived by the owner on Feb 18, 2025. It is now read-only.

Commit

Permalink
Merge pull request #1332 from openark/recover-non-writeable-master
Browse files Browse the repository at this point in the history
Introducing RecoverNonWriteableMaster flag
  • Loading branch information
shlomi-noach authored Apr 13, 2021
2 parents 06041e3 + 5bd38af commit 96533fd
Show file tree
Hide file tree
Showing 21 changed files with 76 additions and 15 deletions.
2 changes: 2 additions & 0 deletions go/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -240,6 +240,7 @@ type Configuration struct {
PostIntermediateMasterFailoverProcesses []string // Processes to execute after doing a master failover (order of execution undefined). Uses same placeholders as PostFailoverProcesses
PostGracefulTakeoverProcesses []string // Processes to execute after runnign a graceful master takeover. Uses same placeholders as PostFailoverProcesses
PostTakeMasterProcesses []string // Processes to execute after a successful Take-Master event has taken place
RecoverNonWriteableMaster bool // When 'true', orchestrator treats a read-only master as a failure scenario and attempts to make the master writeable
CoMasterRecoveryMustPromoteOtherCoMaster bool // When 'false', anything can get promoted (and candidates are prefered over others). When 'true', orchestrator will promote the other co-master or else fail
DetachLostSlavesAfterMasterFailover bool // synonym to DetachLostReplicasAfterMasterFailover
DetachLostReplicasAfterMasterFailover bool // Should replicas that are not to be lost in master recovery (i.e. were more up-to-date than promoted replica) be forcibly detached
Expand Down Expand Up @@ -414,6 +415,7 @@ func newConfiguration() *Configuration {
PostUnsuccessfulFailoverProcesses: []string{},
PostGracefulTakeoverProcesses: []string{},
PostTakeMasterProcesses: []string{},
RecoverNonWriteableMaster: false,
CoMasterRecoveryMustPromoteOtherCoMaster: true,
DetachLostSlavesAfterMasterFailover: true,
ApplyMySQLPromotionAfterMasterFailover: true,
Expand Down
23 changes: 11 additions & 12 deletions go/inst/analysis.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,6 @@ import (
)

type AnalysisCode string
type StructureAnalysisCode string

const (
NoProblem AnalysisCode = "NoProblem"
Expand Down Expand Up @@ -62,16 +61,16 @@ const (
)

const (
StatementAndMixedLoggingReplicasStructureWarning StructureAnalysisCode = "StatementAndMixedLoggingReplicasStructureWarning"
StatementAndRowLoggingReplicasStructureWarning = "StatementAndRowLoggingReplicasStructureWarning"
MixedAndRowLoggingReplicasStructureWarning = "MixedAndRowLoggingReplicasStructureWarning"
MultipleMajorVersionsLoggingReplicasStructureWarning = "MultipleMajorVersionsLoggingReplicasStructureWarning"
NoLoggingReplicasStructureWarning = "NoLoggingReplicasStructureWarning"
DifferentGTIDModesStructureWarning = "DifferentGTIDModesStructureWarning"
ErrantGTIDStructureWarning = "ErrantGTIDStructureWarning"
NoFailoverSupportStructureWarning = "NoFailoverSupportStructureWarning"
NoWriteableMasterStructureWarning = "NoWriteableMasterStructureWarning"
NotEnoughValidSemiSyncReplicasStructureWarning = "NotEnoughValidSemiSyncReplicasStructureWarning"
StatementAndMixedLoggingReplicasStructureWarning AnalysisCode = "StatementAndMixedLoggingReplicasStructureWarning"
StatementAndRowLoggingReplicasStructureWarning = "StatementAndRowLoggingReplicasStructureWarning"
MixedAndRowLoggingReplicasStructureWarning = "MixedAndRowLoggingReplicasStructureWarning"
MultipleMajorVersionsLoggingReplicasStructureWarning = "MultipleMajorVersionsLoggingReplicasStructureWarning"
NoLoggingReplicasStructureWarning = "NoLoggingReplicasStructureWarning"
DifferentGTIDModesStructureWarning = "DifferentGTIDModesStructureWarning"
ErrantGTIDStructureWarning = "ErrantGTIDStructureWarning"
NoFailoverSupportStructureWarning = "NoFailoverSupportStructureWarning"
NoWriteableMasterStructureWarning = "NoWriteableMasterStructureWarning"
NotEnoughValidSemiSyncReplicasStructureWarning = "NotEnoughValidSemiSyncReplicasStructureWarning"
)

type InstanceAnalysis struct {
Expand Down Expand Up @@ -140,7 +139,7 @@ type ReplicationAnalysis struct {
IsFailingToConnectToMaster bool
Analysis AnalysisCode
Description string
StructureAnalysis []StructureAnalysisCode
StructureAnalysis []AnalysisCode
IsDowntimed bool
IsReplicasDowntimed bool // as good as downtimed because all replicas are downtimed AND analysis is all about the replicas (e.e. AllMasterReplicasNotReplicating)
DowntimeEndTimestamp string
Expand Down
6 changes: 5 additions & 1 deletion go/inst/analysis_dao.go
Original file line number Diff line number Diff line change
Expand Up @@ -513,7 +513,7 @@ func GetReplicationAnalysis(clusterName string, hints *ReplicationAnalysisHints)
a.Description = "Master cannot be reached by orchestrator and all of its replicas are lagging"
//
} else if a.IsMaster && !a.LastCheckValid && !a.LastCheckPartialSuccess && a.CountValidReplicas > 0 && a.CountValidReplicatingReplicas > 0 {
// partial success is here to redice noise
// partial success is here to reduce noise
a.Analysis = UnreachableMaster
a.Description = "Master cannot be reached by orchestrator but it has replicating replicas; possibly a network/host issue"
//
Expand All @@ -531,6 +531,10 @@ func GetReplicationAnalysis(clusterName string, hints *ReplicationAnalysisHints)
a.Description = "Semi sync master seems to be locked, more samplings needed to validate"
}
//
} else if a.IsMaster && a.LastCheckValid && a.IsReadOnly && a.CountValidReplicatingReplicas > 0 && config.Config.RecoverNonWriteableMaster {
a.Analysis = NoWriteableMasterStructureWarning
a.Description = "Master with replicas is read_only"
//
} else if a.IsMaster && a.LastCheckValid && a.CountReplicas == 1 && a.CountValidReplicas == a.CountReplicas && a.CountValidReplicatingReplicas == 0 {
a.Analysis = MasterSingleReplicaNotReplicating
a.Description = "Master is reachable but its single replica is not replicating"
Expand Down
34 changes: 32 additions & 2 deletions go/logic/topology_recovery.go
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ import (
ometrics "github.com/openark/orchestrator/go/metrics"
"github.com/openark/orchestrator/go/os"
"github.com/openark/orchestrator/go/process"
"github.com/openark/orchestrator/go/raft"
orcraft "github.com/openark/orchestrator/go/raft"
"github.com/openark/orchestrator/go/util"
"github.com/patrickmn/go-cache"
"github.com/rcrowley/go-metrics"
Expand Down Expand Up @@ -1450,7 +1450,34 @@ func checkAndRecoverDeadCoMaster(analysisEntry inst.ReplicationAnalysis, candida
return true, topologyRecovery, err
}

// checkAndRecoverGenericProblem is a general-purpose recovery function
// checkAndRecoverNonWriteableMaster attempts to recover from a read only master by turning it writeable.
// This behavior is feature protected, see config.Config.RecoverNonWriteableMaster
func checkAndRecoverNonWriteableMaster(analysisEntry inst.ReplicationAnalysis, candidateInstanceKey *inst.InstanceKey, forceInstanceRecovery bool, skipProcesses bool) (recoveryAttempted bool, topologyRecovery *TopologyRecovery, err error) {
if !config.Config.RecoverNonWriteableMaster {
return false, nil, nil
}

topologyRecovery, err = AttemptRecoveryRegistration(&analysisEntry, true, true)
if topologyRecovery == nil {
AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("found an active or recent recovery on %+v. Will not issue another checkAndRecoverNonWriteableMaster.", analysisEntry.AnalyzedInstanceKey))
return false, nil, err
}

inst.AuditOperation("recover-non-writeable-master", &analysisEntry.AnalyzedInstanceKey, "problem found; will recover")
if !skipProcesses {
if err := executeProcesses(config.Config.PreFailoverProcesses, "PreFailoverProcesses", topologyRecovery, true); err != nil {
return false, topologyRecovery, topologyRecovery.AddError(err)
}
}

instance, err := inst.SetReadOnly(&analysisEntry.AnalyzedInstanceKey, false)
if err == nil {
resolveRecovery(topologyRecovery, instance)
}
return true, topologyRecovery, err
}

// checkAndRecoverLockedSemiSyncMaster
func checkAndRecoverLockedSemiSyncMaster(analysisEntry inst.ReplicationAnalysis, candidateInstanceKey *inst.InstanceKey, forceInstanceRecovery bool, skipProcesses bool) (recoveryAttempted bool, topologyRecovery *TopologyRecovery, err error) {

topologyRecovery, err = AttemptRecoveryRegistration(&analysisEntry, true, true)
Expand Down Expand Up @@ -1664,6 +1691,9 @@ func getCheckAndRecoverFunction(analysisCode inst.AnalysisCode, analyzedInstance
// replication group members
case inst.DeadReplicationGroupMemberWithReplicas:
return checkAndRecoverDeadGroupMemberWithReplicas, true
// recoverable structure analysis
case inst.NoWriteableMasterStructureWarning:
return checkAndRecoverNonWriteableMaster, true
}
// Right now this is mostly causing noise with no clear action.
// Will revisit this in the future.
Expand Down
1 change: 1 addition & 0 deletions tests/integration/analysis-no-writeable-master/create.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
UPDATE database_instance SET read_only=1 where port=22293;
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
testhost:22293 (cluster testhost:22293): NoWriteableMasterStructureWarning
1 change: 1 addition & 0 deletions tests/integration/analysis-no-writeable-master/extra_args
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
-c replication-analysis
1 change: 1 addition & 0 deletions tests/system/orchestrator-ci-system.conf.json
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@
"ReduceReplicationAnalysisCount": false,
"FailureDetectionPeriodBlockMinutes": 5,
"FailMasterPromotionOnLagMinutes": 1,
"RecoverNonWriteableMaster": true,
"RecoveryPeriodBlockSeconds": 5,
"RecoveryIgnoreHostnameFilters": [],
"RecoverMasterClusterFilters": [
Expand Down
3 changes: 3 additions & 0 deletions tests/system/recover-read-only-master/01-recover/config.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
{
"RecoverNonWriteableMaster": true
}
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
127.0.0.1:10111
1 change: 1 addition & 0 deletions tests/system/recover-read-only-master/01-recover/run
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
orchestrator-client -c which-cluster-master -alias ci
6 changes: 6 additions & 0 deletions tests/system/recover-read-only-master/01-recover/setup
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
#!/bin/bash

set -e

mysql -uci -pci -h 127.0.0.1 --port=10111 -e "set global read_only=1"
sleep 6
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
false
3 changes: 3 additions & 0 deletions tests/system/recover-read-only-master/02-read-only/run
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
cluster_master="$(orchestrator-client -c which-cluster-master -alias ci)"
cluster_master_path="$(echo $cluster_master | tr ':' '/')"
orchestrator-client -c api -path instance/$cluster_master_path | jq -r '.ReadOnly'
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
3
3 changes: 3 additions & 0 deletions tests/system/recover-read-only-master/03-count-replicas/run
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
cluster_master="$(orchestrator-client -c which-cluster-master -alias ci)"
cluster_master_path="$(echo $cluster_master | tr ':' '/')"
orchestrator-client -c api -path instance/$cluster_master_path | jq '.Replicas | length'
Empty file.
1 change: 1 addition & 0 deletions tests/system/recover-read-only-master/04-downtimed/run
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
orchestrator-client -c api -path downtimed | jq -r '.[] | select(.IsDowntimed==true and .DowntimeReason=="lost-in-recovery") | .Key.Port'
1 change: 1 addition & 0 deletions tests/system/recover-read-only-master/depends-on
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
all-instances
Empty file.
1 change: 1 addition & 0 deletions tests/system/recover-read-only-master/teardown_redeploy
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
sleep 3

0 comments on commit 96533fd

Please sign in to comment.