Skip to content

Commit

Permalink
feat: improve errant gtid detection logic to also handle the case whe…
Browse files Browse the repository at this point in the history
…re the replica is not connected to any primary

Signed-off-by: Manan Gupta <[email protected]>
  • Loading branch information
GuptaManan100 committed Nov 21, 2024
1 parent 4a5ab95 commit 916e97f
Show file tree
Hide file tree
Showing 2 changed files with 171 additions and 29 deletions.
82 changes: 53 additions & 29 deletions go/vt/vtorc/inst/instance_dao.go
Original file line number Diff line number Diff line change
Expand Up @@ -357,35 +357,7 @@ Cleanup:
// Add replication group ancestry UUID as well. Otherwise, VTOrc thinks there are errant GTIDs in group
// members and its replicas, even though they are not.
instance.AncestryUUID = strings.Trim(instance.AncestryUUID, ",")
if instance.ExecutedGtidSet != "" && instance.primaryExecutedGtidSet != "" {
// Compare primary & replica GTID sets, but ignore the sets that present the primary's UUID.
// This is because vtorc may pool primary and replica at an inconvenient timing,
// such that the replica may _seems_ to have more entries than the primary, when in fact
// it's just that the primary's probing is stale.
redactedExecutedGtidSet, _ := NewOracleGtidSet(instance.ExecutedGtidSet)
for _, uuid := range strings.Split(instance.AncestryUUID, ",") {
if uuid != instance.ServerUUID {
redactedExecutedGtidSet.RemoveUUID(uuid)
}
if instance.IsCoPrimary && uuid == instance.ServerUUID {
// If this is a co-primary, then this server is likely to show its own generated GTIDs as errant,
// because its co-primary has not applied them yet
redactedExecutedGtidSet.RemoveUUID(uuid)
}
}
// Avoid querying the database if there's no point:
if !redactedExecutedGtidSet.IsEmpty() {
redactedPrimaryExecutedGtidSet, _ := NewOracleGtidSet(instance.primaryExecutedGtidSet)
redactedPrimaryExecutedGtidSet.RemoveUUID(instance.SourceUUID)

instance.GtidErrant, err = replication.Subtract(redactedExecutedGtidSet.String(), redactedPrimaryExecutedGtidSet.String())
if err == nil {
var gtidCount int64
gtidCount, err = replication.GTIDCount(instance.GtidErrant)
currentErrantGTIDCount.Set(tabletAlias, gtidCount)
}
}
}
err = detectErrantGTIDs(tabletAlias, instance, tablet)
}

latency.Stop("instance")
Expand All @@ -412,6 +384,58 @@ Cleanup:
return nil, err
}

// detectErrantGTIDs detects the errant GTIDs on an instance.
func detectErrantGTIDs(tabletAlias string, instance *Instance, tablet *topodatapb.Tablet) (err error) {
// If the tablet is not replicating from anyone, then it could be the previous primary.
// We should check for errant GTIDs by finding the difference with the shard's current primary.
if instance.primaryExecutedGtidSet == "" && instance.SourceHost == "" {
var primaryInstance *Instance
primaryAlias, _, _ := ReadShardPrimaryInformation(tablet.Keyspace, tablet.Shard)
if primaryAlias != "" {
primaryInstance, _, _ = ReadInstance(primaryAlias)
}
// Only run errant GTID detection, if we are sure that the data read of the current primary
// is up-to-date enough to reflect that it has been promoted. This is needed to prevent
// flagging incorrect errant GTIDs. If we were to use old data, we could have some GTIDs
// accepted by the old primary (this tablet) that don't show in the new primary's set.
if primaryInstance != nil {
if primaryInstance.SourceHost == "" {
instance.primaryExecutedGtidSet = primaryInstance.ExecutedGtidSet
}
}
}
if instance.ExecutedGtidSet != "" && instance.primaryExecutedGtidSet != "" {
// Compare primary & replica GTID sets, but ignore the sets that present the primary's UUID.
// This is because vtorc may pool primary and replica at an inconvenient timing,
// such that the replica may _seems_ to have more entries than the primary, when in fact
// it's just that the primary's probing is stale.
redactedExecutedGtidSet, _ := NewOracleGtidSet(instance.ExecutedGtidSet)
for _, uuid := range strings.Split(instance.AncestryUUID, ",") {
if uuid != instance.ServerUUID {
redactedExecutedGtidSet.RemoveUUID(uuid)
}
if instance.IsCoPrimary && uuid == instance.ServerUUID {
// If this is a co-primary, then this server is likely to show its own generated GTIDs as errant,
// because its co-primary has not applied them yet
redactedExecutedGtidSet.RemoveUUID(uuid)
}
}
// Avoid querying the database if there's no point:
if !redactedExecutedGtidSet.IsEmpty() {
redactedPrimaryExecutedGtidSet, _ := NewOracleGtidSet(instance.primaryExecutedGtidSet)
redactedPrimaryExecutedGtidSet.RemoveUUID(instance.SourceUUID)

instance.GtidErrant, err = replication.Subtract(redactedExecutedGtidSet.String(), redactedPrimaryExecutedGtidSet.String())
if err == nil {
var gtidCount int64
gtidCount, err = replication.GTIDCount(instance.GtidErrant)
currentErrantGTIDCount.Set(tabletAlias, gtidCount)
}
}
}
return err
}

// getKeyspaceShardName returns a single string having both the keyspace and shard
func getKeyspaceShardName(keyspace, shard string) string {
return fmt.Sprintf("%v:%v", keyspace, shard)
Expand Down
118 changes: 118 additions & 0 deletions go/vt/vtorc/inst/instance_dao_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ import (
"vitess.io/vitess/go/vt/external/golib/sqlutils"
"vitess.io/vitess/go/vt/log"
topodatapb "vitess.io/vitess/go/vt/proto/topodata"
"vitess.io/vitess/go/vt/topo"
"vitess.io/vitess/go/vt/topo/topoproto"
"vitess.io/vitess/go/vt/vtorc/config"
"vitess.io/vitess/go/vt/vtorc/db"
Expand Down Expand Up @@ -773,3 +774,120 @@ func TestExpireTableData(t *testing.T) {
})
}
}

func TestDetectErrantGTIDs(t *testing.T) {
tests := []struct {
name string
instance *Instance
primaryInstance *Instance
wantErr bool
wantErrantGTID string
}{
{
name: "No errant GTIDs",
instance: &Instance{
ExecutedGtidSet: "230ea8ea-81e3-11e4-972a-e25ec4bd140a:1-10539,8bc65c84-3fe4-11ed-a912-257f0fcdd6c9:1-34",
primaryExecutedGtidSet: "230ea8ea-81e3-11e4-972a-e25ec4bd140a:1-10591,8bc65c84-3fe4-11ed-a912-257f0fcdd6c9:1-34",
AncestryUUID: "316d193c-70e5-11e5-adb2-ecf4bb2262ff,230ea8ea-81e3-11e4-972a-e25ec4bd140a",
ServerUUID: "316d193c-70e5-11e5-adb2-ecf4bb2262ff",
SourceUUID: "230ea8ea-81e3-11e4-972a-e25ec4bd140a",
},
}, {
name: "Errant GTIDs on replica",
instance: &Instance{
ExecutedGtidSet: "230ea8ea-81e3-11e4-972a-e25ec4bd140a:1-10539,8bc65c84-3fe4-11ed-a912-257f0fcdd6c9:1-34,316d193c-70e5-11e5-adb2-ecf4bb2262ff:34",
primaryExecutedGtidSet: "230ea8ea-81e3-11e4-972a-e25ec4bd140a:1-10591,8bc65c84-3fe4-11ed-a912-257f0fcdd6c9:1-34",
AncestryUUID: "316d193c-70e5-11e5-adb2-ecf4bb2262ff,230ea8ea-81e3-11e4-972a-e25ec4bd140a",
ServerUUID: "316d193c-70e5-11e5-adb2-ecf4bb2262ff",
SourceUUID: "230ea8ea-81e3-11e4-972a-e25ec4bd140a",
},
wantErrantGTID: "316d193c-70e5-11e5-adb2-ecf4bb2262ff:34",
},
{
name: "No errant GTIDs on old primary",
instance: &Instance{
ExecutedGtidSet: "230ea8ea-81e3-11e4-972a-e25ec4bd140a:1-10539,8bc65c84-3fe4-11ed-a912-257f0fcdd6c9:1-34,316d193c-70e5-11e5-adb2-ecf4bb2262ff:1-341",
AncestryUUID: "316d193c-70e5-11e5-adb2-ecf4bb2262ff",
ServerUUID: "316d193c-70e5-11e5-adb2-ecf4bb2262ff",
},
primaryInstance: &Instance{
SourceHost: "",
ExecutedGtidSet: "230ea8ea-81e3-11e4-972a-e25ec4bd140a:1-10589,8bc65c84-3fe4-11ed-a912-257f0fcdd6c9:1-34,316d193c-70e5-11e5-adb2-ecf4bb2262ff:1-341",
},
},
{
name: "Errant GTIDs on old primary",
instance: &Instance{
ExecutedGtidSet: "230ea8ea-81e3-11e4-972a-e25ec4bd140a:1-10539,8bc65c84-3fe4-11ed-a912-257f0fcdd6c9:1-34,316d193c-70e5-11e5-adb2-ecf4bb2262ff:1-342",
AncestryUUID: "316d193c-70e5-11e5-adb2-ecf4bb2262ff",
ServerUUID: "316d193c-70e5-11e5-adb2-ecf4bb2262ff",
},
primaryInstance: &Instance{
SourceHost: "",
ExecutedGtidSet: "230ea8ea-81e3-11e4-972a-e25ec4bd140a:1-10589,8bc65c84-3fe4-11ed-a912-257f0fcdd6c9:1-34,316d193c-70e5-11e5-adb2-ecf4bb2262ff:1-341",
},
wantErrantGTID: "316d193c-70e5-11e5-adb2-ecf4bb2262ff:342",
}, {
name: "Old information for new primary",
instance: &Instance{
ExecutedGtidSet: "230ea8ea-81e3-11e4-972a-e25ec4bd140a:1-10539,8bc65c84-3fe4-11ed-a912-257f0fcdd6c9:1-34,316d193c-70e5-11e5-adb2-ecf4bb2262ff:1-342",
AncestryUUID: "316d193c-70e5-11e5-adb2-ecf4bb2262ff",
ServerUUID: "316d193c-70e5-11e5-adb2-ecf4bb2262ff",
},
primaryInstance: &Instance{
SourceHost: "localhost",
ExecutedGtidSet: "230ea8ea-81e3-11e4-972a-e25ec4bd140a:1-10539,8bc65c84-3fe4-11ed-a912-257f0fcdd6c9:1-34,316d193c-70e5-11e5-adb2-ecf4bb2262ff:1-311",
},
},
}

keyspaceName := "ks"
shardName := "0"
tablet := &topodatapb.Tablet{
Alias: &topodatapb.TabletAlias{
Cell: "zone-1",
Uid: 100,
},
Keyspace: keyspaceName,
Shard: shardName,
}
primaryTablet := &topodatapb.Tablet{
Alias: &topodatapb.TabletAlias{
Cell: "zone-1",
Uid: 100,
},
Keyspace: keyspaceName,
Shard: shardName,
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
// Clear the database after the test. The easiest way to do that is to run all the initialization commands again.
defer func() {
db.ClearVTOrcDatabase()
}()
db.ClearVTOrcDatabase()

// Save shard record for the primary tablet.
err := SaveShard(topo.NewShardInfo(keyspaceName, shardName, &topodatapb.Shard{
PrimaryAlias: primaryTablet.Alias,
}, nil))
require.NoError(t, err)

if tt.primaryInstance != nil {
tt.primaryInstance.InstanceAlias = topoproto.TabletAliasString(primaryTablet.Alias)
err = SaveTablet(primaryTablet)
require.NoError(t, err)
err = WriteInstance(tt.primaryInstance, true, nil)
require.NoError(t, err)
}

err = detectErrantGTIDs(topoproto.TabletAliasString(tablet.Alias), tt.instance, tablet)
if tt.wantErr {
require.Error(t, err)
return
}
require.NoError(t, err)
require.EqualValues(t, tt.wantErrantGTID, tt.instance.GtidErrant)
})
}
}

0 comments on commit 916e97f

Please sign in to comment.