Skip to content

Commit

Permalink
DAOS-16477 mgmt: return suspect engines for pool healthy query (#15458)
Browse files Browse the repository at this point in the history
* DAOS-16477 mgmt: return suspect engines for pool healthy query

After significant failures, the system may leave behind some suspect
engines that were marked as DEAD by the SWIM protocol, but were not
excluded from the system to prevent data loss. An administrator
can bring these ranks back online by restarting them.

This PR aims to provide an administrative interface for querying
suspect engines following a massive failure. These suspect engines
can be retrieved using the daos/dmg --health-only command.

An example of output of dmg pool query --health-only:

Pool 6f450a68-8c7d-4da9-8900-02691650f6a2, ntarget=8, disabled=2, leader=3, version=4, state=Degraded
Pool health info:
- Disabled ranks: 1
- Suspect ranks: 2
- Rebuild busy, 0 objs, 0 recs

Features: DmgPoolQueryRanks
skip-nlt: true
Required-githooks: true
Signed-off-by: Wang Shilong <[email protected]>
Signed-off-by: Phil Henderson <[email protected]>
Co-authored-by: Phil Henderson <[email protected]>
  • Loading branch information
wangshilong and phender committed Nov 25, 2024
1 parent 3d65778 commit 1f7ed6a
Show file tree
Hide file tree
Showing 33 changed files with 930 additions and 421 deletions.
22 changes: 22 additions & 0 deletions docs/admin/pool_operations.md
Original file line number Diff line number Diff line change
Expand Up @@ -286,6 +286,28 @@ The example below shows a rebuild in progress and NVMe space allocated.
Rebuild busy, 75 objs, 9722 recs
```

After experiencing significant failures, the pool may retain some suspect
engines that have been marked as DEAD by the SWIM protocol but were not excluded
from the pool to prevent potential data inconsistency. An administrator can bring
these engines back online by restarting them. The example below illustrates the
system’s status with suspect and disabled engines.

```bash
$ dmg pool query tank -t
```

NB: The --health-only/-t option is necessary to conduct pool health-related queries only.
This is important because suspect ranks may cause commands to hang and timeout so identifying
and restarting them is a useful procedure.

```bash
Pool 6f450a68-8c7d-4da9-8900-02691650f6a2, ntarget=8, disabled=2, leader=3, version=4, state=Degraded
Pool health info:
- Disabled ranks: 1
- Suspect ranks: 2
- Rebuild busy, 0 objs, 0 recs
```

Additional status and telemetry data is planned to be exported through
management tools and will be documented here once available.

Expand Down
19 changes: 7 additions & 12 deletions src/control/cmd/daos/health.go
Original file line number Diff line number Diff line change
Expand Up @@ -95,24 +95,19 @@ func (cmd *healthCheckCmd) Execute([]string) error {
}
}()

queryMask := daos.MustNewPoolQueryMask(daos.PoolQueryOptionEnabledEngines)
queryMask := daos.MustNewPoolQueryMask(daos.PoolQueryOptionEnabledEngines,
daos.PoolQueryOptionSuspectEngines)
if pool.DisabledTargets > 0 {
queryMask.SetOptions(daos.PoolQueryOptionDisabledEngines)
}
tpi, err := queryPool(poolHdl, queryMask)
if err != nil {
cmd.Errorf("failed to query pool %s: %v", pool.Label, err)
continue
}
pool.EnabledRanks = tpi.EnabledRanks

if pool.DisabledTargets > 0 {
queryMask.ClearAll()
queryMask.SetOptions(daos.PoolQueryOptionDisabledEngines)
tpi, err = queryPool(poolHdl, queryMask)
if err != nil {
cmd.Errorf("failed to query pool %s: %v", pool.Label, err)
continue
}
pool.DisabledRanks = tpi.DisabledRanks
}
pool.DisabledRanks = tpi.DisabledRanks
pool.SuspectRanks = tpi.SuspectRanks

poolConts, err := listContainers(poolHdl)
if err != nil {
Expand Down
67 changes: 65 additions & 2 deletions src/control/cmd/daos/pool.go
Original file line number Diff line number Diff line change
Expand Up @@ -296,11 +296,12 @@ func convertPoolInfo(pinfo *C.daos_pool_info_t) (*daos.PoolInfo, error) {
return poolInfo, nil
}

func queryPool(poolHdl C.daos_handle_t, queryMask daos.PoolQueryMask) (*daos.PoolInfo, error) {
func queryPoolRankLists(poolHdl C.daos_handle_t, queryMask daos.PoolQueryMask) (*daos.PoolInfo, error) {
var rlPtr **C.d_rank_list_t = nil
var rl *C.d_rank_list_t = nil

if queryMask.HasOption(daos.PoolQueryOptionEnabledEngines) || queryMask.HasOption(daos.PoolQueryOptionDisabledEngines) {
if queryMask.HasOption(daos.PoolQueryOptionEnabledEngines) || queryMask.HasOption(daos.PoolQueryOptionDisabledEngines) ||
queryMask.HasOption(daos.PoolQueryOptionSuspectEngines) {
rlPtr = &rl
}

Expand Down Expand Up @@ -330,6 +331,68 @@ func queryPool(poolHdl C.daos_handle_t, queryMask daos.PoolQueryMask) (*daos.Poo
if queryMask.HasOption(daos.PoolQueryOptionDisabledEngines) {
poolInfo.DisabledRanks = rs
}
if queryMask.HasOption(daos.PoolQueryOptionSuspectEngines) {
poolInfo.SuspectRanks = rs
}
}

return poolInfo, nil
}
func queryPool(poolHdl C.daos_handle_t, queryMask daos.PoolQueryMask) (*daos.PoolInfo, error) {
poolInfo := &daos.PoolInfo{}
originalMask := queryMask // Save the original queryMask

// Function to handle the query and return a single RankList
queryAndUpdate := func(option string) error {
// Clear previous options and set new option
queryMask.ClearAll()
queryMask.SetOptions(option)

poolInfo1, err := queryPoolRankLists(poolHdl, queryMask)
if err != nil {
return err
}

switch option {
case daos.PoolQueryOptionEnabledEngines:
poolInfo.EnabledRanks = poolInfo1.EnabledRanks
case daos.PoolQueryOptionDisabledEngines:
poolInfo.DisabledRanks = poolInfo1.DisabledRanks
case daos.PoolQueryOptionSuspectEngines:
poolInfo.SuspectRanks = poolInfo1.SuspectRanks
}
return nil
}

// Preprocess queryMask, select one option for the first query
var firstOption string
if originalMask.HasOption(daos.PoolQueryOptionEnabledEngines) {
firstOption = daos.PoolQueryOptionEnabledEngines
} else if originalMask.HasOption(daos.PoolQueryOptionDisabledEngines) {
firstOption = daos.PoolQueryOptionDisabledEngines
} else if originalMask.HasOption(daos.PoolQueryOptionSuspectEngines) {
firstOption = daos.PoolQueryOptionSuspectEngines
}

// Perform the first query to get basic information
if err := queryAndUpdate(firstOption); err != nil {
return nil, err
}

// Check the original query mask and update fields as needed
queryOptions := []string{
daos.PoolQueryOptionEnabledEngines,
daos.PoolQueryOptionDisabledEngines,
daos.PoolQueryOptionSuspectEngines,
}

// Process each option sequentially
for _, opt := range queryOptions {
if originalMask.HasOption(opt) && opt != firstOption {
if err := queryAndUpdate(opt); err != nil {
return nil, err
}
}
}

return poolInfo, nil
Expand Down
7 changes: 7 additions & 0 deletions src/control/cmd/daos/pretty/health.go
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,13 @@ func printPoolHealth(out io.Writer, pi *daos.PoolInfo, verbose bool) {
}

var healthStrings []string
if pi.SuspectRanks != nil && pi.SuspectRanks.Count() > 0 {
degStr := "Suspect"
if verbose {
degStr += fmt.Sprintf(" %s", pi.SuspectRanks)
}
healthStrings = append(healthStrings, degStr)
}
if pi.DisabledTargets > 0 {
degStr := "Degraded"
if verbose {
Expand Down
4 changes: 4 additions & 0 deletions src/control/cmd/daos/pretty/pool.go
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,10 @@ func PrintPoolInfo(pi *daos.PoolInfo, out io.Writer) error {
if pi.DisabledRanks != nil && pi.DisabledRanks.Count() > 0 {
fmt.Fprintf(w, "- Disabled ranks: %s\n", pi.DisabledRanks)
}
if pi.QueryMask.HasOption(daos.PoolQueryOptionSuspectEngines) &&
pi.SuspectRanks != nil && pi.SuspectRanks.Count() > 0 {
fmt.Fprintf(w, "- Suspect ranks: %s\n", pi.SuspectRanks)
}
if pi.Rebuild != nil {
if pi.Rebuild.Status == 0 {
fmt.Fprintf(w, "- Rebuild %s, %d objs, %d recs\n",
Expand Down
39 changes: 39 additions & 0 deletions src/control/cmd/daos/pretty/pool_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,45 @@ Pool space info:
- Storage tier 1 (NVMe):
Total size: 2 B
Free: 1 B, min:0 B, max:0 B, mean:0 B
`, poolUUID.String()),
},
"normal response; suspect ranks": {
pi: &daos.PoolInfo{
QueryMask: daos.HealthOnlyPoolQueryMask,
State: daos.PoolServiceStateDegraded,
UUID: poolUUID,
TotalTargets: 2,
DisabledTargets: 1,
ActiveTargets: 1,
ServiceLeader: 42,
Version: 100,
PoolLayoutVer: 1,
UpgradeLayoutVer: 2,
DisabledRanks: ranklist.MustCreateRankSet("[0,1,3]"),
SuspectRanks: ranklist.MustCreateRankSet("[2]"),
Rebuild: &daos.PoolRebuildStatus{
State: daos.PoolRebuildStateBusy,
Objects: 42,
Records: 21,
},
TierStats: []*daos.StorageUsageStats{
{
Total: 2,
Free: 1,
},
{
Total: 2,
Free: 1,
},
},
},
expPrintStr: fmt.Sprintf(`
Pool %s, ntarget=2, disabled=1, leader=42, version=100, state=Degraded
Pool layout out of date (1 < 2) -- see `+backtickStr+` for details.
Pool health info:
- Disabled ranks: 0-1,3
- Suspect ranks: 2
- Rebuild busy, 42 objs, 21 recs
`, poolUUID.String()),
},
"normal response; disabled ranks": {
Expand Down
4 changes: 2 additions & 2 deletions src/control/common/proto/ctl/storage_nvme.pb.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion src/control/common/proto/ctl/support.pb.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading

0 comments on commit 1f7ed6a

Please sign in to comment.