Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

resmgr,agent: propagate startup config error back to CR. #416

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 9 additions & 4 deletions pkg/agent/agent.go
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,7 @@ func TemplateConfigInterface() ConfigInterface {
}

// NotifyFn is a function to call when the effective configuration changes.
type NotifyFn func(cfg interface{}) error
type NotifyFn func(cfg interface{}) (bool, error)

var (
// Our logger instance for the agent.
Expand Down Expand Up @@ -544,12 +544,17 @@ func (a *Agent) updateConfig(cfg metav1.Object) {
}
}

err := a.notifyFn(cfg)
fatal, err := a.notifyFn(cfg)
a.patchConfigStatus(a.currentCfg, cfg, err)

if err != nil {
log.Errorf("failed to apply configuration: %v", err)
if fatal {
log.Fatalf("failed to apply configuration: %v", err)
} else {
log.Errorf("failed to apply configuration: %v", err)
}
}

a.patchConfigStatus(a.currentCfg, cfg, err)
a.currentCfg = cfg
}

Expand Down
14 changes: 7 additions & 7 deletions pkg/resmgr/resource-manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -128,17 +128,17 @@ func (m *resmgr) Start() error {
return nil
}

func (m *resmgr) updateConfig(newCfg interface{}) error {
func (m *resmgr) updateConfig(newCfg interface{}) (bool, error) {
if newCfg == nil {
return fmt.Errorf("can't run without effective configuration...")
return false, fmt.Errorf("can't run without effective configuration...")
}

cfg, ok := newCfg.(cfgapi.ResmgrConfig)
if !ok {
if !m.running {
log.Fatalf("got initial configuration of unexpected type %T", newCfg)
return true, fmt.Errorf("got initial configuration of unexpected type %T", newCfg)
} else {
return fmt.Errorf("got configuration of unexpected type %T", newCfg)
return false, fmt.Errorf("got configuration of unexpected type %T", newCfg)
}
}

Expand All @@ -151,17 +151,17 @@ func (m *resmgr) updateConfig(newCfg interface{}) error {
log.InfoBlock(" <initial config> ", "%s", dump)

if err := m.start(cfg); err != nil {
log.Fatalf("failed to start with initial configuration: %v", err)
return true, fmt.Errorf("failed to start with initial configuration: %v", err)
}

m.running = true
return nil
return false, nil
}

log.Infof("configuration update %s (generation %d):", meta.GetName(), meta.GetGeneration())
log.InfoBlock(" <updated config> ", "%s", dump)

return m.reconfigure(cfg)
return false, m.reconfigure(cfg)
}

// Start resource management once we acquired initial configuration.
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,13 @@
helm-terminate
CONFIG_GROUP="group.test"

cleanup() {
vm-command "kubectl delete -n kube-system topologyawarepolicies.config.nri/default" || :
vm-command "kubectl delete -n kube-system topologyawarepolicies.config.nri/$CONFIG_GROUP" || :
vm-command "kubectl label nodes --all config.nri/group-" || :
helm-terminate || :
}

cleanup
helm_config=$(instantiate helm-config.yaml) helm-launch topology-aware

sleep 1
Expand All @@ -12,6 +21,7 @@ vm-command "kubectl wait -n kube-system topologyawarepolicies/default \
error "expected initial Success status"
}

# verify propagation of errors back to source CR
vm-put-file $(RESERVED_CPU=750x instantiate custom-config.yaml) broken-config.yaml
vm-command "kubectl apply -f broken-config.yaml"

Expand All @@ -26,3 +36,19 @@ vm-command "kubectl wait -n kube-system topologyawarepolicies/default \
}

helm-terminate

# verify propagation of initial configuration errors back to source CR
vm-put-file $(CONFIG_NAME="$CONFIG_GROUP" RESERVED_CPU=750x instantiate custom-config.yaml) \
broken-group-config.yaml
vm-command "kubectl apply -f broken-group-config.yaml" || \
error "failed to install broken group config"
vm-command "kubectl label nodes --all config.nri/group=test" || \
error "failed to label nodes for group config"

expect_error=1 launch_timeout=5s helm_config=$(instantiate helm-config.yaml) helm-launch topology-aware
get-config-node-status-error topologyawarepolicies/$CONFIG_GROUP | \
grep "failed to parse" | grep -q 750x || {
error "expected initial error not found in status"
}

cleanup
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
apiVersion: config.nri/v1alpha1
kind: TopologyAwarePolicy
metadata:
name: default
name: ${CONFIG_NAME:-default}
namespace: kube-system
spec:
pinCPU: true
Expand Down
16 changes: 12 additions & 4 deletions test/e2e/run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -353,6 +353,7 @@ helm-launch() { # script API
# default: 20s
# cfgresource: config custom resource to wait for node status to change in
# default: balloonspolicies/default or topologyawarepolicies/default
# expect_error: don't exit, expect availability error
#
# Example:
# helm_config=$(instantiate helm-config.yaml) helm-launch balloons
Expand All @@ -361,6 +362,7 @@ helm-launch() { # script API
local helm_config="${helm_config:-$TEST_DIR/helm-config.yaml}"
local ds_name="${daemonset_name:-}" ctr_name="${container_name:-nri-resource-policy-$1}"
local helm_name="${helm_name:-test}" timeout="${launch_timeout:-20s}"
local expect_error="${expect_error:-0}"
local plugin="$1" cfgresource=${cfgresource:-} cfgstatus
local deadline
shift
Expand Down Expand Up @@ -403,8 +405,15 @@ helm-launch() { # script API

deadline=$(deadline-for-timeout $timeout)
vm-command "kubectl wait -n kube-system ds/${ds_name} --timeout=$timeout \
--for=jsonpath='{.status.numberAvailable}'=1" || \
error "Timeout while waiting daemonset/${ds_name} to be available"
--for=jsonpath='{.status.numberAvailable}'=1"

if [ "$COMMAND_STATUS" != "0" ]; then
if [ "$expect_error" != "1" ]; then
error "Timeout while waiting daemonset/${ds_name} to be available"
else
return 1
fi
fi

timeout=$(timeout-for-deadline $deadline)
timeout=$timeout wait-config-node-status $cfgresource
Expand All @@ -413,7 +422,6 @@ helm-launch() { # script API
if [ "$result" != "Success" ]; then
reason=$(get-config-node-status-error $cfgresource)
error "Plugin $plugin configuration failed: $reason"
return 1
fi

vm-start-log-collection -n kube-system ds/$ds_name -c $ctr_name
Expand Down Expand Up @@ -502,7 +510,7 @@ get-config-node-status-result() {
get-config-node-status-error() {
local resource="$1" node="$(get-hostname-for-vm)"
vm-command-q "kubectl get -n kube-system $resource \
-ojsonpath=\"{.status.nodes['$node'].error}\""
-ojsonpath=\"{.status.nodes['$node'].errors}\""
}

wait-config-node-status() {
Expand Down
Loading