Skip to content

Commit

Permalink
testsuite: cover hello with partial allocation
Browse files Browse the repository at this point in the history
Problem: there is no coverage of reloading the scheduler with
partially released jobs in housekeeping.

Add a test.
  • Loading branch information
garlick committed Nov 20, 2024
1 parent 8e7e06b commit 409670a
Showing 1 changed file with 40 additions and 2 deletions.
42 changes: 40 additions & 2 deletions t/t2226-housekeeping.t
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,11 @@ kill_ranks () {
flux housekeeping kill --targets=$1 --signal=$2
}

# Usage: straggler_count
straggler_count () {
flux housekeeping list -no {nnodes}
}

# Note: the hand off of resources to housekeeping occurs just before the job
# becomes inactive, therefore it is safe to assume that housekeeping has run
# for the job if it is enclosed between successful 'wait_for_running 0' calls.
Expand All @@ -42,6 +47,16 @@ wait_for_running () {
done
}

# Usage: wait_for_straggler_count count
wait_for_straggler_count () {
count=0
while test $(straggler_count) -gt $1; do
count=$(($count+1));
test $count -eq 300 && return 1 # max 300 * 0.1s sleep = 30s
sleep 0.1
done
}

test_expect_success 'flux-housekeeping utility exists' '
flux housekeeping list --help &&
flux housekeeping kill --help
Expand Down Expand Up @@ -343,8 +358,9 @@ test_expect_success 'configure housekeeping with immediate release' '
test_expect_success 'run job that uses 4 nodes to trigger housekeeping' '
flux run -N4 true
'
test_expect_success 'housekeeping is running for 1 job' '
wait_for_running 1
test_expect_success 'housekeeping completed except for one straggler' '
wait_for_running 1 &&
wait_for_straggler_count 1
'
test_expect_success 'reload scheduler without partial hello capability' '
flux dmesg -C &&
Expand All @@ -357,4 +373,26 @@ test_expect_success 'wait for housekeeping to finish' '
test_expect_success 'housekeeping jobs were terminated due to sched reload' '
flux dmesg | grep "housekeeping:.*will be terminated"
'
test_expect_success 'no node are allocated' '
test $(flux resource list -s allocated -no {nnodes}) -eq 0 &&
test $(FLUX_RESOURCE_LIST_RPC=sched.resource-status \
flux resource list -s allocated -no {nnodes}) -eq 0
'
test_expect_success 'run job that uses 4 nodes to trigger housekeeping' '
flux run -N4 true
'
test_expect_success 'housekeeping completed except for one straggler' '
wait_for_running 1 &&
wait_for_straggler_count 1
'
test_expect_success 'reload scheduler WITH partial hello capability' '
flux dmesg -C &&
flux module reload -f sched-simple &&
flux dmesg -H
'
test_expect_success 'one node is allocated' '
test $(flux resource list -s allocated -no {nnodes}) -eq 1 &&
test $(FLUX_RESOURCE_LIST_RPC=sched.resource-status \
flux resource list -s allocated -no {nnodes}) -eq 1
'
test_done

0 comments on commit 409670a

Please sign in to comment.