Skip to content

Commit

Permalink
testsuite: cover flub bootstrap
Browse files Browse the repository at this point in the history
Problem: there is no test coverage for adding brokers to
a flux instance.

Add some tests.
  • Loading branch information
garlick committed May 19, 2023
1 parent 3e2072d commit 1650ac5
Show file tree
Hide file tree
Showing 2 changed files with 299 additions and 0 deletions.
1 change: 1 addition & 0 deletions t/Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,7 @@ TESTSCRIPTS = \
t0023-jobspec1-validate.t \
t0026-flux-R.t \
t0033-size-override.t \
t0034-flub.t \
t1000-kvs.t \
t1001-kvs-internals.t \
t1003-kvs-stress.t \
Expand Down
298 changes: 298 additions & 0 deletions t/t0034-flub.t
Original file line number Diff line number Diff line change
@@ -0,0 +1,298 @@
#!/bin/sh
#

test_description='Test flub bootstrap method'

. `dirname $0`/sharness.sh

test_under_flux 8 full

export FLUX_SSH="${SHARNESS_TEST_SRCDIR}/scripts/tssh"

# usage: get_job_uri id
get_job_uri() {
flux job wait-event -t10 $1 memo >/dev/null && flux uri $1
}

# usage: wait_for_service uri name
wait_for_service() {
flux proxy $1 bash -c \""while ! flux ping -c 1 $2 >/dev/null 2>&1; do sleep 0.5; done"\"
}

test_expect_success 'broker fails with bad broker.boot-server' '
test_must_fail flux broker \
-Sbroker.rc1_path= -Sbroker.rc3_path= \
-Sbroker.boot-server=local://noexist/path \
/bin/true 2>server.err &&
grep "was not found" server.err
'

test_expect_success 'start a 1 node job with 0 extra ranks' '
id=$(flux batch -N1 --wrap sleep inf) &&
get_job_uri $id >test1.uri
'
test_expect_success 'job has size 1' '
size=$(flux proxy $(cat test1.uri) flux getattr size) &&
test $size -eq 1
'
test_expect_success 'flub bootstrap fails with no available ranks' '
test_must_fail flux broker \
-Sbroker.boot-server=$(cat test1.uri) 2>noranks.err &&
grep "no available ranks" noranks.err
'
test_expect_success 'clean up' '
flux cancel --all
'


#
# Start 2 node batch job with one extra slot.
# Submit 1 node broker job that fills the slot.
# Run a parallel job across all three nodes in the batch job.
# This test is constrained so that all flubbed nodes are leaf nodes,
# and the flubbed nodes connect to rank 0 only.

test_expect_success 'create config with fake resources' '
cat >fake2.toml <<-EOT
[resource]
noverify = true
[[resource.config]]
hosts = "a,b,c"
cores = "0-3"
EOT
'
test_expect_success 'start a 2 node job with 1 extra rank' '
id=$(flux batch -N2 \
--broker-opts=--config-path=fake2.toml \
--broker-opts=-Ssize=3 \
--broker-opts=-Sbroker.quorum=2 \
--broker-opts=-Stbon.topo=kary:0 \
--wrap sleep inf) &&
get_job_uri $id >test2.uri
'
test_expect_success 'job has size 3' '
size=$(flux proxy $(cat test2.uri) flux getattr size) &&
test $size -eq 3
'
test_expect_success 'overlay status shows extra node offline' '
flux proxy $(cat test2.uri) \
flux overlay status --no-pretty >ov2.out &&
grep "2 extra0: offline" ov2.out
'
test_expect_success 'run a 2 node job in the initial instance' '
wait_for_service $(cat test2.uri) job-ingest &&
run_timeout 30 flux proxy $(cat test2.uri) \
flux run --label-io -N2 flux pmi barrier
'
test_expect_success 'submit a job that starts 1 extra broker' '
id=$(flux submit -N1 flux broker \
--config-path=fake2.toml \
-Stbon.topo=kary:0 \
-Sbroker.boot-server=$(cat test2.uri)) &&
flux job wait-event -p guest.exec.eventlog $id shell.start
'
test_expect_success 'wait for overlay status to be full' '
flux proxy $(cat test2.uri) \
flux overlay status --summary --wait full --timeout 10s
'
test_expect_success 'run a 3 node job in the expanded instance' '
run_timeout 30 flux proxy $(cat test2.uri) \
flux run --label-io -N3 flux pmi barrier
'
test_expect_success 'clean up' '
flux cancel --all
'

#
# Start 3 node batch job with four extra slots (kary:2).
# Submit 4 node broker job that fills the slots.
# Run a parallel job across all seven nodes in the batch job.
# This test is constrained so that all flubbed nodes are leaf nodes,
# but they are grafted on different nodes depending on topology.
# 0
# 1 2
# 3 4 5 6 <-- flubbed

test_expect_success 'create config with fake resources' '
cat >fake3.toml <<-EOT
[resource]
noverify = true
[[resource.config]]
hosts = "a,b,c,d,e,f,g"
cores = "0-3"
EOT
'
test_expect_success 'start a 3 node job with 4 extra ranks' '
id=$(flux batch -N3 \
--broker-opts=--config-path=fake3.toml \
--broker-opts=-Ssize=7 \
--broker-opts=-Sbroker.quorum=3 \
--broker-opts=-Stbon.topo=kary:2 \
--wrap sleep inf) &&
get_job_uri $id >test3.uri
'
test_expect_success 'job has size 7' '
size=$(flux proxy $(cat test3.uri) flux getattr size) &&
test $size -eq 7
'
test_expect_success 'run a 3 node job in the initial instance' '
wait_for_service $(cat test3.uri) job-ingest &&
run_timeout 30 flux proxy $(cat test3.uri) \
flux run --label-io -N3 flux pmi barrier
'
test_expect_success 'submit a job that starts 4 extra brokers' '
id=$(flux submit -N4 flux broker \
--config-path=fake3.toml \
-Stbon.topo=kary:2 \
-Sbroker.boot-server=$(cat test3.uri)) &&
flux job wait-event -p guest.exec.eventlog $id shell.start
'
test_expect_success 'wait for overlay status to be full' '
flux proxy $(cat test3.uri) \
flux overlay status --summary --wait full --timeout 10s
'
test_expect_success 'run a 7 node job in the expanded instance' '
run_timeout 30 flux proxy $(cat test3.uri) \
flux run --label-io -N7 flux pmi barrier
'
test_expect_success 'clean up' '
flux cancel --all
'

#
# Start 1 node batch job with 6 extra slots (kary:2).
# Submit 2 node broker job that fills the first level slots.
# Run a 3 node parallel job.
# Submit 4 node broker job that fills the second level slots.
# Run a 7 node parallel job.
# 0
# 1 2 <-- flubbed (phase 1)
# 3 4 5 6 <-- flubbed (phase 2)
# This test is constrained so the first level wires up before
# the second level is started.

test_expect_success 'start a 1 node job with 6 extra ranks' '
id=$(flux batch -N1 \
--broker-opts=--config-path=fake3.toml \
--broker-opts=-Ssize=7 \
--broker-opts=-Sbroker.quorum=1 \
--broker-opts=-Stbon.topo=kary:2 \
--wrap sleep inf) &&
get_job_uri $id >test4.uri
'
test_expect_success 'run a 1 node job in the initial instance' '
wait_for_service $(cat test4.uri) job-ingest &&
run_timeout 30 flux proxy $(cat test4.uri) \
flux run --label-io -N1 flux pmi barrier
'
test_expect_success 'job has size 7' '
size=$(flux proxy $(cat test4.uri) flux getattr size) &&
test $size -eq 7
'
test_expect_success 'submit a job that starts 2 extra brokers' '
id=$(flux submit -N2 flux broker \
--config-path=fake3.toml \
-Stbon.topo=kary:2 \
-Sbroker.boot-server=$(cat test4.uri)) &&
flux job wait-event -p guest.exec.eventlog $id shell.start
'
test_expect_success 'run a 3 node job in the expanded instance' '
run_timeout 30 flux proxy $(cat test4.uri) \
flux run --label-io -N3 flux pmi barrier
'
test_expect_success 'submit a job that starts 4 extra brokers' '
id=$(flux submit -N4 flux broker \
--config-path=fake3.toml \
-Stbon.topo=kary:2 \
-Sbroker.boot-server=$(cat test4.uri)) &&
flux job wait-event -p guest.exec.eventlog $id shell.start
'
test_expect_success 'wait for overlay status to be full' '
flux proxy $(cat test4.uri) \
flux overlay status --summary --wait full --timeout 10s
'
test_expect_success 'run a 7 node job in the expanded instance' '
run_timeout 30 flux proxy $(cat test4.uri) \
flux run --label-io -N7 flux pmi barrier
'
test_expect_success 'clean up' '
flux cancel --all
'

#
# Start 1 node batch job with 6 extra slots (kary:2).
# Submit 6 node broker job that fills all the slots.
# Run a 7 node parallel job.
#
test_expect_success 'start a 1 node job with 6 extra ranks' '
id=$(flux batch -N1 \
--broker-opts=--config-path=fake3.toml \
--broker-opts=-Ssize=7 \
--broker-opts=-Sbroker.quorum=1 \
--broker-opts=-Stbon.topo=kary:2 \
--wrap sleep inf) &&
get_job_uri $id >test5.uri
'
test_expect_success 'run a 1 node job in the initial instance' '
wait_for_service $(cat test5.uri) job-ingest &&
run_timeout 30 flux proxy $(cat test5.uri) \
flux run --label-io -N1 flux pmi barrier
'
test_expect_success 'job has size 7' '
size=$(flux proxy $(cat test5.uri) flux getattr size) &&
test $size -eq 7
'
test_expect_success 'submit a job that starts 6 extra brokers' '
id=$(flux submit -N6 -o exit-timeout=none \
flux broker \
--config-path=fake3.toml \
-Stbon.topo=kary:2 \
-Sbroker.boot-server=$(cat test5.uri)) &&
flux job wait-event -p guest.exec.eventlog $id shell.start &&
echo $id >xtra_id
'
test_expect_success 'wait for overlay status to be full' '
flux proxy $(cat test5.uri) \
flux overlay status --summary --wait full --timeout 10s
'
test_expect_success 'run a 7 node job in the expanded instance' '
run_timeout 30 flux proxy $(cat test5.uri) \
flux run --label-io -N7 flux pmi barrier
'

#
# Show that a node can be replaced

test_expect_success 'disconnect rank 6' '
flux proxy $(cat test5.uri) \
flux overlay disconnect 6
'
test_expect_success 'rank 6 cannot be pinged - trigger EHOSTUNREACH' '
test_must_fail flux proxy $(cat test5.uri) \
flux ping -c1 6
'
test_expect_success 'wait for overlay status to be degraded' '
flux proxy $(cat test5.uri) \
flux overlay status --summary --wait degraded --timeout 10s
'
test_expect_success 'submit a job that starts 1 broker' '
id=$(flux submit -N1 flux broker \
--config-path=fake3.toml \
-Stbon.topo=kary:2 \
-Sbroker.boot-server=$(cat test5.uri)) &&
flux job wait-event -p guest.exec.eventlog $id shell.start
'
test_expect_success 'wait for overlay status to be full' '
flux proxy $(cat test5.uri) \
flux overlay status --summary --wait full --timeout 10s
'
test_expect_success 'run a 7 node job in the expanded instance' '
run_timeout 30 flux proxy $(cat test5.uri) \
flux run --label-io -N7 flux pmi barrier
'

test_expect_success 'clean up' '
flux cancel --all
'

test_done

0 comments on commit 1650ac5

Please sign in to comment.