diff --git a/src/broker/Makefile.am b/src/broker/Makefile.am index 42c2aad866bc..9b21e120f2e9 100644 --- a/src/broker/Makefile.am +++ b/src/broker/Makefile.am @@ -53,6 +53,8 @@ libbroker_la_SOURCES = \ boot_config.c \ boot_pmi.h \ boot_pmi.c \ + boot_flub.h \ + boot_flub.c \ publisher.h \ publisher.c \ groups.h \ diff --git a/src/broker/boot_flub.c b/src/broker/boot_flub.c new file mode 100644 index 000000000000..1d716b06158d --- /dev/null +++ b/src/broker/boot_flub.c @@ -0,0 +1,223 @@ +/************************************************************\ + * Copyright 2023 Lawrence Livermore National Security, LLC + * (c.f. AUTHORS, NOTICE.LLNS, COPYING) + * + * This file is part of the Flux resource manager framework. + * For details, see https://github.com/flux-framework. + * + * SPDX-License-Identifier: LGPL-3.0 +\************************************************************/ + +/* boot_flub.c - FLUx Boot protocol + * + * Add a broker to an existing Flux instance. + */ + +#if HAVE_CONFIG_H +#include "config.h" +#endif +#include +#include +#include + +#include "src/common/libidset/idset.h" +#include "src/common/libutil/errprintf.h" +#include "src/common/libutil/errno_safe.h" +#include "src/common/libutil/ipaddr.h" +#include "ccan/str/str.h" + +#include "attr.h" +#include "overlay.h" +#include "topology.h" + +#include "boot_flub.h" + +struct boot_info { + int size; + int rank; + json_t *attrs; +}; + +struct boot_parent { + char *pubkey; + int rank; + const char *uri; +}; + +static int wait_for_group_membership (flux_t *h, + const char *name, + int rank, + flux_error_t *error) +{ + flux_future_t *f; + bool online = false; + + if (!(f = flux_rpc_pack (h, + "groups.get", + 0, + FLUX_RPC_STREAMING, + "{s:s}", + "name", "broker.online"))) { + errprintf (error, "broker.online: %s", strerror (errno)); + return -1; + } + do { + const char *members; + struct idset *ids; + + if (flux_rpc_get_unpack (f, "{s:s}", "members", &members) < 0) { + errprintf (error, "broker.online: %s", future_strerror (f, errno)); + break; + } + if ((ids = idset_decode (members)) + && idset_test (ids, rank)) + online = true; + idset_destroy (ids); + flux_future_reset (f); + } while (!online); + flux_future_destroy (f); + + return online ? 0 : -1; +} + +static int set_attrs (attr_t *attrs, json_t *dict) +{ + const char *key; + json_t *val; + + json_object_foreach (dict, key, val) { + const char *s = json_string_value (val); + if (!s) { + errno = EPROTO; + return -1; + } + if (attr_add (attrs, key, s, ATTR_IMMUTABLE) < 0) + return -1; + } + return 0; +} + +int boot_flub (struct broker *ctx, flux_error_t *error) +{ + const char *uri = NULL; + flux_t *h; + flux_future_t *f = NULL; + flux_future_t *f2 = NULL; + struct topology *topo = NULL; + const char *topo_uri; + struct boot_info info; + struct boot_parent parent; + const char *bind_uri = NULL; + int rc = -1; + + /* Ask a Flux instance to allocate an available rank. + * N.B. the broker unsets FLUX_URI so either it is set on the + * command line via broker.boot-server, or the compiled-in path + * is used (system instance). + */ + (void)attr_get (ctx->attrs, "broker.boot-server", &uri, NULL); + if (!(h = flux_open_ex (uri, 0, error))) + return -1; + if (!(f = flux_rpc_pack (h, + "overlay.flub-getinfo", + 0, + 0, + "{}")) + || flux_rpc_get_unpack (f, + "{s:i s:i s:o}", + "rank", &info.rank, + "size", &info.size, + "attrs", &info.attrs) < 0) { + errprintf (error, "%s", future_strerror (f, errno)); + goto out; + } + /* Set instance attributes obtained from boot server. + */ + if (set_attrs (ctx->attrs, info.attrs) < 0) { + errprintf (error, "error setting attributes: %s", strerror (errno)); + goto out; + } + /* Create topology. All ranks are assumed to have the same topology. + * The tbon.topo attribute is set in overlay_create() if not provided + * on the command line. + */ + if (attr_get (ctx->attrs, "tbon.topo", &topo_uri, NULL) < 0) { + errprintf (error, "error fetching tbon.topo attribute"); + goto out; + } + if (!(topo = topology_create (topo_uri, info.size, NULL)) + || topology_set_rank (topo, info.rank) < 0 + || overlay_set_topology (ctx->overlay, topo) < 0) { + errprintf (error, "error creating topology: %s", strerror (errno)); + goto out; + } + if ((parent.rank = topology_get_parent (topo)) < 0) { + errprintf (error, + "rank %d has no parent in %s topology", + info.rank, + topo_uri); + goto out; + } + /* Exchange public keys with TBON parent and obtain its URI. + */ + if (wait_for_group_membership (h, "broker.online", parent.rank, error) < 0) + goto out; + if (!(f2 = flux_rpc_pack (h, + "overlay.flub-kex", + parent.rank, + 0, + "{s:s s:s}", + "name", overlay_cert_name (ctx->overlay), + "pubkey", overlay_cert_pubkey (ctx->overlay))) + || flux_rpc_get_unpack (f2, + "{s:s s:s}", + "pubkey", &parent.pubkey, + "uri", &parent.uri) < 0) { + errprintf (error, "%s", future_strerror (f, errno)); + goto out; + } + /* Inform overlay subsystem of parent info. + */ + if (overlay_set_parent_uri (ctx->overlay, parent.uri) < 0 + || overlay_set_parent_pubkey (ctx->overlay, parent.pubkey) < 0) { + errprintf (error, + "error setting up overlay parameters: %s", + strerror (errno)); + goto out; + } + /* If there are children, bind to zmq socket and update tbon.endpoint. + * Since we don't know if OUR children are co-located on the same node, + * always use the tcp transport. + */ + if (topology_get_child_ranks (topo, NULL, 0) > 0) { + char ipaddr[HOST_NAME_MAX + 1]; + char *wild = NULL; + + overlay_set_ipv6 (ctx->overlay, 1); + if (ipaddr_getprimary (ipaddr, + sizeof (ipaddr), + error->text, + sizeof (error->text)) < 0) + goto out; + if (asprintf (&wild, "tcp://%s:*", ipaddr) < 0 + || overlay_bind (ctx->overlay, wild) < 0) { + errprintf (error, "error binding to tcp://%s:*", ipaddr); + ERRNO_SAFE_WRAP (free, wild); + goto out; + } + bind_uri = overlay_get_bind_uri (ctx->overlay); + } + if (attr_add (ctx->attrs, "tbon.endpoint", bind_uri, ATTR_IMMUTABLE) < 0) { + errprintf (error, "setattr tbon.endpoint"); + goto out; + } + rc = 0; +out: + topology_decref (topo); + flux_future_destroy (f); + flux_future_destroy (f2); + flux_close (h); + return rc; +} + +// vi:ts=4 sw=4 expandtab diff --git a/src/broker/boot_flub.h b/src/broker/boot_flub.h new file mode 100644 index 000000000000..4bf2ee5963b3 --- /dev/null +++ b/src/broker/boot_flub.h @@ -0,0 +1,22 @@ +/************************************************************\ + * Copyright 2023 Lawrence Livermore National Security, LLC + * (c.f. AUTHORS, NOTICE.LLNS, COPYING) + * + * This file is part of the Flux resource manager framework. + * For details, see https://github.com/flux-framework. + * + * SPDX-License-Identifier: LGPL-3.0 +\************************************************************/ + +#ifndef BROKER_BOOT_FLUB_H +#define BROKER_BOOT_FLUB_H + +#include + +#include "broker.h" + +int boot_flub (struct broker *ctx, flux_error_t *error); + +#endif /* BROKER_BOOT_FLUB_H */ + +// vi:ts=4 sw=4 expandtab diff --git a/src/broker/boot_pmi.c b/src/broker/boot_pmi.c index b9d77f8c4626..a0255960279d 100644 --- a/src/broker/boot_pmi.c +++ b/src/broker/boot_pmi.c @@ -75,17 +75,22 @@ static char *pmi_mapping_to_taskmap (const char *s) } /* Set broker.mapping attribute from enclosing instance taskmap. + * Skip setting the taskmap if extra_ranks is nonzero since the + * mapping of those ranks is unknown. N.B. when broker.mapping is missing, + * use_ipc() below will return false, a good thing since it is unknown + * whether ipc:// would work for the TBON wire-up of extra nodes. */ static int set_broker_mapping_attr (struct upmi *upmi, int size, - attr_t *attrs) + attr_t *attrs, + int extra_ranks) { char *val = NULL; int rc; if (size == 1) val = strdup ("[[0,1,1,1]]"); - else { + else if (extra_ranks == 0) { /* First attempt to get flux.taskmap, falling back to * PMI_process_mapping if this key is not available. */ @@ -163,7 +168,7 @@ static int format_bind_uri (char *buf, int bufsz, attr_t *attrs, int rank) return -1; } -static int set_hostlist_attr (attr_t *attrs, struct hostlist *hl) +static int set_hostlist_attr (attr_t *attrs,struct hostlist *hl) { const char *value; char *s; @@ -220,9 +225,10 @@ int boot_pmi (struct overlay *overlay, attr_t *attrs) int child_count; int *child_ranks = NULL; const char *uri; - int i; int upmi_flags = UPMI_LIBPMI_NOFLUX; const char *upmi_method; + const char *s; + int size; // N.B. overlay_create() sets the tbon.topo attribute if (attr_get (attrs, "tbon.topo", &topo_uri, NULL) < 0) { @@ -249,17 +255,35 @@ int boot_pmi (struct overlay *overlay, attr_t *attrs) log_err ("set_instance_level_attr"); goto error; } - if (set_broker_mapping_attr (upmi, info.size, attrs) < 0) { + /* Allow the PMI size to be overridden with a larger one so that + * additional ranks can be grafted on later. + */ + size = info.size; + if (attr_get (attrs, "size", &s, NULL) == 0) { + errno = 0; + size = strtoul (s, NULL, 10); + if (errno != 0 || size <= info.size) { + log_msg ("instance size may only be increased"); + goto error; + } + } + if (set_broker_mapping_attr (upmi, size, attrs, size - info.size) < 0) { log_err ("error setting broker.mapping attribute"); goto error; } - if (!(topo = topology_create (topo_uri, info.size, &error))) { + if (!(topo = topology_create (topo_uri, size, &error))) { log_msg ("error creating '%s' topology: %s", topo_uri, error.text); goto error; } if (topology_set_rank (topo, info.rank) < 0 || overlay_set_topology (overlay, topo) < 0) goto error; + if (info.rank == 0 && size > info.size) { + if (overlay_flub_provision (overlay, info.size, size - 1, true) < 0) { + log_msg ("error provisioning flub allocator"); + goto error; + } + } if (gethostname (hostname, sizeof (hostname)) < 0) { log_err ("gethostname"); goto error; @@ -269,16 +293,6 @@ int boot_pmi (struct overlay *overlay, attr_t *attrs) goto error; } - /* A size=1 instance has no peers, so skip the PMI exchange. - */ - if (info.size == 1) { - if (hostlist_append (hl, hostname) < 0) { - log_err ("hostlist_append"); - goto error; - } - goto done; - } - /* Enable ipv6 for maximum flexibility in address selection. */ overlay_set_ipv6 (overlay, 1); @@ -310,6 +324,16 @@ int boot_pmi (struct overlay *overlay, attr_t *attrs) goto error; } + /* If the PMI size is 1, then skip the PMI exchange entirely. + */ + if (info.size == 1) { + if (hostlist_append (hl, hostname) < 0) { + log_err ("hostlist_append"); + goto error; + } + goto done; + } + /* Each broker writes a "business card" consisting of hostname, * public key, and URI (empty string for leaf node). */ @@ -390,10 +414,13 @@ int boot_pmi (struct overlay *overlay, attr_t *attrs) /* Fetch the business card of children and inform overlay of public keys. */ - for (i = 0; i < child_count; i++) { + for (int i = 0; i < child_count; i++) { const char *peer_pubkey; int child_rank = child_ranks[i]; + if (child_rank >= info.size) + break; + if (snprintf (key, sizeof (key), "%d", child_rank) >= sizeof (key)) { log_msg ("pmi key string overflow"); goto error; @@ -428,7 +455,7 @@ int boot_pmi (struct overlay *overlay, attr_t *attrs) /* Fetch the business card of all ranks and build hostlist. * The hostlist is built independently (and in parallel) on all ranks. */ - for (i = 0; i < info.size; i++) { + for (int i = 0; i < info.size; i++) { const char *peer_hostname; if (snprintf (key, sizeof (key), "%d", i) >= sizeof (key)) { @@ -471,6 +498,17 @@ int boot_pmi (struct overlay *overlay, attr_t *attrs) } done: + /* If the instance size is greater than the PMI size, add placeholder + * names to the hostlist for the ranks that haven't joined yet. + */ + for (int i = info.size; i < size; i++) { + char buf[64]; + snprintf (buf, sizeof (buf), "extra%d", i - info.size); + if (hostlist_append (hl, buf) < 0) { + log_err ("hostlist_append"); + goto error; + } + } if (set_hostlist_attr (attrs, hl) < 0) { log_err ("setattr hostlist"); goto error; diff --git a/src/broker/broker.c b/src/broker/broker.c index e417e4d6c659..850d132936e6 100644 --- a/src/broker/broker.c +++ b/src/broker/broker.c @@ -75,6 +75,7 @@ #include "runat.h" #include "heaptrace.h" #include "exec.h" +#include "boot_flub.h" #include "boot_config.h" #include "boot_pmi.h" #include "publisher.h" @@ -211,7 +212,6 @@ int main (int argc, char *argv[]) struct sigaction old_sigact_term; flux_msg_handler_t **handlers = NULL; const flux_conf_t *conf; - const char *method; flux_error_t error; setlocale (LC_ALL, ""); @@ -355,24 +355,31 @@ int main (int argc, char *argv[]) init_attrs_starttime (ctx.attrs, ctx.starttime); /* Execute broker network bootstrap. - * Default method is pmi. - * If [bootstrap] is defined in configuration, use static configuration. */ - if (attr_get (ctx.attrs, "broker.boot-method", &method, NULL) < 0) { + const char *boot_method; + if (attr_get (ctx.attrs, "broker.boot-method", &boot_method, NULL) < 0) { if (flux_conf_unpack (conf, NULL, "{s:{}}", "bootstrap") == 0) - method = "config"; + boot_method = "config"; + else if (attr_get (ctx.attrs, "broker.boot-server", NULL, NULL) == 0) + boot_method = "flub"; else - method = NULL; + boot_method = NULL; } - if (!method || !streq (method, "config")) { - if (boot_pmi (ctx.overlay, ctx.attrs) < 0) { - log_msg ("bootstrap failed"); + if (boot_method && streq (boot_method, "config")) { + if (boot_config (ctx.h, ctx.overlay, ctx.attrs) < 0) { + log_msg ("boot-config failed"); + goto cleanup; + } + } + else if (boot_method && streq (boot_method, "flub")) { + if (boot_flub (&ctx, &error) < 0) { + log_msg ("boot-flub: %s", error.text); goto cleanup; } } else { - if (boot_config (ctx.h, ctx.overlay, ctx.attrs) < 0) { - log_msg ("bootstrap failed"); + if (boot_pmi (ctx.overlay, ctx.attrs) < 0) { + log_msg ("boot-pmi failed"); goto cleanup; } } diff --git a/src/broker/overlay.c b/src/broker/overlay.c index b53f419d1701..0c19726f5617 100644 --- a/src/broker/overlay.c +++ b/src/broker/overlay.c @@ -38,6 +38,7 @@ #include "src/common/libutil/errprintf.h" #include "src/common/librouter/rpc_track.h" #include "ccan/str/str.h" +#include "ccan/array_size/array_size.h" #include "overlay.h" #include "attr.h" @@ -192,6 +193,7 @@ struct overlay { struct flux_msglist *health_requests; struct flux_msglist *trace_requests; + struct idset *flub_rankpool; }; static void overlay_mcast_child (struct overlay *ov, flux_msg_t *msg); @@ -297,6 +299,16 @@ int overlay_set_topology (struct overlay *ov, struct topology *topo) ov->size = topology_get_size (topo); ov->rank = topology_get_rank (topo); + if (ov->rank == 0) { + if (!(ov->flub_rankpool = idset_create (ov->size, 0))) + goto error; + } + if (!cert_meta_get (ov->cert, "name")) { + char val[16]; + snprintf (val, sizeof (val), "%lu", (unsigned long)ov->rank); + if (cert_meta_set (ov->cert, "name", val) < 0) + goto error; + } ov->child_count = child_count; if (ov->child_count > 0) { int i; @@ -1584,6 +1596,7 @@ int overlay_register_attrs (struct overlay *overlay) overlay->rank, ATTR_IMMUTABLE) < 0) return -1; + (void)attr_delete (overlay->attrs, "size", true); if (attr_add_uint32 (overlay->attrs, "size", overlay->size, ATTR_IMMUTABLE) < 0) @@ -2319,11 +2332,142 @@ static int overlay_configure_topo (struct overlay *ov) return 0; } +static int overlay_flub_alloc (struct overlay *ov, int *rank) +{ + unsigned int id; + + if (!ov->flub_rankpool) { // created by overlay_set_topology() + errno = EINVAL; + return -1; + } + if ((id = idset_first (ov->flub_rankpool)) != IDSET_INVALID_ID) { + if (idset_clear (ov->flub_rankpool, id) < 0) + return -1; + *rank = (int)id; + return 0; + } + errno = ENOENT; + return -1; +} + +int overlay_flub_provision (struct overlay *ov, + uint32_t lo_rank, + uint32_t hi_rank, + bool available) +{ + if (!ov->flub_rankpool) { // created by overlay_set_topology() + errno = EINVAL; + return -1; + } + if (available) + return idset_range_set (ov->flub_rankpool, lo_rank, hi_rank); + return idset_range_clear (ov->flub_rankpool, lo_rank, hi_rank); +} + +static json_t *flub_dict_create (attr_t *attrs) +{ + const char *names[] = { "hostlist", "instance-level" }; + json_t *dict; + + if (!(dict = json_object ())) + goto nomem; + for (int i = 0; i < ARRAY_SIZE (names); i++) { + const char *val; + json_t *o; + if (attr_get (attrs, names[i], &val, NULL) < 0) + goto error; + if (!(o = json_string (val)) + || json_object_set_new (dict, names[i], o) < 0) { + json_decref (o); + goto nomem; + } + } + return dict; +nomem: + errno = ENOMEM; +error: + ERRNO_SAFE_WRAP (json_decref, dict); + return NULL; +} + +static void overlay_flub_getinfo_cb (flux_t *h, + flux_msg_handler_t *mh, + const flux_msg_t *msg, + void *arg) +{ + struct overlay *ov = arg; + const char *errmsg = NULL; + json_t *attrs = NULL; + int rank; + + if (flux_request_unpack (msg, NULL, "{}") < 0) + goto error; + if (overlay_flub_alloc (ov, &rank) < 0) { + errmsg = "there are no available ranks"; + goto error; + } + if (!(attrs = flub_dict_create (ov->attrs))) + goto error; + if (flux_respond_pack (h, + msg, + "{s:i s:i s:O}", + "rank", rank, + "size", ov->size, + "attrs", attrs) < 0) + flux_log_error (h, "error responding to overlay.flub-getinfo request"); + json_decref (attrs); + return; +error: + if (flux_respond_error (h, msg, errno, errmsg) < 0) + flux_log_error (h, "error responding to overlay.flub-getinfo request"); + json_decref (attrs); +} + +static void overlay_flub_kex_cb (flux_t *h, + flux_msg_handler_t *mh, + const flux_msg_t *msg, + void *arg) +{ + struct overlay *ov = arg; + const char *errmsg = NULL; + const char *name; + const char *pubkey; + + if (flux_request_unpack (msg, + NULL, + "{s:s s:s}", + "name", &name, + "pubkey", &pubkey) < 0) + goto error; + if (ov->child_count == 0) { + errmsg = "this broker cannot have children"; + errno = EINVAL; + goto error; + } + if (overlay_authorize (ov, name, pubkey) < 0) { + errmsg = "failed to authorize public key"; + goto error; + } + if (flux_respond_pack (h, + msg, + "{s:s s:s s:s}", + "pubkey", overlay_cert_pubkey (ov), + "name", overlay_cert_name (ov), + "uri", overlay_get_bind_uri (ov)) < 0) + flux_log_error (h, "error responding to overlay.flub-kex request"); + return; +error: + if (flux_respond_error (h, msg, errno, errmsg) < 0) + flux_log_error (h, "error responding to overlay.flub-kex request"); +} + void overlay_destroy (struct overlay *ov) { if (ov) { int saved_errno = errno; + idset_destroy (ov->flub_rankpool); + flux_msglist_destroy (ov->health_requests); cert_destroy (ov->cert); @@ -2377,6 +2521,18 @@ static const struct flux_msg_handler_spec htab[] = { overlay_trace_cb, 0, }, + { + FLUX_MSGTYPE_REQUEST, + "overlay.flub-kex", + overlay_flub_kex_cb, + 0, + }, + { + FLUX_MSGTYPE_REQUEST, + "overlay.flub-getinfo", + overlay_flub_getinfo_cb, + 0, + }, { FLUX_MSGTYPE_REQUEST, "overlay.stats-get", diff --git a/src/broker/overlay.h b/src/broker/overlay.h index 78be2de45a85..687f8454b4ad 100644 --- a/src/broker/overlay.h +++ b/src/broker/overlay.h @@ -145,6 +145,13 @@ int overlay_set_monitor_cb (struct overlay *ov, overlay_monitor_f cb, void *arg); +/* Make a range of ranks available/unavailable for flub bootstrap + */ +int overlay_flub_provision (struct overlay *ov, + uint32_t lo_rank, + uint32_t hi_rank, + bool available); + /* Register overlay-related broker attributes. */ int overlay_register_attrs (struct overlay *overlay); diff --git a/src/broker/state_machine.c b/src/broker/state_machine.c index f3c993ae5147..bfbb4b78e1cc 100644 --- a/src/broker/state_machine.c +++ b/src/broker/state_machine.c @@ -925,6 +925,24 @@ static void broker_online_cb (flux_future_t *f, void *arg) } idset_destroy (loss); } + /* A broker that drops out of s->quorum.online is provisioned + * for replacement via flub, and unprovisioned if it returns. + */ + if (previous_online) { + unsigned int id; + id = idset_first (previous_online); + while (id != IDSET_INVALID_ID) { // online -> offline + if (!idset_test (s->quorum.online, id)) + (void)overlay_flub_provision (s->ctx->overlay, id, id, true); + id = idset_next (previous_online, id); + } + id = idset_first (s->quorum.online); + while (id != IDSET_INVALID_ID) { // offline -> online + if (!idset_test (previous_online, id)) + (void)overlay_flub_provision (s->ctx->overlay, id, id, false); + id = idset_next (s->quorum.online, id); + } + } idset_destroy (previous_online); flux_future_reset (f); diff --git a/t/Makefile.am b/t/Makefile.am index d36d56e0cadb..c37aabbd3897 100644 --- a/t/Makefile.am +++ b/t/Makefile.am @@ -95,6 +95,8 @@ TESTSCRIPTS = \ t0022-jj-reader.t \ t0023-jobspec1-validate.t \ t0026-flux-R.t \ + t0033-size-override.t \ + t0034-flub.t \ t0090-content-enospc.t \ t1000-kvs.t \ t1001-kvs-internals.t \ diff --git a/t/t0033-size-override.t b/t/t0033-size-override.t new file mode 100755 index 000000000000..ba8c2ba904b4 --- /dev/null +++ b/t/t0033-size-override.t @@ -0,0 +1,52 @@ +#!/bin/sh +# + +test_description='Test instance size override' + +. `dirname $0`/sharness.sh + +test_expect_success 'size override fails if too small' ' + test_must_fail flux broker \ + -Sbroker.rc1_path= -Sbroker.rc3_path= \ + -Ssize=1,-Sbroker.quorum=1 /bin/true 2>toosmall.err && + grep "may only be increased" toosmall.err +' +test_expect_success 'an instance can be started with an extra rank' ' + flux broker \ + -Sbroker.rc1_path= -Sbroker.rc3_path= \ + -Ssize=2 -Sbroker.quorum=1 \ + flux overlay status --no-pretty >overlay.out +' +test_expect_success 'flux overlay status shows the extra rank offline' ' + grep "1 extra0: offline" overlay.out +' +test_expect_success 'an instance can be started with PMI plus extra' ' + flux start -s2 -o,-Stbon.topo=kary:0 \ + -o,-Sbroker.rc1_path=,-Sbroker.rc3_path= \ + -o,-Ssize=3,-Sbroker.quorum=2 \ + flux overlay status --no-pretty >overlay2.out +' +test_expect_success 'flux overlay status shows the extra rank offline' ' + grep "2 extra0: offline" overlay2.out +' +test_expect_success 'create config with fake resources' ' + cat >fake.toml <<-EOT + [resource] + noverify = true + [[resource.config]] + hosts = "a,b,c" + cores = "0" + EOT +' +test_expect_success 'start full 2-broker instance with one extra rank' ' + flux start -s2 -o,-Ssize=3,-Sbroker.quorum=2,-cfake.toml \ + flux resource status -s offline -no {nnodes} >offline.out +' +test_expect_success 'flux resource status reports one node offline' ' + cat >offline.exp <<-EOT && + 1 + EOT + test_cmp offline.exp offline.out +' + +test_done diff --git a/t/t0034-flub.t b/t/t0034-flub.t new file mode 100755 index 000000000000..2a19f8bb1ecb --- /dev/null +++ b/t/t0034-flub.t @@ -0,0 +1,193 @@ +#!/bin/sh +# + +test_description='Test flub bootstrap method' + +. `dirname $0`/sharness.sh + +test_under_flux 8 full + +export FLUX_URI_RESOLVE_LOCAL=t + +# usage: get_job_uri id +get_job_uri() { + flux job wait-event -t10 $1 memo >/dev/null && flux uri $1 +} + +# usage: wait_for_service uri name +wait_for_service() { + flux proxy $1 bash -c \""while ! flux ping -c 1 $2 >/dev/null 2>&1; do sleep 0.5; done"\" +} + +test_expect_success 'broker fails with bad broker.boot-server' ' + test_must_fail flux broker \ + -Sbroker.rc1_path= -Sbroker.rc3_path= \ + -Sbroker.boot-server=local://noexist/path \ + /bin/true 2>server.err && + grep "was not found" server.err +' + +test_expect_success 'start a 1 node job with 0 extra ranks' ' + id=$(flux batch -N1 --wrap sleep inf) && + get_job_uri $id >test1.uri +' +test_expect_success 'job has size 1' ' + size=$(flux proxy $(cat test1.uri) flux getattr size) && + test $size -eq 1 +' +test_expect_success 'flub bootstrap fails with no available ranks' ' + test_must_fail flux broker \ + -Sbroker.boot-server=$(cat test1.uri) 2>noranks.err && + grep "no available ranks" noranks.err +' +test_expect_success 'clean up' ' + flux cancel --all +' + + +# +# Start 2 node batch job with one extra slot. +# Submit 1 node broker job that fills the slot. +# Run a parallel job across all three nodes in the batch job. +# This test is constrained so that all flubbed nodes are leaf nodes, +# and the flubbed nodes connect to rank 0 only. + +test_expect_success 'create config with 3 fake nodes' ' + cat >fake3.toml <<-EOT + [resource] + noverify = true + [[resource.config]] + hosts = "a,b,c" + cores = "0-3" + EOT +' +test_expect_success 'start a 2 node job with 1 extra rank' ' + id=$(flux batch -N2 \ + --broker-opts=--config-path=fake3.toml \ + --broker-opts=-Ssize=3 \ + --broker-opts=-Sbroker.quorum=2 \ + --broker-opts=-Stbon.topo=kary:0 \ + --wrap sleep inf) && + get_job_uri $id >test2.uri +' +test_expect_success 'job has size 3' ' + size=$(flux proxy $(cat test2.uri) flux getattr size) && + test $size -eq 3 +' +test_expect_success 'overlay status shows extra node offline' ' + flux proxy $(cat test2.uri) \ + flux overlay status --no-pretty >ov2.out && + grep "2 extra0: offline" ov2.out +' +test_expect_success 'run a 2 node job in the initial instance' ' + wait_for_service $(cat test2.uri) job-ingest && + run_timeout 30 flux proxy $(cat test2.uri) \ + flux run --label-io -N2 flux pmi barrier +' +test_expect_success 'submit a job that starts 1 extra broker' ' + id=$(flux submit -N1 flux broker \ + --config-path=fake3.toml \ + -Stbon.topo=kary:0 \ + -Sbroker.boot-server=$(cat test2.uri)) && + flux job wait-event -p guest.exec.eventlog $id shell.start +' +test_expect_success 'wait for overlay status to be full' ' + flux proxy $(cat test2.uri) \ + flux overlay status --summary --wait full --timeout 30s +' +test_expect_success 'run a 3 node job in the expanded instance' ' + run_timeout 30 flux proxy $(cat test2.uri) \ + flux run --label-io -N3 flux pmi barrier +' +test_expect_success 'clean up' ' + flux cancel --all +' + +test_expect_success 'create config with 7 fake nodes' ' + cat >fake7.toml <<-EOT + [resource] + noverify = true + [[resource.config]] + hosts = "a,b,c,d,e,f,g" + cores = "0-3" + EOT +' + +# +# Start 1 node batch job with 6 extra slots (kary:2). +# Submit 6 node broker job that fills all the slots. +# Run a 7 node parallel job. +# +test_expect_success 'start a 1 node job with 6 extra ranks' ' + id=$(flux batch -N1 \ + --broker-opts=--config-path=fake7.toml \ + --broker-opts=-Ssize=7 \ + --broker-opts=-Sbroker.quorum=1 \ + --broker-opts=-Stbon.topo=kary:2 \ + --wrap sleep inf) && + get_job_uri $id >test5.uri +' +test_expect_success 'run a 1 node job in the initial instance' ' + wait_for_service $(cat test5.uri) job-ingest && + run_timeout 30 flux proxy $(cat test5.uri) \ + flux run --label-io -N1 flux pmi barrier +' +test_expect_success 'job has size 7' ' + size=$(flux proxy $(cat test5.uri) flux getattr size) && + test $size -eq 7 +' +# N.B. include exit-timeout=none so we can safely disconnect one node later +test_expect_success 'submit a job that starts 6 extra brokers' ' + id=$(flux submit -N6 -o exit-timeout=none \ + flux broker \ + --config-path=fake7.toml \ + -Stbon.topo=kary:2 \ + -Sbroker.boot-server=$(cat test5.uri)) && + flux job wait-event -p guest.exec.eventlog $id shell.start && + echo $id >xtra_id +' +test_expect_success 'wait for overlay status to be full' ' + flux proxy $(cat test5.uri) \ + flux overlay status --summary --wait full --timeout 10s +' +test_expect_success 'run a 7 node job in the expanded instance' ' + run_timeout 30 flux proxy $(cat test5.uri) \ + flux run --label-io -N7 flux pmi barrier +' + +# +# Show that a node can be replaced + +test_expect_success 'disconnect rank 6' ' + flux proxy $(cat test5.uri) \ + flux overlay disconnect 6 +' +test_expect_success 'rank 6 cannot be pinged - trigger EHOSTUNREACH' ' + test_must_fail flux proxy $(cat test5.uri) \ + flux ping -c1 6 +' +test_expect_success 'wait for overlay status to be degraded' ' + flux proxy $(cat test5.uri) \ + flux overlay status --summary --wait degraded --timeout 10s +' +test_expect_success 'submit a job that starts 1 broker' ' + id=$(flux submit -N1 flux broker \ + --config-path=fake7.toml \ + -Stbon.topo=kary:2 \ + -Sbroker.boot-server=$(cat test5.uri)) && + flux job wait-event -p guest.exec.eventlog $id shell.start +' +test_expect_success 'wait for overlay status to be full' ' + flux proxy $(cat test5.uri) \ + flux overlay status --summary --wait full --timeout 10s +' +test_expect_success 'run a 7 node job in the expanded instance' ' + run_timeout 30 flux proxy $(cat test5.uri) \ + flux run --label-io -N7 flux pmi barrier +' + +test_expect_success 'clean up' ' + flux cancel --all +' + +test_done