From 3b3b26c32112d449b4e918cf20500e8721044040 Mon Sep 17 00:00:00 2001 From: Jessie Yang Date: Tue, 19 Nov 2024 13:45:09 -0800 Subject: [PATCH] prov/efa: Disable resource management when user set FI_OPT_EFA_RNR_RETRY If a user is explicitly asking for a retry count that means they also want to manage the resources themselves and the EFA provider should disable resource management to prevent implicit retries. Also remove the unused function. Signed-off-by: Jessie Yang --- fabtests/prov/efa/src/efa_rnr_shared.c | 8 ++++++-- prov/efa/src/rdm/efa_rdm_ep.h | 21 --------------------- prov/efa/src/rdm/efa_rdm_ep_fiops.c | 6 ++++++ prov/efa/test/efa_unit_test_ep.c | 25 +++++++++++++++++++++++++ prov/efa/test/efa_unit_tests.c | 1 + prov/efa/test/efa_unit_tests.h | 1 + 6 files changed, 39 insertions(+), 23 deletions(-) diff --git a/fabtests/prov/efa/src/efa_rnr_shared.c b/fabtests/prov/efa/src/efa_rnr_shared.c index 87a3f452b90..fb62be4f49d 100644 --- a/fabtests/prov/efa/src/efa_rnr_shared.c +++ b/fabtests/prov/efa/src/efa_rnr_shared.c @@ -77,9 +77,13 @@ int ft_efa_rnr_init_fabric() } fprintf(stdout, "Setting RNR retry count to %zu ...\n", rnr_retry); - ret = fi_setopt(&ep->fid, FI_OPT_ENDPOINT, FI_OPT_EFA_RNR_RETRY, &rnr_retry, sizeof(rnr_retry)); + /* + * Cannot use setopt FI_OPT_EFA_RNR_RETRY here because it sets FI_RM_DISABLED, + * which conflicts with the queue/resend test that requires FI_RM_ENABLED. + */ + ret = setenv("FI_EFA_RNR_RETRY", "0", 1); if (ret) { - FT_PRINTERR("fi_setopt", -ret); + FT_PRINTERR("setenv", -ret); return ret; } fprintf(stdout, "RNR retry count has been set to %zu.\n", rnr_retry); diff --git a/prov/efa/src/rdm/efa_rdm_ep.h b/prov/efa/src/rdm/efa_rdm_ep.h index d7a8fc5ddc2..5d43b28afcd 100644 --- a/prov/efa/src/rdm/efa_rdm_ep.h +++ b/prov/efa/src/rdm/efa_rdm_ep.h @@ -263,27 +263,6 @@ struct efa_domain *efa_rdm_ep_domain(struct efa_rdm_ep *ep) void efa_rdm_ep_post_internal_rx_pkts(struct efa_rdm_ep *ep); -/** - * @brief return whether this endpoint should write error cq entry for RNR. - * - * For an endpoint to write RNR completion, two conditions must be met: - * - * First, the end point must be able to receive RNR completion from rdma-core, - * which means rnr_etry must be less then EFA_RNR_INFINITE_RETRY. - * - * Second, the app need to request this feature when opening endpoint - * (by setting info->domain_attr->resource_mgmt to FI_RM_DISABLED). - * The setting was saved as efa_rdm_ep->handle_resource_management. - * - * @param[in] ep endpoint - */ -static inline -bool efa_rdm_ep_should_write_rnr_completion(struct efa_rdm_ep *ep) -{ - return (efa_env.rnr_retry < EFA_RNR_INFINITE_RETRY) && - (ep->handle_resource_management == FI_RM_DISABLED); -} - /* * @brief: check whether we should use p2p for this transaction * diff --git a/prov/efa/src/rdm/efa_rdm_ep_fiops.c b/prov/efa/src/rdm/efa_rdm_ep_fiops.c index d8a1a3fc5e9..48ec1347289 100644 --- a/prov/efa/src/rdm/efa_rdm_ep_fiops.c +++ b/prov/efa/src/rdm/efa_rdm_ep_fiops.c @@ -1697,6 +1697,12 @@ static int efa_rdm_ep_setopt(fid_t fid, int level, int optname, return -FI_ENOSYS; } efa_rdm_ep->base_ep.rnr_retry = *(size_t *)optval; + /* + * If a user is explicitly asking for a retry count that means + * they also want to manage the resources themselves and the EFA + * provider should disable resource management to prevent implicit retries. + */ + efa_rdm_ep->handle_resource_management = FI_RM_DISABLED; break; case FI_OPT_FI_HMEM_P2P: if (optlen != sizeof(int)) diff --git a/prov/efa/test/efa_unit_test_ep.c b/prov/efa/test/efa_unit_test_ep.c index 375ada94683..9403d2a44cc 100644 --- a/prov/efa/test/efa_unit_test_ep.c +++ b/prov/efa/test/efa_unit_test_ep.c @@ -858,6 +858,31 @@ void test_efa_rdm_ep_setopt_shared_memory_permitted(struct efa_resource **state) assert_null(ep->shm_ep); } +/* Test resource management is disabled when user sets FI_OPT_EFA_RNR_RETRY */ +void test_efa_rdm_ep_setopt_rnr_retry(struct efa_resource **state) +{ + struct efa_resource *resource = *state; + struct efa_rdm_ep *ep; + size_t rnr_retry = 5; + + resource->hints = efa_unit_test_alloc_hints(FI_EP_RDM); + + efa_unit_test_resource_construct_with_hints( + resource, FI_EP_RDM, FI_VERSION(1, 14), resource->hints, false, + true); + + assert_int_equal(fi_setopt(&resource->ep->fid, FI_OPT_ENDPOINT, + FI_OPT_EFA_RNR_RETRY, &rnr_retry, + sizeof(rnr_retry)), 0); + + assert_int_equal(fi_enable(resource->ep), 0); + + ep = container_of(resource->ep, struct efa_rdm_ep, + base_ep.util_ep.ep_fid); + + assert_int_equal(ep->handle_resource_management, FI_RM_DISABLED); +} + /** * @brief Test fi_enable with different optval of fi_setopt for * FI_OPT_EFA_WRITE_IN_ORDER_ALIGNED_128_BYTES optname. diff --git a/prov/efa/test/efa_unit_tests.c b/prov/efa/test/efa_unit_tests.c index 2232ea36059..c1302fc29c6 100644 --- a/prov/efa/test/efa_unit_tests.c +++ b/prov/efa/test/efa_unit_tests.c @@ -92,6 +92,7 @@ int main(void) cmocka_unit_test_setup_teardown(test_efa_rdm_ep_getopt_undersized_optlen, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), cmocka_unit_test_setup_teardown(test_efa_rdm_ep_getopt_oversized_optlen, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), cmocka_unit_test_setup_teardown(test_efa_rdm_ep_setopt_shared_memory_permitted, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), + cmocka_unit_test_setup_teardown(test_efa_rdm_ep_setopt_rnr_retry, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), cmocka_unit_test_setup_teardown(test_efa_rdm_ep_enable_qp_in_order_aligned_128_bytes_good, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), cmocka_unit_test_setup_teardown(test_efa_rdm_ep_enable_qp_in_order_aligned_128_bytes_bad, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), cmocka_unit_test_setup_teardown(test_efa_rdm_ep_pkt_pool_flags, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), diff --git a/prov/efa/test/efa_unit_tests.h b/prov/efa/test/efa_unit_tests.h index d44368bc81f..9f4ff7bfef4 100644 --- a/prov/efa/test/efa_unit_tests.h +++ b/prov/efa/test/efa_unit_tests.h @@ -117,6 +117,7 @@ void test_efa_rdm_ep_send_with_shm_no_copy(); void test_efa_rdm_ep_rma_without_caps(); void test_efa_rdm_ep_atomic_without_caps(); void test_efa_rdm_ep_setopt_shared_memory_permitted(); +void test_efa_rdm_ep_setopt_rnr_retry(); void test_efa_rdm_ep_enable_qp_in_order_aligned_128_bytes_good(); void test_efa_rdm_ep_enable_qp_in_order_aligned_128_bytes_bad(); void test_efa_rdm_ep_user_zcpy_rx_disabled();