Skip to content

Commit

Permalink
prov/efa: Map EFA errnos to Libfabric codes
Browse files Browse the repository at this point in the history
This adds a rudimentary function to map proprietary EFA status codes to
common Libfabric status codes. This is useful when reporting errors to
the application for operations that rely solely on ibverbs or RDMA Core,
such as CQ polling.

Signed-off-by: Darryl Abbate <[email protected]>
  • Loading branch information
darrylabbate authored and shijin-aws committed Apr 17, 2024
1 parent b825778 commit acc217b
Show file tree
Hide file tree
Showing 5 changed files with 55 additions and 13 deletions.
43 changes: 43 additions & 0 deletions prov/efa/src/efa_errno.h
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
#ifndef EFA_ERRNO_H
#define EFA_ERRNO_H

#include <ofi_osd.h>

#define EFA_IO_COMP_STATUS_START 0

/**
Expand Down Expand Up @@ -132,6 +134,47 @@ enum efa_errno {
#undef EFA_IO_COMP_STATUS_ENUM
#undef EFA_PROV_ERRNO_ENUM

/**
* @brief Convert an EFA error code into a common Libfabric error code
*
* @param[in] err An EFA-specific error code
* @return Analogous common Libfabric error code
*
* @sa fi_errno(3)
*/
static inline int to_fi_errno(enum efa_errno err) {
switch (err) {
case EFA_IO_COMP_STATUS_OK:
return FI_SUCCESS;
case EFA_IO_COMP_STATUS_FLUSHED:
return FI_EHOSTDOWN;
case EFA_IO_COMP_STATUS_LOCAL_ERROR_QP_INTERNAL_ERROR:
case EFA_IO_COMP_STATUS_LOCAL_ERROR_INVALID_AH:
case EFA_IO_COMP_STATUS_LOCAL_ERROR_INVALID_LKEY:
case EFA_IO_COMP_STATUS_LOCAL_ERROR_INVALID_OP_TYPE:
case EFA_IO_COMP_STATUS_REMOTE_ERROR_BAD_ADDRESS:
return FI_EINVAL;
case EFA_IO_COMP_STATUS_LOCAL_ERROR_UNRESP_REMOTE:
return FI_EHOSTUNREACH;
case EFA_IO_COMP_STATUS_LOCAL_ERROR_BAD_LENGTH:
case EFA_IO_COMP_STATUS_REMOTE_ERROR_BAD_LENGTH:
return FI_EMSGSIZE;
case EFA_IO_COMP_STATUS_REMOTE_ERROR_ABORT:
case FI_EFA_ERR_ESTABLISHED_RECV_UNRESP:
return FI_ECONNABORTED;
case EFA_IO_COMP_STATUS_REMOTE_ERROR_BAD_DEST_QPN:
return FI_ENOTCONN;
case EFA_IO_COMP_STATUS_REMOTE_ERROR_RNR:
return FI_ENORX;
case EFA_IO_COMP_STATUS_REMOTE_ERROR_BAD_STATUS:
return FI_EREMOTEIO;
case FI_EFA_ERR_OOM:
return FI_ENOMEM;
default:
return FI_EOTHER;
}
}

const char *efa_strerror(enum efa_errno);
void efa_show_help(enum efa_errno);

Expand Down
4 changes: 2 additions & 2 deletions prov/efa/src/rdm/efa_rdm_cq.c
Original file line number Diff line number Diff line change
Expand Up @@ -318,11 +318,11 @@ void efa_rdm_cq_poll_ibv_cq(ssize_t cqe_to_process, struct efa_ibv_cq *ibv_cq)
case IBV_WC_SEND: /* fall through */
case IBV_WC_RDMA_WRITE: /* fall through */
case IBV_WC_RDMA_READ:
efa_rdm_pke_handle_tx_error(pkt_entry, FI_EIO, prov_errno);
efa_rdm_pke_handle_tx_error(pkt_entry, prov_errno);
break;
case IBV_WC_RECV: /* fall through */
case IBV_WC_RECV_RDMA_WITH_IMM:
efa_rdm_pke_handle_rx_error(pkt_entry, FI_EIO, prov_errno);
efa_rdm_pke_handle_rx_error(pkt_entry, prov_errno);
break;
default:
EFA_WARN(FI_LOG_EP_CTRL, "Unhandled op code %d\n", opcode);
Expand Down
11 changes: 6 additions & 5 deletions prov/efa/src/rdm/efa_rdm_pke_cmd.c
Original file line number Diff line number Diff line change
Expand Up @@ -374,16 +374,17 @@ void efa_rdm_pke_handle_data_copied(struct efa_rdm_pke *pkt_entry)
* For other types of error, an error EQ entry is written.
*
* @param[in] pkt_entry pkt entry
* @param[in] err libfabric error code
* @param[in] prov_errno provider specific error code
*/
void efa_rdm_pke_handle_tx_error(struct efa_rdm_pke *pkt_entry, int err, int prov_errno)
void efa_rdm_pke_handle_tx_error(struct efa_rdm_pke *pkt_entry, int prov_errno)
{
struct efa_rdm_peer *peer;
struct efa_rdm_ope *txe;
struct efa_rdm_ope *rxe;
struct efa_rdm_ep *ep;

int err = to_fi_errno(prov_errno);

assert(pkt_entry->alloc_type == EFA_RDM_PKE_FROM_EFA_TX_POOL);

EFA_DBG(FI_LOG_CQ, "Packet send error: %s (%d)\n",
Expand Down Expand Up @@ -459,7 +460,7 @@ void efa_rdm_pke_handle_tx_error(struct efa_rdm_pke *pkt_entry, int err, int pro
*/
if (!(txe->internal_flags & EFA_RDM_TXE_WRITTEN_RNR_CQ_ERR_ENTRY)) {
txe->internal_flags |= EFA_RDM_TXE_WRITTEN_RNR_CQ_ERR_ENTRY;
efa_rdm_txe_handle_error(pkt_entry->ope, FI_ENORX, prov_errno);
efa_rdm_txe_handle_error(pkt_entry->ope, err, prov_errno);
}

efa_rdm_pke_release_tx(pkt_entry);
Expand Down Expand Up @@ -653,12 +654,12 @@ void efa_rdm_pke_handle_send_completion(struct efa_rdm_pke *pkt_entry)
* This function will write error cq or eq entry, then release the packet entry.
*
* @param[in] pkt_entry pkt entry
* @param[in] err libfabric error code
* @param[in] prov_errno provider specific error code
*/
void efa_rdm_pke_handle_rx_error(struct efa_rdm_pke *pkt_entry, int err, int prov_errno)
void efa_rdm_pke_handle_rx_error(struct efa_rdm_pke *pkt_entry, int prov_errno)
{
struct efa_rdm_ep *ep;
int err = to_fi_errno(prov_errno);

ep = pkt_entry->ep;
/*
Expand Down
6 changes: 2 additions & 4 deletions prov/efa/src/rdm/efa_rdm_pke_cmd.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,13 +20,11 @@ fi_addr_t efa_rdm_pke_determine_addr(struct efa_rdm_pke *pkt_entry);

void efa_rdm_pke_handle_data_copied(struct efa_rdm_pke *pkt_entry);

void efa_rdm_pke_handle_tx_error(struct efa_rdm_pke *pkt_entry,
int err, int prov_errno);
void efa_rdm_pke_handle_tx_error(struct efa_rdm_pke *pkt_entry, int prov_errno);

void efa_rdm_pke_handle_send_completion(struct efa_rdm_pke *pkt_entry);

void efa_rdm_pke_handle_rx_error(struct efa_rdm_pke *pkt_entry,
int err, int prov_errno);
void efa_rdm_pke_handle_rx_error(struct efa_rdm_pke *pkt_entry, int prov_errno);

void efa_rdm_pke_handle_recv_completion(struct efa_rdm_pke *pkt_entry);

Expand Down
4 changes: 2 additions & 2 deletions prov/efa/test/efa_unit_test_cq.c
Original file line number Diff line number Diff line change
Expand Up @@ -167,7 +167,7 @@ static void test_rdm_cq_read_bad_send_status(struct efa_resource *resource,
strerror = fi_cq_strerror(resource->cq, cq_err_entry.prov_errno, cq_err_entry.err_data, NULL, 0);

assert_int_equal(ret, 1);
assert_int_equal(cq_err_entry.err, FI_EIO);
assert_int_not_equal(cq_err_entry.err, FI_SUCCESS);
assert_int_equal(cq_err_entry.prov_errno, vendor_error);

/* Reset value */
Expand Down Expand Up @@ -326,7 +326,7 @@ void test_ibv_cq_ex_read_bad_recv_status(struct efa_resource **state)

ret = fi_eq_readerr(resource->eq, &eq_err_entry, 0);
assert_int_equal(ret, sizeof(eq_err_entry));
assert_int_equal(eq_err_entry.err, FI_EIO);
assert_int_not_equal(eq_err_entry.err, FI_SUCCESS);
assert_int_equal(eq_err_entry.prov_errno, EFA_IO_COMP_STATUS_LOCAL_ERROR_UNRESP_REMOTE);
}

Expand Down

0 comments on commit acc217b

Please sign in to comment.