Skip to content

Commit

Permalink
prov/efa: Map EFA errnos to Libfabric codes
Browse files Browse the repository at this point in the history
This adds a rudimentary function to map proprietary EFA status codes to
common Libfabric status codes. This is useful when reporting errors to
the application for operations that rely solely on ibverbs or RDMA Core,
such as CQ polling.

Signed-off-by: Darryl Abbate <drl@amazon.com>
  • Loading branch information
darrylabbate committed Apr 3, 2024
1 parent d187bde commit 2897565
Show file tree
Hide file tree
Showing 4 changed files with 53 additions and 11 deletions.
43 changes: 43 additions & 0 deletions prov/efa/src/efa_errno.h
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
#ifndef EFA_ERRNO_H
#define EFA_ERRNO_H

#include <rdma/fi_errno.h>

#define EFA_IO_COMP_STATUS_START 0

/**
Expand Down Expand Up @@ -132,6 +134,47 @@ enum efa_errno {
#undef EFA_IO_COMP_STATUS_ENUM
#undef EFA_PROV_ERRNO_ENUM

/**
* @brief Convert an EFA error code into a common Libfabric error code
*
* @param[in] err An EFA-specific error code
* @return Analogous common Libfabric error code
*
* @sa fi_errno(3)
*/
static inline int to_fi_errno(enum efa_errno err) {
switch (err) {
case EFA_IO_COMP_STATUS_OK:
return FI_SUCCESS;
case EFA_IO_COMP_STATUS_FLUSHED:
case EFA_IO_COMP_STATUS_LOCAL_ERROR_QP_INTERNAL_ERROR:
case EFA_IO_COMP_STATUS_LOCAL_ERROR_INVALID_AH:
case EFA_IO_COMP_STATUS_LOCAL_ERROR_INVALID_LKEY:
case EFA_IO_COMP_STATUS_LOCAL_ERROR_UNRESP_REMOTE:
case FI_EFA_ERR_ESTABLISHED_RECV_UNRESP:
return FI_EOPBADSTATE;
case EFA_IO_COMP_STATUS_LOCAL_ERROR_INVALID_OP_TYPE:
return FI_EOPNOTSUPP;
case EFA_IO_COMP_STATUS_LOCAL_ERROR_BAD_LENGTH:
case EFA_IO_COMP_STATUS_REMOTE_ERROR_BAD_LENGTH:
return FI_EMSGSIZE;
case EFA_IO_COMP_STATUS_REMOTE_ERROR_BAD_ADDRESS:
return FI_EFAULT;
case EFA_IO_COMP_STATUS_REMOTE_ERROR_ABORT:
return FI_ECONNABORTED;
case EFA_IO_COMP_STATUS_REMOTE_ERROR_BAD_DEST_QPN:
return FI_ENOTCONN;
case EFA_IO_COMP_STATUS_REMOTE_ERROR_RNR:
return FI_ENORX;
case EFA_IO_COMP_STATUS_REMOTE_ERROR_BAD_STATUS:
return FI_EREMOTEIO;
case FI_EFA_ERR_OOM:
return FI_ENOMEM;
default:
return FI_EOTHER;
}
}

const char *efa_strerror(enum efa_errno);
void efa_show_help(enum efa_errno);

Expand Down
4 changes: 2 additions & 2 deletions prov/efa/src/rdm/efa_rdm_cq.c
Original file line number Diff line number Diff line change
Expand Up @@ -318,11 +318,11 @@ void efa_rdm_cq_poll_ibv_cq(ssize_t cqe_to_process, struct efa_ibv_cq *ibv_cq)
case IBV_WC_SEND: /* fall through */
case IBV_WC_RDMA_WRITE: /* fall through */
case IBV_WC_RDMA_READ:
efa_rdm_pke_handle_tx_error(pkt_entry, FI_EIO, prov_errno);
efa_rdm_pke_handle_tx_error(pkt_entry, prov_errno);
break;
case IBV_WC_RECV: /* fall through */
case IBV_WC_RECV_RDMA_WITH_IMM:
efa_rdm_pke_handle_rx_error(pkt_entry, FI_EIO, prov_errno);
efa_rdm_pke_handle_rx_error(pkt_entry, prov_errno);
break;
default:
EFA_WARN(FI_LOG_EP_CTRL, "Unhandled op code %d\n", opcode);
Expand Down
11 changes: 6 additions & 5 deletions prov/efa/src/rdm/efa_rdm_pke_cmd.c
Original file line number Diff line number Diff line change
Expand Up @@ -374,16 +374,17 @@ void efa_rdm_pke_handle_data_copied(struct efa_rdm_pke *pkt_entry)
* For other types of error, an error EQ entry is written.
*
* @param[in] pkt_entry pkt entry
* @param[in] err libfabric error code
* @param[in] prov_errno provider specific error code
*/
void efa_rdm_pke_handle_tx_error(struct efa_rdm_pke *pkt_entry, int err, int prov_errno)
void efa_rdm_pke_handle_tx_error(struct efa_rdm_pke *pkt_entry, int prov_errno)
{
struct efa_rdm_peer *peer;
struct efa_rdm_ope *txe;
struct efa_rdm_ope *rxe;
struct efa_rdm_ep *ep;

int err = to_fi_errno(prov_errno);

assert(pkt_entry->alloc_type == EFA_RDM_PKE_FROM_EFA_TX_POOL);

EFA_DBG(FI_LOG_CQ, "Packet send error: %s (%d)\n",
Expand Down Expand Up @@ -459,7 +460,7 @@ void efa_rdm_pke_handle_tx_error(struct efa_rdm_pke *pkt_entry, int err, int pro
*/
if (!(txe->internal_flags & EFA_RDM_TXE_WRITTEN_RNR_CQ_ERR_ENTRY)) {
txe->internal_flags |= EFA_RDM_TXE_WRITTEN_RNR_CQ_ERR_ENTRY;
efa_rdm_txe_handle_error(pkt_entry->ope, FI_ENORX, prov_errno);
efa_rdm_txe_handle_error(pkt_entry->ope, err, prov_errno);
}

efa_rdm_pke_release_tx(pkt_entry);
Expand Down Expand Up @@ -653,12 +654,12 @@ void efa_rdm_pke_handle_send_completion(struct efa_rdm_pke *pkt_entry)
* This function will write error cq or eq entry, then release the packet entry.
*
* @param[in] pkt_entry pkt entry
* @param[in] err libfabric error code
* @param[in] prov_errno provider specific error code
*/
void efa_rdm_pke_handle_rx_error(struct efa_rdm_pke *pkt_entry, int err, int prov_errno)
void efa_rdm_pke_handle_rx_error(struct efa_rdm_pke *pkt_entry, int prov_errno)
{
struct efa_rdm_ep *ep;
int err = to_fi_errno(prov_errno);

ep = pkt_entry->ep;
/*
Expand Down
6 changes: 2 additions & 4 deletions prov/efa/src/rdm/efa_rdm_pke_cmd.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,13 +20,11 @@ fi_addr_t efa_rdm_pke_determine_addr(struct efa_rdm_pke *pkt_entry);

void efa_rdm_pke_handle_data_copied(struct efa_rdm_pke *pkt_entry);

void efa_rdm_pke_handle_tx_error(struct efa_rdm_pke *pkt_entry,
int err, int prov_errno);
void efa_rdm_pke_handle_tx_error(struct efa_rdm_pke *pkt_entry, int prov_errno);

void efa_rdm_pke_handle_send_completion(struct efa_rdm_pke *pkt_entry);

void efa_rdm_pke_handle_rx_error(struct efa_rdm_pke *pkt_entry,
int err, int prov_errno);
void efa_rdm_pke_handle_rx_error(struct efa_rdm_pke *pkt_entry, int prov_errno);

void efa_rdm_pke_handle_recv_completion(struct efa_rdm_pke *pkt_entry);

Expand Down

0 comments on commit 2897565

Please sign in to comment.