diff --git a/prov/opx/include/rdma/opx/fi_opx_endpoint.h b/prov/opx/include/rdma/opx/fi_opx_endpoint.h index 977002a3e09..70828f38130 100644 --- a/prov/opx/include/rdma/opx/fi_opx_endpoint.h +++ b/prov/opx/include/rdma/opx/fi_opx_endpoint.h @@ -594,6 +594,7 @@ struct fi_opx_rma_request { uint64_t hmem_device; enum fi_hmem_iface hmem_iface; uint32_t padding; + uint64_t hmem_handle; }; /* @@ -1840,12 +1841,12 @@ void fi_opx_ep_rx_process_header_rzv_cts(struct fi_opx_ep *opx_ep, const union o switch (hdr->cts.target.opcode) { case FI_OPX_HFI_DPUT_OPCODE_RZV: case FI_OPX_HFI_DPUT_OPCODE_RZV_TID: { - const union fi_opx_hfi1_dput_iov *const dput_iov = payload->cts.iov; - const uintptr_t target_context_vaddr = hdr->cts.target.vaddr.target_context_vaddr; - const uint32_t niov = hdr->cts.target.vaddr.niov; - uint64_t *origin_byte_counter = (uint64_t *) hdr->cts.target.vaddr.origin_byte_counter_vaddr; + const union opx_hfi1_dput_iov *const dput_iov = payload->cts.iov; + const uintptr_t target_context_vaddr = hdr->cts.target.vaddr.target_context_vaddr; + const uint32_t niov = hdr->cts.target.vaddr.niov; + uint64_t *origin_byte_counter = (uint64_t *) hdr->cts.target.vaddr.origin_byte_counter_vaddr; OPX_TRACER_TRACE(OPX_TRACER_BEGIN, "RECV-RZV-CTS-HFI:%p", (void *) target_context_vaddr); - FI_OPX_FABRIC_RX_RZV_CTS(opx_ep, NULL, hdr, (const void *const) payload, 0, u8_rx, niov, dput_iov, + FI_OPX_FABRIC_RX_RZV_CTS(opx_ep, hdr, (const void *const) payload, 0, u8_rx, niov, dput_iov, NULL, (const uint8_t)(FI_NOOP - 1), (const uint8_t)(FI_VOID - 1), (uintptr_t) NULL, /* No RMA Request */ target_context_vaddr, origin_byte_counter, hdr->cts.target.opcode, NULL, @@ -1855,11 +1856,11 @@ void fi_opx_ep_rx_process_header_rzv_cts(struct fi_opx_ep *opx_ep, const union o OPX_TRACER_TRACE(OPX_TRACER_END_SUCCESS, "RECV-RZV-CTS-HFI:%p", (void *) target_context_vaddr); } break; case FI_OPX_HFI_DPUT_OPCODE_RZV_NONCONTIG: { - const union fi_opx_hfi1_dput_iov *const dput_iov = payload->cts.iov; - const uintptr_t target_context_vaddr = hdr->cts.target.vaddr.target_context_vaddr; - const uint32_t niov = hdr->cts.target.vaddr.niov; - uint64_t *origin_byte_counter = (uint64_t *) hdr->cts.target.vaddr.origin_byte_counter_vaddr; - FI_OPX_FABRIC_RX_RZV_CTS(opx_ep, NULL, hdr, (const void *const) payload, 0, u8_rx, niov, dput_iov, + const union opx_hfi1_dput_iov *const dput_iov = payload->cts.iov; + const uintptr_t target_context_vaddr = hdr->cts.target.vaddr.target_context_vaddr; + const uint32_t niov = hdr->cts.target.vaddr.niov; + uint64_t *origin_byte_counter = (uint64_t *) hdr->cts.target.vaddr.origin_byte_counter_vaddr; + FI_OPX_FABRIC_RX_RZV_CTS(opx_ep, hdr, (const void *const) payload, 0, u8_rx, niov, dput_iov, NULL, (const uint8_t)(FI_NOOP - 1), (const uint8_t)(FI_VOID - 1), (uintptr_t) NULL, /* No RMA Request */ target_context_vaddr, origin_byte_counter, @@ -1886,33 +1887,35 @@ void fi_opx_ep_rx_process_header_rzv_cts(struct fi_opx_ep *opx_ep, const union o #ifdef OPX_HMEM // Our MR code only supports 1 IOV per registration. uint64_t hmem_device; - enum fi_hmem_iface hmem_iface = opx_hmem_get_mr_iface(opx_mr, &hmem_device); + uint64_t hmem_handle; + enum fi_hmem_iface hmem_iface = opx_hmem_get_mr_iface(opx_mr, &hmem_device, &hmem_handle); assert(niov == 1); - const union fi_opx_hfi1_dput_iov dput_iov = {.rbuf = payload->cts.iov[0].rbuf, - .sbuf = payload->cts.iov[0].sbuf, - .bytes = payload->cts.iov[0].bytes, - .rbuf_iface = payload->cts.iov[0].rbuf_iface, - .rbuf_device = payload->cts.iov[0].rbuf_device, - .sbuf_iface = hmem_iface, - .sbuf_device = hmem_device}; - const union fi_opx_hfi1_dput_iov *const dput_iov_ptr = &dput_iov; + const union opx_hfi1_dput_iov dput_iov = {.rbuf = payload->cts.iov[0].rbuf, + .sbuf = payload->cts.iov[0].sbuf, + .bytes = payload->cts.iov[0].bytes, + .rbuf_iface = payload->cts.iov[0].rbuf_iface, + .rbuf_device = payload->cts.iov[0].rbuf_device, + .sbuf_iface = hmem_iface, + .sbuf_device = hmem_device, + .sbuf_handle = hmem_handle}; + const union opx_hfi1_dput_iov *const dput_iov_ptr = &dput_iov; #else - const union fi_opx_hfi1_dput_iov *const dput_iov_ptr = payload->cts.iov; + const union opx_hfi1_dput_iov *const dput_iov_ptr = payload->cts.iov; #endif - FI_OPX_FABRIC_RX_RZV_CTS(opx_ep, opx_mr, hdr, (const void *const) payload, 0, u8_rx, niov, dput_iov_ptr, - hdr->cts.target.mr.op, hdr->cts.target.mr.dt, rma_request_vaddr, - (uintptr_t) NULL, /* Target completion counter is in rma_request */ - NULL, /* No origin byte counter here */ - FI_OPX_HFI_DPUT_OPCODE_GET, NULL, - is_intranode, /* compile-time constant expression */ - reliability, /* compile-time constant expression */ - u32_ext_rx, hfi1_type); + FI_OPX_FABRIC_RX_RZV_CTS( + opx_ep, hdr, (const void *const) payload, 0, u8_rx, niov, dput_iov_ptr, opx_mr->base_addr, + hdr->cts.target.mr.op, hdr->cts.target.mr.dt, rma_request_vaddr, + (uintptr_t) NULL, /* Target completion counter is in rma_request */ + NULL, /* No origin byte counter here */ + FI_OPX_HFI_DPUT_OPCODE_GET, NULL, is_intranode, /* compile-time constant expression */ + reliability, /* compile-time constant expression */ + u32_ext_rx, hfi1_type); } break; case FI_OPX_HFI_DPUT_OPCODE_PUT_CQ: { - const union fi_opx_hfi1_dput_iov *const dput_iov_ptr = payload->cts.iov; - FI_OPX_FABRIC_RX_RZV_CTS(opx_ep, NULL, hdr, (const void *const) payload, 0, u8_rx, - hdr->cts.target.rma.niov, dput_iov_ptr, hdr->cts.target.rma.op, - hdr->cts.target.rma.dt, hdr->cts.target.rma.origin_rma_request_vaddr, + const union opx_hfi1_dput_iov *const dput_iov_ptr = payload->cts.iov; + FI_OPX_FABRIC_RX_RZV_CTS(opx_ep, hdr, (const void *const) payload, 0, u8_rx, hdr->cts.target.rma.niov, + dput_iov_ptr, NULL, hdr->cts.target.rma.op, hdr->cts.target.rma.dt, + hdr->cts.target.rma.origin_rma_request_vaddr, hdr->cts.target.rma.rma_request_vaddr, NULL, /* No origin byte counter here */ FI_OPX_HFI_DPUT_OPCODE_PUT_CQ, NULL, is_intranode, /* compile-time constant expression */ @@ -1961,7 +1964,8 @@ void fi_opx_ep_rx_process_header_rma_rts(struct fi_opx_ep *opx_ep, const union o uint64_t *rbuf_qws = (uint64_t *) (((uint8_t *) opx_mr->base_addr) + payload->rma_rts.iov[0].rbuf); uint64_t rbuf_device; - enum fi_hmem_iface rbuf_iface = opx_hmem_get_mr_iface(opx_mr, &rbuf_device); + uint64_t rbuf_handle; + enum fi_hmem_iface rbuf_iface = opx_hmem_get_mr_iface(opx_mr, &rbuf_device, &rbuf_handle); struct opx_context *context = (struct opx_context *) ofi_buf_alloc(opx_ep->rx->ctx_pool); context->flags = FI_REMOTE_CQ_DATA | FI_RMA | FI_REMOTE_WRITE; @@ -1972,10 +1976,10 @@ void fi_opx_ep_rx_process_header_rma_rts(struct fi_opx_ep *opx_ep, const union o context->err_entry.err = 0; context->err_entry.op_context = NULL; - fi_opx_hfi1_rx_rma_rts(opx_ep, hdr, payload, hdr->rma_rts.niov, hdr->rma_rts.rma_request_vaddr, context, - (uintptr_t) rbuf_qws, rbuf_iface, rbuf_device, payload->rma_rts.iov, is_intranode, - reliability, /* compile-time constant expression */ - hfi1_type); + opx_hfi1_rx_rma_rts(opx_ep, hdr, payload, hdr->rma_rts.niov, hdr->rma_rts.rma_request_vaddr, context, + (uintptr_t) rbuf_qws, rbuf_iface, rbuf_device, rbuf_handle, payload->rma_rts.iov, + is_intranode, reliability, /* compile-time constant expression */ + hfi1_type); /* post a pending completion event for when the PUT completes */ if (lock_required) { @@ -2171,7 +2175,8 @@ void fi_opx_ep_rx_process_header_rzv_data(struct fi_opx_ep *opx_ep, const union uint64_t *rbuf_qws = (uint64_t *) (((uint8_t *) opx_mr->base_addr) + fi_opx_dput_rbuf_in(hdr->dput.target.mr.offset)); uint64_t hmem_device; - enum fi_hmem_iface hmem_iface = opx_hmem_get_mr_iface(opx_mr, &hmem_device); + uint64_t hmem_handle; + enum fi_hmem_iface hmem_iface = opx_hmem_get_mr_iface(opx_mr, &hmem_device, &hmem_handle); /* In a multi-packet SDMA send, the driver sets the high bit on * in the PSN to indicate this is the last packet. The payload @@ -2193,8 +2198,8 @@ void fi_opx_ep_rx_process_header_rzv_data(struct fi_opx_ep *opx_ep, const union #endif // Optimize Memcpy if (hdr->dput.target.op == FI_NOOP - 1 && hdr->dput.target.dt == FI_VOID - 1) { - OPX_HMEM_COPY_TO(rbuf_qws, sbuf_qws, bytes, OPX_HMEM_NO_HANDLE, - OPX_HMEM_DEV_REG_THRESHOLD_NOT_SET, hmem_iface, hmem_device); + OPX_HMEM_COPY_TO(rbuf_qws, sbuf_qws, bytes, hmem_handle, OPX_HMEM_DEV_REG_RECV_THRESHOLD, + hmem_iface, hmem_device); } else { OPX_HMEM_ATOMIC_DISPATCH(sbuf_qws, rbuf_qws, bytes, hdr->dput.target.dt, hdr->dput.target.op, hmem_iface, hmem_device); @@ -2229,8 +2234,8 @@ void fi_opx_ep_rx_process_header_rzv_data(struct fi_opx_ep *opx_ep, const union #endif // Optimize Memcpy if (hdr->dput.target.op == (FI_NOOP - 1) && hdr->dput.target.dt == (FI_VOID - 1)) { - OPX_HMEM_COPY_TO(rbuf_qws, sbuf_qws, bytes, OPX_HMEM_NO_HANDLE, - OPX_HMEM_DEV_REG_THRESHOLD_NOT_SET, rma_req->hmem_iface, rma_req->hmem_device); + OPX_HMEM_COPY_TO(rbuf_qws, sbuf_qws, bytes, rma_req->hmem_handle, + OPX_HMEM_DEV_REG_RECV_THRESHOLD, rma_req->hmem_iface, rma_req->hmem_device); } else { OPX_HMEM_ATOMIC_DISPATCH(sbuf_qws, rbuf_qws, bytes, hdr->dput.target.dt, hdr->dput.target.op, rma_req->hmem_iface, rma_req->hmem_device); @@ -2279,8 +2284,8 @@ void fi_opx_ep_rx_process_header_rzv_data(struct fi_opx_ep *opx_ep, const union } #endif if (hdr->dput.target.dt == (FI_VOID - 1)) { - OPX_HMEM_COPY_TO(rbuf_qws, sbuf_qws, bytes, OPX_HMEM_NO_HANDLE, - OPX_HMEM_DEV_REG_THRESHOLD_NOT_SET, rma_req->hmem_iface, rma_req->hmem_device); + OPX_HMEM_COPY_TO(rbuf_qws, sbuf_qws, bytes, rma_req->hmem_handle, + OPX_HMEM_DEV_REG_RECV_THRESHOLD, rma_req->hmem_iface, rma_req->hmem_device); } else { OPX_HMEM_ATOMIC_DISPATCH(sbuf_qws, rbuf_qws, bytes, hdr->dput.target.dt, FI_ATOMIC_WRITE, rma_req->hmem_iface, rma_req->hmem_device); @@ -2325,25 +2330,27 @@ void fi_opx_ep_rx_process_header_rzv_data(struct fi_opx_ep *opx_ep, const union #endif assert(bytes > sizeof(*dput_fetch)); uint64_t hmem_device; - enum fi_hmem_iface hmem_iface = opx_hmem_get_mr_iface(opx_mr, &hmem_device); + uint64_t hmem_handle; + enum fi_hmem_iface hmem_iface = opx_hmem_get_mr_iface(opx_mr, &hmem_device, &hmem_handle); - // rbuf_iface & rbuf_hmem are contained in the rma_request that + // rbuf_iface and rbuf_hmem are contained in the rma_request that // resides in the originating endpoint, so can just be set to - // system/0 here. - union fi_opx_hfi1_dput_iov dput_iov = {.sbuf = mr_offset, - .rbuf = dput_fetch->fetch_rbuf, - .bytes = bytes - sizeof(struct fi_opx_hfi1_dput_fetch), - .rbuf_iface = FI_HMEM_SYSTEM, - .sbuf_iface = hmem_iface, - .rbuf_device = 0, - .sbuf_device = hmem_device}; + // system/0/OPX_HMEM_NO_HANDLE here. + union opx_hfi1_dput_iov dput_iov = {.sbuf = mr_offset, + .rbuf = dput_fetch->fetch_rbuf, + .bytes = bytes - sizeof(struct fi_opx_hfi1_dput_fetch), + .rbuf_iface = FI_HMEM_SYSTEM, + .sbuf_iface = hmem_iface, + .rbuf_device = 0, + .sbuf_device = hmem_device, + .sbuf_handle = hmem_handle}; assert(dput_iov.bytes <= FI_OPX_HFI1_PACKET_MTU - sizeof(*dput_fetch)); assert(hdr->dput.target.op != (FI_NOOP - 1)); assert(hdr->dput.target.dt != (FI_VOID - 1)); // Do the FETCH part of this atomic fetch operation union fi_opx_hfi1_deferred_work *work = FI_OPX_FABRIC_RX_RZV_CTS( - opx_ep, opx_mr, hdr, (const void *const) payload, bytes, u8_rx, 1, &dput_iov, + opx_ep, hdr, (const void *const) payload, bytes, u8_rx, 1, &dput_iov, opx_mr->base_addr, hdr->dput.target.op, hdr->dput.target.dt, dput_fetch->rma_request_vaddr, (uintptr_t) NULL, /* target byte counter is in rma_request */ NULL, /* No origin byte counter here */ @@ -2390,25 +2397,27 @@ void fi_opx_ep_rx_process_header_rzv_data(struct fi_opx_ep *opx_ep, const union #endif assert(bytes > sizeof(*dput_fetch)); uint64_t hmem_device; - enum fi_hmem_iface hmem_iface = opx_hmem_get_mr_iface(opx_mr, &hmem_device); + uint64_t hmem_handle; + enum fi_hmem_iface hmem_iface = opx_hmem_get_mr_iface(opx_mr, &hmem_device, &hmem_handle); - // rbuf_iface & rbuf_hmem are contained in the rma_request that + // rbuf_iface and rbuf_hmem are contained in the rma_request that // resides in the originating endpoint, so can just be set to - // system/0 here. - union fi_opx_hfi1_dput_iov dput_iov = {.sbuf = mr_offset, - .rbuf = dput_fetch->fetch_rbuf, - .bytes = (bytes - sizeof(struct fi_opx_hfi1_dput_fetch)) >> 1, - .rbuf_iface = FI_HMEM_SYSTEM, - .sbuf_iface = hmem_iface, - .rbuf_device = 0, - .sbuf_device = hmem_device}; + // system/0/OPX_HMEM_NO_HANDLE here. + union opx_hfi1_dput_iov dput_iov = {.sbuf = mr_offset, + .rbuf = dput_fetch->fetch_rbuf, + .bytes = (bytes - sizeof(struct fi_opx_hfi1_dput_fetch)) >> 1, + .rbuf_iface = FI_HMEM_SYSTEM, + .sbuf_iface = hmem_iface, + .rbuf_device = 0, + .sbuf_device = hmem_device, + .sbuf_handle = hmem_handle}; assert(dput_iov.bytes <= ((FI_OPX_HFI1_PACKET_MTU - sizeof(*dput_fetch)) >> 1)); assert(hdr->dput.target.op != (FI_NOOP - 1)); assert(hdr->dput.target.dt != (FI_VOID - 1)); // Do the FETCH part of this atomic fetch operation union fi_opx_hfi1_deferred_work *work = FI_OPX_FABRIC_RX_RZV_CTS( - opx_ep, opx_mr, hdr, (const void *const) payload, bytes, u8_rx, 1, &dput_iov, + opx_ep, hdr, (const void *const) payload, bytes, u8_rx, 1, &dput_iov, opx_mr->base_addr, hdr->dput.target.op, hdr->dput.target.dt, dput_fetch->rma_request_vaddr, (uintptr_t) NULL, /* Target completion counter is in rma request */ NULL, /* No origin byte counter here */ @@ -3241,13 +3250,14 @@ ssize_t fi_opx_ep_rx_recv_internal(struct fi_opx_ep *opx_ep, void *buf, size_t l #ifdef OPX_HMEM uint64_t hmem_device; - enum fi_hmem_iface hmem_iface = opx_hmem_get_mr_iface(desc, &hmem_device); + uint64_t hmem_handle; + enum fi_hmem_iface hmem_iface = opx_hmem_get_mr_iface(desc, &hmem_device, &hmem_handle); if (hmem_iface != FI_HMEM_SYSTEM) { FI_OPX_DEBUG_COUNTERS_INC_COND(static_flags & FI_MSG, opx_ep->debug_counters.hmem.posted_recv_msg); FI_OPX_DEBUG_COUNTERS_INC_COND(static_flags & FI_TAGGED, opx_ep->debug_counters.hmem.posted_recv_tag); hmem_info->iface = hmem_iface; hmem_info->device = hmem_device; - hmem_info->hmem_dev_reg_handle = ((struct fi_opx_mr *) desc)->hmem_dev_reg_handle; + hmem_info->hmem_dev_reg_handle = hmem_handle; context->flags |= FI_OPX_CQ_CONTEXT_HMEM; @@ -3358,11 +3368,12 @@ static inline ssize_t fi_opx_ep_rx_recvmsg_internal(struct fi_opx_ep *opx_ep, co struct fi_opx_hmem_info *hmem_info = (struct fi_opx_hmem_info *) &context->hmem_info_qws[0]; uint64_t hmem_device; enum fi_hmem_iface hmem_iface; + uint64_t hmem_handle; if (msg->desc && msg->desc[0]) { - hmem_iface = opx_hmem_get_mr_iface(msg->desc[0], &hmem_device); + hmem_iface = opx_hmem_get_mr_iface(msg->desc[0], &hmem_device, &hmem_handle); hmem_info->iface = hmem_iface; hmem_info->device = hmem_device; - hmem_info->hmem_dev_reg_handle = ((struct fi_opx_mr *) msg->desc[0])->hmem_dev_reg_handle; + hmem_info->hmem_dev_reg_handle = hmem_handle; hmem_info->is_unified = ((struct fi_opx_mr *) msg->desc[0])->hmem_unified; } else { hmem_iface = FI_HMEM_SYSTEM; @@ -3376,8 +3387,9 @@ static inline ssize_t fi_opx_ep_rx_recvmsg_internal(struct fi_opx_ep *opx_ep, co if (msg->iov_count > 1) { for (int i = 1; i < msg->iov_count; ++i) { uint64_t tmp_hmem_device; + uint64_t hmem_handle; enum fi_hmem_iface tmp_hmem_iface = - opx_hmem_get_mr_iface(msg->desc ? msg->desc[i] : NULL, &tmp_hmem_device); + opx_hmem_get_mr_iface(msg->desc ? msg->desc[i] : NULL, &tmp_hmem_device, &hmem_handle); assert(tmp_hmem_iface == hmem_iface); assert(tmp_hmem_device == hmem_device); } @@ -3501,12 +3513,12 @@ void fi_opx_ep_tx_cq_completion_rzv(struct fid_ep *ep, void *context, const size } __OPX_FORCE_INLINE__ -ssize_t fi_opx_hfi1_tx_send_try_mp_egr(struct fid_ep *ep, const void *buf, size_t len, void *desc, fi_addr_t dest_addr, - uint64_t tag, void *context, const uint32_t data, int lock_required, - const unsigned override_flags, const uint64_t tx_op_flags, const uint64_t caps, - const enum ofi_reliability_kind reliability, const uint64_t do_cq_completion, - const enum fi_hmem_iface hmem_iface, const uint64_t hmem_device, - const enum opx_hfi1_type hfi1_type) +ssize_t opx_hfi1_tx_send_try_mp_egr(struct fid_ep *ep, const void *buf, size_t len, fi_addr_t dest_addr, uint64_t tag, + void *context, const uint32_t data, int lock_required, + const unsigned override_flags, const uint64_t tx_op_flags, const uint64_t caps, + const enum ofi_reliability_kind reliability, const uint64_t do_cq_completion, + const enum fi_hmem_iface hmem_iface, const uint64_t hmem_device, + const uint64_t hmem_handle, const enum opx_hfi1_type hfi1_type) { struct fi_opx_ep *opx_ep = container_of(ep, struct fi_opx_ep, ep_fid); const union fi_opx_addr addr = {.fi = dest_addr}; @@ -3526,10 +3538,10 @@ ssize_t fi_opx_hfi1_tx_send_try_mp_egr(struct fid_ep *ep, const void *buf, size_ uint8_t *buf_bytes_ptr = (uint8_t *) buf; ssize_t rc; - rc = fi_opx_hfi1_tx_send_mp_egr_first_common(opx_ep, (void **) &buf_bytes_ptr, len, desc, opx_ep->hmem_copy_buf, - pbc_dlid, bth_rx, lrh_dlid, addr, tag, data, lock_required, - tx_op_flags, caps, reliability, &first_packet_psn, hmem_iface, - hmem_device, hfi1_type); + rc = opx_hfi1_tx_send_mp_egr_first_common(opx_ep, (void **) &buf_bytes_ptr, len, opx_ep->hmem_copy_buf, + pbc_dlid, bth_rx, lrh_dlid, addr, tag, data, lock_required, + tx_op_flags, caps, reliability, &first_packet_psn, hmem_iface, + hmem_device, hmem_handle, hfi1_type); if (rc != FI_SUCCESS) { FI_OPX_DEBUG_COUNTERS_INC(opx_ep->debug_counters.mp_eager.send_fall_back_to_rzv); @@ -3674,24 +3686,25 @@ ssize_t fi_opx_hfi1_tx_send_try_mp_egr(struct fid_ep *ep, const void *buf, size_ #endif __OPX_FORCE_INLINE__ -ssize_t fi_opx_ep_tx_send_try_eager(struct fid_ep *ep, const void *buf, size_t len, void *desc, - const union fi_opx_addr addr, uint64_t tag, void *context, - const struct iovec *local_iov, size_t niov, size_t total_len, const uint32_t data, - const int lock_required, const unsigned is_contiguous, - const unsigned override_flags, const uint64_t tx_op_flags, const uint64_t caps, - const enum ofi_reliability_kind reliability, const uint64_t do_cq_completion, - const enum fi_hmem_iface hmem_iface, const uint64_t hmem_device, - const bool mp_eager_fallback, const enum opx_hfi1_type hfi1_type) +ssize_t opx_ep_tx_send_try_eager(struct fid_ep *ep, const void *buf, size_t len, const union fi_opx_addr addr, + uint64_t tag, void *context, const struct iovec *local_iov, size_t niov, + size_t total_len, const uint32_t data, const int lock_required, + const unsigned is_contiguous, const unsigned override_flags, + const uint64_t tx_op_flags, const uint64_t caps, + const enum ofi_reliability_kind reliability, const uint64_t do_cq_completion, + const enum fi_hmem_iface hmem_iface, const uint64_t hmem_device, + const uint64_t hmem_handle, const bool mp_eager_fallback, + const enum opx_hfi1_type hfi1_type) { ssize_t rc; if (is_contiguous) { - rc = FI_OPX_FABRIC_TX_SEND_EGR(ep, buf, len, desc, addr.fi, tag, context, data, lock_required, - override_flags, tx_op_flags, addr.hfi1_subctxt_rx, caps, reliability, - do_cq_completion, hmem_iface, hmem_device, hfi1_type); + rc = OPX_FABRIC_TX_SEND_EGR(ep, buf, len, addr.fi, tag, context, data, lock_required, override_flags, + tx_op_flags, addr.hfi1_subctxt_rx, caps, reliability, do_cq_completion, + hmem_iface, hmem_device, hmem_handle, hfi1_type); } else { - rc = FI_OPX_FABRIC_TX_SENDV_EGR(ep, local_iov, niov, total_len, desc, addr.fi, tag, context, data, - lock_required, override_flags, tx_op_flags, addr.hfi1_subctxt_rx, caps, - reliability, do_cq_completion, hmem_iface, hmem_device, hfi1_type); + rc = OPX_FABRIC_TX_SENDV_EGR(ep, local_iov, niov, total_len, addr.fi, tag, context, data, lock_required, + override_flags, tx_op_flags, addr.hfi1_subctxt_rx, caps, reliability, + do_cq_completion, hmem_iface, hmem_device, hmem_handle, hfi1_type); } if (OFI_LIKELY(rc == FI_SUCCESS)) { return rc; @@ -3719,15 +3732,15 @@ ssize_t fi_opx_ep_tx_send_try_eager(struct fid_ep *ep, const void *buf, size_t l uint64_t loop = 0; do { if (is_contiguous) { - rc = FI_OPX_FABRIC_TX_SEND_EGR(ep, buf, len, desc, addr.fi, tag, context, data, lock_required, - override_flags, tx_op_flags, addr.hfi1_subctxt_rx, caps, - reliability, do_cq_completion, hmem_iface, hmem_device, - hfi1_type); + rc = OPX_FABRIC_TX_SEND_EGR(ep, buf, len, addr.fi, tag, context, data, lock_required, + override_flags, tx_op_flags, addr.hfi1_subctxt_rx, caps, + reliability, do_cq_completion, hmem_iface, hmem_device, hmem_handle, + hfi1_type); } else { - rc = FI_OPX_FABRIC_TX_SENDV_EGR(ep, local_iov, niov, total_len, desc, addr.fi, tag, context, - data, lock_required, override_flags, tx_op_flags, - addr.hfi1_subctxt_rx, caps, reliability, do_cq_completion, - hmem_iface, hmem_device, hfi1_type); + rc = OPX_FABRIC_TX_SENDV_EGR(ep, local_iov, niov, total_len, addr.fi, tag, context, data, + lock_required, override_flags, tx_op_flags, addr.hfi1_subctxt_rx, + caps, reliability, do_cq_completion, hmem_iface, hmem_device, + hmem_handle, hfi1_type); } fi_opx_ep_rx_poll(ep, 0, OPX_RELIABILITY, FI_OPX_HDRQ_MASK_RUNTIME, hfi1_type); } while (rc == -FI_ENOBUFS && loop++ < FI_OPX_EP_TX_SEND_EAGER_MAX_RETRIES); @@ -3736,28 +3749,28 @@ ssize_t fi_opx_ep_tx_send_try_eager(struct fid_ep *ep, const void *buf, size_t l } __OPX_FORCE_INLINE__ -ssize_t fi_opx_ep_tx_send_rzv(struct fid_ep *ep, const void *buf, size_t len, void *desc, const union fi_opx_addr addr, - uint64_t tag, void *context, const struct iovec *local_iov, size_t niov, size_t total_len, - const uint32_t data, const int lock_required, const unsigned is_contiguous, - const unsigned override_flags, const uint64_t tx_op_flags, const uint64_t caps, - const enum ofi_reliability_kind reliability, const uint64_t do_cq_completion, - const enum fi_hmem_iface hmem_iface, const uint64_t hmem_device, - const enum opx_hfi1_type hfi1_type) +ssize_t opx_ep_tx_send_rzv(struct fid_ep *ep, const void *buf, size_t len, const union fi_opx_addr addr, uint64_t tag, + void *context, const struct iovec *local_iov, size_t niov, size_t total_len, + const uint32_t data, const int lock_required, const unsigned is_contiguous, + const unsigned override_flags, const uint64_t tx_op_flags, const uint64_t caps, + const enum ofi_reliability_kind reliability, const uint64_t do_cq_completion, + const enum fi_hmem_iface hmem_iface, const uint64_t hmem_device, const uint64_t hmem_handle, + const enum opx_hfi1_type hfi1_type) { struct fi_opx_ep *opx_ep = container_of(ep, struct fi_opx_ep, ep_fid); ssize_t rc; do { if (is_contiguous) { - rc = FI_OPX_FABRIC_TX_SEND_RZV(ep, buf, len, desc, addr.fi, tag, context, data, lock_required, - override_flags, tx_op_flags, addr.hfi1_subctxt_rx, caps, - reliability, do_cq_completion, hmem_iface, hmem_device, - hfi1_type); + rc = OPX_FABRIC_TX_SEND_RZV(ep, buf, len, addr.fi, tag, context, data, lock_required, + override_flags, tx_op_flags, addr.hfi1_subctxt_rx, caps, + reliability, do_cq_completion, hmem_iface, hmem_device, hmem_handle, + hfi1_type); } else { - rc = FI_OPX_FABRIC_TX_SENDV_RZV(ep, local_iov, niov, total_len, desc, addr.fi, tag, context, - data, lock_required, override_flags, tx_op_flags, - addr.hfi1_subctxt_rx, caps, reliability, do_cq_completion, - hmem_iface, hmem_device, hfi1_type); + rc = OPX_FABRIC_TX_SENDV_RZV(ep, local_iov, niov, total_len, addr.fi, tag, context, data, + lock_required, override_flags, tx_op_flags, addr.hfi1_subctxt_rx, + caps, reliability, do_cq_completion, hmem_iface, hmem_device, + hmem_handle, hfi1_type); } if (OFI_UNLIKELY(rc == -EAGAIN)) { @@ -3780,7 +3793,8 @@ static inline ssize_t fi_opx_ep_tx_send_internal(struct fid_ep *ep, const void * OPX_TRACER_TRACE(OPX_TRACER_BEGIN, "SEND"); uint64_t hmem_device; - enum fi_hmem_iface hmem_iface = opx_hmem_get_mr_iface(desc, &hmem_device); + uint64_t hmem_handle; + enum fi_hmem_iface hmem_iface = opx_hmem_get_mr_iface(desc, &hmem_device, &hmem_handle); assert(is_contiguous == OPX_CONTIG_FALSE || is_contiguous == OPX_CONTIG_TRUE); @@ -3836,10 +3850,10 @@ static inline ssize_t fi_opx_ep_tx_send_internal(struct fid_ep *ep, const void * const bool mp_eager_fallback = (total_len > FI_OPX_MP_EGR_CHUNK_PAYLOAD_SIZE(hfi1_type) && total_len <= opx_ep->tx->mp_eager_max_payload_bytes); if (total_len <= opx_ep->tx->pio_max_eager_tx_bytes) { - rc = fi_opx_ep_tx_send_try_eager(ep, buf, len, desc, addr, tag, context, local_iov, niov, - total_len, data, lock_required, is_contiguous, override_flags, - tx_op_flags, caps, reliability, do_cq_completion, hmem_iface, - hmem_device, mp_eager_fallback, hfi1_type); + rc = opx_ep_tx_send_try_eager(ep, buf, len, addr, tag, context, local_iov, niov, total_len, + data, lock_required, is_contiguous, override_flags, tx_op_flags, + caps, reliability, do_cq_completion, hmem_iface, hmem_device, + hmem_handle, mp_eager_fallback, hfi1_type); if (OFI_LIKELY(rc == FI_SUCCESS)) { OPX_TRACER_TRACE(OPX_TRACER_END_SUCCESS, "SEND"); FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, @@ -3856,9 +3870,10 @@ static inline ssize_t fi_opx_ep_tx_send_internal(struct fid_ep *ep, const void * /* If hmem_iface != FI_HMEM_SYSTEM, we skip MP EGR because RZV yields better performance for devices */ if (is_contiguous && mp_eager_fallback && !fi_opx_hfi1_tx_is_intranode(opx_ep, addr, caps) && (caps & FI_TAGGED) && hmem_iface == FI_HMEM_SYSTEM) { - rc = fi_opx_hfi1_tx_send_try_mp_egr( - ep, buf, len, desc, addr.fi, tag, context, data, lock_required, override_flags, - tx_op_flags, caps, reliability, do_cq_completion, FI_HMEM_SYSTEM, 0ul, hfi1_type); + rc = opx_hfi1_tx_send_try_mp_egr(ep, buf, len, addr.fi, tag, context, data, lock_required, + override_flags, tx_op_flags, caps, reliability, + do_cq_completion, FI_HMEM_SYSTEM, 0ul, OPX_HMEM_NO_HANDLE, + hfi1_type); if (OFI_LIKELY(rc == FI_SUCCESS)) { OPX_TRACER_TRACE(OPX_TRACER_END_SUCCESS, "SEND"); FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, @@ -3882,9 +3897,9 @@ static inline ssize_t fi_opx_ep_tx_send_internal(struct fid_ep *ep, const void * } } - rc = fi_opx_ep_tx_send_rzv(ep, buf, len, desc, addr, tag, context, local_iov, niov, total_len, data, - lock_required, is_contiguous, override_flags, tx_op_flags, caps, reliability, - do_cq_completion, hmem_iface, hmem_device, hfi1_type); + rc = opx_ep_tx_send_rzv(ep, buf, len, addr, tag, context, local_iov, niov, total_len, data, lock_required, + is_contiguous, override_flags, tx_op_flags, caps, reliability, do_cq_completion, + hmem_iface, hmem_device, hmem_handle, hfi1_type); OPX_TRACER_TRACE(OPX_TRACER_END_SUCCESS, "SEND"); FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "===================================== SEND (end)\n"); diff --git a/prov/opx/include/rdma/opx/fi_opx_fabric_transport.h b/prov/opx/include/rdma/opx/fi_opx_fabric_transport.h index c6b1ae82839..7f1a759632b 100644 --- a/prov/opx/include/rdma/opx/fi_opx_fabric_transport.h +++ b/prov/opx/include/rdma/opx/fi_opx_fabric_transport.h @@ -39,10 +39,10 @@ #include "rdma/opx/fi_opx_hfi1_transport.h" #define FI_OPX_FABRIC_TX_INJECT fi_opx_hfi1_tx_inject -#define FI_OPX_FABRIC_TX_SEND_EGR fi_opx_hfi1_tx_send_egr_select -#define FI_OPX_FABRIC_TX_SENDV_EGR fi_opx_hfi1_tx_sendv_egr_select -#define FI_OPX_FABRIC_TX_SEND_RZV fi_opx_hfi1_tx_send_rzv_select -#define FI_OPX_FABRIC_TX_SENDV_RZV fi_opx_hfi1_tx_sendv_rzv +#define OPX_FABRIC_TX_SEND_EGR opx_hfi1_tx_send_egr_select +#define OPX_FABRIC_TX_SENDV_EGR opx_hfi1_tx_sendv_egr_select +#define OPX_FABRIC_TX_SEND_RZV opx_hfi1_tx_send_rzv_select +#define OPX_FABRIC_TX_SENDV_RZV opx_hfi1_tx_sendv_rzv #define FI_OPX_FABRIC_RX_RZV_RTS fi_opx_hfi1_rx_rzv_rts #define FI_OPX_FABRIC_RX_RZV_RTS_ETRUNC fi_opx_hfi1_rx_rzv_rts_etrunc #define FI_OPX_FABRIC_RX_RZV_CTS fi_opx_hfi1_rx_rzv_cts diff --git a/prov/opx/include/rdma/opx/fi_opx_hfi1_inlines.h b/prov/opx/include/rdma/opx/fi_opx_hfi1_inlines.h index 5d7f4050af8..145c83202d6 100644 --- a/prov/opx/include/rdma/opx/fi_opx_hfi1_inlines.h +++ b/prov/opx/include/rdma/opx/fi_opx_hfi1_inlines.h @@ -43,8 +43,8 @@ size_t opx_hfi1_dput_write_header_and_payload_put(struct fi_opx_ep *opx_ep, unio const uint64_t op64, const uint64_t dt64, const size_t payload_bytes, const uint64_t key_or_rma_req, uint8_t **sbuf, const enum fi_hmem_iface sbuf_iface, const uint64_t sbuf_device, - uintptr_t *rbuf, const enum opx_hfi1_type hfi1_type, - const uint32_t opcode) + const uint64_t sbuf_handle, uintptr_t *rbuf, + const enum opx_hfi1_type hfi1_type, const uint32_t opcode) { if (hfi1_type & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) { hdr->qw_9B[4] = opx_ep->rx->tx.dput_9B.hdr.qw_9B[4] | opcode | (dt64 << 16) | (op64 << 24) | @@ -60,8 +60,8 @@ size_t opx_hfi1_dput_write_header_and_payload_put(struct fi_opx_ep *opx_ep, unio if (tx_payload) { assert(!iov); - OPX_HMEM_COPY_FROM((void *) tx_payload, (const void *) *sbuf, payload_bytes, OPX_HMEM_NO_HANDLE, - OPX_HMEM_DEV_REG_THRESHOLD_NOT_SET, sbuf_iface, sbuf_device); + OPX_HMEM_COPY_FROM((void *) tx_payload, (const void *) *sbuf, payload_bytes, sbuf_handle, + OPX_HMEM_DEV_REG_SEND_THRESHOLD, sbuf_iface, sbuf_device); } else { assert(iov); iov->iov_base = (void *) *sbuf; @@ -217,7 +217,8 @@ size_t opx_hfi1_dput_write_header_and_payload_get(struct fi_opx_ep *opx_ep, unio const uint64_t dt64, const size_t payload_bytes, const uintptr_t rma_request_vaddr, uint8_t **sbuf, const enum fi_hmem_iface sbuf_iface, const uint64_t sbuf_device, - uintptr_t *rbuf, const enum opx_hfi1_type hfi1_type) + const uint64_t sbuf_handle, uintptr_t *rbuf, + const enum opx_hfi1_type hfi1_type) { if (hfi1_type & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) { hdr->qw_9B[4] = opx_ep->rx->tx.dput_9B.hdr.qw_9B[4] | FI_OPX_HFI_DPUT_OPCODE_GET | (dt64 << 16) | @@ -234,8 +235,8 @@ size_t opx_hfi1_dput_write_header_and_payload_get(struct fi_opx_ep *opx_ep, unio if (tx_payload) { assert(!iov); if (dt64 == (FI_VOID - 1)) { - OPX_HMEM_COPY_FROM((void *) tx_payload, (const void *) *sbuf, payload_bytes, OPX_HMEM_NO_HANDLE, - OPX_HMEM_DEV_REG_THRESHOLD_NOT_SET, sbuf_iface, sbuf_device); + OPX_HMEM_COPY_FROM((void *) tx_payload, (const void *) *sbuf, payload_bytes, sbuf_handle, + OPX_HMEM_DEV_REG_SEND_THRESHOLD, sbuf_iface, sbuf_device); } else { OPX_HMEM_ATOMIC_DISPATCH((void *) *sbuf, (void *) tx_payload, payload_bytes, dt64, FI_ATOMIC_WRITE, sbuf_iface, sbuf_device); @@ -258,8 +259,8 @@ size_t opx_hfi1_dput_write_header_and_payload_rzv(struct fi_opx_ep *opx_ep, unio const uint64_t op64, const uint64_t dt64, const size_t payload_bytes, const uint32_t opcode, const uintptr_t target_byte_counter_vaddr, uint8_t **sbuf, const enum fi_hmem_iface sbuf_iface, - const uint64_t sbuf_device, uintptr_t *rbuf, - enum opx_hfi1_type hfi1_type) + const uint64_t sbuf_device, const uint64_t sbuf_handle, + uintptr_t *rbuf, enum opx_hfi1_type hfi1_type) { if (hfi1_type & (OPX_HFI1_WFR | OPX_HFI1_JKR_9B)) { hdr->qw_9B[4] = opx_ep->rx->tx.rzv_dput_9B.hdr.qw_9B[4] | (opcode) | (payload_bytes << 48); @@ -273,8 +274,8 @@ size_t opx_hfi1_dput_write_header_and_payload_rzv(struct fi_opx_ep *opx_ep, unio if (tx_payload) { assert(!iov); - OPX_HMEM_COPY_FROM((void *) tx_payload, (const void *) *sbuf, payload_bytes, OPX_HMEM_NO_HANDLE, - OPX_HMEM_DEV_REG_THRESHOLD_NOT_SET, sbuf_iface, sbuf_device); + OPX_HMEM_COPY_FROM((void *) tx_payload, (const void *) *sbuf, payload_bytes, sbuf_handle, + OPX_HMEM_DEV_REG_SEND_THRESHOLD, sbuf_iface, sbuf_device); } else { assert(iov); iov->iov_base = (void *) *sbuf; @@ -294,9 +295,9 @@ size_t opx_hfi1_dput_write_packet(struct fi_opx_ep *opx_ep, union opx_hfi1_packe const uint64_t bth_rx, const size_t payload_bytes, const uint64_t key, const uint64_t fetch_vaddr, const uintptr_t target_byte_counter_vaddr, const uintptr_t rma_request_vaddr, uint64_t bytes_sent, uint8_t **sbuf, - const enum fi_hmem_iface sbuf_iface, const uint64_t sbuf_device, uint8_t **cbuf, - const enum fi_hmem_iface cbuf_iface, const uint64_t cbuf_device, uintptr_t *rbuf, - const enum opx_hfi1_type hfi1_type) + const enum fi_hmem_iface sbuf_iface, const uint64_t sbuf_device, + const uint64_t sbuf_handle, uint8_t **cbuf, const enum fi_hmem_iface cbuf_iface, + const uint64_t cbuf_device, uintptr_t *rbuf, const enum opx_hfi1_type hfi1_type) { uint64_t psn = (uint64_t) htonl((uint32_t) psn_orig); @@ -341,25 +342,25 @@ size_t opx_hfi1_dput_write_packet(struct fi_opx_ep *opx_ep, union opx_hfi1_packe case FI_OPX_HFI_DPUT_OPCODE_RZV: case FI_OPX_HFI_DPUT_OPCODE_RZV_TID: case FI_OPX_HFI_DPUT_OPCODE_RZV_NONCONTIG: - return opx_hfi1_dput_write_header_and_payload_rzv(opx_ep, hdr, tx_payload, iov, op64, dt64, - payload_bytes, opcode, target_byte_counter_vaddr, - sbuf, sbuf_iface, sbuf_device, rbuf, hfi1_type); + return opx_hfi1_dput_write_header_and_payload_rzv( + opx_ep, hdr, tx_payload, iov, op64, dt64, payload_bytes, opcode, target_byte_counter_vaddr, + sbuf, sbuf_iface, sbuf_device, sbuf_handle, rbuf, hfi1_type); break; case FI_OPX_HFI_DPUT_OPCODE_GET: return opx_hfi1_dput_write_header_and_payload_get(opx_ep, hdr, tx_payload, iov, dt64, payload_bytes, rma_request_vaddr, sbuf, sbuf_iface, sbuf_device, - rbuf, hfi1_type); + sbuf_handle, rbuf, hfi1_type); break; case FI_OPX_HFI_DPUT_OPCODE_PUT: return opx_hfi1_dput_write_header_and_payload_put(opx_ep, hdr, tx_payload, iov, op64, dt64, payload_bytes, key, sbuf, sbuf_iface, sbuf_device, - rbuf, hfi1_type, opcode); + sbuf_handle, rbuf, hfi1_type, opcode); break; case FI_OPX_HFI_DPUT_OPCODE_PUT_CQ: return opx_hfi1_dput_write_header_and_payload_put( opx_ep, hdr, tx_payload, iov, op64, dt64, payload_bytes, target_byte_counter_vaddr, /* this is the remote rma_request */ - sbuf, sbuf_iface, sbuf_device, rbuf, hfi1_type, opcode); + sbuf, sbuf_iface, sbuf_device, sbuf_handle, rbuf, hfi1_type, opcode); break; case FI_OPX_HFI_DPUT_OPCODE_ATOMIC_FETCH: return opx_hfi1_dput_write_header_and_payload_atomic_fetch( @@ -384,13 +385,13 @@ size_t opx_hfi1_dput_write_header_and_payload( const uint64_t lrh_dlid, const uint64_t bth_rx, const size_t payload_bytes, const uint64_t key, const uint64_t fetch_vaddr, const uintptr_t target_byte_counter_vaddr, const uintptr_t rma_request_vaddr, uint64_t bytes_sent, uint8_t **sbuf, const enum fi_hmem_iface sbuf_iface, const uint64_t sbuf_device, - uint8_t **cbuf, const enum fi_hmem_iface cbuf_iface, const uint64_t cbuf_device, uintptr_t *rbuf, - const enum opx_hfi1_type hfi1_type) + const uint64_t sbuf_handle, uint8_t **cbuf, const enum fi_hmem_iface cbuf_iface, const uint64_t cbuf_device, + uintptr_t *rbuf, const enum opx_hfi1_type hfi1_type) { return opx_hfi1_dput_write_packet(opx_ep, hdr, tx_payload, NULL, opcode, psn_orig, lrh_dws, op64, dt64, lrh_dlid, bth_rx, payload_bytes, key, fetch_vaddr, target_byte_counter_vaddr, - rma_request_vaddr, bytes_sent, sbuf, sbuf_iface, sbuf_device, cbuf, - cbuf_iface, cbuf_device, rbuf, hfi1_type); + rma_request_vaddr, bytes_sent, sbuf, sbuf_iface, sbuf_device, sbuf_handle, + cbuf, cbuf_iface, cbuf_device, rbuf, hfi1_type); } __OPX_FORCE_INLINE__ @@ -408,7 +409,7 @@ size_t opx_hfi1_dput_write_header_and_iov(struct fi_opx_ep *opx_ep, union opx_hf */ return opx_hfi1_dput_write_packet(opx_ep, hdr, NULL, iov, opcode, 0, lrh_dws, op64, dt64, lrh_dlid, bth_rx, payload_bytes, key, fetch_vaddr, target_byte_counter_vaddr, rma_request_vaddr, - bytes_sent, sbuf, FI_HMEM_SYSTEM, 0ul, cbuf, FI_HMEM_SYSTEM, 0ul, rbuf, - hfi1_type); + bytes_sent, sbuf, FI_HMEM_SYSTEM, 0ul, OPX_HMEM_NO_HANDLE, cbuf, + FI_HMEM_SYSTEM, 0ul, rbuf, hfi1_type); } #endif diff --git a/prov/opx/include/rdma/opx/fi_opx_hfi1_packet.h b/prov/opx/include/rdma/opx/fi_opx_hfi1_packet.h index 6bfe0dbeb20..fa98b61c05b 100644 --- a/prov/opx/include/rdma/opx/fi_opx_hfi1_packet.h +++ b/prov/opx/include/rdma/opx/fi_opx_hfi1_packet.h @@ -1355,8 +1355,8 @@ union cacheline { uint8_t byte[64]; }; -union fi_opx_hfi1_dput_iov { - uint64_t qw[6]; +union opx_hfi1_dput_iov { + uint64_t qw[8]; struct { uintptr_t rbuf; uintptr_t sbuf; @@ -1365,6 +1365,8 @@ union fi_opx_hfi1_dput_iov { uint64_t sbuf_device; enum fi_hmem_iface rbuf_iface; enum fi_hmem_iface sbuf_iface; + uint64_t pad; + uint64_t sbuf_handle; }; }; @@ -1400,10 +1402,10 @@ struct fi_opx_hmem_iov { } __attribute__((__packed__)); #define FI_OPX_MAX_HMEM_IOV ((FI_OPX_HFI1_PACKET_MTU - sizeof(uintptr_t)) / sizeof(struct fi_opx_hmem_iov)) -#define FI_OPX_MAX_DPUT_IOV ((FI_OPX_HFI1_PACKET_MTU / sizeof(union fi_opx_hfi1_dput_iov) - 4) + 3) +#define FI_OPX_MAX_DPUT_IOV ((FI_OPX_HFI1_PACKET_MTU / sizeof(union opx_hfi1_dput_iov) - 4) + 3) #define FI_OPX_MAX_DPUT_TIDPAIRS \ - ((FI_OPX_HFI1_PACKET_MTU - sizeof(union fi_opx_hfi1_dput_iov) - (4 * sizeof(uint32_t))) / sizeof(uint32_t)) + ((FI_OPX_HFI1_PACKET_MTU - sizeof(union opx_hfi1_dput_iov) - (4 * sizeof(uint32_t))) / sizeof(uint32_t)) #define OPX_IMMEDIATE_BYTE_COUNT_SHIFT (5) #define OPX_IMMEDIATE_BYTE_COUNT_MASK (0xE0) @@ -1494,23 +1496,23 @@ union fi_opx_hfi1_packet_payload { } rendezvous; struct { - union fi_opx_hfi1_dput_iov iov[FI_OPX_MAX_DPUT_IOV]; + union opx_hfi1_dput_iov iov[FI_OPX_MAX_DPUT_IOV]; } rma_rts; struct { - union fi_opx_hfi1_dput_iov iov[FI_OPX_MAX_DPUT_IOV]; + union opx_hfi1_dput_iov iov[FI_OPX_MAX_DPUT_IOV]; } cts; /* tid_cts extends cts*/ struct { /* ==== CACHE LINE 0 ==== */ - union fi_opx_hfi1_dput_iov iov[1]; - uint32_t tid_offset; - uint32_t ntidpairs; - int32_t origin_byte_counter_adjust; - uint32_t unused; - + union opx_hfi1_dput_iov iov[1]; /* ==== CACHE LINE 1 ==== */ + uint32_t tid_offset; + uint32_t ntidpairs; + int32_t origin_byte_counter_adjust; + uint32_t unused; + uint32_t tidpairs[FI_OPX_MAX_DPUT_TIDPAIRS]; } tid_cts; } __attribute__((__aligned__(32))); diff --git a/prov/opx/include/rdma/opx/fi_opx_hfi1_transport.h b/prov/opx/include/rdma/opx/fi_opx_hfi1_transport.h index cce7ee7e2e5..c80f7c987df 100644 --- a/prov/opx/include/rdma/opx/fi_opx_hfi1_transport.h +++ b/prov/opx/include/rdma/opx/fi_opx_hfi1_transport.h @@ -682,9 +682,9 @@ void fi_opx_hfi1_rx_rzv_rts(struct fi_opx_ep *opx_ep, const union opx_hfi1_packe const enum opx_hfi1_type hfi1_type); union fi_opx_hfi1_deferred_work * -fi_opx_hfi1_rx_rzv_cts(struct fi_opx_ep *opx_ep, struct fi_opx_mr *opx_mr, const union opx_hfi1_packet_hdr *const hdr, - const void *const payload, size_t payload_bytes_to_copy, const uint8_t u8_rx, - const uint32_t niov, const union fi_opx_hfi1_dput_iov *const dput_iov, const uint8_t op, +fi_opx_hfi1_rx_rzv_cts(struct fi_opx_ep *opx_ep, const union opx_hfi1_packet_hdr *const hdr, const void *const payload, + size_t payload_bytes_to_copy, const uint8_t u8_rx, const uint32_t niov, + const union opx_hfi1_dput_iov *const dput_iov, uint8_t *src_base_addr, const uint8_t op, const uint8_t dt, const uintptr_t rma_request_vaddr, const uintptr_t target_byte_counter_vaddr, uint64_t *origin_byte_counter, uint32_t op_kind, void (*completion_action)(union fi_opx_hfi1_deferred_work *work_state), @@ -705,10 +705,10 @@ struct fi_opx_work_elem { struct fi_opx_hfi1_dput_params { struct fi_opx_work_elem work_elem; struct fi_opx_ep *opx_ep; - struct fi_opx_mr *opx_mr; + uint8_t *src_base_addr; uint64_t lrh_dlid; uint64_t pbc_dlid; - union fi_opx_hfi1_dput_iov *dput_iov; + union opx_hfi1_dput_iov *dput_iov; void *fetch_vaddr; void *compare_vaddr; struct fi_opx_completion_counter *cc; @@ -752,19 +752,20 @@ struct fi_opx_hfi1_dput_params { struct fi_opx_hmem_iov compare_iov; uint8_t inject_data[FI_OPX_HFI1_PACKET_IMM]; - uint32_t unused_padding; + uint32_t unused_padding[3]; /* Either FI_OPX_MAX_DPUT_IOV iov's or 1 iov and FI_OPX_MAX_DPUT_TIDPAIRS tidpairs */ union { - union fi_opx_hfi1_dput_iov iov[FI_OPX_MAX_DPUT_IOV]; + union opx_hfi1_dput_iov iov[FI_OPX_MAX_DPUT_IOV]; struct { - union fi_opx_hfi1_dput_iov reserved; /* skip 1 iov */ - uint32_t tidpairs[FI_OPX_MAX_DPUT_TIDPAIRS]; + union opx_hfi1_dput_iov reserved; /* skip 1 iov */ + uint32_t tidpairs[FI_OPX_MAX_DPUT_TIDPAIRS]; }; }; } __attribute__((__aligned__(L2_CACHE_LINE_SIZE))) __attribute__((__packed__)); OPX_COMPILE_TIME_ASSERT((offsetof(struct fi_opx_hfi1_dput_params, compare_iov) & 7) == 0, "compare_iov not 8-byte aligned!"); +OPX_COMPILE_TIME_ASSERT((offsetof(struct fi_opx_hfi1_dput_params, iov) & 63) == 0, "iov not 64-byte aligned!"); struct fi_opx_hfi1_rx_rzv_rts_params { /* == CACHE LINE 0 == */ @@ -806,18 +807,21 @@ struct fi_opx_hfi1_rx_rzv_rts_params { uint32_t offset; int32_t origin_byte_counter_adj; } tid_info; + uint64_t unused2[3]; - union fi_opx_hfi1_dput_iov elided_head; - union fi_opx_hfi1_dput_iov elided_tail; + /* == CACHE LINE 3 == */ + union opx_hfi1_dput_iov elided_head; + /* == CACHE LINE 4 == */ + union opx_hfi1_dput_iov elided_tail; /* Either FI_OPX_MAX_DPUT_IOV iov's or 1 iov and FI_OPX_MAX_DPUT_TIDPAIRS tidpairs */ + /* == CACHE LINE 5 == */ union { - union fi_opx_hfi1_dput_iov dput_iov[FI_OPX_MAX_DPUT_IOV]; + union opx_hfi1_dput_iov dput_iov[FI_OPX_MAX_DPUT_IOV]; struct { - union fi_opx_hfi1_dput_iov reserved; /* skip 1 iov */ - uint64_t pad_to_16b_boundary; - uint32_t tidpairs[FI_OPX_MAX_DPUT_TIDPAIRS]; + union opx_hfi1_dput_iov reserved; /* skip 1 iov */ + uint32_t tidpairs[FI_OPX_MAX_DPUT_TIDPAIRS]; }; }; } __attribute__((__aligned__(L2_CACHE_LINE_SIZE))) __attribute__((__packed__)); @@ -825,12 +829,12 @@ OPX_COMPILE_TIME_ASSERT(sizeof(((struct fi_opx_hfi1_rx_rzv_rts_params *) 0)->dpu "sizeof(fi_opx_hfi1_rx_rzv_rts_params->dput_iov) should be < MAX PACKET MTU!"); OPX_COMPILE_TIME_ASSERT( sizeof(((struct fi_opx_hfi1_rx_rzv_rts_params *) 0)->tidpairs) < - (FI_OPX_HFI1_PACKET_MTU - sizeof(union fi_opx_hfi1_dput_iov)), + (FI_OPX_HFI1_PACKET_MTU - sizeof(union opx_hfi1_dput_iov)), "sizeof(fi_opx_hfi1_rx_rzv_rts_params->tidpairs) should be < (MAX PACKET MTU - sizeof(dput iov)!"); OPX_COMPILE_TIME_ASSERT((offsetof(struct fi_opx_hfi1_rx_rzv_rts_params, tidpairs) & 0xF) == 0, - "offsetof(fi_opx_hfi1_rx_rma_rts_params->tidpairs) should be 16-byte aligned!"); + "offsetof(opx_hfi1_rx_rma_rts_params->tidpairs) should be 16-byte aligned!"); -struct fi_opx_hfi1_rx_rma_rts_params { +struct opx_hfi1_rx_rma_rts_params { /* == CACHE LINE 0 == */ struct fi_opx_work_elem work_elem; // 40 bytes struct fi_opx_ep *opx_ep; @@ -857,12 +861,12 @@ struct fi_opx_hfi1_rx_rma_rts_params { uint8_t unused[4]; /* == CACHE LINE 2 == */ - union fi_opx_hfi1_dput_iov dput_iov[FI_OPX_MAX_DPUT_IOV]; + union opx_hfi1_dput_iov dput_iov[FI_OPX_MAX_DPUT_IOV]; } __attribute__((__aligned__(L2_CACHE_LINE_SIZE))) __attribute__((__packed__)); -OPX_COMPILE_TIME_ASSERT(offsetof(struct fi_opx_hfi1_rx_rma_rts_params, dput_iov) == FI_OPX_CACHE_LINE_SIZE * 2, - "fi_opx_hfi1_rx_rma_rts_params->dput_iov should start on cacheline 2!"); -OPX_COMPILE_TIME_ASSERT(sizeof(((struct fi_opx_hfi1_rx_rma_rts_params *) 0)->dput_iov) < FI_OPX_HFI1_PACKET_MTU, - "sizeof(fi_opx_hfi1_rx_rma_rts_params->dput_iov) should be < MAX PACKET MTU!"); +OPX_COMPILE_TIME_ASSERT(offsetof(struct opx_hfi1_rx_rma_rts_params, dput_iov) == FI_OPX_CACHE_LINE_SIZE * 2, + "opx_hfi1_rx_rma_rts_params->dput_iov should start on cacheline 2!"); +OPX_COMPILE_TIME_ASSERT(sizeof(((struct opx_hfi1_rx_rma_rts_params *) 0)->dput_iov) < FI_OPX_HFI1_PACKET_MTU, + "sizeof(opx_hfi1_rx_rma_rts_params->dput_iov) should be < MAX PACKET MTU!"); struct fi_opx_hfi1_rx_dput_fence_params { struct fi_opx_work_elem work_elem; @@ -880,23 +884,23 @@ struct fi_opx_hfi1_rx_readv_params { struct fi_opx_work_elem work_elem; struct fi_opx_ep *opx_ep; struct fi_opx_rma_request *rma_request; - union fi_opx_hfi1_dput_iov dput_iov; size_t niov; + union opx_hfi1_dput_iov dput_iov; union fi_opx_addr opx_target_addr; struct fi_opx_completion_counter *cc; uint64_t key; uint64_t dest_rx; uint32_t u32_extended_rx; + uint32_t opcode; uint64_t lrh_dlid; uint64_t bth_rx; uint64_t pbc_dws; uint64_t lrh_dws; uint64_t op; uint64_t dt; - uint32_t opcode; + uint64_t pbc_dlid; enum ofi_reliability_kind reliability; bool is_intranode; - uint64_t pbc_dlid; } __attribute__((__aligned__(L2_CACHE_LINE_SIZE))) __attribute__((__packed__)); union fi_opx_hfi1_deferred_work { @@ -905,7 +909,7 @@ union fi_opx_hfi1_deferred_work { struct fi_opx_hfi1_rx_rzv_rts_params rx_rzv_rts; struct fi_opx_hfi1_rx_dput_fence_params fence; struct fi_opx_hfi1_rx_readv_params readv; - struct fi_opx_hfi1_rx_rma_rts_params rx_rma_rts; + struct opx_hfi1_rx_rma_rts_params rx_rma_rts; } __attribute__((__aligned__(L2_CACHE_LINE_SIZE))) __attribute__((__packed__)); int opx_hfi1_do_dput_fence(union fi_opx_hfi1_deferred_work *work); @@ -1377,14 +1381,14 @@ ssize_t fi_opx_hfi1_tx_check_credits(struct fi_opx_ep *opx_ep, union fi_opx_hfi1 } __OPX_FORCE_INLINE__ -ssize_t fi_opx_hfi1_tx_sendv_egr_intranode(struct fid_ep *ep, const struct iovec *iov, size_t niov, - const uint16_t lrh_dws, const uint64_t lrh_dlid_9B, const uint64_t bth_rx, - size_t total_len, const size_t payload_qws_total, - const size_t xfer_bytes_tail, void *desc, const union fi_opx_addr *addr, - uint64_t tag, void *context, const uint32_t data, int lock_required, - const uint64_t dest_rx, const uint64_t tx_op_flags, const uint64_t caps, - const uint64_t do_cq_completion, const enum fi_hmem_iface iface, - const uint64_t hmem_device, const enum opx_hfi1_type hfi1_type) +ssize_t opx_hfi1_tx_sendv_egr_intranode(struct fid_ep *ep, const struct iovec *iov, size_t niov, const uint16_t lrh_dws, + const uint64_t lrh_dlid_9B, const uint64_t bth_rx, size_t total_len, + const size_t payload_qws_total, const size_t xfer_bytes_tail, + const union fi_opx_addr *addr, uint64_t tag, void *context, const uint32_t data, + int lock_required, const uint64_t dest_rx, const uint64_t tx_op_flags, + const uint64_t caps, const uint64_t do_cq_completion, + const enum fi_hmem_iface iface, const uint64_t hmem_device, + const uint64_t hmem_handle, const enum opx_hfi1_type hfi1_type) { struct fi_opx_ep *opx_ep = container_of(ep, struct fi_opx_ep, ep_fid); @@ -1417,12 +1421,10 @@ ssize_t fi_opx_hfi1_tx_sendv_egr_intranode(struct fid_ep *ep, const struct iovec bounce buffer, and then proceed as if we only have a single IOV that points to the bounce buffer. */ if (iface != FI_HMEM_SYSTEM) { - struct fi_opx_mr *desc_mr = (struct fi_opx_mr *) desc; - unsigned iov_total_len = 0; + unsigned iov_total_len = 0; for (int i = 0; i < niov; ++i) { - opx_copy_from_hmem(iface, hmem_device, desc_mr->hmem_dev_reg_handle, - &opx_ep->hmem_copy_buf[iov_total_len], iov[i].iov_base, iov[i].iov_len, - OPX_HMEM_DEV_REG_SEND_THRESHOLD); + opx_copy_from_hmem(iface, hmem_device, hmem_handle, &opx_ep->hmem_copy_buf[iov_total_len], + iov[i].iov_base, iov[i].iov_len, OPX_HMEM_DEV_REG_SEND_THRESHOLD); iov_total_len += iov[i].iov_len; } @@ -1491,13 +1493,13 @@ ssize_t fi_opx_hfi1_tx_sendv_egr_intranode(struct fid_ep *ep, const struct iovec } __OPX_FORCE_INLINE__ -ssize_t fi_opx_hfi1_tx_sendv_egr(struct fid_ep *ep, const struct iovec *iov, size_t niov, size_t total_len, void *desc, - fi_addr_t dest_addr, uint64_t tag, void *context, const uint32_t data, - int lock_required, const unsigned override_flags, const uint64_t tx_op_flags, - const uint64_t dest_rx, const uint64_t caps, - const enum ofi_reliability_kind reliability, const uint64_t do_cq_completion, - const enum fi_hmem_iface iface, const uint64_t hmem_device, - const enum opx_hfi1_type hfi1_type) +ssize_t opx_hfi1_tx_sendv_egr(struct fid_ep *ep, const struct iovec *iov, size_t niov, size_t total_len, + fi_addr_t dest_addr, uint64_t tag, void *context, const uint32_t data, int lock_required, + const unsigned override_flags, const uint64_t tx_op_flags, const uint64_t dest_rx, + const uint64_t caps, const enum ofi_reliability_kind reliability, + const uint64_t do_cq_completion, const enum fi_hmem_iface iface, + const uint64_t hmem_device, const uint64_t hmem_handle, + const enum opx_hfi1_type hfi1_type) { assert(lock_required == 0); struct fi_opx_ep *opx_ep = container_of(ep, struct fi_opx_ep, ep_fid); @@ -1529,10 +1531,10 @@ ssize_t fi_opx_hfi1_tx_sendv_egr(struct fid_ep *ep, const struct iovec *iov, siz size_t *niov_ptr = &niov; if (fi_opx_hfi1_tx_is_intranode(opx_ep, addr, caps)) { - return fi_opx_hfi1_tx_sendv_egr_intranode(ep, iov, niov, lrh_dws, lrh_dlid_9B, bth_rx, total_len, - payload_qws_total, xfer_bytes_tail, desc, &addr, tag, context, - data, lock_required, dest_rx, tx_op_flags, caps, - do_cq_completion, iface, hmem_device, hfi1_type); + return opx_hfi1_tx_sendv_egr_intranode(ep, iov, niov, lrh_dws, lrh_dlid_9B, bth_rx, total_len, + payload_qws_total, xfer_bytes_tail, &addr, tag, context, data, + lock_required, dest_rx, tx_op_flags, caps, do_cq_completion, + iface, hmem_device, hmem_handle, hfi1_type); } FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, @@ -1567,12 +1569,10 @@ ssize_t fi_opx_hfi1_tx_sendv_egr(struct fid_ep *ep, const struct iovec *iov, siz bounce buffer, and then proceed as if we only have a single IOV that points to the bounce buffer. */ if (iface != FI_HMEM_SYSTEM) { - unsigned iov_total_len = 0; - struct fi_opx_mr *desc_mr = (struct fi_opx_mr *) desc; + unsigned iov_total_len = 0; for (int i = 0; i < niov; ++i) { - opx_copy_from_hmem(iface, hmem_device, desc_mr->hmem_dev_reg_handle, - &opx_ep->hmem_copy_buf[iov_total_len], iov[i].iov_base, iov[i].iov_len, - OPX_HMEM_DEV_REG_SEND_THRESHOLD); + opx_copy_from_hmem(iface, hmem_device, hmem_handle, &opx_ep->hmem_copy_buf[iov_total_len], + iov[i].iov_base, iov[i].iov_len, OPX_HMEM_DEV_REG_SEND_THRESHOLD); iov_total_len += iov[i].iov_len; } @@ -1649,14 +1649,15 @@ ssize_t fi_opx_hfi1_tx_sendv_egr(struct fid_ep *ep, const struct iovec *iov, siz } __OPX_FORCE_INLINE__ -ssize_t fi_opx_hfi1_tx_sendv_egr_intranode_16B(struct fid_ep *ep, const struct iovec *iov, size_t niov, - const uint16_t lrh_qws, const uint64_t lrh_dlid_16B, - const uint64_t bth_rx, size_t total_len, const size_t payload_qws_total, - const size_t xfer_bytes_tail, void *desc, const union fi_opx_addr *addr, - uint64_t tag, void *context, const uint32_t data, int lock_required, - const uint64_t dest_rx, const uint64_t tx_op_flags, const uint64_t caps, - const uint64_t do_cq_completion, const enum fi_hmem_iface iface, - const uint64_t hmem_device, const enum opx_hfi1_type hfi1_type) +ssize_t opx_hfi1_tx_sendv_egr_intranode_16B(struct fid_ep *ep, const struct iovec *iov, size_t niov, + const uint16_t lrh_qws, const uint64_t lrh_dlid_16B, const uint64_t bth_rx, + size_t total_len, const size_t payload_qws_total, + const size_t xfer_bytes_tail, const union fi_opx_addr *addr, uint64_t tag, + void *context, const uint32_t data, int lock_required, + const uint64_t dest_rx, const uint64_t tx_op_flags, const uint64_t caps, + const uint64_t do_cq_completion, const enum fi_hmem_iface iface, + const uint64_t hmem_device, const uint64_t hmem_handle, + const enum opx_hfi1_type hfi1_type) { struct fi_opx_ep *opx_ep = container_of(ep, struct fi_opx_ep, ep_fid); @@ -1689,12 +1690,10 @@ ssize_t fi_opx_hfi1_tx_sendv_egr_intranode_16B(struct fid_ep *ep, const struct i bounce buffer, and then proceed as if we only have a single IOV that points to the bounce buffer. */ if (iface != FI_HMEM_SYSTEM) { - struct fi_opx_mr *desc_mr = (struct fi_opx_mr *) desc; - unsigned iov_total_len = 0; + unsigned iov_total_len = 0; for (int i = 0; i < niov; ++i) { - opx_copy_from_hmem(iface, hmem_device, desc_mr->hmem_dev_reg_handle, - &opx_ep->hmem_copy_buf[iov_total_len], iov[i].iov_base, iov[i].iov_len, - OPX_HMEM_DEV_REG_SEND_THRESHOLD); + opx_copy_from_hmem(iface, hmem_device, hmem_handle, &opx_ep->hmem_copy_buf[iov_total_len], + iov[i].iov_base, iov[i].iov_len, OPX_HMEM_DEV_REG_SEND_THRESHOLD); iov_total_len += iov[i].iov_len; } @@ -1770,13 +1769,13 @@ ssize_t fi_opx_hfi1_tx_sendv_egr_intranode_16B(struct fid_ep *ep, const struct i } __OPX_FORCE_INLINE__ -ssize_t fi_opx_hfi1_tx_sendv_egr_16B(struct fid_ep *ep, const struct iovec *iov, size_t niov, size_t total_len, - void *desc, fi_addr_t dest_addr, uint64_t tag, void *context, const uint32_t data, - int lock_required, const unsigned override_flags, const uint64_t tx_op_flags, - const uint64_t dest_rx, const uint64_t caps, - const enum ofi_reliability_kind reliability, const uint64_t do_cq_completion, - const enum fi_hmem_iface iface, const uint64_t hmem_device, - const enum opx_hfi1_type hfi1_type) +ssize_t opx_hfi1_tx_sendv_egr_16B(struct fid_ep *ep, const struct iovec *iov, size_t niov, size_t total_len, + fi_addr_t dest_addr, uint64_t tag, void *context, const uint32_t data, + int lock_required, const unsigned override_flags, const uint64_t tx_op_flags, + const uint64_t dest_rx, const uint64_t caps, + const enum ofi_reliability_kind reliability, const uint64_t do_cq_completion, + const enum fi_hmem_iface iface, const uint64_t hmem_device, + const uint64_t hmem_handle, const enum opx_hfi1_type hfi1_type) { assert(lock_required == 0); struct fi_opx_ep *opx_ep = container_of(ep, struct fi_opx_ep, ep_fid); @@ -1815,10 +1814,10 @@ ssize_t fi_opx_hfi1_tx_sendv_egr_16B(struct fid_ep *ep, const struct iovec *iov, size_t *niov_ptr = &niov; if (fi_opx_hfi1_tx_is_intranode(opx_ep, addr, caps)) { - return fi_opx_hfi1_tx_sendv_egr_intranode_16B(ep, iov, niov, lrh_qws, lrh_dlid_16B, bth_rx, total_len, - payload_qws_total, xfer_bytes_tail, desc, &addr, tag, - context, data, lock_required, dest_rx, tx_op_flags, caps, - do_cq_completion, iface, hmem_device, hfi1_type); + return opx_hfi1_tx_sendv_egr_intranode_16B( + ep, iov, niov, lrh_qws, lrh_dlid_16B, bth_rx, total_len, payload_qws_total, xfer_bytes_tail, + &addr, tag, context, data, lock_required, dest_rx, tx_op_flags, caps, do_cq_completion, iface, + hmem_device, hmem_handle, hfi1_type); } FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, @@ -1853,12 +1852,10 @@ ssize_t fi_opx_hfi1_tx_sendv_egr_16B(struct fid_ep *ep, const struct iovec *iov, bounce buffer, and then proceed as if we only have a single IOV that points to the bounce buffer. */ if (iface != FI_HMEM_SYSTEM) { - unsigned iov_total_len = 0; - struct fi_opx_mr *desc_mr = (struct fi_opx_mr *) desc; + unsigned iov_total_len = 0; for (int i = 0; i < niov; ++i) { - opx_copy_from_hmem(iface, hmem_device, desc_mr->hmem_dev_reg_handle, - &opx_ep->hmem_copy_buf[iov_total_len], iov[i].iov_base, iov[i].iov_len, - OPX_HMEM_DEV_REG_SEND_THRESHOLD); + opx_copy_from_hmem(iface, hmem_device, hmem_handle, &opx_ep->hmem_copy_buf[iov_total_len], + iov[i].iov_base, iov[i].iov_len, OPX_HMEM_DEV_REG_SEND_THRESHOLD); iov_total_len += iov[i].iov_len; } @@ -1941,37 +1938,37 @@ ssize_t fi_opx_hfi1_tx_sendv_egr_16B(struct fid_ep *ep, const struct iovec *iov, } __OPX_FORCE_INLINE__ -ssize_t fi_opx_hfi1_tx_sendv_egr_select(struct fid_ep *ep, const struct iovec *iov, size_t niov, size_t total_len, - void *desc, fi_addr_t dest_addr, uint64_t tag, void *context, - const uint32_t data, int lock_required, const unsigned override_flags, - const uint64_t tx_op_flags, const uint64_t dest_rx, const uint64_t caps, - const enum ofi_reliability_kind reliability, const uint64_t do_cq_completion, - const enum fi_hmem_iface iface, const uint64_t hmem_device, - const enum opx_hfi1_type hfi1_type) +ssize_t opx_hfi1_tx_sendv_egr_select(struct fid_ep *ep, const struct iovec *iov, size_t niov, size_t total_len, + fi_addr_t dest_addr, uint64_t tag, void *context, const uint32_t data, + int lock_required, const unsigned override_flags, const uint64_t tx_op_flags, + const uint64_t dest_rx, const uint64_t caps, + const enum ofi_reliability_kind reliability, const uint64_t do_cq_completion, + const enum fi_hmem_iface iface, const uint64_t hmem_device, + const uint64_t hmem_handle, const enum opx_hfi1_type hfi1_type) { if (hfi1_type & OPX_HFI1_WFR) { - return fi_opx_hfi1_tx_sendv_egr(ep, iov, niov, total_len, desc, dest_addr, tag, context, data, - lock_required, override_flags, tx_op_flags, dest_rx, caps, reliability, - do_cq_completion, iface, hmem_device, OPX_HFI1_WFR); + return opx_hfi1_tx_sendv_egr(ep, iov, niov, total_len, dest_addr, tag, context, data, lock_required, + override_flags, tx_op_flags, dest_rx, caps, reliability, do_cq_completion, + iface, hmem_device, hmem_handle, OPX_HFI1_WFR); } else if (hfi1_type & OPX_HFI1_JKR) { - return fi_opx_hfi1_tx_sendv_egr_16B(ep, iov, niov, total_len, desc, dest_addr, tag, context, data, - lock_required, override_flags, tx_op_flags, dest_rx, caps, - reliability, do_cq_completion, iface, hmem_device, OPX_HFI1_JKR); + return opx_hfi1_tx_sendv_egr_16B(ep, iov, niov, total_len, dest_addr, tag, context, data, lock_required, + override_flags, tx_op_flags, dest_rx, caps, reliability, + do_cq_completion, iface, hmem_device, hmem_handle, OPX_HFI1_JKR); } else if (hfi1_type & OPX_HFI1_JKR_9B) { - return fi_opx_hfi1_tx_sendv_egr(ep, iov, niov, total_len, desc, dest_addr, tag, context, data, - lock_required, override_flags, tx_op_flags, dest_rx, caps, reliability, - do_cq_completion, iface, hmem_device, OPX_HFI1_JKR_9B); + return opx_hfi1_tx_sendv_egr(ep, iov, niov, total_len, dest_addr, tag, context, data, lock_required, + override_flags, tx_op_flags, dest_rx, caps, reliability, do_cq_completion, + iface, hmem_device, hmem_handle, OPX_HFI1_JKR_9B); } abort(); return (ssize_t) -1L; } __OPX_FORCE_INLINE__ -ssize_t fi_opx_hfi1_tx_send_egr_intranode(struct fid_ep *ep, const void *buf, size_t len, void *desc, - fi_addr_t dest_addr, uint64_t tag, void *context, const uint32_t data, - int lock_required, const uint64_t dest_rx, const uint64_t tx_op_flags, - const uint64_t caps, const uint64_t do_cq_completion, - const enum fi_hmem_iface iface, const uint64_t hmem_device) +ssize_t opx_hfi1_tx_send_egr_intranode(struct fid_ep *ep, const void *buf, size_t len, fi_addr_t dest_addr, + uint64_t tag, void *context, const uint32_t data, int lock_required, + const uint64_t dest_rx, const uint64_t tx_op_flags, const uint64_t caps, + const uint64_t do_cq_completion, const enum fi_hmem_iface iface, + const uint64_t hmem_device, const uint64_t hmem_handle) { struct fi_opx_ep *opx_ep = container_of(ep, struct fi_opx_ep, ep_fid); const union fi_opx_addr addr = {.fi = dest_addr}; @@ -2009,8 +2006,7 @@ ssize_t fi_opx_hfi1_tx_send_egr_intranode(struct fid_ep *ep, const void *buf, si #ifdef OPX_HMEM if (iface != FI_HMEM_SYSTEM) { - struct fi_opx_mr *desc_mr = (struct fi_opx_mr *) desc; - opx_copy_from_hmem(iface, hmem_device, desc_mr->hmem_dev_reg_handle, opx_ep->hmem_copy_buf, buf, len, + opx_copy_from_hmem(iface, hmem_device, hmem_handle, opx_ep->hmem_copy_buf, buf, len, OPX_HMEM_DEV_REG_SEND_THRESHOLD); buf = opx_ep->hmem_copy_buf; FI_OPX_DEBUG_COUNTERS_INC( @@ -2061,11 +2057,11 @@ ssize_t fi_opx_hfi1_tx_send_egr_intranode(struct fid_ep *ep, const void *buf, si } __OPX_FORCE_INLINE__ -ssize_t fi_opx_hfi1_tx_send_egr_intranode_16B(struct fid_ep *ep, const void *buf, size_t len, void *desc, - fi_addr_t dest_addr, uint64_t tag, void *context, const uint32_t data, - int lock_required, const uint64_t dest_rx, const uint64_t tx_op_flags, - const uint64_t caps, const uint64_t do_cq_completion, - const enum fi_hmem_iface iface, const uint64_t hmem_device) +ssize_t opx_hfi1_tx_send_egr_intranode_16B(struct fid_ep *ep, const void *buf, size_t len, fi_addr_t dest_addr, + uint64_t tag, void *context, const uint32_t data, int lock_required, + const uint64_t dest_rx, const uint64_t tx_op_flags, const uint64_t caps, + const uint64_t do_cq_completion, const enum fi_hmem_iface iface, + const uint64_t hmem_device, const uint64_t hmem_handle) { struct fi_opx_ep *opx_ep = container_of(ep, struct fi_opx_ep, ep_fid); const union fi_opx_addr addr = {.fi = dest_addr}; @@ -2103,8 +2099,7 @@ ssize_t fi_opx_hfi1_tx_send_egr_intranode_16B(struct fid_ep *ep, const void *buf #ifdef OPX_HMEM if (iface != FI_HMEM_SYSTEM) { - struct fi_opx_mr *desc_mr = (struct fi_opx_mr *) desc; - opx_copy_from_hmem(iface, hmem_device, desc_mr->hmem_dev_reg_handle, opx_ep->hmem_copy_buf, buf, len, + opx_copy_from_hmem(iface, hmem_device, hmem_handle, opx_ep->hmem_copy_buf, buf, len, OPX_HMEM_DEV_REG_SEND_THRESHOLD); buf = opx_ep->hmem_copy_buf; FI_OPX_DEBUG_COUNTERS_INC( @@ -2432,12 +2427,12 @@ void fi_opx_hfi1_tx_send_egr_write_replay_data(struct fi_opx_ep *opx_ep, const u } __OPX_FORCE_INLINE__ -ssize_t fi_opx_hfi1_tx_send_egr(struct fid_ep *ep, const void *buf, size_t len, void *desc, fi_addr_t dest_addr, - uint64_t tag, void *context, const uint32_t data, int lock_required, - const unsigned override_flags, const uint64_t tx_op_flags, const uint64_t dest_rx, - const uint64_t caps, const enum ofi_reliability_kind reliability, - const uint64_t do_cq_completion, const enum fi_hmem_iface iface, - const uint64_t hmem_device, const enum opx_hfi1_type hfi1_type) +ssize_t opx_hfi1_tx_send_egr(struct fid_ep *ep, const void *buf, size_t len, fi_addr_t dest_addr, uint64_t tag, + void *context, const uint32_t data, int lock_required, const unsigned override_flags, + const uint64_t tx_op_flags, const uint64_t dest_rx, const uint64_t caps, + const enum ofi_reliability_kind reliability, const uint64_t do_cq_completion, + const enum fi_hmem_iface iface, const uint64_t hmem_device, const uint64_t hmem_handle, + const enum opx_hfi1_type hfi1_type) { struct fi_opx_ep *opx_ep = container_of(ep, struct fi_opx_ep, ep_fid); const union fi_opx_addr addr = {.fi = dest_addr}; @@ -2445,9 +2440,9 @@ ssize_t fi_opx_hfi1_tx_send_egr(struct fid_ep *ep, const void *buf, size_t len, OPX_NO_16B_SUPPORT(hfi1_type); if (fi_opx_hfi1_tx_is_intranode(opx_ep, addr, caps)) { - return fi_opx_hfi1_tx_send_egr_intranode(ep, buf, len, desc, dest_addr, tag, context, data, - lock_required, dest_rx, tx_op_flags, caps, do_cq_completion, - iface, hmem_device); + return opx_hfi1_tx_send_egr_intranode(ep, buf, len, dest_addr, tag, context, data, lock_required, + dest_rx, tx_op_flags, caps, do_cq_completion, iface, hmem_device, + hmem_handle); } const size_t xfer_bytes_tail = len & 0x07ul; @@ -2508,8 +2503,7 @@ ssize_t fi_opx_hfi1_tx_send_egr(struct fid_ep *ep, const void *buf, size_t len, #ifdef OPX_HMEM if (iface != FI_HMEM_SYSTEM) { - struct fi_opx_mr *desc_mr = (struct fi_opx_mr *) desc; - opx_copy_from_hmem(iface, hmem_device, desc_mr->hmem_dev_reg_handle, opx_ep->hmem_copy_buf, buf, len, + opx_copy_from_hmem(iface, hmem_device, hmem_handle, opx_ep->hmem_copy_buf, buf, len, OPX_HMEM_DEV_REG_SEND_THRESHOLD); buf = opx_ep->hmem_copy_buf; FI_OPX_DEBUG_COUNTERS_INC( @@ -2571,12 +2565,12 @@ ssize_t fi_opx_hfi1_tx_send_egr(struct fid_ep *ep, const void *buf, size_t len, } __OPX_FORCE_INLINE__ -ssize_t fi_opx_hfi1_tx_send_egr_16B(struct fid_ep *ep, const void *buf, size_t len, void *desc, fi_addr_t dest_addr, - uint64_t tag, void *context, const uint32_t data, int lock_required, - const unsigned override_flags, const uint64_t tx_op_flags, const uint64_t dest_rx, - const uint64_t caps, const enum ofi_reliability_kind reliability, - const uint64_t do_cq_completion, const enum fi_hmem_iface iface, - const uint64_t hmem_device, const enum opx_hfi1_type hfi1_type) +ssize_t opx_hfi1_tx_send_egr_16B(struct fid_ep *ep, const void *buf, size_t len, fi_addr_t dest_addr, uint64_t tag, + void *context, const uint32_t data, int lock_required, const unsigned override_flags, + const uint64_t tx_op_flags, const uint64_t dest_rx, const uint64_t caps, + const enum ofi_reliability_kind reliability, const uint64_t do_cq_completion, + const enum fi_hmem_iface iface, const uint64_t hmem_device, const uint64_t hmem_handle, + const enum opx_hfi1_type hfi1_type) { struct fi_opx_ep *opx_ep = container_of(ep, struct fi_opx_ep, ep_fid); const union fi_opx_addr addr = {.fi = dest_addr}; @@ -2584,9 +2578,9 @@ ssize_t fi_opx_hfi1_tx_send_egr_16B(struct fid_ep *ep, const void *buf, size_t l OPX_NO_9B_SUPPORT(hfi1_type); if (fi_opx_hfi1_tx_is_intranode(opx_ep, addr, caps)) { - return fi_opx_hfi1_tx_send_egr_intranode_16B(ep, buf, len, desc, dest_addr, tag, context, data, - lock_required, dest_rx, tx_op_flags, caps, - do_cq_completion, iface, hmem_device); + return opx_hfi1_tx_send_egr_intranode_16B(ep, buf, len, dest_addr, tag, context, data, lock_required, + dest_rx, tx_op_flags, caps, do_cq_completion, iface, + hmem_device, hmem_handle); } const size_t xfer_bytes_tail = len & 0x07ul; @@ -2654,8 +2648,7 @@ ssize_t fi_opx_hfi1_tx_send_egr_16B(struct fid_ep *ep, const void *buf, size_t l #ifdef OPX_HMEM if (iface != FI_HMEM_SYSTEM) { - struct fi_opx_mr *desc_mr = (struct fi_opx_mr *) desc; - opx_copy_from_hmem(iface, hmem_device, desc_mr->hmem_dev_reg_handle, opx_ep->hmem_copy_buf, buf, len, + opx_copy_from_hmem(iface, hmem_device, hmem_handle, opx_ep->hmem_copy_buf, buf, len, OPX_HMEM_DEV_REG_SEND_THRESHOLD); buf = opx_ep->hmem_copy_buf; FI_OPX_DEBUG_COUNTERS_INC( @@ -2739,26 +2732,26 @@ ssize_t fi_opx_hfi1_tx_send_egr_16B(struct fid_ep *ep, const void *buf, size_t l } __OPX_FORCE_INLINE__ -ssize_t fi_opx_hfi1_tx_send_egr_select(struct fid_ep *ep, const void *buf, size_t len, void *desc, fi_addr_t dest_addr, - uint64_t tag, void *context, const uint32_t data, int lock_required, - const unsigned override_flags, const uint64_t tx_op_flags, - const uint64_t dest_rx, const uint64_t caps, - const enum ofi_reliability_kind reliability, const uint64_t do_cq_completion, - const enum fi_hmem_iface iface, const uint64_t hmem_device, - const enum opx_hfi1_type hfi1_type) +ssize_t opx_hfi1_tx_send_egr_select(struct fid_ep *ep, const void *buf, size_t len, fi_addr_t dest_addr, uint64_t tag, + void *context, const uint32_t data, int lock_required, + const unsigned override_flags, const uint64_t tx_op_flags, const uint64_t dest_rx, + const uint64_t caps, const enum ofi_reliability_kind reliability, + const uint64_t do_cq_completion, const enum fi_hmem_iface iface, + const uint64_t hmem_device, const uint64_t hmem_handle, + const enum opx_hfi1_type hfi1_type) { if (hfi1_type & OPX_HFI1_WFR) { - return fi_opx_hfi1_tx_send_egr(ep, buf, len, desc, dest_addr, tag, context, data, lock_required, - override_flags, tx_op_flags, dest_rx, caps, reliability, - do_cq_completion, iface, hmem_device, OPX_HFI1_WFR); + return opx_hfi1_tx_send_egr(ep, buf, len, dest_addr, tag, context, data, lock_required, override_flags, + tx_op_flags, dest_rx, caps, reliability, do_cq_completion, iface, + hmem_device, hmem_handle, OPX_HFI1_WFR); } else if (hfi1_type & OPX_HFI1_JKR) { - return fi_opx_hfi1_tx_send_egr_16B(ep, buf, len, desc, dest_addr, tag, context, data, lock_required, - override_flags, tx_op_flags, dest_rx, caps, reliability, - do_cq_completion, iface, hmem_device, OPX_HFI1_JKR); + return opx_hfi1_tx_send_egr_16B(ep, buf, len, dest_addr, tag, context, data, lock_required, + override_flags, tx_op_flags, dest_rx, caps, reliability, + do_cq_completion, iface, hmem_device, hmem_handle, OPX_HFI1_JKR); } else if (hfi1_type & OPX_HFI1_JKR_9B) { - return fi_opx_hfi1_tx_send_egr(ep, buf, len, desc, dest_addr, tag, context, data, lock_required, - override_flags, tx_op_flags, dest_rx, caps, reliability, - do_cq_completion, iface, hmem_device, OPX_HFI1_JKR_9B); + return opx_hfi1_tx_send_egr(ep, buf, len, dest_addr, tag, context, data, lock_required, override_flags, + tx_op_flags, dest_rx, caps, reliability, do_cq_completion, iface, + hmem_device, hmem_handle, OPX_HFI1_JKR_9B); } abort(); return (ssize_t) -1L; @@ -2980,12 +2973,14 @@ ssize_t fi_opx_hfi1_tx_mp_egr_write_nth_packet_header_no_payload( } __OPX_FORCE_INLINE__ -ssize_t fi_opx_hfi1_tx_send_mp_egr_first_common( - struct fi_opx_ep *opx_ep, void **buf, const uint64_t payload_bytes_total, const void *desc, - uint8_t *hmem_bounce_buf, const uint64_t pbc_dlid, const uint64_t bth_rx, const uint64_t lrh_dlid, - const union fi_opx_addr addr, uint64_t tag, const uint32_t data, int lock_required, const uint64_t tx_op_flags, - const uint64_t caps, const enum ofi_reliability_kind reliability, uint32_t *psn_out, - const enum fi_hmem_iface iface, const uint64_t hmem_device, const enum opx_hfi1_type hfi1_type) +ssize_t opx_hfi1_tx_send_mp_egr_first_common(struct fi_opx_ep *opx_ep, void **buf, const uint64_t payload_bytes_total, + uint8_t *hmem_bounce_buf, const uint64_t pbc_dlid, const uint64_t bth_rx, + const uint64_t lrh_dlid, const union fi_opx_addr addr, uint64_t tag, + const uint32_t data, int lock_required, const uint64_t tx_op_flags, + const uint64_t caps, const enum ofi_reliability_kind reliability, + uint32_t *psn_out, const enum fi_hmem_iface iface, + const uint64_t hmem_device, const uint64_t hmem_handle, + const enum opx_hfi1_type hfi1_type) { assert(lock_required == 0); @@ -3018,9 +3013,8 @@ ssize_t fi_opx_hfi1_tx_send_mp_egr_first_common( buffer for this first MP Eager packet as well as all subsequent MP Eager Nth packets. */ if (iface != FI_HMEM_SYSTEM) { - struct fi_opx_mr *desc_mr = (struct fi_opx_mr *) desc; - opx_copy_from_hmem(iface, hmem_device, desc_mr->hmem_dev_reg_handle, hmem_bounce_buf, *buf, - payload_bytes_total, OPX_HMEM_DEV_REG_SEND_THRESHOLD); + opx_copy_from_hmem(iface, hmem_device, hmem_handle, hmem_bounce_buf, *buf, payload_bytes_total, + OPX_HMEM_DEV_REG_SEND_THRESHOLD); *buf = hmem_bounce_buf; FI_OPX_DEBUG_COUNTERS_INC(opx_ep->debug_counters.hmem.hfi.kind[FI_OPX_KIND_TAG].send.mp_eager); } @@ -3590,59 +3584,60 @@ static inline void fi_opx_shm_write_fence(struct fi_opx_ep *opx_ep, const uint8_ opx_shm_tx_advance(&opx_ep->tx->shm, (void *) hdr, pos); } -ssize_t fi_opx_hfi1_tx_sendv_rzv(struct fid_ep *ep, const struct iovec *iov, size_t niov, size_t total_len, void *desc, - fi_addr_t dest_addr, uint64_t tag, void *user_context, const uint32_t data, - int lock_required, const unsigned override_flags, const uint64_t tx_op_flags, - const uint64_t dest_rx, const uint64_t caps, - const enum ofi_reliability_kind reliability, const uint64_t do_cq_completion, - const enum fi_hmem_iface hmem_iface, const uint64_t hmem_device, +ssize_t opx_hfi1_tx_sendv_rzv(struct fid_ep *ep, const struct iovec *iov, size_t niov, size_t total_len, + fi_addr_t dest_addr, uint64_t tag, void *user_context, const uint32_t data, + int lock_required, const unsigned override_flags, const uint64_t tx_op_flags, + const uint64_t dest_rx, const uint64_t caps, const enum ofi_reliability_kind reliability, + const uint64_t do_cq_completion, const enum fi_hmem_iface hmem_iface, + const uint64_t hmem_device, const uint64_t hmem_handle, + const enum opx_hfi1_type hfi1_type); + +ssize_t opx_hfi1_tx_send_rzv(struct fid_ep *ep, const void *buf, size_t len, fi_addr_t dest_addr, uint64_t tag, + void *user_context, const uint32_t data, int lock_required, const unsigned override_flags, + const uint64_t tx_op_flags, const uint64_t dest_rx, const uint64_t caps, + const enum ofi_reliability_kind reliability, const uint64_t do_cq_completion, + const enum fi_hmem_iface hmem_iface, const uint64_t hmem_handle, + const uint64_t hmem_device, const enum opx_hfi1_type hfi1_type); + +ssize_t opx_hfi1_tx_send_rzv_16B(struct fid_ep *ep, const void *buf, size_t len, fi_addr_t dest_addr, uint64_t tag, + void *user_context, const uint32_t data, int lock_required, + const unsigned override_flags, const uint64_t tx_op_flags, const uint64_t dest_rx, + const uint64_t caps, const enum ofi_reliability_kind reliability, + const uint64_t do_cq_completion, const enum fi_hmem_iface hmem_iface, + const uint64_t hmem_handle, const uint64_t hmem_device, const enum opx_hfi1_type hfi1_type); -ssize_t fi_opx_hfi1_tx_send_rzv(struct fid_ep *ep, const void *buf, size_t len, void *desc, fi_addr_t dest_addr, - uint64_t tag, void *user_context, const uint32_t data, int lock_required, - const unsigned override_flags, const uint64_t tx_op_flags, const uint64_t dest_rx, - const uint64_t caps, const enum ofi_reliability_kind reliability, - const uint64_t do_cq_completion, const enum fi_hmem_iface hmem_iface, - const uint64_t hmem_device, const enum opx_hfi1_type hfi1_type); - -ssize_t fi_opx_hfi1_tx_send_rzv_16B(struct fid_ep *ep, const void *buf, size_t len, void *desc, fi_addr_t dest_addr, - uint64_t tag, void *user_context, const uint32_t data, int lock_required, +__OPX_FORCE_INLINE__ +ssize_t opx_hfi1_tx_send_rzv_select(struct fid_ep *ep, const void *buf, size_t len, fi_addr_t dest_addr, uint64_t tag, + void *context, const uint32_t data, int lock_required, const unsigned override_flags, const uint64_t tx_op_flags, const uint64_t dest_rx, const uint64_t caps, const enum ofi_reliability_kind reliability, const uint64_t do_cq_completion, const enum fi_hmem_iface hmem_iface, - const uint64_t hmem_device, const enum opx_hfi1_type hfi1_type); - -__OPX_FORCE_INLINE__ -ssize_t fi_opx_hfi1_tx_send_rzv_select(struct fid_ep *ep, const void *buf, size_t len, void *desc, fi_addr_t dest_addr, - uint64_t tag, void *context, const uint32_t data, int lock_required, - const unsigned override_flags, const uint64_t tx_op_flags, - const uint64_t dest_rx, const uint64_t caps, - const enum ofi_reliability_kind reliability, const uint64_t do_cq_completion, - const enum fi_hmem_iface hmem_iface, const uint64_t hmem_device, - const enum opx_hfi1_type hfi1_type) + const uint64_t hmem_device, const uint64_t hmem_handle, + const enum opx_hfi1_type hfi1_type) { if (hfi1_type & OPX_HFI1_WFR) { - return fi_opx_hfi1_tx_send_rzv(ep, buf, len, desc, dest_addr, tag, context, data, lock_required, - override_flags, tx_op_flags, dest_rx, caps, reliability, - do_cq_completion, hmem_iface, hmem_device, OPX_HFI1_WFR); + return opx_hfi1_tx_send_rzv(ep, buf, len, dest_addr, tag, context, data, lock_required, override_flags, + tx_op_flags, dest_rx, caps, reliability, do_cq_completion, hmem_iface, + hmem_device, hmem_handle, OPX_HFI1_WFR); } else if (hfi1_type & OPX_HFI1_JKR) { - return fi_opx_hfi1_tx_send_rzv_16B(ep, buf, len, desc, dest_addr, tag, context, data, lock_required, - override_flags, tx_op_flags, dest_rx, caps, reliability, - do_cq_completion, hmem_iface, hmem_device, OPX_HFI1_JKR); + return opx_hfi1_tx_send_rzv_16B(ep, buf, len, dest_addr, tag, context, data, lock_required, + override_flags, tx_op_flags, dest_rx, caps, reliability, + do_cq_completion, hmem_iface, hmem_device, hmem_handle, OPX_HFI1_JKR); } else if (hfi1_type & OPX_HFI1_JKR_9B) { - return fi_opx_hfi1_tx_send_rzv(ep, buf, len, desc, dest_addr, tag, context, data, lock_required, - override_flags, tx_op_flags, dest_rx, caps, reliability, - do_cq_completion, hmem_iface, hmem_device, OPX_HFI1_JKR_9B); + return opx_hfi1_tx_send_rzv(ep, buf, len, dest_addr, tag, context, data, lock_required, override_flags, + tx_op_flags, dest_rx, caps, reliability, do_cq_completion, hmem_iface, + hmem_device, hmem_handle, OPX_HFI1_JKR_9B); } abort(); return (ssize_t) -1L; } -void fi_opx_hfi1_rx_rma_rts(struct fi_opx_ep *opx_ep, const union opx_hfi1_packet_hdr *const hdr, - const void *const payload, const uint64_t niov, uintptr_t origin_rma_req, - struct opx_context *const target_context, const uintptr_t dst_vaddr, - const enum fi_hmem_iface dst_iface, const uint64_t dst_device, - const union fi_opx_hfi1_dput_iov *src_iovs, const unsigned is_intranode, - const enum ofi_reliability_kind reliability, const enum opx_hfi1_type hfi1_type); +void opx_hfi1_rx_rma_rts(struct fi_opx_ep *opx_ep, const union opx_hfi1_packet_hdr *const hdr, + const void *const payload, const uint64_t niov, uintptr_t origin_rma_req, + struct opx_context *const target_context, const uintptr_t dst_vaddr, + const enum fi_hmem_iface dst_iface, const uint64_t dst_device, const uint64_t dst_handle, + const union opx_hfi1_dput_iov *src_iovs, const unsigned is_intranode, + const enum ofi_reliability_kind reliability, const enum opx_hfi1_type hfi1_type); #endif /* _FI_PROV_OPX_HFI1_TRANSPORT_H_ */ diff --git a/prov/opx/include/rdma/opx/fi_opx_hmem.h b/prov/opx/include/rdma/opx/fi_opx_hmem.h index 304714c1898..79f453d5c0d 100644 --- a/prov/opx/include/rdma/opx/fi_opx_hmem.h +++ b/prov/opx/include/rdma/opx/fi_opx_hmem.h @@ -1,5 +1,5 @@ /* - * Copyright (C) 2023-2024 by Cornelis Networks. + * Copyright (C) 2023-2025 by Cornelis Networks. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -106,7 +106,7 @@ enum fi_hmem_iface opx_hmem_get_ptr_iface(const void *ptr, uint64_t *device, uin } __OPX_FORCE_INLINE__ -enum fi_hmem_iface opx_hmem_get_mr_iface(const struct fi_opx_mr *desc, uint64_t *device) +enum fi_hmem_iface opx_hmem_get_mr_iface(const struct fi_opx_mr *desc, uint64_t *device, uint64_t *handle) { #ifdef OPX_HMEM if (desc && !desc->hmem_unified) { @@ -120,10 +120,12 @@ enum fi_hmem_iface opx_hmem_get_mr_iface(const struct fi_opx_mr *desc, uint64_t default: *device = 0ul; } + *handle = desc->hmem_dev_reg_handle; return desc->attr.iface; } #endif *device = 0ul; + *handle = OPX_HMEM_NO_HANDLE; return FI_HMEM_SYSTEM; } @@ -139,8 +141,7 @@ int opx_copy_to_hmem(enum fi_hmem_iface iface, uint64_t device, uint64_t hmem_ha int ret; - assert((hmem_handle == OPX_HMEM_NO_HANDLE && threshold == OPX_HMEM_DEV_REG_THRESHOLD_NOT_SET) || - (hmem_handle != OPX_HMEM_NO_HANDLE && threshold != OPX_HMEM_DEV_REG_THRESHOLD_NOT_SET)); + assert(hmem_handle == OPX_HMEM_NO_HANDLE || threshold != OPX_HMEM_DEV_REG_THRESHOLD_NOT_SET); OPX_TRACER_TRACE(OPX_TRACER_BEGIN, "COPY-TO-HMEM"); switch (iface) { @@ -198,8 +199,7 @@ int opx_copy_from_hmem(enum fi_hmem_iface iface, uint64_t device, uint64_t hmem_ int ret; - assert((hmem_handle == OPX_HMEM_NO_HANDLE && threshold == OPX_HMEM_DEV_REG_THRESHOLD_NOT_SET) || - (hmem_handle != OPX_HMEM_NO_HANDLE && threshold != OPX_HMEM_DEV_REG_THRESHOLD_NOT_SET)); + assert(hmem_handle == OPX_HMEM_NO_HANDLE || threshold != OPX_HMEM_DEV_REG_THRESHOLD_NOT_SET); OPX_TRACER_TRACE(OPX_TRACER_BEGIN, "COPY-FROM-HMEM"); switch (iface) { @@ -246,7 +246,8 @@ int opx_copy_from_hmem(enum fi_hmem_iface iface, uint64_t device, uint64_t hmem_ } __OPX_FORCE_INLINE__ -unsigned fi_opx_hmem_iov_init(const void *buf, const size_t len, const void *desc, struct fi_opx_hmem_iov *iov) +unsigned opx_hmem_iov_init(const void *buf, const size_t len, const void *desc, struct fi_opx_hmem_iov *iov, + uint64_t *handle) { iov->buf = (uintptr_t) buf; iov->len = len; @@ -254,10 +255,11 @@ unsigned fi_opx_hmem_iov_init(const void *buf, const size_t len, const void *des uint64_t hmem_device; enum fi_hmem_iface hmem_iface; if (desc) { - hmem_iface = opx_hmem_get_mr_iface(desc, &hmem_device); + hmem_iface = opx_hmem_get_mr_iface(desc, &hmem_device, handle); } else { uint64_t is_unified __attribute__((__unused__)); hmem_iface = opx_hmem_get_ptr_iface(buf, &hmem_device, &is_unified); + *handle = OPX_HMEM_NO_HANDLE; } iov->iface = hmem_iface; iov->device = hmem_device; @@ -265,6 +267,7 @@ unsigned fi_opx_hmem_iov_init(const void *buf, const size_t len, const void *des #else iov->iface = FI_HMEM_SYSTEM; iov->device = 0ul; + *handle = OPX_HMEM_NO_HANDLE; return 0; #endif } diff --git a/prov/opx/include/rdma/opx/fi_opx_rma.h b/prov/opx/include/rdma/opx/fi_opx_rma.h index ab0bedab6b3..49b5dcdf3ad 100644 --- a/prov/opx/include/rdma/opx/fi_opx_rma.h +++ b/prov/opx/include/rdma/opx/fi_opx_rma.h @@ -62,13 +62,13 @@ int opx_hfi1_tx_rma_rts(union fi_opx_hfi1_deferred_work *work); int opx_hfi1_tx_rma_rts_intranode(union fi_opx_hfi1_deferred_work *work); __OPX_FORCE_INLINE__ -void fi_opx_readv_internal(struct fi_opx_ep *opx_ep, const struct fi_opx_hmem_iov *iov, const size_t niov, - const union fi_opx_addr opx_target_addr, const uint64_t *addr_offset, const uint64_t *key, - const uint64_t tx_op_flags, const struct fi_opx_cq *opx_cq, - const struct fi_opx_cntr *opx_cntr, struct fi_opx_completion_counter *cc, - enum fi_datatype dt, enum fi_op op, const uint32_t opcode, const int lock_required, - const uint64_t caps, const enum ofi_reliability_kind reliability, - const enum opx_hfi1_type hfi1_type) +void opx_readv_internal(struct fi_opx_ep *opx_ep, const struct fi_opx_hmem_iov *iov, const size_t niov, + const uint64_t *hmem_handle, const union fi_opx_addr opx_target_addr, + const uint64_t *addr_offset, const uint64_t *key, const uint64_t tx_op_flags, + const struct fi_opx_cq *opx_cq, const struct fi_opx_cntr *opx_cntr, + struct fi_opx_completion_counter *cc, enum fi_datatype dt, enum fi_op op, const uint32_t opcode, + const int lock_required, const uint64_t caps, const enum ofi_reliability_kind reliability, + const enum opx_hfi1_type hfi1_type) { OPX_TRACER_TRACE(OPX_TRACER_BEGIN, "READV_INTERNAL"); @@ -93,18 +93,18 @@ void fi_opx_readv_internal(struct fi_opx_ep *opx_ep, const struct fi_opx_hmem_io 2 + /* lrh */ 3 + /* bth */ 9 + /* kdeth; from "RcvHdrSize[i].HdrSize" CSR */ - 16; /* one "struct fi_opx_hfi1_dput_iov", padded to cache line */ + 16; /* one "struct opx_hfi1_dput_iov" */ params->lrh_dws = htons(params->pbc_dws - 2 + 1); /* (BE: LRH DW) does not include pbc (8 bytes), but does include icrc (4 bytes) */ params->lrh_dlid = FI_OPX_ADDR_TO_HFI1_LRH_DLID_9B(opx_target_addr.lid); } else { - params->pbc_dws = 2 + /* pbc */ - 4 + /* lrh uncompressed */ - 3 + /* bth */ - 9 + /* kdeth; from "RcvHdrSize[i].HdrSize" CSR */ - 16 + /* one "struct fi_opx_hfi1_dput_iov", padded to cache line */ - 2; /* ICRC/tail */ + params->pbc_dws = 2 + /* pbc */ + 4 + /* lrh uncompressed */ + 3 + /* bth */ + 9 + /* kdeth; from "RcvHdrSize[i].HdrSize" CSR */ + 16 + /* one "struct opx_hfi1_dput_iov" */ + 2; /* ICRC/tail */ params->lrh_dws = (params->pbc_dws - 2) >> 1; /* (LRH QW) does not include pbc (8 bytes) */ params->lrh_dlid = opx_target_addr.lid; } @@ -124,14 +124,16 @@ void fi_opx_readv_internal(struct fi_opx_ep *opx_ep, const struct fi_opx_hmem_io params->dput_iov.bytes = iov->len; params->dput_iov.rbuf_iface = iov->iface; params->dput_iov.rbuf_device = iov->device; - params->dput_iov.sbuf_iface = FI_HMEM_SYSTEM; // TBD by remote node - params->dput_iov.sbuf_device = 0; // TBD by remote node + params->dput_iov.sbuf_iface = FI_HMEM_SYSTEM; // TBD by remote node + params->dput_iov.sbuf_device = 0; // TBD by remote node + params->dput_iov.sbuf_handle = OPX_HMEM_NO_HANDLE; // TBD by remote node params->rma_request = ofi_buf_alloc(opx_ep->tx->rma_request_pool); assert(params->rma_request != NULL); params->rma_request->cc = cc; params->rma_request->hmem_iface = iov->iface; params->rma_request->hmem_device = iov->device; + params->rma_request->hmem_handle = hmem_handle[0]; if (params->is_intranode) { params->work_elem.work_fn = fi_opx_do_readv_internal_intranode; @@ -169,12 +171,12 @@ void fi_opx_readv_internal(struct fi_opx_ep *opx_ep, const struct fi_opx_hmem_io } __OPX_FORCE_INLINE__ -void fi_opx_write_internal(struct fi_opx_ep *opx_ep, const struct fi_opx_hmem_iov *iov, const size_t niov, - const uint64_t data, const union fi_opx_addr opx_dst_addr, uint64_t addr_offset, - const uint64_t key, struct fi_opx_completion_counter *cc, enum fi_datatype dt, enum fi_op op, - const uint64_t tx_op_flags, const uint64_t is_hmem, const int lock_required, - const uint64_t caps, const enum ofi_reliability_kind reliability, - const enum opx_hfi1_type hfi1_type) +void opx_write_internal(struct fi_opx_ep *opx_ep, const struct fi_opx_hmem_iov *iov, const size_t niov, + const uint64_t data, const union fi_opx_addr opx_dst_addr, uint64_t addr_offset, + const uint64_t key, struct fi_opx_completion_counter *cc, enum fi_datatype dt, enum fi_op op, + const uint64_t tx_op_flags, const uint64_t is_hmem, const uint64_t handle, int lock_required, + const uint64_t caps, const enum ofi_reliability_kind reliability, + const enum opx_hfi1_type hfi1_type) { OPX_TRACER_TRACE(OPX_TRACER_BEGIN, "WRITE_INTERNAL"); assert(niov == 1); // TODO, support something ... bigger @@ -200,15 +202,15 @@ void fi_opx_write_internal(struct fi_opx_ep *opx_ep, const struct fi_opx_hmem_io struct fi_opx_rma_request *rma_req = ofi_buf_alloc(opx_ep->tx->rma_request_pool); assert(rma_req != NULL); - struct fi_opx_hfi1_rx_rma_rts_params *params = &work->rx_rma_rts; - params->work_elem.slist_entry.next = NULL; - params->work_elem.completion_action = NULL; - params->work_elem.payload_copy = NULL; - params->work_elem.complete = false; - params->opx_ep = opx_ep; - params->lrh_dlid = lrh_dlid; - params->slid = slid; - params->pbc_dlid = pbc_dlid; + struct opx_hfi1_rx_rma_rts_params *params = &work->rx_rma_rts; + params->work_elem.slist_entry.next = NULL; + params->work_elem.completion_action = NULL; + params->work_elem.payload_copy = NULL; + params->work_elem.complete = false; + params->opx_ep = opx_ep; + params->lrh_dlid = lrh_dlid; + params->slid = slid; + params->pbc_dlid = pbc_dlid; params->niov = niov; rma_req->cc = cc; @@ -216,10 +218,11 @@ void fi_opx_write_internal(struct fi_opx_ep *opx_ep, const struct fi_opx_hmem_io params->key = key; params->data = data; - // params->iov[0].rbuf_iface = FI_HMEM_SYSTEM; // TBD on remote node - // params->iov[0].rbuf_device = 0; // TBD on remote node + // params->iov[0].rbuf_iface = FI_HMEM_SYSTEM; // TBD on remote node + // params->iov[0].rbuf_device = 0; // TBD on remote node params->dput_iov[0].sbuf_iface = iov[0].iface; params->dput_iov[0].sbuf_device = iov[0].device; + params->dput_iov[0].sbuf_handle = handle; params->dput_iov[0].rbuf = addr_offset; params->dput_iov[0].sbuf = iov[0].buf; params->dput_iov[0].bytes = iov[0].len; @@ -267,6 +270,7 @@ void fi_opx_write_internal(struct fi_opx_ep *opx_ep, const struct fi_opx_hmem_io params->iov[0].sbuf = iov->buf; params->iov[0].sbuf_iface = iov->iface; params->iov[0].sbuf_device = iov->device; + params->iov[0].sbuf_handle = handle; params->iov[0].rbuf_iface = FI_HMEM_SYSTEM; // TBD on remote node params->iov[0].rbuf_device = 0; // TBD on remote node params->dput_iov = ¶ms->iov[0]; @@ -277,7 +281,7 @@ void fi_opx_write_internal(struct fi_opx_ep *opx_ep, const struct fi_opx_hmem_io params->reliability = reliability; params->cur_iov = 0; params->bytes_sent = 0; - params->opx_mr = NULL; + params->src_base_addr = NULL; params->origin_byte_counter = NULL; params->payload_bytes_for_iovec = 0; params->target_hfi_unit = opx_dst_addr.hfi1_unit; diff --git a/prov/opx/src/fi_opx_atomic.c b/prov/opx/src/fi_opx_atomic.c index d42d225ae54..9c74be6639f 100644 --- a/prov/opx/src/fi_opx_atomic.c +++ b/prov/opx/src/fi_opx_atomic.c @@ -96,9 +96,9 @@ static inline int fi_opx_check_atomic(struct fi_opx_ep *opx_ep, enum fi_datatype void fi_opx_atomic_completion_action(union fi_opx_hfi1_deferred_work *work_state) { - struct fi_opx_hfi1_dput_params *params = &work_state->dput; - uint64_t *rbuf_qws = (uint64_t *) (((uint8_t *) params->opx_mr->base_addr) + params->dput_iov->sbuf); - const uint64_t *sbuf_qws = + struct fi_opx_hfi1_dput_params *params = &work_state->dput; + uint64_t *rbuf_qws = (uint64_t *) (params->src_base_addr + params->dput_iov->sbuf); + const uint64_t *sbuf_qws = (uint64_t *) &work_state->work_elem.payload_copy->byte[sizeof(struct fi_opx_hfi1_dput_fetch)]; assert(params->op != (FI_NOOP - 1)); assert(params->dt != (FI_VOID - 1)); @@ -163,7 +163,7 @@ void fi_opx_atomic_op_internal(struct fi_opx_ep *opx_ep, const uint32_t opcode, params->bytes_sent = 0; params->cc = NULL; params->user_cc = NULL; - params->opx_mr = NULL; + params->src_base_addr = NULL; params->origin_byte_counter = NULL; params->payload_bytes_for_iovec = sizeof(struct fi_opx_hfi1_dput_fetch); params->fetch_vaddr = (void *) fetch_iov->buf; @@ -254,12 +254,13 @@ size_t fi_opx_atomic_internal(struct fi_opx_ep *opx_ep, const void *buf, size_t FI_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "===================================== ATOMIC READ (begin)\n"); struct fi_opx_hmem_iov fetch_iov; - fi_opx_hmem_iov_init(fetch_vaddr, buf_len, NULL, &fetch_iov); + uint64_t hmem_handle; + opx_hmem_iov_init(fetch_vaddr, buf_len, NULL, &fetch_iov, &hmem_handle); cc->cntr = opx_ep->read_cntr; - fi_opx_readv_internal(opx_ep, &fetch_iov, 1, opx_dst_addr, &addr, &key, opx_ep->tx->op_flags, - opx_ep->rx->cq, opx_ep->read_cntr, cc, datatype, op, FI_OPX_HFI_DPUT_OPCODE_GET, - lock_required, caps, reliability, hfi1_type); + opx_readv_internal(opx_ep, &fetch_iov, 1, &hmem_handle, opx_dst_addr, &addr, &key, opx_ep->tx->op_flags, + opx_ep->rx->cq, opx_ep->read_cntr, cc, datatype, op, FI_OPX_HFI_DPUT_OPCODE_GET, + lock_required, caps, reliability, hfi1_type); FI_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "===================================== ATOMIC READ (end)\n"); return count; @@ -272,14 +273,16 @@ size_t fi_opx_atomic_internal(struct fi_opx_ep *opx_ep, const void *buf, size_t const uint64_t is_intranode = fi_opx_hfi1_tx_is_intranode(opx_ep, opx_dst_addr, caps); struct fi_opx_hmem_iov fetch_iov; - fi_opx_hmem_iov_init(fetch_vaddr, buf_len, NULL, &fetch_iov); + uint64_t fetch_handle; + opx_hmem_iov_init(fetch_vaddr, buf_len, NULL, &fetch_iov, &fetch_handle); /* Note that is_hmem is only used to indicate if either of the send buffers are HMEM, as that's what we care about for determining the best way to send out the data. Whether or not the fetch_iov buffer is HMEM doesn't matter here and can be ignored. */ struct fi_opx_hmem_iov buf_iov; - uint64_t is_hmem = fi_opx_hmem_iov_init(buf, buf_len, NULL, &buf_iov); + uint64_t handle; + uint64_t is_hmem = opx_hmem_iov_init(buf, buf_len, NULL, &buf_iov, &handle); if (!is_compare) { FI_OPX_DEBUG_COUNTERS_INC_COND((is_hmem || fetch_iov.iface != FI_HMEM_SYSTEM) && is_intranode, @@ -294,7 +297,8 @@ size_t fi_opx_atomic_internal(struct fi_opx_ep *opx_ep, const void *buf, size_t } else { struct fi_opx_hmem_iov compare_iov; - is_hmem |= fi_opx_hmem_iov_init(compare_vaddr, buf_len, NULL, &compare_iov); + uint64_t handle; + is_hmem |= opx_hmem_iov_init(compare_vaddr, buf_len, NULL, &compare_iov, &handle); buf_iov.len <<= 1; FI_OPX_DEBUG_COUNTERS_INC_COND((is_hmem || fetch_iov.iface != FI_HMEM_SYSTEM) && is_intranode, @@ -319,10 +323,11 @@ size_t fi_opx_atomic_internal(struct fi_opx_ep *opx_ep, const void *buf, size_t FI_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "===================================== ATOMIC WRITE (begin)\n"); cc->cntr = opx_ep->write_cntr; struct fi_opx_hmem_iov buf_iov; - uint64_t is_hmem = fi_opx_hmem_iov_init(buf, buf_len, NULL, &buf_iov); + uint64_t handle; + uint64_t is_hmem = opx_hmem_iov_init(buf, buf_len, NULL, &buf_iov, &handle); - fi_opx_write_internal(opx_ep, &buf_iov, 1, OPX_NO_REMOTE_CQ_DATA, opx_dst_addr, addr, key, cc, datatype, op, - opx_ep->tx->op_flags, is_hmem, lock_required, caps, reliability, hfi1_type); + opx_write_internal(opx_ep, &buf_iov, 1, OPX_NO_REMOTE_CQ_DATA, opx_dst_addr, addr, key, cc, datatype, op, + opx_ep->tx->op_flags, is_hmem, handle, lock_required, caps, reliability, hfi1_type); FI_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "===================================== ATOMIC WRITE (end)\n"); return count; @@ -876,10 +881,13 @@ ssize_t fi_opx_inject_atomic_generic(struct fid_ep *ep, const void *buf, size_t "===================================== ATOMIC INJECT WRITE (begin)\n"); struct fi_opx_hmem_iov iov; - const uint64_t is_hmem = (const uint64_t) fi_opx_hmem_iov_init(buf, count * sizeofdt(datatype), NULL, &iov); + uint64_t handle; + const uint64_t is_hmem = + (const uint64_t) opx_hmem_iov_init(buf, count * sizeofdt(datatype), NULL, &iov, &handle); - fi_opx_write_internal(opx_ep, &iov, 1, OPX_NO_REMOTE_CQ_DATA, opx_dst_addr, addr, key, cc, datatype, op, - opx_ep->tx->op_flags | FI_INJECT, is_hmem, lock_required, caps, reliability, hfi1_type); + opx_write_internal(opx_ep, &iov, 1, OPX_NO_REMOTE_CQ_DATA, opx_dst_addr, addr, key, cc, datatype, op, + opx_ep->tx->op_flags | FI_INJECT, is_hmem, handle, lock_required, caps, reliability, + hfi1_type); FI_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "===================================== ATOMIC INJECT WRITE (end)\n"); diff --git a/prov/opx/src/fi_opx_hfi1.c b/prov/opx/src/fi_opx_hfi1.c index ca0c813a2c2..64f56535ac8 100644 --- a/prov/opx/src/fi_opx_hfi1.c +++ b/prov/opx/src/fi_opx_hfi1.c @@ -1120,7 +1120,7 @@ int opx_hfi1_rx_rzv_rts_send_cts(union fi_opx_hfi1_deferred_work *work) OPX_TRACER_TRACE(OPX_TRACER_BEGIN, "SEND-RZV-CTS-HFI:%p", params->rzv_comp); const uint64_t tid_payload = params->tid_info.npairs ? ((params->tid_info.npairs + 4) * sizeof(params->tidpairs[0])) : 0; - const uint64_t payload_bytes = (params->niov * sizeof(union fi_opx_hfi1_dput_iov)) + tid_payload; + const uint64_t payload_bytes = (params->niov * sizeof(union opx_hfi1_dput_iov)) + tid_payload; const uint64_t pbc_dws = 2 + /* pbc */ 2 + /* lrh */ 3 + /* bth */ @@ -1240,7 +1240,7 @@ int opx_hfi1_rx_rzv_rts_send_cts_16B(union fi_opx_hfi1_deferred_work *work) OPX_TRACER_TRACE(OPX_TRACER_BEGIN, "SEND-RZV-CTS-HFI:%p", params->rzv_comp); const uint64_t tid_payload = params->tid_info.npairs ? ((params->tid_info.npairs + 4) * sizeof(params->tidpairs[0])) : 0; - const uint64_t payload_bytes = (params->niov * sizeof(union fi_opx_hfi1_dput_iov)) + tid_payload; + const uint64_t payload_bytes = (params->niov * sizeof(union opx_hfi1_dput_iov)) + tid_payload; FI_DBG(fi_opx_global.prov, FI_LOG_EP_DATA, "payload_bytes = %ld\n", payload_bytes); const uint64_t pbc_dws = 2 + /* pbc */ 4 + /* lrh uncompressed */ @@ -2311,11 +2311,11 @@ void opx_hfi1_dput_fence(struct fi_opx_ep *opx_ep, const union opx_hfi1_packet_h int opx_hfi1_rx_rma_rts_send_cts_intranode(union fi_opx_hfi1_deferred_work *work) { - struct fi_opx_hfi1_rx_rma_rts_params *params = &work->rx_rma_rts; - struct fi_opx_ep *opx_ep = params->opx_ep; - const uint64_t lrh_dlid = params->lrh_dlid; - const uint64_t bth_rx = ((uint64_t) params->u8_rx) << OPX_BTH_RX_SHIFT; - const enum opx_hfi1_type hfi1_type = OPX_HFI1_TYPE; + struct opx_hfi1_rx_rma_rts_params *params = &work->rx_rma_rts; + struct fi_opx_ep *opx_ep = params->opx_ep; + const uint64_t lrh_dlid = params->lrh_dlid; + const uint64_t bth_rx = ((uint64_t) params->u8_rx) << OPX_BTH_RX_SHIFT; + const enum opx_hfi1_type hfi1_type = OPX_HFI1_TYPE; FI_DBG(fi_opx_global.prov, FI_LOG_EP_DATA, "===================================== RECV, SHM -- RMA RTS (begin)\n"); @@ -2386,10 +2386,10 @@ int opx_hfi1_rx_rma_rts_send_cts_intranode(union fi_opx_hfi1_deferred_work *work int opx_hfi1_rx_rma_rts_send_cts(union fi_opx_hfi1_deferred_work *work) { - struct fi_opx_hfi1_rx_rma_rts_params *params = &work->rx_rma_rts; - struct fi_opx_ep *opx_ep = params->opx_ep; - const uint64_t lrh_dlid = params->lrh_dlid; - const uint64_t bth_rx = ((uint64_t) params->u8_rx) << OPX_BTH_RX_SHIFT; + struct opx_hfi1_rx_rma_rts_params *params = &work->rx_rma_rts; + struct fi_opx_ep *opx_ep = params->opx_ep; + const uint64_t lrh_dlid = params->lrh_dlid; + const uint64_t bth_rx = ((uint64_t) params->u8_rx) << OPX_BTH_RX_SHIFT; const enum opx_hfi1_type hfi1_type = OPX_HFI1_TYPE; @@ -2398,7 +2398,7 @@ int opx_hfi1_rx_rma_rts_send_cts(union fi_opx_hfi1_deferred_work *work) params, params->rma_req, params->rma_req->context); assert(params->rma_req->context->byte_counter >= params->dput_iov[0].bytes); OPX_TRACER_TRACE(OPX_TRACER_BEGIN, "SEND-RMA-CTS-HFI:%p", params->rma_req); - const uint64_t payload_bytes = (params->niov * sizeof(union fi_opx_hfi1_dput_iov)); + const uint64_t payload_bytes = (params->niov * sizeof(union opx_hfi1_dput_iov)); union fi_opx_hfi1_pio_state pio_state = *opx_ep->tx->pio_state; const uint16_t total_credits_needed = 1 + /* packet header */ ((payload_bytes + 63) >> 6); /* payload blocks needed */ @@ -2524,19 +2524,19 @@ int opx_hfi1_rx_rma_rts_send_cts(union fi_opx_hfi1_deferred_work *work) return FI_SUCCESS; } -void fi_opx_hfi1_rx_rma_rts(struct fi_opx_ep *opx_ep, const union opx_hfi1_packet_hdr *const hdr, - const void *const payload, const uint64_t niov, uintptr_t origin_rma_req, - struct opx_context *const target_context, const uintptr_t dst_vaddr, - const enum fi_hmem_iface dst_iface, const uint64_t dst_device, - const union fi_opx_hfi1_dput_iov *src_iovs, const unsigned is_intranode, - const enum ofi_reliability_kind reliability, const enum opx_hfi1_type hfi1_type) +void opx_hfi1_rx_rma_rts(struct fi_opx_ep *opx_ep, const union opx_hfi1_packet_hdr *const hdr, + const void *const payload, const uint64_t niov, uintptr_t origin_rma_req, + struct opx_context *const target_context, const uintptr_t dst_vaddr, + const enum fi_hmem_iface dst_iface, const uint64_t dst_device, const uint64_t dst_handle, + const union opx_hfi1_dput_iov *src_iovs, const unsigned is_intranode, + const enum ofi_reliability_kind reliability, const enum opx_hfi1_type hfi1_type) { OPX_TRACER_TRACE(OPX_TRACER_BEGIN, "RECV-RMA-RTS-HFI:%ld", hdr->qw_9B[6]); union fi_opx_hfi1_deferred_work *work = ofi_buf_alloc(opx_ep->tx->work_pending_pool); assert(work != NULL); - struct fi_opx_hfi1_rx_rma_rts_params *params = &work->rx_rma_rts; - params->work_elem.slist_entry.next = NULL; - params->opx_ep = opx_ep; + struct opx_hfi1_rx_rma_rts_params *params = &work->rx_rma_rts; + params->work_elem.slist_entry.next = NULL; + params->opx_ep = opx_ep; opx_lid_t lid; if (hfi1_type == OPX_HFI1_WFR) { @@ -2555,13 +2555,14 @@ void fi_opx_hfi1_rx_rma_rts(struct fi_opx_ep *opx_ep, const union opx_hfi1_packe params->slid = lid; assert(niov <= MIN(FI_OPX_MAX_HMEM_IOV, FI_OPX_MAX_DPUT_IOV)); - params->niov = niov; - const union fi_opx_hfi1_dput_iov *src_iov = src_iovs; - uint64_t rbuf_offset = 0; + params->niov = niov; + const union opx_hfi1_dput_iov *src_iov = src_iovs; + uint64_t rbuf_offset = 0; for (int i = 0; i < niov; i++) { params->dput_iov[i].sbuf = src_iov->sbuf; params->dput_iov[i].sbuf_iface = src_iov->sbuf_iface; params->dput_iov[i].sbuf_device = src_iov->sbuf_device; + params->dput_iov[i].sbuf_handle = src_iov->sbuf_handle; params->dput_iov[i].rbuf = dst_vaddr + rbuf_offset; params->dput_iov[i].rbuf_iface = dst_iface; params->dput_iov[i].rbuf_device = dst_device; @@ -2600,6 +2601,7 @@ void fi_opx_hfi1_rx_rma_rts(struct fi_opx_ep *opx_ep, const union opx_hfi1_packe params->rma_req->context = target_context; params->rma_req->hmem_device = dst_device; params->rma_req->hmem_iface = dst_iface; + params->rma_req->hmem_handle = dst_handle; int rc = params->work_elem.work_fn(work); if (rc == FI_SUCCESS) { @@ -2618,10 +2620,10 @@ void fi_opx_hfi1_rx_rma_rts(struct fi_opx_ep *opx_ep, const union opx_hfi1_packe int opx_hfi1_tx_rma_rts(union fi_opx_hfi1_deferred_work *work) { - struct fi_opx_hfi1_rx_rma_rts_params *params = &work->rx_rma_rts; - struct fi_opx_ep *opx_ep = params->opx_ep; - const uint64_t lrh_dlid = params->lrh_dlid; - const uint64_t bth_rx = ((uint64_t) params->u8_rx) << OPX_BTH_RX_SHIFT; + struct opx_hfi1_rx_rma_rts_params *params = &work->rx_rma_rts; + struct fi_opx_ep *opx_ep = params->opx_ep; + const uint64_t lrh_dlid = params->lrh_dlid; + const uint64_t bth_rx = ((uint64_t) params->u8_rx) << OPX_BTH_RX_SHIFT; FI_DBG(fi_opx_global.prov, FI_LOG_EP_DATA, "===================================== SEND, HFI -- RMA RTS (begin) (params=%p origin_rma_req=%p cc=%p)\n", @@ -2629,7 +2631,7 @@ int opx_hfi1_tx_rma_rts(union fi_opx_hfi1_deferred_work *work) assert(params->origin_rma_req->cc->byte_counter >= params->dput_iov[0].bytes); OPX_TRACER_TRACE(OPX_TRACER_BEGIN, "SEND-RMA-RTS-HFI:%p", params->origin_rma_req); - const uint64_t payload_bytes = (params->niov * sizeof(union fi_opx_hfi1_dput_iov)); + const uint64_t payload_bytes = (params->niov * sizeof(union opx_hfi1_dput_iov)); FI_DBG(fi_opx_global.prov, FI_LOG_EP_DATA, "payload_bytes = %ld\n", payload_bytes); union fi_opx_hfi1_pio_state pio_state = *opx_ep->tx->pio_state; const uint16_t total_credits_needed = 1 + /* packet header */ @@ -2765,11 +2767,11 @@ int opx_hfi1_tx_rma_rts(union fi_opx_hfi1_deferred_work *work) int opx_hfi1_tx_rma_rts_intranode(union fi_opx_hfi1_deferred_work *work) { - struct fi_opx_hfi1_rx_rma_rts_params *params = &work->rx_rma_rts; - struct fi_opx_ep *opx_ep = params->opx_ep; - const uint64_t lrh_dlid = params->lrh_dlid; - const uint64_t bth_rx = ((uint64_t) params->u8_rx) << OPX_BTH_RX_SHIFT; - uint64_t pos; + struct opx_hfi1_rx_rma_rts_params *params = &work->rx_rma_rts; + struct fi_opx_ep *opx_ep = params->opx_ep; + const uint64_t lrh_dlid = params->lrh_dlid; + const uint64_t bth_rx = ((uint64_t) params->u8_rx) << OPX_BTH_RX_SHIFT; + uint64_t pos; FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "===================================== SEND, SHM -- RENDEZVOUS RMA (begin) context %p\n", NULL); @@ -2847,21 +2849,20 @@ int opx_hfi1_tx_rma_rts_intranode(union fi_opx_hfi1_deferred_work *work) int fi_opx_hfi1_do_dput(union fi_opx_hfi1_deferred_work *work) { - struct fi_opx_hfi1_dput_params *params = &work->dput; - struct fi_opx_ep *opx_ep = params->opx_ep; - struct fi_opx_mr *opx_mr = params->opx_mr; - const uint8_t u8_rx = params->u8_rx; - const uint32_t niov = params->niov; - const union fi_opx_hfi1_dput_iov *const dput_iov = params->dput_iov; - const uintptr_t target_byte_counter_vaddr = params->target_byte_counter_vaddr; - uint64_t *origin_byte_counter = params->origin_byte_counter; - uint64_t key = params->key; - struct fi_opx_completion_counter *cc = params->cc; - uint64_t op64 = params->op; - uint64_t dt64 = params->dt; - uint32_t opcode = params->opcode; - const unsigned is_intranode = params->is_intranode; - const enum ofi_reliability_kind reliability = params->reliability; + struct fi_opx_hfi1_dput_params *params = &work->dput; + struct fi_opx_ep *opx_ep = params->opx_ep; + const uint8_t u8_rx = params->u8_rx; + const uint32_t niov = params->niov; + const union opx_hfi1_dput_iov *const dput_iov = params->dput_iov; + const uintptr_t target_byte_counter_vaddr = params->target_byte_counter_vaddr; + uint64_t *origin_byte_counter = params->origin_byte_counter; + uint64_t key = params->key; + struct fi_opx_completion_counter *cc = params->cc; + uint64_t op64 = params->op; + uint64_t dt64 = params->dt; + uint32_t opcode = params->opcode; + const unsigned is_intranode = params->is_intranode; + const enum ofi_reliability_kind reliability = params->reliability; /* use the slid from the lrh header of the incoming packet * as the dlid for the lrh header of the outgoing packet */ const enum opx_hfi1_type hfi1_type = OPX_HFI1_TYPE; @@ -2873,7 +2874,7 @@ int fi_opx_hfi1_do_dput(union fi_opx_hfi1_deferred_work *work) assert((opx_ep->tx->pio_max_eager_tx_bytes & 0x3fu) == 0); unsigned i; - const void *sbuf_start = (opx_mr == NULL) ? 0 : opx_mr->base_addr; + const void *sbuf_start = params->src_base_addr; /* Note that lrh_dlid is just the version of params->slid shifted so that it can be OR'd into the correct position in the packet header */ @@ -2920,6 +2921,7 @@ int fi_opx_hfi1_do_dput(union fi_opx_hfi1_deferred_work *work) enum fi_hmem_iface sbuf_iface = dput_iov[i].sbuf_iface; uint64_t sbuf_device = dput_iov[i].sbuf_device; + uint64_t sbuf_handle = dput_iov[i].sbuf_handle; uint64_t bytes_to_send = dput_iov[i].bytes - params->bytes_sent; while (bytes_to_send > 0) { @@ -2980,16 +2982,16 @@ int fi_opx_hfi1_do_dput(union fi_opx_hfi1_deferred_work *work) bth_rx, bytes_to_send_this_packet, key, (const uint64_t) params->fetch_vaddr, target_byte_counter_vaddr, params->rma_request_vaddr, params->bytes_sent, &sbuf, sbuf_iface, - sbuf_device, (uint8_t **) ¶ms->compare_vaddr, cbuf_iface, - cbuf_device, &rbuf, OPX_HFI1_JKR); + sbuf_device, sbuf_handle, (uint8_t **) ¶ms->compare_vaddr, + cbuf_iface, cbuf_device, &rbuf, OPX_HFI1_JKR); } else { bytes_sent = opx_hfi1_dput_write_header_and_payload( opx_ep, hdr, tx_payload, opcode, 0, lrh_dws, op64, dt64, lrh_dlid, bth_rx, bytes_to_send_this_packet, key, (const uint64_t) params->fetch_vaddr, target_byte_counter_vaddr, params->rma_request_vaddr, params->bytes_sent, &sbuf, sbuf_iface, - sbuf_device, (uint8_t **) ¶ms->compare_vaddr, cbuf_iface, - cbuf_device, &rbuf, OPX_HFI1_WFR); + sbuf_device, sbuf_handle, (uint8_t **) ¶ms->compare_vaddr, + cbuf_iface, cbuf_device, &rbuf, OPX_HFI1_WFR); } opx_shm_tx_advance(&opx_ep->tx->shm, (void *) hdr, pos); @@ -3038,8 +3040,8 @@ int fi_opx_hfi1_do_dput(union fi_opx_hfi1_deferred_work *work) op64, dt64, lrh_dlid, bth_rx, bytes_to_send_this_packet, key, (const uint64_t) params->fetch_vaddr, target_byte_counter_vaddr, params->rma_request_vaddr, params->bytes_sent, &sbuf, sbuf_iface, - sbuf_device, (uint8_t **) ¶ms->compare_vaddr, cbuf_iface, - cbuf_device, &rbuf, OPX_HFI1_JKR); + sbuf_device, sbuf_handle, (uint8_t **) ¶ms->compare_vaddr, + cbuf_iface, cbuf_device, &rbuf, OPX_HFI1_JKR); } else { replay->scb.scb_9B.qw0 = opx_ep->rx->tx.dput_9B.qw0 | OPX_PBC_LEN(pbc_dws, OPX_HFI1_WFR) | @@ -3050,8 +3052,8 @@ int fi_opx_hfi1_do_dput(union fi_opx_hfi1_deferred_work *work) op64, dt64, lrh_dlid, bth_rx, bytes_to_send_this_packet, key, (const uint64_t) params->fetch_vaddr, target_byte_counter_vaddr, params->rma_request_vaddr, params->bytes_sent, &sbuf, sbuf_iface, - sbuf_device, (uint8_t **) ¶ms->compare_vaddr, cbuf_iface, - cbuf_device, &rbuf, OPX_HFI1_WFR); + sbuf_device, sbuf_handle, (uint8_t **) ¶ms->compare_vaddr, + cbuf_iface, cbuf_device, &rbuf, OPX_HFI1_WFR); } FI_OPX_HFI1_CLEAR_CREDIT_RETURN(opx_ep); @@ -3176,18 +3178,17 @@ void fi_opx_hfi1_dput_copy_to_bounce_buf(uint32_t opcode, uint8_t *target_buf, u int fi_opx_hfi1_do_dput_sdma(union fi_opx_hfi1_deferred_work *work) { - struct fi_opx_hfi1_dput_params *params = &work->dput; - struct fi_opx_ep *opx_ep = params->opx_ep; - struct fi_opx_mr *opx_mr = params->opx_mr; - const uint8_t u8_rx = params->u8_rx; - const uint32_t niov = params->niov; - const union fi_opx_hfi1_dput_iov *const dput_iov = params->dput_iov; - const uintptr_t target_byte_counter_vaddr = params->target_byte_counter_vaddr; - uint64_t key = params->key; - uint64_t op64 = params->op; - uint64_t dt64 = params->dt; - uint32_t opcode = params->opcode; - const enum ofi_reliability_kind reliability = params->reliability; + struct fi_opx_hfi1_dput_params *params = &work->dput; + struct fi_opx_ep *opx_ep = params->opx_ep; + const uint8_t u8_rx = params->u8_rx; + const uint32_t niov = params->niov; + const union opx_hfi1_dput_iov *const dput_iov = params->dput_iov; + const uintptr_t target_byte_counter_vaddr = params->target_byte_counter_vaddr; + uint64_t key = params->key; + uint64_t op64 = params->op; + uint64_t dt64 = params->dt; + uint32_t opcode = params->opcode; + const enum ofi_reliability_kind reliability = params->reliability; /* use the slid from the lrh header of the incoming packet * as the dlid for the lrh header of the outgoing packet */ const enum opx_hfi1_type hfi1_type = OPX_HFI1_TYPE; @@ -3195,7 +3196,7 @@ int fi_opx_hfi1_do_dput_sdma(union fi_opx_hfi1_deferred_work *work) const uint64_t bth_rx = ((uint64_t) u8_rx) << OPX_BTH_RX_SHIFT; assert((opx_ep->tx->pio_max_eager_tx_bytes & 0x3fu) == 0); unsigned i; - const void *sbuf_start = (opx_mr == NULL) ? 0 : opx_mr->base_addr; + const void *sbuf_start = params->src_base_addr; const bool sdma_no_bounce_buf = params->sdma_no_bounce_buf; /* Note that lrh_dlid is just the version of params->slid shifted so @@ -3454,25 +3455,24 @@ int fi_opx_hfi1_do_dput_sdma(union fi_opx_hfi1_deferred_work *work) int fi_opx_hfi1_do_dput_sdma_tid(union fi_opx_hfi1_deferred_work *work) { - struct fi_opx_hfi1_dput_params *params = &work->dput; - struct fi_opx_ep *opx_ep = params->opx_ep; - struct fi_opx_mr *opx_mr = params->opx_mr; - const uint8_t u8_rx = params->u8_rx; - const uint32_t niov = params->niov; - const union fi_opx_hfi1_dput_iov *const dput_iov = params->dput_iov; - const uintptr_t target_byte_counter_vaddr = params->target_byte_counter_vaddr; - uint64_t key = params->key; - uint64_t op64 = params->op; - uint64_t dt64 = params->dt; - uint32_t opcode = params->opcode; - const enum ofi_reliability_kind reliability = params->reliability; - const enum opx_hfi1_type hfi1_type = OPX_HFI1_TYPE; + struct fi_opx_hfi1_dput_params *params = &work->dput; + struct fi_opx_ep *opx_ep = params->opx_ep; + const uint8_t u8_rx = params->u8_rx; + const uint32_t niov = params->niov; + const union opx_hfi1_dput_iov *const dput_iov = params->dput_iov; + const uintptr_t target_byte_counter_vaddr = params->target_byte_counter_vaddr; + uint64_t key = params->key; + uint64_t op64 = params->op; + uint64_t dt64 = params->dt; + uint32_t opcode = params->opcode; + const enum ofi_reliability_kind reliability = params->reliability; + const enum opx_hfi1_type hfi1_type = OPX_HFI1_TYPE; /* use the slid from the lrh header of the incoming packet * as the dlid for the lrh header of the outgoing packet */ const uint64_t lrh_dlid = params->lrh_dlid; const uint64_t bth_rx = ((uint64_t) u8_rx) << OPX_BTH_RX_SHIFT; unsigned i; - const void *sbuf_start = (opx_mr == NULL) ? 0 : opx_mr->base_addr; + const void *sbuf_start = params->src_base_addr; const bool sdma_no_bounce_buf = params->sdma_no_bounce_buf; assert(params->ntidpairs != 0); assert(niov == 1); @@ -3652,7 +3652,7 @@ int fi_opx_hfi1_do_dput_sdma_tid(union fi_opx_hfi1_deferred_work *work) if (params->sdma_we->use_bounce_buf) { OPX_HMEM_COPY_FROM(params->sdma_we->bounce_buf.buf, sbuf, MIN((packet_count * max_dput_bytes), bytes_to_send), - OPX_HMEM_NO_HANDLE, OPX_HMEM_DEV_REG_THRESHOLD_NOT_SET, + dput_iov[i].sbuf_handle, OPX_HMEM_DEV_REG_SEND_THRESHOLD, dput_iov[i].sbuf_iface, dput_iov[i].sbuf_device); sbuf_tmp = params->sdma_we->bounce_buf.buf; } else { @@ -3891,9 +3891,9 @@ int fi_opx_hfi1_do_dput_sdma_tid(union fi_opx_hfi1_deferred_work *work) } union fi_opx_hfi1_deferred_work * -fi_opx_hfi1_rx_rzv_cts(struct fi_opx_ep *opx_ep, struct fi_opx_mr *opx_mr, const union opx_hfi1_packet_hdr *const hdr, - const void *const payload, size_t payload_bytes_to_copy, const uint8_t u8_rx, - const uint32_t niov, const union fi_opx_hfi1_dput_iov *const dput_iov, const uint8_t op, +fi_opx_hfi1_rx_rzv_cts(struct fi_opx_ep *opx_ep, const union opx_hfi1_packet_hdr *const hdr, const void *const payload, + size_t payload_bytes_to_copy, const uint8_t u8_rx, const uint32_t niov, + const union opx_hfi1_dput_iov *const dput_iov, uint8_t *src_base_addr, const uint8_t op, const uint8_t dt, const uintptr_t rma_request_vaddr, const uintptr_t target_byte_counter_vaddr, uint64_t *origin_byte_counter, uint32_t opcode, void (*completion_action)(union fi_opx_hfi1_deferred_work *work_state), @@ -3908,7 +3908,7 @@ fi_opx_hfi1_rx_rzv_cts(struct fi_opx_ep *opx_ep, struct fi_opx_mr *opx_mr, const params->work_elem.payload_copy = NULL; params->work_elem.complete = false; params->opx_ep = opx_ep; - params->opx_mr = opx_mr; + params->src_base_addr = src_base_addr; if (hfi1_type == OPX_HFI1_WFR) { params->slid = (opx_lid_t) __be16_to_cpu24((__be16) hdr->lrh_9B.slid); params->lrh_dlid = (hdr->lrh_9B.qw[0] & 0xFFFF000000000000ul) >> 32; @@ -4032,13 +4032,13 @@ fi_opx_hfi1_rx_rzv_cts(struct fi_opx_ep *opx_ep, struct fi_opx_mr *opx_mr, const uint64_t num_sends; uint64_t total_sendv_bytes; -ssize_t fi_opx_hfi1_tx_sendv_rzv(struct fid_ep *ep, const struct iovec *iov, size_t niov, size_t total_len, void *desc, - fi_addr_t dest_addr, uint64_t tag, void *user_context, const uint32_t data, - int lock_required, const unsigned override_flags, const uint64_t tx_op_flags, - const uint64_t dest_rx, const uint64_t caps, - const enum ofi_reliability_kind reliability, const uint64_t do_cq_completion, - const enum fi_hmem_iface hmem_iface, const uint64_t hmem_device, - const enum opx_hfi1_type hfi1_type) +ssize_t opx_hfi1_tx_sendv_rzv(struct fid_ep *ep, const struct iovec *iov, size_t niov, size_t total_len, + fi_addr_t dest_addr, uint64_t tag, void *user_context, const uint32_t data, + int lock_required, const unsigned override_flags, const uint64_t tx_op_flags, + const uint64_t dest_rx, const uint64_t caps, const enum ofi_reliability_kind reliability, + const uint64_t do_cq_completion, const enum fi_hmem_iface hmem_iface, + const uint64_t hmem_device, const uint64_t hmem_handle, + const enum opx_hfi1_type hfi1_type) { // We should already have grabbed the lock prior to calling this function assert(!lock_required); @@ -4410,12 +4410,12 @@ ssize_t fi_opx_hfi1_tx_sendv_rzv(struct fid_ep *ep, const struct iovec *iov, si return FI_SUCCESS; } -ssize_t fi_opx_hfi1_tx_send_rzv(struct fid_ep *ep, const void *buf, size_t len, void *desc, fi_addr_t dest_addr, - uint64_t tag, void *user_context, const uint32_t data, int lock_required, - const unsigned override_flags, const uint64_t tx_op_flags, const uint64_t dest_rx, - const uint64_t caps, const enum ofi_reliability_kind reliability, - const uint64_t do_cq_completion, const enum fi_hmem_iface src_iface, - const uint64_t src_device_id, const enum opx_hfi1_type hfi1_type) +ssize_t opx_hfi1_tx_send_rzv(struct fid_ep *ep, const void *buf, size_t len, fi_addr_t dest_addr, uint64_t tag, + void *user_context, const uint32_t data, int lock_required, const unsigned override_flags, + const uint64_t tx_op_flags, const uint64_t dest_rx, const uint64_t caps, + const enum ofi_reliability_kind reliability, const uint64_t do_cq_completion, + const enum fi_hmem_iface src_iface, const uint64_t src_device_id, + const uint64_t src_handle, const enum opx_hfi1_type hfi1_type) { // We should already have grabbed the lock prior to calling this function assert(!lock_required); @@ -4556,12 +4556,8 @@ ssize_t fi_opx_hfi1_tx_send_rzv(struct fid_ep *ep, const void *buf, size_t len, if (immediate_total) { uint8_t *sbuf; if (src_iface != FI_HMEM_SYSTEM) { - struct fi_opx_mr *desc_mr = (struct fi_opx_mr *) desc; - opx_copy_from_hmem(src_iface, src_device_id, - desc_mr ? desc_mr->hmem_dev_reg_handle : OPX_HMEM_NO_HANDLE, - opx_ep->hmem_copy_buf, buf, immediate_total, - desc_mr ? OPX_HMEM_DEV_REG_SEND_THRESHOLD : - OPX_HMEM_DEV_REG_THRESHOLD_NOT_SET); + opx_copy_from_hmem(src_iface, src_device_id, src_handle, opx_ep->hmem_copy_buf, buf, + immediate_total, OPX_HMEM_DEV_REG_SEND_THRESHOLD); sbuf = opx_ep->hmem_copy_buf; } else { sbuf = (uint8_t *) buf; @@ -4664,11 +4660,8 @@ ssize_t fi_opx_hfi1_tx_send_rzv(struct fid_ep *ep, const void *buf, size_t len, if (immediate_tail) { uint8_t *buf_tail_bytes = ((uint8_t *) buf + len) - OPX_IMMEDIATE_TAIL_BYTE_COUNT; if (src_iface != FI_HMEM_SYSTEM) { - struct fi_opx_mr *desc_mr = (struct fi_opx_mr *) desc; - opx_copy_from_hmem( - src_iface, src_device_id, desc_mr ? desc_mr->hmem_dev_reg_handle : OPX_HMEM_NO_HANDLE, - opx_ep->hmem_copy_buf, buf_tail_bytes, OPX_IMMEDIATE_TAIL_BYTE_COUNT, - desc_mr ? OPX_HMEM_DEV_REG_SEND_THRESHOLD : OPX_HMEM_DEV_REG_THRESHOLD_NOT_SET); + opx_copy_from_hmem(src_iface, src_device_id, src_handle, opx_ep->hmem_copy_buf, buf_tail_bytes, + OPX_IMMEDIATE_TAIL_BYTE_COUNT, OPX_HMEM_DEV_REG_SEND_THRESHOLD); buf_tail_bytes = opx_ep->hmem_copy_buf; } @@ -4739,11 +4732,8 @@ ssize_t fi_opx_hfi1_tx_send_rzv(struct fid_ep *ep, const void *buf, size_t len, uint8_t *sbuf; if (src_iface != FI_HMEM_SYSTEM && immediate_total) { - struct fi_opx_mr *desc_mr = (struct fi_opx_mr *) desc; - opx_copy_from_hmem(src_iface, src_device_id, - desc_mr ? desc_mr->hmem_dev_reg_handle : OPX_HMEM_NO_HANDLE, opx_ep->hmem_copy_buf, - buf, immediate_total, - desc_mr ? OPX_HMEM_DEV_REG_SEND_THRESHOLD : OPX_HMEM_DEV_REG_THRESHOLD_NOT_SET); + opx_copy_from_hmem(src_iface, src_device_id, src_handle, opx_ep->hmem_copy_buf, buf, immediate_total, + OPX_HMEM_DEV_REG_SEND_THRESHOLD); sbuf = opx_ep->hmem_copy_buf; } else { sbuf = (uint8_t *) buf; @@ -4823,12 +4813,13 @@ ssize_t fi_opx_hfi1_tx_send_rzv(struct fid_ep *ep, const void *buf, size_t len, return FI_SUCCESS; } -ssize_t fi_opx_hfi1_tx_send_rzv_16B(struct fid_ep *ep, const void *buf, size_t len, void *desc, fi_addr_t dest_addr, - uint64_t tag, void *user_context, const uint32_t data, int lock_required, - const unsigned override_flags, const uint64_t tx_op_flags, const uint64_t dest_rx, - const uint64_t caps, const enum ofi_reliability_kind reliability, - const uint64_t do_cq_completion, const enum fi_hmem_iface src_iface, - const uint64_t src_device_id, const enum opx_hfi1_type hfi1_type) +ssize_t opx_hfi1_tx_send_rzv_16B(struct fid_ep *ep, const void *buf, size_t len, fi_addr_t dest_addr, uint64_t tag, + void *user_context, const uint32_t data, int lock_required, + const unsigned override_flags, const uint64_t tx_op_flags, const uint64_t dest_rx, + const uint64_t caps, const enum ofi_reliability_kind reliability, + const uint64_t do_cq_completion, const enum fi_hmem_iface src_iface, + const uint64_t src_device_id, const uint64_t src_handle, + const enum opx_hfi1_type hfi1_type) { // We should already have grabbed the lock prior to calling this function assert(!lock_required); @@ -4992,12 +4983,8 @@ ssize_t fi_opx_hfi1_tx_send_rzv_16B(struct fid_ep *ep, const void *buf, size_t l if (immediate_total) { uint8_t *sbuf; if (src_iface != FI_HMEM_SYSTEM) { - struct fi_opx_mr *desc_mr = (struct fi_opx_mr *) desc; - opx_copy_from_hmem(src_iface, src_device_id, - desc_mr ? desc_mr->hmem_dev_reg_handle : OPX_HMEM_NO_HANDLE, - opx_ep->hmem_copy_buf, buf, immediate_total, - desc_mr ? OPX_HMEM_DEV_REG_SEND_THRESHOLD : - OPX_HMEM_DEV_REG_THRESHOLD_NOT_SET); + opx_copy_from_hmem(src_iface, src_device_id, src_handle, opx_ep->hmem_copy_buf, buf, + immediate_total, OPX_HMEM_DEV_REG_SEND_THRESHOLD); sbuf = opx_ep->hmem_copy_buf; } else { sbuf = (uint8_t *) buf; @@ -5098,11 +5085,8 @@ ssize_t fi_opx_hfi1_tx_send_rzv_16B(struct fid_ep *ep, const void *buf, size_t l if (immediate_tail) { uint8_t *buf_tail_bytes = ((uint8_t *) buf + len) - OPX_IMMEDIATE_TAIL_BYTE_COUNT; if (src_iface != FI_HMEM_SYSTEM) { - struct fi_opx_mr *desc_mr = (struct fi_opx_mr *) desc; - opx_copy_from_hmem( - src_iface, src_device_id, desc_mr ? desc_mr->hmem_dev_reg_handle : OPX_HMEM_NO_HANDLE, - opx_ep->hmem_copy_buf, buf_tail_bytes, OPX_IMMEDIATE_TAIL_BYTE_COUNT, - desc_mr ? OPX_HMEM_DEV_REG_SEND_THRESHOLD : OPX_HMEM_DEV_REG_THRESHOLD_NOT_SET); + opx_copy_from_hmem(src_iface, src_device_id, src_handle, opx_ep->hmem_copy_buf, buf_tail_bytes, + OPX_IMMEDIATE_TAIL_BYTE_COUNT, OPX_HMEM_DEV_REG_SEND_THRESHOLD); buf_tail_bytes = opx_ep->hmem_copy_buf; } @@ -5195,11 +5179,8 @@ ssize_t fi_opx_hfi1_tx_send_rzv_16B(struct fid_ep *ep, const void *buf, size_t l uint8_t *sbuf; if (src_iface != FI_HMEM_SYSTEM && immediate_total) { - struct fi_opx_mr *desc_mr = (struct fi_opx_mr *) desc; - opx_copy_from_hmem(src_iface, src_device_id, - desc_mr ? desc_mr->hmem_dev_reg_handle : OPX_HMEM_NO_HANDLE, opx_ep->hmem_copy_buf, - buf, immediate_total, - desc_mr ? OPX_HMEM_DEV_REG_SEND_THRESHOLD : OPX_HMEM_DEV_REG_THRESHOLD_NOT_SET); + opx_copy_from_hmem(src_iface, src_device_id, src_handle, opx_ep->hmem_copy_buf, buf, immediate_total, + OPX_HMEM_DEV_REG_SEND_THRESHOLD); sbuf = opx_ep->hmem_copy_buf; } else { sbuf = (uint8_t *) buf; diff --git a/prov/opx/src/fi_opx_rma.c b/prov/opx/src/fi_opx_rma.c index 0e6bfe19762..9923cee50e6 100644 --- a/prov/opx/src/fi_opx_rma.c +++ b/prov/opx/src/fi_opx_rma.c @@ -302,11 +302,12 @@ ssize_t fi_opx_inject_write_internal(struct fid_ep *ep, const void *buf, size_t cc->hit_zero = fi_opx_hit_zero; struct fi_opx_hmem_iov iov; - const uint64_t is_hmem = fi_opx_hmem_iov_init(buf, len, NULL, &iov); + uint64_t handle; + const uint64_t is_hmem = opx_hmem_iov_init(buf, len, NULL, &iov, &handle); - fi_opx_write_internal(opx_ep, &iov, 1, OPX_NO_REMOTE_CQ_DATA, opx_dst_addr, addr_offset, key, cc, FI_VOID, - FI_NOOP, opx_ep->tx->op_flags | FI_INJECT, is_hmem, lock_required, caps, reliability, - hfi1_type); + opx_write_internal(opx_ep, &iov, 1, OPX_NO_REMOTE_CQ_DATA, opx_dst_addr, addr_offset, key, cc, FI_VOID, FI_NOOP, + opx_ep->tx->op_flags | FI_INJECT, is_hmem, handle, lock_required, caps, reliability, + hfi1_type); OPX_TRACER_TRACE(OPX_TRACER_END_SUCCESS, "INJECT_WRITE"); return 0; @@ -390,9 +391,10 @@ ssize_t fi_opx_write(struct fid_ep *ep, const void *buf, size_t len, void *desc, cc->hit_zero = fi_opx_hit_zero; struct fi_opx_hmem_iov iov; - const uint64_t is_hmem = fi_opx_hmem_iov_init(buf, len, desc, &iov); - fi_opx_write_internal(opx_ep, &iov, 1, data, opx_dst_addr, addr_offset, key, cc, FI_VOID, FI_NOOP, op_flags, - is_hmem, lock_required, caps, reliability, hfi1_type); + uint64_t handle; + const uint64_t is_hmem = opx_hmem_iov_init(buf, len, desc, &iov, &handle); + opx_write_internal(opx_ep, &iov, 1, data, opx_dst_addr, addr_offset, key, cc, FI_VOID, FI_NOOP, op_flags, + is_hmem, handle, lock_required, caps, reliability, hfi1_type); OPX_TRACER_TRACE(OPX_TRACER_END_SUCCESS, "WRITE"); return 0; @@ -483,10 +485,11 @@ ssize_t fi_opx_writev_internal(struct fid_ep *ep, const struct iovec *iov, void mr_ptr = NULL; } struct fi_opx_hmem_iov hmem_iov; + uint64_t handle; const uint64_t is_hmem = - fi_opx_hmem_iov_init(iov[index].iov_base, iov[index].iov_len, mr_ptr, &hmem_iov); - fi_opx_write_internal(opx_ep, &hmem_iov, 1, OPX_NO_REMOTE_CQ_DATA, opx_dst_addr, addr_offset, key, cc, - FI_VOID, FI_NOOP, 0, is_hmem, lock_required, caps, reliability, hfi1_type); + opx_hmem_iov_init(iov[index].iov_base, iov[index].iov_len, mr_ptr, &hmem_iov, &handle); + opx_write_internal(opx_ep, &hmem_iov, 1, OPX_NO_REMOTE_CQ_DATA, opx_dst_addr, addr_offset, key, cc, + FI_VOID, FI_NOOP, 0, is_hmem, handle, lock_required, caps, reliability, hfi1_type); addr_offset += iov[index].iov_len; } @@ -638,12 +641,13 @@ ssize_t fi_opx_writemsg_internal(struct fid_ep *ep, const struct fi_msg_rma *msg } else { mr_ptr = NULL; } - uint64_t is_hmem = fi_opx_hmem_iov_init((void *) msg_iov_vaddr, msg_iov_bytes, mr_ptr, &iov); + uint64_t handle; + uint64_t is_hmem = opx_hmem_iov_init((void *) msg_iov_vaddr, msg_iov_bytes, mr_ptr, &iov, &handle); size_t len = (msg_iov_bytes <= rma_iov_bytes) ? msg_iov_bytes : rma_iov_bytes; iov.buf = msg_iov_vaddr; iov.len = len; - fi_opx_write_internal(opx_ep, &iov, 1, msg->data, opx_dst_addr, rma_iov_addr, rma_iov_key, cc, FI_VOID, - FI_NOOP, flags, is_hmem, lock_required, caps, reliability, hfi1_type); + opx_write_internal(opx_ep, &iov, 1, msg->data, opx_dst_addr, rma_iov_addr, rma_iov_key, cc, FI_VOID, + FI_NOOP, flags, is_hmem, handle, lock_required, caps, reliability, hfi1_type); msg_iov_bytes -= len; msg_iov_vaddr += len; @@ -652,7 +656,7 @@ ssize_t fi_opx_writemsg_internal(struct fid_ep *ep, const struct fi_msg_rma *msg ++msg_iov_index; msg_iov_bytes = msg->msg_iov[msg_iov_index].iov_len; msg_iov_vaddr = (uintptr_t) msg->msg_iov[msg_iov_index].iov_base; - is_hmem = fi_opx_hmem_iov_init((void *) msg_iov_vaddr, msg_iov_bytes, mr_ptr, &iov); + is_hmem = opx_hmem_iov_init((void *) msg_iov_vaddr, msg_iov_bytes, mr_ptr, &iov, &handle); } rma_iov_bytes -= len; @@ -712,10 +716,12 @@ ssize_t fi_opx_read_internal(struct fid_ep *ep, void *buf, size_t len, void *des #ifdef OPX_HMEM uint64_t hmem_device; - enum fi_hmem_iface hmem_iface = opx_hmem_get_mr_iface(desc, &hmem_device); + uint64_t hmem_handle; + enum fi_hmem_iface hmem_iface = opx_hmem_get_mr_iface(desc, &hmem_device, &hmem_handle); #else const enum fi_hmem_iface hmem_iface = FI_HMEM_SYSTEM; const uint64_t hmem_device = 0; + const uint64_t hmem_handle = OPX_HMEM_NO_HANDLE; #endif struct fi_opx_hmem_iov iov = {.buf = (uintptr_t) buf, .len = len, .iface = hmem_iface, .device = hmem_device}; @@ -750,9 +756,9 @@ ssize_t fi_opx_read_internal(struct fid_ep *ep, void *buf, size_t len, void *des cc->context = context; cc->hit_zero = fi_opx_hit_zero; - fi_opx_readv_internal(opx_ep, &iov, 1, opx_addr, &addr_offset, &key, opx_ep->tx->op_flags, opx_ep->tx->cq, - opx_ep->read_cntr, cc, FI_VOID, FI_NOOP, FI_OPX_HFI_DPUT_OPCODE_GET, lock_required, caps, - reliability, hfi1_type); + opx_readv_internal(opx_ep, &iov, 1, &hmem_handle, opx_addr, &addr_offset, &key, opx_ep->tx->op_flags, + opx_ep->tx->cq, opx_ep->read_cntr, cc, FI_VOID, FI_NOOP, FI_OPX_HFI_DPUT_OPCODE_GET, + lock_required, caps, reliability, hfi1_type); OPX_TRACER_TRACE(OPX_TRACER_END_SUCCESS, "READ"); return FI_SUCCESS; @@ -839,6 +845,7 @@ ssize_t fi_opx_readv(struct fid_ep *ep, const struct iovec *iov, void **desc, si uint64_t hmem_device; enum fi_hmem_iface hmem_iface; + uint64_t hmem_handle[8]; struct fi_opx_hmem_iov hmem_iovs[8]; /* max 8 descriptors (iovecs) per readv_internal */ @@ -853,17 +860,18 @@ ssize_t fi_opx_readv(struct fid_ep *ep, const struct iovec *iov, void **desc, si } else { mr_ptr = NULL; } - hmem_iface = opx_hmem_get_mr_iface(mr_ptr, &hmem_device); + hmem_iface = opx_hmem_get_mr_iface(mr_ptr, &hmem_device, &hmem_handle[i]); hmem_iovs[i].buf = (uintptr_t) iov[index + i].iov_base; hmem_iovs[i].len = iov[index + i].iov_len; hmem_iovs[i].iface = hmem_iface; hmem_iovs[i].device = hmem_device; } - fi_opx_readv_internal(opx_ep, hmem_iovs, 8, opx_addr, addr_v, key_v, 0, NULL, NULL, cc, FI_VOID, - FI_NOOP, FI_OPX_HFI_DPUT_OPCODE_GET, lock_required, caps, reliability, hfi1_type); + opx_readv_internal(opx_ep, hmem_iovs, 8, hmem_handle, opx_addr, addr_v, key_v, 0, NULL, NULL, cc, + FI_VOID, FI_NOOP, FI_OPX_HFI_DPUT_OPCODE_GET, lock_required, caps, reliability, + hfi1_type); } - /* if 'partial_ndesc' is zero, the fi_opx_readv_internal() will fence */ + /* if 'partial_ndesc' is zero, the opx_readv_internal() will fence */ const size_t partial_ndesc = count & 0x07ull; for (int i = 0; i < partial_ndesc; ++i) { struct fi_opx_mr *mr_ptr; @@ -873,15 +881,15 @@ ssize_t fi_opx_readv(struct fid_ep *ep, const struct iovec *iov, void **desc, si } else { mr_ptr = NULL; } - hmem_iface = opx_hmem_get_mr_iface(mr_ptr, &hmem_device); + hmem_iface = opx_hmem_get_mr_iface(mr_ptr, &hmem_device, &hmem_handle[i]); hmem_iovs[i].buf = (uintptr_t) iov[index + i].iov_base; hmem_iovs[i].len = iov[index + i].iov_len; hmem_iovs[i].iface = hmem_iface; hmem_iovs[i].device = hmem_device; } - fi_opx_readv_internal(opx_ep, hmem_iovs, partial_ndesc, opx_addr, addr_v, key_v, tx_op_flags, opx_ep->tx->cq, - opx_ep->read_cntr, cc, FI_VOID, FI_NOOP, FI_OPX_HFI_DPUT_OPCODE_GET, lock_required, caps, - reliability, hfi1_type); + opx_readv_internal(opx_ep, hmem_iovs, partial_ndesc, hmem_handle, opx_addr, addr_v, key_v, tx_op_flags, + opx_ep->tx->cq, opx_ep->read_cntr, cc, FI_VOID, FI_NOOP, FI_OPX_HFI_DPUT_OPCODE_GET, + lock_required, caps, reliability, hfi1_type); OPX_TRACER_TRACE(OPX_TRACER_END_SUCCESS, "READV"); return 0; @@ -955,6 +963,7 @@ ssize_t fi_opx_readmsg_internal(struct fid_ep *ep, const struct fi_msg_rma *msg, struct fi_opx_hmem_iov iov[8]; uint64_t addr[8]; uint64_t key[8]; + uint64_t hmem_handle[8]; ssize_t index; struct fi_opx_completion_counter *cc = ofi_buf_alloc(opx_ep->rma_counter_pool); @@ -997,7 +1006,7 @@ ssize_t fi_opx_readmsg_internal(struct fid_ep *ep, const struct fi_msg_rma *msg, mr_ptr = NULL; } const size_t len = (dst_iov_bytes <= src_iov_bytes) ? dst_iov_bytes : src_iov_bytes; - fi_opx_hmem_iov_init(dst_iov_vaddr, len, mr_ptr, &iov[niov]); + opx_hmem_iov_init(dst_iov_vaddr, len, mr_ptr, &iov[niov], &hmem_handle[niov]); addr[niov] = src_iov_addr; key[niov] = src_iov_key; @@ -1021,10 +1030,10 @@ ssize_t fi_opx_readmsg_internal(struct fid_ep *ep, const struct fi_msg_rma *msg, } assert(totsize_issued <= totsize); #endif - fi_opx_readv_internal(opx_ep, iov, niov + 1, opx_src_addr, addr, key, flags, cq, - opx_ep->read_cntr, /* enable_cq, enable_cntr */ - cc, FI_VOID, FI_NOOP, FI_OPX_HFI_DPUT_OPCODE_GET, - lock_required, caps, reliability, hfi1_type); + opx_readv_internal(opx_ep, iov, niov + 1, hmem_handle, opx_src_addr, addr, key, + flags, cq, opx_ep->read_cntr, /* enable_cq, enable_cntr */ + cc, FI_VOID, FI_NOOP, FI_OPX_HFI_DPUT_OPCODE_GET, + lock_required, caps, reliability, hfi1_type); OPX_TRACER_TRACE(OPX_TRACER_END_SUCCESS, "READMSG_INTERNAL"); return 0; @@ -1069,9 +1078,10 @@ ssize_t fi_opx_readmsg_internal(struct fid_ep *ep, const struct fi_msg_rma *msg, } assert(totsize_issued <= totsize); #endif - fi_opx_readv_internal( - opx_ep, iov, 8, opx_src_addr, addr, key, 0, NULL, NULL, /* disable_cq, disable_cntr */ - cc, FI_VOID, FI_NOOP, FI_OPX_HFI_DPUT_OPCODE_GET, lock_required, caps, reliability, hfi1_type); + opx_readv_internal(opx_ep, iov, 8, hmem_handle, opx_src_addr, addr, key, 0, NULL, + NULL, /* disable_cq, disable_cntr */ + cc, FI_VOID, FI_NOOP, FI_OPX_HFI_DPUT_OPCODE_GET, lock_required, caps, reliability, + hfi1_type); } /* end while */ diff --git a/prov/opx/src/fi_opx_tagged.c b/prov/opx/src/fi_opx_tagged.c index 2d643744131..5d931a3bf5a 100644 --- a/prov/opx/src/fi_opx_tagged.c +++ b/prov/opx/src/fi_opx_tagged.c @@ -97,12 +97,13 @@ ssize_t fi_opx_trecvmsg_generic(struct fid_ep *ep, const struct fi_msg_tagged *m /* NOTE: Assume that all IOVs reside in the same HMEM space */ struct fi_opx_hmem_info *hmem_info = (struct fi_opx_hmem_info *) context->hmem_info_qws; uint64_t hmem_device; + uint64_t hmem_handle; enum fi_hmem_iface hmem_iface; if (msg->desc && msg->desc[0]) { - hmem_iface = opx_hmem_get_mr_iface(msg->desc[0], &hmem_device); + hmem_iface = opx_hmem_get_mr_iface(msg->desc[0], &hmem_device, &hmem_handle); hmem_info->iface = hmem_iface; hmem_info->device = hmem_device; - hmem_info->hmem_dev_reg_handle = ((struct fi_opx_mr *) msg->desc[0])->hmem_dev_reg_handle; + hmem_info->hmem_dev_reg_handle = hmem_handle; hmem_info->is_unified = ((struct fi_opx_mr *) msg->desc[0])->hmem_unified; } else { hmem_iface = FI_HMEM_SYSTEM; @@ -117,7 +118,7 @@ ssize_t fi_opx_trecvmsg_generic(struct fid_ep *ep, const struct fi_msg_tagged *m for (int i = 1; i < msg->iov_count; ++i) { uint64_t tmp_hmem_device; enum fi_hmem_iface tmp_hmem_iface = - opx_hmem_get_mr_iface(msg->desc ? msg->desc[i] : NULL, &tmp_hmem_device); + opx_hmem_get_mr_iface(msg->desc ? msg->desc[i] : NULL, &tmp_hmem_device, &hmem_handle); assert(tmp_hmem_iface == hmem_iface); assert(tmp_hmem_device == hmem_device); }