diff --git a/prov/shm/src/smr.h b/prov/shm/src/smr.h index 495475528c8..0b061d8df35 100644 --- a/prov/shm/src/smr.h +++ b/prov/shm/src/smr.h @@ -37,12 +37,114 @@ #include "ofi_shm_p2p.h" #include "ofi_util.h" +struct smr_ep { + struct util_ep util_ep; + size_t tx_size; + size_t rx_size; + const char *name; + uint64_t msg_id; + struct smr_region *volatile region; + struct smr_map *map; + struct fid_peer_srx *srx; + struct ofi_bufpool *cmd_ctx_pool; + struct ofi_bufpool *unexp_buf_pool; + struct ofi_bufpool *pend_pool; + + struct slist overflow_list; + struct dlist_entry async_cpy_list; + size_t min_multi_recv_size; + + int ep_idx; + enum ofi_shm_p2p_type p2p_type; + void *dsa_context; + void (*smr_progress_async)(struct smr_ep *ep); +}; + + +struct smr_map { + int64_t cur_id; + int num_peers; + uint16_t flags; + struct ofi_rbmap rbmap; + struct smr_peer peers[SMR_MAX_PEERS]; +}; + +struct smr_av { + struct util_av util_av; + struct smr_map smr_map; + size_t used; +}; + +static inline struct smr_region *smr_peer_region(struct smr_ep *ep, int i) +{ + return ep->map->peers[i].region; +} + +void smr_map_add(struct smr_map *map, const char *name, int64_t *id); +int smr_map_to_region(struct smr_map *map, int64_t id); +void smr_unmap_region(struct smr_map *map, int64_t id, bool found); +void smr_map_to_endpoint(struct smr_ep *ep, int64_t id); + +static inline uintptr_t smr_local_to_peer(struct smr_ep *ep, + int64_t id, int64_t peer_id, + uintptr_t local_ptr) +{ + struct smr_region *peer_smr = smr_peer_region(ep, id); + uint64_t offset = local_ptr - (uintptr_t) ep->region; + + return smr_peer_data(peer_smr)[peer_id].local_region + offset; +} + +static inline uintptr_t smr_peer_to_peer(struct smr_ep *ep, + int64_t id, uintptr_t local_ptr) +{ + struct smr_region *peer_smr = smr_peer_region(ep, id); + uint64_t offset = local_ptr - (uintptr_t) peer_smr; + + return (uintptr_t) peer_smr->base_addr + offset; +} + +static inline uintptr_t smr_peer_to_owner(struct smr_ep *ep, + int64_t id, uintptr_t local_ptr) +{ + struct smr_region *peer_smr = smr_peer_region(ep, id); + uint64_t offset = local_ptr - (uintptr_t) peer_smr; + + return (uintptr_t) peer_smr->base_addr + offset; +} + +static inline void smr_return_cmd(struct smr_ep *ep, struct smr_cmd *cmd) +{ + struct smr_region *peer_smr = smr_peer_region(ep, cmd->hdr.rx_id); + uintptr_t peer_ptr; + int64_t pos; + struct smr_return_entry *queue_entry; + int ret; + + ret = smr_return_queue_next(smr_return_queue(peer_smr), &queue_entry, + &pos); + if (ret == -FI_ENOENT) { + /* return queue runs in parallel to command stack + * ie we will never run out of space + */ + assert(0); + } + + peer_ptr = smr_peer_to_owner(ep, cmd->hdr.rx_id, (uintptr_t) cmd); + assert(peer_ptr >= (uintptr_t) peer_smr->base_addr && + peer_ptr < (uintptr_t) peer_smr->base_addr + + peer_smr->total_size); + queue_entry->ptr = peer_ptr; + + smr_return_queue_commit(queue_entry, pos); +} + struct smr_env { - size_t sar_threshold; - int disable_cma; - int use_dsa_sar; - size_t max_gdrcopy_size; - int use_xpmem; + size_t sar_threshold; + int disable_cma; + int use_dsa_sar; + size_t max_gdrcopy_size; + int use_xpmem; }; extern struct smr_env smr_env; @@ -52,13 +154,7 @@ extern struct util_prov smr_util_prov; extern int smr_global_ep_idx; //protected by the ep_list_lock int smr_fabric(struct fi_fabric_attr *attr, struct fid_fabric **fabric, - void *context); - -struct smr_av { - struct util_av util_av; - struct smr_map smr_map; - size_t used; -}; + void *context); static inline int64_t smr_addr_lookup(struct util_av *av, fi_addr_t fiaddr) { @@ -66,7 +162,7 @@ static inline int64_t smr_addr_lookup(struct util_av *av, fi_addr_t fiaddr) } int smr_domain_open(struct fid_fabric *fabric, struct fi_info *info, - struct fid_domain **dom, void *context); + struct fid_domain **dom, void *context); int smr_eq_open(struct fid_fabric *fabric, struct fi_eq_attr *attr, struct fid_eq **eq, void *context); @@ -75,54 +171,48 @@ int smr_av_open(struct fid_domain *domain, struct fi_av_attr *attr, struct fid_av **av, void *context); int smr_query_atomic(struct fid_domain *domain, enum fi_datatype datatype, - enum fi_op op, struct fi_atomic_attr *attr, uint64_t flags); - -#define SMR_IOV_LIMIT 4 - -struct smr_tx_entry { - struct smr_cmd cmd; - int64_t peer_id; - void *context; - struct iovec iov[SMR_IOV_LIMIT]; - uint32_t iov_count; - uint64_t op_flags; - size_t bytes_done; - void *map_ptr; - struct smr_ep_name *map_name; - struct ofi_mr *mr[SMR_IOV_LIMIT]; + enum fi_op op, struct fi_atomic_attr *attr, + uint64_t flags); + +enum { + SMR_TX_ENTRY, + SMR_RX_ENTRY, }; struct smr_pend_entry { - struct dlist_entry entry; - struct smr_cmd cmd; - struct fi_peer_rx_entry *rx_entry; - struct smr_cmd_ctx *cmd_ctx; - size_t bytes_done; - struct iovec iov[SMR_IOV_LIMIT]; - size_t iov_count; - struct ofi_mr *mr[SMR_IOV_LIMIT]; - struct ofi_mr_entry *ipc_entry; - ofi_hmem_async_event_t async_event; + struct dlist_entry entry; + struct { + struct fi_peer_rx_entry *rx_entry; + struct ofi_mr_entry *ipc_entry; + ofi_hmem_async_event_t async_event;\ + } rx; + uint8_t type; + struct smr_cmd *cmd; + struct iovec iov[SMR_IOV_LIMIT]; + size_t iov_count; + struct ofi_mr *mr[SMR_IOV_LIMIT]; + size_t bytes_done; + void *comp_ctx; + uint64_t comp_flags; + int sar_dir; + ssize_t (*sar_copy_fn)( + struct smr_ep *ep, + struct smr_pend_entry *pend); }; struct smr_cmd_ctx { - struct dlist_entry entry; - struct smr_ep *ep; - struct smr_cmd cmd; - struct smr_pend_entry *sar_entry; - struct slist buf_list; -}; - -OFI_DECLARE_FREESTACK(struct smr_tx_entry, smr_tx_fs); - -struct smr_fabric { - struct util_fabric util_fabric; + struct dlist_entry entry; + struct smr_ep *ep; + struct smr_pend_entry *pend; + struct smr_cmd *cmd; + struct smr_cmd cmd_cpy; + char msg[SMR_MSG_DATA_LEN]; + struct slist buf_list; }; struct smr_domain { struct util_domain util_domain; - int fast_rma; - /* cache for use with hmem ipc */ + bool fast_rma; struct ofi_mr_cache *ipc_cache; struct fid_ep rx_ep; struct fid_peer_srx *srx; @@ -131,11 +221,11 @@ struct smr_domain { #define SMR_PREFIX "fi_shm://" #define SMR_PREFIX_NS "fi_ns://" -#define SMR_RMA_ORDER (OFI_ORDER_RAR_SET | OFI_ORDER_RAW_SET | FI_ORDER_RAS | \ - OFI_ORDER_WAR_SET | OFI_ORDER_WAW_SET | FI_ORDER_WAS | \ +#define SMR_RMA_ORDER (OFI_ORDER_RAR_SET | OFI_ORDER_RAW_SET | FI_ORDER_RAS | \ + OFI_ORDER_WAR_SET | OFI_ORDER_WAW_SET | FI_ORDER_WAS | \ FI_ORDER_SAR | FI_ORDER_SAW) #define smr_fast_rma_enabled(mode, order) ((mode & FI_MR_VIRT_ADDR) && \ - !(order & SMR_RMA_ORDER)) + !(order & SMR_RMA_ORDER)) static inline uint64_t smr_get_offset(void *base, void *addr) { @@ -152,29 +242,6 @@ struct smr_unexp_buf { char buf[SMR_SAR_SIZE]; }; -struct smr_ep { - struct util_ep util_ep; - size_t tx_size; - size_t rx_size; - const char *name; - uint64_t msg_id; - struct smr_region *volatile region; - struct fid_peer_srx *srx; - struct ofi_bufpool *cmd_ctx_pool; - struct ofi_bufpool *unexp_buf_pool; - struct ofi_bufpool *pend_buf_pool; - - struct smr_tx_fs *tx_fs; - struct dlist_entry sar_list; - struct dlist_entry ipc_cpy_pend_list; - size_t min_multi_recv_size; - - int ep_idx; - enum ofi_shm_p2p_type p2p_type; - void *dsa_context; - void (*smr_progress_ipc_list)(struct smr_ep *ep); -}; - #define smr_ep_rx_flags(smr_ep) ((smr_ep)->util_ep.rx_op_flags) #define smr_ep_tx_flags(smr_ep) ((smr_ep)->util_ep.tx_op_flags) @@ -186,8 +253,7 @@ static inline int smr_mmap_name(char *shm_name, const char *ep_name, } int smr_endpoint(struct fid_domain *domain, struct fi_info *info, - struct fid_ep **ep, void *context); -void smr_ep_exchange_fds(struct smr_ep *ep, int64_t id); + struct fid_ep **ep, void *context); int smr_cq_open(struct fid_domain *domain, struct fi_cq_attr *attr, struct fid_cq **cq_fid, void *context); @@ -196,29 +262,27 @@ int smr_cntr_open(struct fid_domain *domain, struct fi_cntr_attr *attr, int64_t smr_verify_peer(struct smr_ep *ep, fi_addr_t fi_addr); -void smr_format_pend_resp(struct smr_tx_entry *pend, struct smr_cmd *cmd, - void *context, struct ofi_mr **mr, - const struct iovec *iov, uint32_t iov_count, - uint64_t op_flags, int64_t id, struct smr_resp *resp); -void smr_generic_format(struct smr_cmd *cmd, int64_t peer_id, uint32_t op, - uint64_t tag, uint64_t data, uint64_t op_flags); -size_t smr_copy_to_sar(struct smr_freestack *sar_pool, struct smr_resp *resp, - struct smr_cmd *cmd, struct ofi_mr **mr, - const struct iovec *iov, size_t count, - size_t *bytes_done); -size_t smr_copy_from_sar(struct smr_freestack *sar_pool, struct smr_resp *resp, - struct smr_cmd *cmd, struct ofi_mr **mr, - const struct iovec *iov, size_t count, - size_t *bytes_done); +void smr_format_tx_pend(struct smr_pend_entry *pend, struct smr_cmd *cmd, + void *context, struct ofi_mr **mr, + const struct iovec *iov, uint32_t iov_count, + uint64_t op_flags); +void smr_generic_format(struct smr_cmd *cmd, int64_t tx_id, int64_t rx_id, + uint32_t op, uint64_t tag, uint64_t data, + uint64_t op_flags); +size_t smr_copy_to_sar(struct smr_ep *ep, struct smr_region *smr, + struct smr_pend_entry *pend); +size_t smr_copy_from_sar(struct smr_ep *ep, struct smr_region *smr, + struct smr_pend_entry *pend); int smr_select_proto(void **desc, size_t iov_count, bool cma_avail, bool ipc_valid, uint32_t op, uint64_t total_len, uint64_t op_flags); -typedef ssize_t (*smr_proto_func)(struct smr_ep *ep, struct smr_region *peer_smr, - int64_t id, int64_t peer_id, uint32_t op, uint64_t tag, +typedef ssize_t (*smr_send_func)( + struct smr_ep *ep, struct smr_region *peer_smr, + int64_t tx_id, int64_t rx_id, uint32_t op, uint64_t tag, uint64_t data, uint64_t op_flags, struct ofi_mr **desc, const struct iovec *iov, size_t iov_count, size_t total_len, void *context, struct smr_cmd *cmd); -extern smr_proto_func smr_proto_ops[smr_src_max]; +extern smr_send_func smr_send_ops[smr_proto_max]; int smr_write_err_comp(struct util_cq *cq, void *context, uint64_t flags, uint64_t tag, int err); @@ -237,24 +301,21 @@ static inline uint64_t smr_rx_cq_flags(uint64_t rx_flags, uint16_t op_flags) void smr_ep_progress(struct util_ep *util_ep); +/* Returns whether any VMA interface is available */ static inline bool smr_vma_enabled(struct smr_ep *ep, struct smr_region *peer_smr) { - if (ep->region == peer_smr) - return (ep->region->cma_cap_self == SMR_VMA_CAP_ON || - ep->region->xpmem_cap_self == SMR_VMA_CAP_ON); - else - return (ep->region->cma_cap_peer == SMR_VMA_CAP_ON || - peer_smr->xpmem_cap_self == SMR_VMA_CAP_ON); + return ep->region == peer_smr ? ep->region->self_vma_caps : + ep->region->peer_vma_caps; } -static inline void smr_set_ipc_valid(struct smr_region *region, uint64_t id) +static inline void smr_set_ipc_valid(struct smr_ep *ep, uint64_t id) { if (ofi_hmem_is_initialized(FI_HMEM_ZE) && - region->map->peers[id].pid_fd == -1) - smr_peer_data(region)[id].ipc_valid = 0; + ep->map->peers[id].pid_fd == -1) + smr_peer_data(ep->region)[id].ipc_valid = 0; else - smr_peer_data(region)[id].ipc_valid = 1; + smr_peer_data(ep->region)[id].ipc_valid = 1; } static inline bool smr_ipc_valid(struct smr_ep *ep, struct smr_region *peer_smr, @@ -264,42 +325,21 @@ static inline bool smr_ipc_valid(struct smr_ep *ep, struct smr_region *peer_smr, smr_peer_data(peer_smr)[peer_id].ipc_valid); } -static inline bool smr_ze_ipc_enabled(struct smr_region *smr, - struct smr_region *peer_smr) -{ - return (smr->flags & SMR_FLAG_IPC_SOCK) && - (peer_smr->flags & SMR_FLAG_IPC_SOCK); -} - -static inline struct smr_inject_buf * -smr_get_txbuf(struct smr_region *smr) -{ - struct smr_inject_buf *txbuf; - - pthread_spin_lock(&smr->lock); - if (!smr_freestack_isempty(smr_inject_pool(smr))) - txbuf = smr_freestack_pop(smr_inject_pool(smr)); - else - txbuf = NULL; - pthread_spin_unlock(&smr->lock); - return txbuf; -} - -static inline void -smr_release_txbuf(struct smr_region *smr, - struct smr_inject_buf *tx_buf) +static inline struct smr_freestack *smr_pend_sar_pool( + struct smr_ep *ep, struct smr_pend_entry *pend) { - pthread_spin_lock(&smr->lock); - smr_freestack_push(smr_inject_pool(smr), tx_buf); - pthread_spin_unlock(&smr->lock); + if (pend->type == SMR_TX_ENTRY) + return smr_sar_pool(ep->region); + return smr_sar_pool(smr_peer_region(ep, pend->cmd->hdr.rx_id)); } int smr_unexp_start(struct fi_peer_rx_entry *rx_entry); -void smr_progress_ipc_list(struct smr_ep *ep); -static inline void smr_progress_ipc_list_noop(struct smr_ep *ep) +void smr_progress_async(struct smr_ep *ep); +static inline void smr_progress_async_noop(struct smr_ep *ep) { // noop } +ssize_t smr_copy_sar(struct smr_ep *ep, struct smr_pend_entry *pend); -#endif +#endif /* _SMR_H_ */ \ No newline at end of file diff --git a/prov/shm/src/smr_atomic.c b/prov/shm/src/smr_atomic.c index 8d95aab30dc..b3821c21ecc 100644 --- a/prov/shm/src/smr_atomic.c +++ b/prov/shm/src/smr_atomic.c @@ -33,7 +33,8 @@ #include "smr.h" #include "ofi_atomic.h" -static void smr_format_rma_ioc(struct smr_cmd *cmd, const struct fi_rma_ioc *rma_ioc, +static void smr_format_rma_ioc(struct smr_cmd *cmd, + const struct fi_rma_ioc *rma_ioc, size_t ioc_count) { cmd->rma.rma_count = ioc_count; @@ -43,64 +44,67 @@ static void smr_format_rma_ioc(struct smr_cmd *cmd, const struct fi_rma_ioc *rma static void smr_generic_atomic_format(struct smr_cmd *cmd, uint8_t datatype, uint8_t atomic_op) { - cmd->msg.hdr.datatype = datatype; - cmd->msg.hdr.atomic_op = atomic_op; + cmd->hdr.datatype = datatype; + cmd->hdr.atomic_op = atomic_op; } static void smr_format_inline_atomic(struct smr_cmd *cmd, struct ofi_mr **mr, const struct iovec *iov, size_t count) { - cmd->msg.hdr.op_src = smr_src_inline; - - cmd->msg.hdr.size = ofi_copy_from_mr_iov(cmd->msg.data.msg, - SMR_MSG_DATA_LEN, mr, - iov, count, 0); + cmd->hdr.proto = smr_proto_inline; + cmd->hdr.tx_ctx = 0; + cmd->hdr.size = ofi_copy_from_mr_iov(cmd->data.msg, SMR_MSG_DATA_LEN, + mr, iov, count, 0); } -static void smr_do_atomic_inline(struct smr_ep *ep, struct smr_region *peer_smr, - int64_t id, int64_t peer_id, uint32_t op, +static void smr_do_atomic_inline( + struct smr_ep *ep, struct smr_region *peer_smr, + int64_t tx_id, int64_t rx_id, uint32_t op, uint64_t op_flags, uint8_t datatype, uint8_t atomic_op, struct ofi_mr **desc, const struct iovec *iov, size_t iov_count, size_t total_len, struct smr_cmd *cmd) { - smr_generic_format(cmd, peer_id, op, 0, 0, op_flags); + smr_generic_format(cmd, tx_id, rx_id, op, 0, 0, op_flags); smr_generic_atomic_format(cmd, datatype, atomic_op); smr_format_inline_atomic(cmd, desc, iov, iov_count); } -static void smr_format_inject_atomic(struct smr_cmd *cmd, struct ofi_mr **desc, +static void smr_format_inject_atomic( + struct smr_cmd *cmd, struct ofi_mr **desc, const struct iovec *iov, size_t count, const struct iovec *resultv, size_t result_count, struct ofi_mr **comp_desc, const struct iovec *compv, - size_t comp_count, struct smr_region *smr, - struct smr_inject_buf *tx_buf) + size_t comp_count, struct smr_region *smr) { + struct smr_inject_buf *tx_buf; size_t comp_size; - cmd->msg.hdr.op_src = smr_src_inject; - cmd->msg.hdr.src_data = smr_get_offset(smr, tx_buf); + cmd->hdr.proto = smr_proto_inject; - switch (cmd->msg.hdr.op) { + tx_buf = smr_get_inject_buf(smr, cmd); + switch (cmd->hdr.op) { case ofi_op_atomic: - cmd->msg.hdr.size = ofi_copy_from_mr_iov(tx_buf->data, - SMR_INJECT_SIZE, desc, iov, count, 0); + cmd->hdr.size = ofi_copy_from_mr_iov( + tx_buf->data, SMR_INJECT_SIZE, desc, + iov, count, 0); break; case ofi_op_atomic_fetch: - if (cmd->msg.hdr.atomic_op == FI_ATOMIC_READ) - cmd->msg.hdr.size = ofi_total_iov_len(resultv, result_count); + if (cmd->hdr.atomic_op == FI_ATOMIC_READ) + cmd->hdr.size = ofi_total_iov_len(resultv, + result_count); else - cmd->msg.hdr.size = ofi_copy_from_mr_iov(tx_buf->data, - SMR_INJECT_SIZE, desc, iov, - count, 0); + cmd->hdr.size = ofi_copy_from_mr_iov( + tx_buf->data, SMR_INJECT_SIZE, + desc, iov, count, 0); break; case ofi_op_atomic_compare: - cmd->msg.hdr.size = ofi_copy_from_mr_iov(tx_buf->buf, - SMR_COMP_INJECT_SIZE, - desc, iov, count, 0); - comp_size = ofi_copy_from_mr_iov(tx_buf->comp, - SMR_COMP_INJECT_SIZE, + cmd->hdr.size = ofi_copy_from_mr_iov(tx_buf->buf, + SMR_COMP_INJECT_SIZE, + desc, iov, count, 0); + comp_size = ofi_copy_from_mr_iov( + tx_buf->comp, SMR_COMP_INJECT_SIZE, comp_desc, compv, comp_count, 0); - if (comp_size != cmd->msg.hdr.size) + if (comp_size != cmd->hdr.size) FI_WARN(&smr_prov, FI_LOG_EP_CTRL, "atomic and compare buffer size mismatch\n"); break; @@ -110,8 +114,9 @@ static void smr_format_inject_atomic(struct smr_cmd *cmd, struct ofi_mr **desc, } } -static ssize_t smr_do_atomic_inject(struct smr_ep *ep, struct smr_region *peer_smr, - int64_t id, int64_t peer_id, uint32_t op, +static ssize_t smr_do_atomic_inject( + struct smr_ep *ep, struct smr_region *peer_smr, + int64_t tx_id, int64_t rx_id, uint32_t op, uint64_t op_flags, uint8_t datatype, uint8_t atomic_op, struct ofi_mr **desc, const struct iovec *iov, size_t iov_count, struct ofi_mr **res_desc, @@ -120,35 +125,25 @@ static ssize_t smr_do_atomic_inject(struct smr_ep *ep, struct smr_region *peer_s size_t comp_count, size_t total_len, void *context, uint16_t smr_flags, struct smr_cmd *cmd) { - struct smr_inject_buf *tx_buf; - struct smr_tx_entry *pend; - struct smr_resp *resp; - - tx_buf = smr_get_txbuf(peer_smr); - if (!tx_buf) - return -FI_EAGAIN; + struct smr_pend_entry *pend; - smr_generic_format(cmd, peer_id, op, 0, 0, op_flags); + smr_generic_format(cmd, tx_id, rx_id, op, 0, 0, op_flags); smr_generic_atomic_format(cmd, datatype, atomic_op); smr_format_inject_atomic(cmd, desc, iov, iov_count, resultv, result_count, comp_desc, compv, comp_count, - peer_smr, tx_buf); - - if (smr_flags & SMR_RMA_REQ || op_flags & FI_DELIVERY_COMPLETE) { - if (ofi_cirque_isfull(smr_resp_queue(ep->region))) { - smr_release_txbuf(peer_smr, tx_buf); - return -FI_EAGAIN; - } - resp = ofi_cirque_next(smr_resp_queue(ep->region)); - pend = ofi_freestack_pop(ep->tx_fs); - smr_format_pend_resp(pend, cmd, context, res_desc, resultv, - result_count, op_flags, id, resp); - cmd->msg.hdr.data = smr_get_offset(ep->region, resp); - ofi_cirque_commit(smr_resp_queue(ep->region)); + ep->region); + + if (op == ofi_op_atomic_fetch || op == ofi_op_atomic_compare || + atomic_op == FI_ATOMIC_READ || op_flags & FI_DELIVERY_COMPLETE) { + pend = ofi_buf_alloc(ep->pend_pool); + assert(pend); + cmd->hdr.tx_ctx = (uintptr_t) pend; + smr_format_tx_pend(pend, cmd, context, res_desc, resultv, + result_count, op_flags); + } else { + cmd->hdr.tx_ctx = 0; } - cmd->msg.hdr.op_flags |= smr_flags; - return FI_SUCCESS; } @@ -157,13 +152,14 @@ static int smr_select_atomic_proto(uint32_t op, uint64_t total_len, { if (op == ofi_op_atomic_compare || op == ofi_op_atomic_fetch || op_flags & FI_DELIVERY_COMPLETE || total_len > SMR_MSG_DATA_LEN) - return smr_src_inject; + return smr_proto_inject; - return smr_src_inline; + return smr_proto_inline; } -static ssize_t smr_generic_atomic(struct smr_ep *ep, - const struct fi_ioc *ioc, void **desc, size_t count, +static ssize_t smr_generic_atomic( + struct smr_ep *ep, const struct fi_ioc *ioc, + void **desc, size_t count, const struct fi_ioc *compare_ioc, void **compare_desc, size_t compare_count, struct fi_ioc *result_ioc, void **result_desc, size_t result_count, @@ -173,37 +169,39 @@ static ssize_t smr_generic_atomic(struct smr_ep *ep, uint64_t op_flags) { struct smr_cmd_entry *ce; + struct smr_cmd *cmd; struct smr_region *peer_smr; struct iovec iov[SMR_IOV_LIMIT]; struct iovec compare_iov[SMR_IOV_LIMIT]; struct iovec result_iov[SMR_IOV_LIMIT]; uint16_t smr_flags = 0; - int64_t id, peer_id; + int64_t tx_id, rx_id, pos; int proto; ssize_t ret = 0; size_t total_len; - int64_t pos; assert(count <= SMR_IOV_LIMIT); assert(result_count <= SMR_IOV_LIMIT); assert(compare_count <= SMR_IOV_LIMIT); assert(rma_count <= SMR_IOV_LIMIT); - id = smr_verify_peer(ep, addr); - if (id < 0) + tx_id = smr_verify_peer(ep, addr); + if (tx_id < 0) return -FI_EAGAIN; - peer_id = smr_peer_data(ep->region)[id].addr.id; - peer_smr = smr_peer_region(ep->region, id); + rx_id = smr_peer_data(ep->region)[tx_id].id; + peer_smr = smr_peer_region(ep, tx_id); - if (smr_peer_data(ep->region)[id].sar_status) + if (smr_peer_data(ep->region)[tx_id].sar_status) return -FI_EAGAIN; + ofi_genlock_lock(&ep->util_ep.lock); ret = smr_cmd_queue_next(smr_cmd_queue(peer_smr), &ce, &pos); - if (ret == -FI_ENOENT) - return -FI_EAGAIN; + if (ret == -FI_ENOENT) { + ret = -FI_EAGAIN; + goto unlock; + } - ofi_genlock_lock(&ep->util_ep.lock); total_len = ofi_datatype_size(datatype) * ofi_total_ioc_cnt(ioc, count); switch (op) { @@ -217,12 +215,12 @@ static ssize_t smr_generic_atomic(struct smr_ep *ep, assert(result_ioc); ofi_ioc_to_iov(result_ioc, result_iov, result_count, ofi_datatype_size(datatype)); - smr_flags = SMR_RMA_REQ; /* fall through */ case ofi_op_atomic: if (atomic_op != FI_ATOMIC_READ) { assert(ioc); - ofi_ioc_to_iov(ioc, iov, count, ofi_datatype_size(datatype)); + ofi_ioc_to_iov(ioc, iov, count, + ofi_datatype_size(datatype)); } else { count = 0; } @@ -234,26 +232,37 @@ static ssize_t smr_generic_atomic(struct smr_ep *ep, proto = smr_select_atomic_proto(op, total_len, op_flags); - if (proto == smr_src_inline) { - smr_do_atomic_inline(ep, peer_smr, id, peer_id, ofi_op_atomic, + if (proto == smr_proto_inline) { + cmd = &ce->cmd; + ce->ptr = smr_peer_to_peer(ep, tx_id, (uintptr_t) cmd); + smr_do_atomic_inline(ep, peer_smr, tx_id, rx_id, ofi_op_atomic, op_flags, datatype, atomic_op, (struct ofi_mr **) desc, iov, count, - total_len, &ce->cmd); + total_len, cmd); } else { - ret = smr_do_atomic_inject(ep, peer_smr, id, peer_id, op, + if (smr_freestack_isempty(smr_cmd_stack(ep->region))) { + smr_cmd_queue_discard(ce, pos); + ret = -FI_EAGAIN; + goto unlock; + } + + cmd = smr_freestack_pop(smr_cmd_stack(ep->region)); + assert(cmd); + ce->ptr = smr_local_to_peer(ep, tx_id, rx_id, (uintptr_t) cmd); + ret = smr_do_atomic_inject(ep, peer_smr, tx_id, rx_id, op, op_flags, datatype, atomic_op, (struct ofi_mr **) desc, iov, count, (struct ofi_mr **) result_desc, result_iov, result_count, (struct ofi_mr **) compare_desc, compare_iov, compare_count, total_len, context, - smr_flags, &ce->cmd); + smr_flags, cmd); if (ret) { smr_cmd_queue_discard(ce, pos); goto unlock; } } - if (!(smr_flags & SMR_RMA_REQ) && !(op_flags & FI_DELIVERY_COMPLETE)) { + if (!cmd->hdr.tx_ctx) { ret = smr_complete_tx(ep, context, op, op_flags); if (ret) { FI_WARN(&smr_prov, FI_LOG_EP_CTRL, @@ -261,7 +270,7 @@ static ssize_t smr_generic_atomic(struct smr_ep *ep, } } - smr_format_rma_ioc(&ce->rma_cmd, rma_ioc, rma_count); + smr_format_rma_ioc(cmd, rma_ioc, rma_count); smr_cmd_queue_commit(ce, pos); unlock: ofi_genlock_unlock(&ep->util_ep.lock); @@ -269,7 +278,8 @@ static ssize_t smr_generic_atomic(struct smr_ep *ep, } static ssize_t smr_atomic_writemsg(struct fid_ep *ep_fid, - const struct fi_msg_atomic *msg, uint64_t flags) + const struct fi_msg_atomic *msg, + uint64_t flags) { struct smr_ep *ep; @@ -279,13 +289,15 @@ static ssize_t smr_atomic_writemsg(struct fid_ep *ep_fid, NULL, NULL, 0, NULL, NULL, 0, msg->addr, msg->rma_iov, msg->rma_iov_count, msg->datatype, msg->op, msg->context, - ofi_op_atomic, flags | ep->util_ep.tx_msg_flags); + ofi_op_atomic, + flags | ep->util_ep.tx_msg_flags); } -static ssize_t smr_atomic_writev(struct fid_ep *ep_fid, - const struct fi_ioc *iov, void **desc, size_t count, - fi_addr_t dest_addr, uint64_t addr, uint64_t key, - enum fi_datatype datatype, enum fi_op op, void *context) +static ssize_t smr_atomic_writev( + struct fid_ep *ep_fid, const struct fi_ioc *iov, + void **desc, size_t count, fi_addr_t dest_addr, + uint64_t addr, uint64_t key, enum fi_datatype datatype, + enum fi_op op, void *context) { struct smr_ep *ep; struct fi_rma_ioc rma_iov; @@ -298,10 +310,12 @@ static ssize_t smr_atomic_writev(struct fid_ep *ep_fid, return smr_generic_atomic(ep, iov, desc, count, NULL, NULL, 0, NULL, NULL, 0, dest_addr, &rma_iov, 1, datatype, - op, context, ofi_op_atomic, smr_ep_tx_flags(ep)); + op, context, ofi_op_atomic, + smr_ep_tx_flags(ep)); } -static ssize_t smr_atomic_write(struct fid_ep *ep_fid, const void *buf, size_t count, +static ssize_t smr_atomic_write( + struct fid_ep *ep_fid, const void *buf, size_t count, void *desc, fi_addr_t dest_addr, uint64_t addr, uint64_t key, enum fi_datatype datatype, enum fi_op op, void *context) @@ -319,24 +333,26 @@ static ssize_t smr_atomic_write(struct fid_ep *ep_fid, const void *buf, size_t c rma_iov.count = count; rma_iov.key = key; - return smr_generic_atomic(ep, &iov, &desc, 1, NULL, NULL, 0, NULL, NULL, 0, - dest_addr, &rma_iov, 1, datatype, op, context, - ofi_op_atomic, smr_ep_tx_flags(ep)); + return smr_generic_atomic(ep, &iov, &desc, 1, NULL, NULL, 0, NULL, NULL, + 0, dest_addr, &rma_iov, 1, datatype, op, + context, ofi_op_atomic, smr_ep_tx_flags(ep)); } -static ssize_t smr_atomic_inject(struct fid_ep *ep_fid, const void *buf, - size_t count, fi_addr_t dest_addr, uint64_t addr, - uint64_t key, enum fi_datatype datatype, enum fi_op op) +static ssize_t smr_atomic_inject( + struct fid_ep *ep_fid, const void *buf, size_t count, + fi_addr_t dest_addr, uint64_t addr, uint64_t key, + enum fi_datatype datatype, enum fi_op op) { struct smr_cmd_entry *ce; + struct smr_cmd *cmd; struct smr_ep *ep; struct smr_region *peer_smr; struct iovec iov; struct fi_rma_ioc rma_ioc; - int64_t id, peer_id; - ssize_t ret = 0; + int64_t id, peer_id, pos; + ssize_t ret = -FI_EAGAIN; size_t total_len; - int64_t pos; + int proto; ep = container_of(ep_fid, struct smr_ep, util_ep.ep_fid.fid); @@ -344,17 +360,21 @@ static ssize_t smr_atomic_inject(struct fid_ep *ep_fid, const void *buf, if (id < 0) return -FI_EAGAIN; - peer_id = smr_peer_data(ep->region)[id].addr.id; - peer_smr = smr_peer_region(ep->region, id); + peer_id = smr_peer_data(ep->region)[id].id; + peer_smr = smr_peer_region(ep, id); + + ofi_genlock_lock(&ep->util_ep.lock); if (smr_peer_data(ep->region)[id].sar_status) { ret = -FI_EAGAIN; - goto out; + goto unlock; } ret = smr_cmd_queue_next(smr_cmd_queue(peer_smr), &ce, &pos); - if (ret == -FI_ENOENT) - return -FI_EAGAIN; + if (ret == -FI_ENOENT) { + ret = -FI_EAGAIN; + goto unlock; + } total_len = count * ofi_datatype_size(datatype); assert(total_len <= SMR_INJECT_SIZE); @@ -367,30 +387,47 @@ static ssize_t smr_atomic_inject(struct fid_ep *ep_fid, const void *buf, rma_ioc.key = key; if (total_len <= SMR_MSG_DATA_LEN) { + proto = smr_proto_inline; + cmd = &ce->cmd; + ce->ptr = smr_peer_to_peer(ep, id, (uintptr_t) cmd); smr_do_atomic_inline(ep, peer_smr, id, peer_id, ofi_op_atomic, 0, datatype, op, NULL, &iov, 1, total_len, &ce->cmd); - } else if (total_len <= SMR_INJECT_SIZE) { + } else { + proto = smr_proto_inject; + if (smr_freestack_isempty(smr_cmd_stack(ep->region))) { + smr_cmd_queue_discard(ce, pos); + ret = -FI_EAGAIN; + goto unlock; + } + + cmd = smr_freestack_pop(smr_cmd_stack(ep->region)); + assert(cmd); + ce->ptr = smr_local_to_peer(ep, id, peer_id, (uintptr_t) cmd); ret = smr_do_atomic_inject(ep, peer_smr, id, peer_id, - ofi_op_atomic, 0, datatype, op, NULL, &iov, 1, - NULL, NULL, 0, NULL, NULL, 0, total_len, NULL, - 0, &ce->cmd); + ofi_op_atomic, 0, datatype, op, NULL, + &iov, 1, NULL, NULL, 0, NULL, NULL, + 0, total_len, NULL, 0, cmd); if (ret) { smr_cmd_queue_discard(ce, pos); - goto out; + goto unlock; } } - smr_format_rma_ioc(&ce->rma_cmd, &rma_ioc, 1); + smr_format_rma_ioc(cmd, &rma_ioc, 1); smr_cmd_queue_commit(ce, pos); - ofi_ep_peer_tx_cntr_inc(&ep->util_ep, ofi_op_atomic); -out: + + if (proto == smr_proto_inline) + ofi_ep_peer_tx_cntr_inc(&ep->util_ep, ofi_op_atomic); +unlock: + ofi_genlock_unlock(&ep->util_ep.lock); return ret; } -static ssize_t smr_atomic_readwritemsg(struct fid_ep *ep_fid, - const struct fi_msg_atomic *msg, struct fi_ioc *resultv, - void **result_desc, size_t result_count, uint64_t flags) +static ssize_t smr_atomic_readwritemsg( + struct fid_ep *ep_fid, const struct fi_msg_atomic *msg, + struct fi_ioc *resultv, void **result_desc, + size_t result_count, uint64_t flags) { struct smr_ep *ep; @@ -405,12 +442,12 @@ static ssize_t smr_atomic_readwritemsg(struct fid_ep *ep_fid, flags | ep->util_ep.tx_msg_flags); } -static ssize_t smr_atomic_readwritev(struct fid_ep *ep_fid, - const struct fi_ioc *iov, void **desc, size_t count, - struct fi_ioc *resultv, void **result_desc, - size_t result_count, fi_addr_t dest_addr, uint64_t addr, - uint64_t key, enum fi_datatype datatype, enum fi_op op, - void *context) +static ssize_t smr_atomic_readwritev( + struct fid_ep *ep_fid, const struct fi_ioc *iov, + void **desc, size_t count, struct fi_ioc *resultv, + void **result_desc, size_t result_count, + fi_addr_t dest_addr, uint64_t addr, uint64_t key, + enum fi_datatype datatype, enum fi_op op, void *context) { struct smr_ep *ep; struct fi_rma_ioc rma_iov; @@ -427,11 +464,11 @@ static ssize_t smr_atomic_readwritev(struct fid_ep *ep_fid, ofi_op_atomic_fetch, smr_ep_tx_flags(ep)); } -static ssize_t smr_atomic_readwrite(struct fid_ep *ep_fid, const void *buf, - size_t count, void *desc, void *result, - void *result_desc, fi_addr_t dest_addr, uint64_t addr, - uint64_t key, enum fi_datatype datatype, enum fi_op op, - void *context) +static ssize_t smr_atomic_readwrite( + struct fid_ep *ep_fid, const void *buf, size_t count, + void *desc, void *result, void *result_desc, + fi_addr_t dest_addr, uint64_t addr, uint64_t key, + enum fi_datatype datatype, enum fi_op op, void *context) { struct smr_ep *ep; struct fi_ioc iov, resultv; @@ -455,8 +492,8 @@ static ssize_t smr_atomic_readwrite(struct fid_ep *ep_fid, const void *buf, smr_ep_tx_flags(ep)); } -static ssize_t smr_atomic_compwritemsg(struct fid_ep *ep_fid, - const struct fi_msg_atomic *msg, +static ssize_t smr_atomic_compwritemsg( + struct fid_ep *ep_fid, const struct fi_msg_atomic *msg, const struct fi_ioc *comparev, void **compare_desc, size_t compare_count, struct fi_ioc *resultv, void **result_desc, size_t result_count, uint64_t flags) @@ -475,8 +512,9 @@ static ssize_t smr_atomic_compwritemsg(struct fid_ep *ep_fid, flags | ep->util_ep.tx_msg_flags); } -static ssize_t smr_atomic_compwritev(struct fid_ep *ep_fid, - const struct fi_ioc *iov, void **desc, size_t count, +static ssize_t smr_atomic_compwritev( + struct fid_ep *ep_fid, const struct fi_ioc *iov, + void **desc, size_t count, const struct fi_ioc *comparev, void **compare_desc, size_t compare_count, struct fi_ioc *resultv, void **result_desc, size_t result_count, @@ -499,11 +537,12 @@ static ssize_t smr_atomic_compwritev(struct fid_ep *ep_fid, smr_ep_tx_flags(ep)); } -static ssize_t smr_atomic_compwrite(struct fid_ep *ep_fid, const void *buf, - size_t count, void *desc, const void *compare, - void *compare_desc, void *result, void *result_desc, - fi_addr_t dest_addr, uint64_t addr, uint64_t key, - enum fi_datatype datatype, enum fi_op op, void *context) +static ssize_t smr_atomic_compwrite( + struct fid_ep *ep_fid, const void *buf, size_t count, + void *desc, const void *compare, void *compare_desc, + void *result, void *result_desc, fi_addr_t dest_addr, + uint64_t addr, uint64_t key, enum fi_datatype datatype, + enum fi_op op, void *context) { struct smr_ep *ep; struct fi_ioc iov, resultv, comparev; @@ -618,4 +657,4 @@ struct fi_ops_atomic smr_atomic_ops = { .writevalid = smr_atomic_valid, .readwritevalid = smr_atomic_fetch_valid, .compwritevalid = smr_atomic_comp_valid, -}; +}; \ No newline at end of file diff --git a/prov/shm/src/smr_attr.c b/prov/shm/src/smr_attr.c index 0ab09945252..40bac964d38 100644 --- a/prov/shm/src/smr_attr.c +++ b/prov/shm/src/smr_attr.c @@ -101,7 +101,7 @@ struct fi_domain_attr smr_domain_attr = { .av_type = FI_AV_UNSPEC, .mr_mode = OFI_MR_BASIC | OFI_MR_SCALABLE, .mr_key_size = sizeof_field(struct fi_rma_iov, key), - .cq_data_size = sizeof_field(struct smr_msg_hdr, data), + .cq_data_size = sizeof_field(struct smr_cmd_hdr, cq_data), .cq_cnt = (1 << 10), .ep_cnt = SMR_MAX_PEERS, .tx_ctx_cnt = (1 << 10), @@ -121,7 +121,7 @@ struct fi_domain_attr smr_hmem_domain_attr = { .av_type = FI_AV_UNSPEC, .mr_mode = FI_MR_HMEM, .mr_key_size = sizeof_field(struct fi_rma_iov, key), - .cq_data_size = sizeof_field(struct smr_msg_hdr, data), + .cq_data_size = sizeof_field(struct smr_cmd_hdr, cq_data), .cq_cnt = (1 << 10), .ep_cnt = SMR_MAX_PEERS, .tx_ctx_cnt = (1 << 10), @@ -157,4 +157,4 @@ struct fi_info smr_info = { .domain_attr = &smr_domain_attr, .fabric_attr = &smr_fabric_attr, .next = &smr_hmem_info, -}; +}; \ No newline at end of file diff --git a/prov/shm/src/smr_av.c b/prov/shm/src/smr_av.c index f74359dc838..dffd9d02cfa 100644 --- a/prov/shm/src/smr_av.c +++ b/prov/shm/src/smr_av.c @@ -31,11 +31,284 @@ */ #include "smr.h" +#include -static void smr_peer_addr_init(struct smr_addr *peer) +void smr_map_to_endpoint(struct smr_ep *ep, int64_t id) { - memset(peer->name, 0, SMR_NAME_MAX); - peer->id = -1; + int ret; + struct smr_region *peer_smr; + struct smr_peer_data *local_peers; + + assert(ofi_genlock_held(&container_of(ep->util_ep.av, struct smr_av, + util_av)->util_av.lock)); + peer_smr = smr_peer_region(ep, id); + if (!ep->map->peers[id].id_assigned || !peer_smr) + return; + + local_peers = smr_peer_data(ep->region); + local_peers[id].local_region = (uintptr_t) peer_smr; + + if (ep->region == peer_smr || !(ep->region->flags & SMR_FLAG_CMA_INIT)) + smr_cma_check(ep->region, peer_smr); + + /* enable xpmem locally if the peer also has it enabled */ + if (smr_get_vma_cap(peer_smr->self_vma_caps, FI_SHM_P2P_XPMEM) && + smr_get_vma_cap(ep->region->self_vma_caps, FI_SHM_P2P_XPMEM)) { + ret = ofi_xpmem_enable(&peer_smr->xpmem_self, + &local_peers[id].xpmem); + if (ret) { + local_peers[id].xpmem.avail = false; + smr_set_vma_cap(&ep->region->self_vma_caps, + FI_SHM_P2P_XPMEM, false); + return; + } + local_peers[id].xpmem.avail = true; + local_peers[id].xpmem.addr_max = + peer_smr->xpmem_self.address_max; + } else { + local_peers[id].xpmem.avail = false; + } + + smr_set_ipc_valid(ep, id); + + return; +} + +static int smr_match_name(struct dlist_entry *item, const void *args) +{ + return !strcmp(container_of(item, struct smr_ep_name, entry)->name, + (char *) args); +} + +int smr_map_to_region(struct smr_map *map, int64_t id) +{ + struct smr_peer *peer_buf = &map->peers[id]; + struct smr_region *peer; + struct util_ep *util_ep; + struct smr_ep *smr_ep; + struct smr_av *av = container_of(map, struct smr_av, smr_map); + size_t size; + int fd, ret = 0; + struct stat sts; + struct dlist_entry *entry; + const char *name = smr_no_prefix(peer_buf->name); + char tmp[SMR_PATH_MAX]; + + pthread_mutex_lock(&ep_list_lock); + entry = dlist_find_first_match(&ep_name_list, smr_match_name, name); + if (entry) { + peer_buf->region = container_of(entry, struct smr_ep_name, + entry)->region; + pthread_mutex_unlock(&ep_list_lock); + return FI_SUCCESS; + } + pthread_mutex_unlock(&ep_list_lock); + + if (peer_buf->region) + return FI_SUCCESS; + + assert(ofi_genlock_held(&av->util_av.lock)); + fd = shm_open(name, O_RDWR, S_IRUSR | S_IWUSR); + if (fd < 0) { + FI_WARN_ONCE(&smr_prov, FI_LOG_AV, + "shm_open error: name %s errno %d\n", name, errno); + return -errno; + } + + memset(tmp, 0, sizeof(tmp)); + snprintf(tmp, sizeof(tmp), "%s%s", SMR_DIR, name); + if (stat(tmp, &sts) == -1) { + ret = -errno; + goto out; + } + + if (sts.st_size < sizeof(*peer)) { + ret = -FI_ENOENT; + goto out; + } + + peer = mmap(NULL, sizeof(*peer), PROT_READ | PROT_WRITE, + MAP_SHARED, fd, 0); + if (peer == MAP_FAILED) { + FI_WARN(&smr_prov, FI_LOG_AV, "mmap error\n"); + ret = -errno; + goto out; + } + + if (!peer->pid) { + FI_WARN(&smr_prov, FI_LOG_AV, "peer not initialized\n"); + munmap(peer, sizeof(*peer)); + ret = -FI_ENOENT; + goto out; + } + + size = peer->total_size; + munmap(peer, sizeof(*peer)); + + peer = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); + peer_buf->region = peer; + + if (map->flags & SMR_FLAG_HMEM_ENABLED) { + ret = ofi_hmem_host_register(peer, peer->total_size); + if (ret) + FI_WARN(&smr_prov, FI_LOG_AV, + "unable to register shm with iface\n"); + if (ofi_hmem_is_initialized(FI_HMEM_ZE)) { + peer_buf->pid_fd = ofi_pidfd_open(peer->pid, 0); + if (peer_buf->pid_fd < 0) { + FI_WARN(&smr_prov, FI_LOG_AV, + "unable to open pidfd\n"); + } + } else { + peer_buf->pid_fd = -1; + } + } + + dlist_foreach_container(&av->util_av.ep_list, struct util_ep, util_ep, + av_entry) { + smr_ep = container_of(util_ep, struct smr_ep, util_ep); + smr_map_to_endpoint(smr_ep, id); + } + +out: + close(fd); + return ret; +} + +static void smr_unmap_from_endpoint(struct smr_ep *ep, int64_t id) +{ + struct smr_region *peer_smr; + struct smr_peer_data *local_peers, *peer_peers; + int64_t peer_id; + + if (!ep->map->peers[id].id_assigned) + return; + + peer_smr = smr_peer_region(ep, id); + assert(peer_smr); + peer_peers = smr_peer_data(peer_smr); + peer_id = smr_peer_data(ep->region)[id].id; + + peer_peers[peer_id].id = -1; + peer_peers[peer_id].name_sent = 0; + + local_peers = smr_peer_data(ep->region); + ofi_xpmem_release(&local_peers[peer_id].xpmem); +} + +void smr_unmap_region(struct smr_map *map, int64_t peer_id, bool local) +{ + struct smr_region *peer_region; + struct smr_peer *peer; + struct util_ep *util_ep; + struct smr_ep *smr_ep; + struct smr_av *av; + int ret = 0; + + av = container_of(map, struct smr_av, smr_map); + + assert(ofi_genlock_held(&av->util_av.lock)); + peer_region = map->peers[peer_id].region; + if (!peer_region) + return; + + peer = &map->peers[peer_id]; + dlist_foreach_container(&av->util_av.ep_list, struct util_ep, util_ep, + av_entry) { + smr_ep = container_of(util_ep, struct smr_ep, util_ep); + smr_unmap_from_endpoint(smr_ep, peer_id); + } + + /* Don't unmap memory owned by this pid because the endpoint it belongs + * to might still be active. + */ + if (local) + return; + + if (map->flags & SMR_FLAG_HMEM_ENABLED) { + ret = ofi_hmem_host_unregister(peer_region); + if (ret) + FI_WARN(&smr_prov, FI_LOG_AV, + "unable to unregister shm with iface\n"); + + if (peer->pid_fd != -1) { + close(peer->pid_fd); + peer->pid_fd = -1; + } + } + + munmap(peer_region, peer_region->total_size); + peer->region = NULL; +} + +void smr_map_add(struct smr_map *map, const char *name, int64_t *id) +{ + struct ofi_rbnode *node; + const char *shm_name = smr_no_prefix(name); + int tries = 0, ret = 0; + + assert(ofi_genlock_held(&container_of(map, struct smr_av, + smr_map)->util_av.lock)); + + ret = ofi_rbmap_insert(&map->rbmap, (void *) shm_name, + (void *) (intptr_t) *id, &node); + if (ret) { + assert(ret == -FI_EALREADY); + *id = (intptr_t) node->data; + return; + } + + while (map->peers[map->cur_id].id_assigned && tries < SMR_MAX_PEERS) { + if (++map->cur_id == SMR_MAX_PEERS) + map->cur_id = 0; + tries++; + } + + assert(map->cur_id < SMR_MAX_PEERS && tries < SMR_MAX_PEERS); + *id = map->cur_id; + if (++map->cur_id == SMR_MAX_PEERS) + map->cur_id = 0; + node->data = (void *) (intptr_t) *id; + strncpy(map->peers[*id].name, shm_name, SMR_NAME_MAX); + map->peers[*id].name[SMR_NAME_MAX - 1] = '\0'; + map->peers[*id].region = NULL; + map->num_peers++; + map->peers[*id].id_assigned = true; +} + +static void smr_map_del(struct smr_map *map, int64_t id) +{ + struct smr_ep_name *name; + bool local = false; + + assert(ofi_genlock_held(&container_of(map, struct smr_av, + smr_map)->util_av.lock)); + + assert(id >= 0 && id < SMR_MAX_PEERS); + pthread_mutex_lock(&ep_list_lock); + dlist_foreach_container(&ep_name_list, struct smr_ep_name, name, + entry) { + if (!strcmp(name->name, map->peers[id].name)) { + local = true; + break; + } + } + pthread_mutex_unlock(&ep_list_lock); + + + smr_unmap_region(map, id, local); + map->peers[id].fiaddr = FI_ADDR_NOTAVAIL; + map->peers[id].id_assigned = false; + map->num_peers--; + ofi_rbmap_find_delete(&map->rbmap, map->peers[id].name); +} + +struct smr_region *smr_map_get(struct smr_map *map, int64_t id) +{ + if (id < 0 || id >= SMR_MAX_PEERS) + return NULL; + + return map->peers[id].region; } static int smr_name_compare(struct ofi_rbmap *map, void *key, void *data) @@ -44,71 +317,66 @@ static int smr_name_compare(struct ofi_rbmap *map, void *key, void *data) smr_map = container_of(map, struct smr_map, rbmap); - return strncmp(smr_map->peers[(uintptr_t) data].peer.name, + return strncmp(smr_map->peers[(uintptr_t) data].name, (char *) key, SMR_NAME_MAX); } -static int smr_map_init(const struct fi_provider *prov, struct smr_map *map, - int peer_count, uint16_t flags) +static int smr_map_init(struct smr_map *map, int peer_count, uint16_t flags) { int i; for (i = 0; i < peer_count; i++) { - smr_peer_addr_init(&map->peers[i].peer); + memset(&map->peers[i].name, 0, SMR_NAME_MAX); + map->peers[i].id_assigned = 0; map->peers[i].fiaddr = FI_ADDR_NOTAVAIL; } map->flags = flags; ofi_rbmap_init(&map->rbmap, smr_name_compare); - ofi_spin_init(&map->lock); return 0; } -static void smr_map_cleanup(struct smr_map *map) +static void smr_map_cleanup(struct smr_av *av) { int64_t i; + ofi_genlock_lock(&av->util_av.lock); for (i = 0; i < SMR_MAX_PEERS; i++) { - if (map->peers[i].peer.id < 0) - continue; - - smr_map_del(map, i); + if (av->smr_map.peers[i].id_assigned) + smr_map_del(&av->smr_map, i); } - ofi_rbmap_cleanup(&map->rbmap); + ofi_rbmap_cleanup(&av->smr_map.rbmap); + ofi_genlock_unlock(&av->util_av.lock); } static int smr_av_close(struct fid *fid) { + struct smr_av *av; int ret; - struct util_av *av; - struct smr_av *smr_av; - av = container_of(fid, struct util_av, av_fid); - smr_av = container_of(av, struct smr_av, util_av); + av = container_of(fid, struct smr_av, util_av.av_fid); + + smr_map_cleanup(av); - ret = ofi_av_close(av); + ret = ofi_av_close(&av->util_av); if (ret) return ret; - smr_map_cleanup(&smr_av->smr_map); free(av); return 0; } - static fi_addr_t smr_get_addr(struct fi_peer_rx_entry *rx_entry) { struct smr_cmd_ctx *cmd_ctx = rx_entry->peer_context; + struct smr_av *av; - return cmd_ctx->ep->region->map->peers[cmd_ctx->cmd.msg.hdr.id].fiaddr; -} + av = container_of(cmd_ctx->ep->util_ep.av, struct smr_av, util_av); + return av->smr_map.peers[cmd_ctx->cmd->hdr.rx_id].fiaddr; +} -/* - * Input address: smr name (string) - * output address: index (fi_addr_t), the output from util_av - */ static int smr_av_insert(struct fid_av *av_fid, const void *addr, size_t count, fi_addr_t *fi_addr, uint64_t flags, void *context) { @@ -125,19 +393,14 @@ static int smr_av_insert(struct fid_av *av_fid, const void *addr, size_t count, util_av = container_of(av_fid, struct util_av, av_fid); smr_av = container_of(util_av, struct smr_av, util_av); + ofi_genlock_lock(&util_av->lock); for (i = 0; i < count; i++, addr = (char *) addr + strlen(addr) + 1) { FI_INFO(&smr_prov, FI_LOG_AV, "%s\n", (const char *) addr); util_addr = FI_ADDR_NOTAVAIL; if (smr_av->used < SMR_MAX_PEERS) { - ret = smr_map_add(&smr_prov, &smr_av->smr_map, - addr, &shm_id); - if (!ret) { - ofi_genlock_lock(&util_av->lock); - ret = ofi_av_insert_addr(util_av, &shm_id, - &util_addr); - ofi_genlock_unlock(&util_av->lock); - } + smr_map_add(&smr_av->smr_map, addr, &shm_id); + ret = ofi_av_insert_addr(util_av, &shm_id, &util_addr); } else { FI_WARN(&smr_prov, FI_LOG_AV, "AV insert failed. The maximum number of AV " @@ -145,7 +408,8 @@ static int smr_av_insert(struct fid_av *av_fid, const void *addr, size_t count, ret = -FI_ENOMEM; } - FI_INFO(&smr_prov, FI_LOG_AV, "fi_addr: %" PRIu64 "\n", util_addr); + FI_INFO(&smr_prov, FI_LOG_AV, "fi_addr: %" PRIu64 "\n", + util_addr); if (ret) { if (fi_addr) @@ -174,18 +438,19 @@ static int smr_av_insert(struct fid_av *av_fid, const void *addr, size_t count, av_entry); smr_ep = container_of(util_ep, struct smr_ep, util_ep); smr_ep->region->max_sar_buf_per_peer = - SMR_MAX_PEERS / smr_av->smr_map.num_peers; - smr_ep->srx->owner_ops->foreach_unspec_addr(smr_ep->srx, - &smr_get_addr); + MIN(SMR_BUF_BATCH_MAX, + SMR_MAX_PEERS / smr_av->smr_map.num_peers); + smr_ep->srx->owner_ops->foreach_unspec_addr( + smr_ep->srx, &smr_get_addr); } - } + ofi_genlock_unlock(&util_av->lock); return succ_count; } -static int smr_av_remove(struct fid_av *av_fid, fi_addr_t *fi_addr, size_t count, - uint64_t flags) +static int smr_av_remove(struct fid_av *av_fid, fi_addr_t *fi_addr, + size_t count, uint64_t flags) { struct util_av *util_av; struct util_ep *util_ep; @@ -211,15 +476,16 @@ static int smr_av_remove(struct fid_av *av_fid, fi_addr_t *fi_addr, size_t count smr_map_del(&smr_av->smr_map, id); dlist_foreach(&util_av->ep_list, av_entry) { - util_ep = container_of(av_entry, struct util_ep, av_entry); + util_ep = container_of(av_entry, struct util_ep, + av_entry); smr_ep = container_of(util_ep, struct smr_ep, util_ep); if (smr_av->smr_map.num_peers > 0) smr_ep->region->max_sar_buf_per_peer = - SMR_MAX_PEERS / - smr_av->smr_map.num_peers; + SMR_MAX_PEERS / + smr_av->smr_map.num_peers; else smr_ep->region->max_sar_buf_per_peer = - SMR_BUF_BATCH_MAX; + SMR_BUF_BATCH_MAX; } smr_av->used--; } @@ -240,7 +506,7 @@ static int smr_av_lookup(struct fid_av *av, fi_addr_t fi_addr, void *addr, smr_av = container_of(util_av, struct smr_av, util_av); id = smr_addr_lookup(util_av, fi_addr); - name = smr_av->smr_map.peers[id].peer.name; + name = smr_av->smr_map.peers[id].name; strncpy((char *) addr, name, *addrlen); @@ -315,7 +581,8 @@ int smr_av_open(struct fid_domain *domain, struct fi_av_attr *attr, goto out; } - ret = ofi_av_init(util_domain, attr, &util_attr, &smr_av->util_av, context); + ret = ofi_av_init(util_domain, attr, &util_attr, &smr_av->util_av, + context); if (ret) goto out; @@ -324,7 +591,7 @@ int smr_av_open(struct fid_domain *domain, struct fi_av_attr *attr, (*av)->fid.ops = &smr_av_fi_ops; (*av)->ops = &smr_av_ops; - ret = smr_map_init(&smr_prov, &smr_av->smr_map, SMR_MAX_PEERS, + ret = smr_map_init(&smr_av->smr_map, SMR_MAX_PEERS, util_domain->info_domain_caps & FI_HMEM ? SMR_FLAG_HMEM_ENABLED : 0); if (ret) @@ -337,5 +604,4 @@ int smr_av_open(struct fid_domain *domain, struct fi_av_attr *attr, out: free(smr_av); return ret; -} - +} \ No newline at end of file diff --git a/prov/shm/src/smr_cntr.c b/prov/shm/src/smr_cntr.c index 2d02e314dbd..dbc2c94970c 100644 --- a/prov/shm/src/smr_cntr.c +++ b/prov/shm/src/smr_cntr.c @@ -65,4 +65,4 @@ int smr_cntr_open(struct fid_domain *domain, struct fi_cntr_attr *attr, free: free(cntr); return ret; -} +} \ No newline at end of file diff --git a/prov/shm/src/smr_comp.c b/prov/shm/src/smr_comp.c index 0cf507f4d3b..2ea5c1314de 100644 --- a/prov/shm/src/smr_comp.c +++ b/prov/shm/src/smr_comp.c @@ -54,8 +54,8 @@ int smr_write_err_comp(struct util_cq *cq, void *context, err_entry.op_context = context; err_entry.flags = flags; err_entry.tag = tag; - err_entry.err = err; - err_entry.prov_errno = -err; + err_entry.err = -err; + err_entry.prov_errno = err; return ofi_peer_cq_write_error(cq, &err_entry); } @@ -63,6 +63,8 @@ int smr_complete_rx(struct smr_ep *ep, void *context, uint32_t op, uint64_t flags, size_t len, void *buf, int64_t id, uint64_t tag, uint64_t data) { + struct smr_av *av; + ofi_ep_peer_rx_cntr_inc(&ep->util_ep, op); if (!(flags & (FI_REMOTE_CQ_DATA | FI_COMPLETION))) @@ -70,6 +72,7 @@ int smr_complete_rx(struct smr_ep *ep, void *context, uint32_t op, flags &= ~FI_COMPLETION; + av = container_of(ep->util_ep.av, struct smr_av, util_av); return ofi_peer_cq_write(ep->util_ep.rx_cq, context, flags, len, buf, - data, tag, ep->region->map->peers[id].fiaddr); + data, tag, av->smr_map.peers[id].fiaddr); } \ No newline at end of file diff --git a/prov/shm/src/smr_cq.c b/prov/shm/src/smr_cq.c index 084dda8fa7a..f1721f33194 100644 --- a/prov/shm/src/smr_cq.c +++ b/prov/shm/src/smr_cq.c @@ -62,4 +62,4 @@ int smr_cq_open(struct fid_domain *domain, struct fi_cq_attr *attr, (*cq_fid) = &cq->cq_fid; return FI_SUCCESS; -} +} \ No newline at end of file diff --git a/prov/shm/src/smr_domain.c b/prov/shm/src/smr_domain.c index e090a195fc1..6352eff64e8 100644 --- a/prov/shm/src/smr_domain.c +++ b/prov/shm/src/smr_domain.c @@ -103,7 +103,6 @@ static int smr_srx_context(struct fid_domain *domain, struct fi_rx_attr *attr, return -FI_EINVAL; } - static struct fi_ops_domain smr_domain_ops = { .size = sizeof(struct fi_ops_domain), .av_open = smr_av_open, @@ -123,7 +122,8 @@ static int smr_domain_close(fid_t fid) int ret; struct smr_domain *domain; - domain = container_of(fid, struct smr_domain, util_domain.domain_fid.fid); + domain = container_of(fid, struct smr_domain, + util_domain.domain_fid.fid); if (domain->ipc_cache) ofi_ipc_cache_destroy(domain->ipc_cache); @@ -152,11 +152,10 @@ static struct fi_ops_mr smr_mr_ops = { }; int smr_domain_open(struct fid_fabric *fabric, struct fi_info *info, - struct fid_domain **domain, void *context) + struct fid_domain **domain, void *context) { int ret; struct smr_domain *smr_domain; - struct smr_fabric *smr_fabric; ret = ofi_prov_check_info(&smr_util_prov, fabric->api_version, info); if (ret) @@ -173,13 +172,13 @@ int smr_domain_open(struct fid_fabric *fabric, struct fi_info *info, return ret; } - smr_fabric = container_of(fabric, struct smr_fabric, util_fabric.fabric_fid); - ofi_mutex_lock(&smr_fabric->util_fabric.lock); + ofi_mutex_lock(&smr_domain->util_domain.fabric->lock); smr_domain->fast_rma = smr_fast_rma_enabled(info->domain_attr->mr_mode, info->tx_attr->msg_order); - ofi_mutex_unlock(&smr_fabric->util_fabric.lock); + ofi_mutex_unlock(&smr_domain->util_domain.fabric->lock); - ret = ofi_ipc_cache_open(&smr_domain->ipc_cache, &smr_domain->util_domain); + ret = ofi_ipc_cache_open(&smr_domain->ipc_cache, + &smr_domain->util_domain); if (ret) { free(smr_domain); return ret; @@ -191,4 +190,4 @@ int smr_domain_open(struct fid_fabric *fabric, struct fi_info *info, (*domain)->mr = &smr_mr_ops; return 0; -} +} \ No newline at end of file diff --git a/prov/shm/src/smr_dsa.c b/prov/shm/src/smr_dsa.c index 85695eabf95..f70a001fd87 100644 --- a/prov/shm/src/smr_dsa.c +++ b/prov/shm/src/smr_dsa.c @@ -52,78 +52,82 @@ #define MAX_CMD_BATCH_SIZE (SMR_BUF_BATCH_MAX + SMR_IOV_LIMIT) struct dsa_bitmap { - int size; - atomic_int data; + int size; + atomic_int data; }; struct dsa_cmd_context { - size_t bytes_in_progress; - int index; - int batch_size; - int dir; - uint32_t op; - // We keep track of the entry type to know which lock to acquire - // when we need to do the updates after completion - void *entry_ptr; + size_t bytes_in_progress; + struct smr_pend_entry *pend; + int batch_size; + int index; }; struct wq_handle { union { - void *mmapped; - int fd; + void *mmapped; + int fd; }; }; struct smr_dsa_context { - struct dsa_hw_desc dsa_work_desc[MAX_CMD_BATCH_SIZE * - CMD_CONTEXT_COUNT]; - - struct dsa_completion_record dsa_work_comp[MAX_CMD_BATCH_SIZE * - CMD_CONTEXT_COUNT]; - - struct dsa_cmd_context dsa_cmd_context[CMD_CONTEXT_COUNT]; - - struct dsa_bitmap dsa_bitmap; - struct wq_handle wq_handle[MAX_WQS_PER_EP]; - int wq_count; - int next_wq; - int (*submit_cmd)(struct wq_handle *wq_handle, struct dsa_hw_desc *desc); - void (*close_wq)(struct wq_handle *wq_handle); - - unsigned long copy_type_stats[2]; - unsigned long page_fault_stats[2]; + struct dsa_hw_desc dsa_work_desc[MAX_CMD_BATCH_SIZE * + CMD_CONTEXT_COUNT]; + + struct dsa_completion_record dsa_work_comp[MAX_CMD_BATCH_SIZE * + CMD_CONTEXT_COUNT]; + + struct dsa_cmd_context dsa_cmd_context[CMD_CONTEXT_COUNT]; + + struct dsa_bitmap dsa_bitmap; + struct wq_handle wq_handle[MAX_WQS_PER_EP]; + int wq_count; + int next_wq; + int (*submit_cmd)( + struct wq_handle *wq_handle, + struct dsa_hw_desc *desc); + void (*close_wq)( + struct wq_handle *wq_handle); + + unsigned long copy_type_stats[2]; + unsigned long page_fault_stats[2]; }; struct dsa_ops { - struct accfg_device *(*accfg_wq_get_device)(struct accfg_wq *wq); - int (*accfg_device_get_cdev_major)(struct accfg_device *dev); - int (*accfg_wq_get_cdev_minor)(struct accfg_wq *wq); - int (*accfg_new)(struct accfg_ctx **ctx); - enum accfg_device_state (*accfg_device_get_state)(struct accfg_device *device); - uint64_t (*accfg_device_get_gen_cap)(struct accfg_device *device); - int (*accfg_device_get_numa_node)(struct accfg_device *device); - enum accfg_wq_state (*accfg_wq_get_state)(struct accfg_wq *wq); - uint64_t (*accfg_wq_get_max_transfer_size)(struct accfg_wq *wq); - enum accfg_wq_type (*accfg_wq_get_type)(struct accfg_wq *wq); - enum accfg_wq_mode (*accfg_wq_get_mode)(struct accfg_wq *wq); - const char *(*accfg_wq_get_devname)(struct accfg_wq *wq); - struct accfg_ctx *(*accfg_unref)(struct accfg_ctx *ctx); - - struct accfg_device *(*accfg_device_get_first)(struct accfg_ctx *ctx); - struct accfg_device *(*accfg_device_get_next)(struct accfg_device *device); - struct accfg_wq *(*accfg_wq_get_first)(struct accfg_device *device); - struct accfg_wq *(*accfg_wq_get_next)(struct accfg_wq *wq); + struct accfg_device *(*accfg_wq_get_device)(struct accfg_wq *wq); + int (*accfg_device_get_cdev_major)( + struct accfg_device *dev); + int (*accfg_wq_get_cdev_minor)(struct accfg_wq *wq); + int (*accfg_new)(struct accfg_ctx **ctx); + enum accfg_device_state (*accfg_device_get_state)( + struct accfg_device *device); + uint64_t (*accfg_device_get_gen_cap)( + struct accfg_device *device); + int (*accfg_device_get_numa_node)( + struct accfg_device *device); + enum accfg_wq_state (*accfg_wq_get_state)(struct accfg_wq *wq); + uint64_t (*accfg_wq_get_max_transfer_size)( + struct accfg_wq *wq); + enum accfg_wq_type (*accfg_wq_get_type)(struct accfg_wq *wq); + enum accfg_wq_mode (*accfg_wq_get_mode)(struct accfg_wq *wq); + const char *(*accfg_wq_get_devname)(struct accfg_wq *wq); + struct accfg_ctx *(*accfg_unref)(struct accfg_ctx *ctx); + struct accfg_device *(*accfg_device_get_first)(struct accfg_ctx *ctx); + struct accfg_device *(*accfg_device_get_next)( + struct accfg_device *device); + struct accfg_wq *(*accfg_wq_get_first) + (struct accfg_device *device); + struct accfg_wq *(*accfg_wq_get_next)(struct accfg_wq *wq); }; #define dsa_foreach_device(ctx, device) \ - for (device = dsa_ops.accfg_device_get_first(ctx); \ - device != NULL; \ + for (device = dsa_ops.accfg_device_get_first(ctx); \ + device != NULL; \ device = dsa_ops.accfg_device_get_next(device)) - #define dsa_foreach_wq(device, wq) \ - for (wq = dsa_ops.accfg_wq_get_first(device); \ - wq != NULL; \ + for (wq = dsa_ops.accfg_wq_get_first(device); \ + wq != NULL; \ wq = dsa_ops.accfg_wq_get_next(wq)) static void *libdsa_handle = NULL; @@ -150,7 +154,7 @@ static int dsa_write_cmd(struct wq_handle *wq_handle, struct dsa_hw_desc *desc) return ret != sizeof(*desc) ? 1 : 0; } -static __always_inline void dsa_desc_submit(struct smr_dsa_context *dsa_context, +static __always_inline void dsa_desc_submit(struct smr_dsa_context *dsa_ctx, struct dsa_hw_desc *hw) { int status; @@ -159,10 +163,9 @@ static __always_inline void dsa_desc_submit(struct smr_dsa_context *dsa_context, { asm volatile("sfence":::"memory"); } do { - status = dsa_context->submit_cmd( - &dsa_context->wq_handle[dsa_context->next_wq], hw); - dsa_context->next_wq = - (dsa_context->next_wq + 1) % (dsa_context->wq_count); + status = dsa_ctx->submit_cmd( + &dsa_ctx->wq_handle[dsa_ctx->next_wq], hw); + dsa_ctx->next_wq = (dsa_ctx->next_wq + 1) % (dsa_ctx->wq_count); } while (status); } @@ -263,17 +266,15 @@ static void dsa_idxd_wq_close(struct wq_handle *wq_handle) } static int dsa_idxd_init_wq_array(int shared, int numa_node, - struct smr_dsa_context *dsa_context) + struct smr_dsa_context *dsa_ctx) { static struct accfg_ctx *ctx; struct accfg_wq *wq; void *wq_reg; - int fd; enum accfg_device_state dstate; enum accfg_wq_state wstate; enum accfg_wq_type type; - int mode; - int wq_count = 0; + int mode, fd, wq_count = 0; struct accfg_device *device; bool wq_mmap_support = true; bool wq_write_support = false; @@ -282,7 +283,6 @@ static int dsa_idxd_init_wq_array(int shared, int numa_node, return 0; dsa_foreach_device(ctx, device) { - /* Make sure that the device is enabled */ dstate = (*dsa_ops.accfg_device_get_state)(device); if (dstate != ACCFG_DEVICE_ENABLED) continue; @@ -292,19 +292,17 @@ static int dsa_idxd_init_wq_array(int shared, int numa_node, GENCAP_CACHE_CTRL_MEM) == 0) continue; - /* Match the device to the id requested */ if (numa_node != -1 && (*dsa_ops.accfg_device_get_numa_node)(device) != numa_node) continue; - dsa_foreach_wq(device, wq) - { - /* Get a workqueue that's enabled */ + dsa_foreach_wq(device, wq) { wstate = (*dsa_ops.accfg_wq_get_state)(wq); if (wstate != ACCFG_WQ_ENABLED) continue; - if ((*dsa_ops.accfg_wq_get_max_transfer_size)(wq) < SMR_SAR_SIZE) + if ((*dsa_ops.accfg_wq_get_max_transfer_size)(wq) < + SMR_SAR_SIZE) continue; /* The wq type should be user */ @@ -312,16 +310,14 @@ static int dsa_idxd_init_wq_array(int shared, int numa_node, if (type != ACCFG_WQT_USER) continue; - /* Make sure the mode is correct */ mode = (*dsa_ops.accfg_wq_get_mode)(wq); if ((mode == ACCFG_WQ_SHARED && !shared) || (mode == ACCFG_WQ_DEDICATED && shared)) continue; - /* This is a candidate wq */ FI_DBG(&smr_prov, FI_LOG_EP_CTRL, - "DSA WQ: %s\n", - (*dsa_ops.accfg_wq_get_devname)(wq)); + "DSA WQ: %s\n", + (*dsa_ops.accfg_wq_get_devname)(wq)); fd = -1; wq_reg = NULL; @@ -332,7 +328,8 @@ static int dsa_idxd_init_wq_array(int shared, int numa_node, wq_mmap_support = false; wq_write_support = true; } else if (wq_reg != NULL) { - dsa_context->wq_handle[wq_count].mmapped = wq_reg; + dsa_ctx->wq_handle[wq_count].mmapped = + wq_reg; } } @@ -341,7 +338,7 @@ static int dsa_idxd_init_wq_array(int shared, int numa_node, if (fd < 0 && wq_count == 0) wq_write_support = false; else if (fd >= 0) - dsa_context->wq_handle[wq_count].fd = fd; + dsa_ctx->wq_handle[wq_count].fd = fd; } if (wq_reg || fd >= 0 ) { @@ -356,16 +353,15 @@ static int dsa_idxd_init_wq_array(int shared, int numa_node, } if (wq_mmap_support) { - dsa_context->submit_cmd = dsa_enq_cmd; - dsa_context->close_wq = dsa_idxd_wq_unmap; + dsa_ctx->submit_cmd = dsa_enq_cmd; + dsa_ctx->close_wq = dsa_idxd_wq_unmap; } else if (wq_write_support) { - dsa_context->submit_cmd = dsa_write_cmd; - dsa_context->close_wq = dsa_idxd_wq_close; + dsa_ctx->submit_cmd = dsa_write_cmd; + dsa_ctx->close_wq = dsa_idxd_wq_close; } else { assert(wq_count == 0); } - (*dsa_ops.accfg_unref)(ctx); return wq_count; } @@ -382,12 +378,10 @@ static void dsa_bitmap_clear_bit(struct dsa_bitmap *bitmap, int index) atomic_fetch_and(&bitmap->data, ~(1ULL << index)); } -static int dsa_bitmap_allocate(struct dsa_bitmap *bitmap, int size) +static void dsa_bitmap_allocate(struct dsa_bitmap *bitmap, int size) { atomic_init(&bitmap->data, 0); bitmap->size = size; - - return 1; } static int dsa_bitmap_test_bit(struct dsa_bitmap *bitmap, int index) @@ -396,58 +390,53 @@ static int dsa_bitmap_test_bit(struct dsa_bitmap *bitmap, int index) return atomic_load(&bitmap->data) & (1ULL << index); } -static int dsa_bitmap_is_empty(struct dsa_bitmap *bitmap) +static bool dsa_bitmap_is_empty(struct dsa_bitmap *bitmap) { return atomic_load(&bitmap->data) == 0; } -static struct dsa_cmd_context * -dsa_allocate_cmd_context(struct smr_dsa_context *smr_dsa_context) +static struct dsa_cmd_context * dsa_alloc_cmd(struct smr_dsa_context *dsa_ctx) { - struct dsa_cmd_context *dsa_cmd_context; + struct dsa_cmd_context *cmd_ctx; int i; for (i = 0; i < CMD_CONTEXT_COUNT; i++) { - if (!dsa_bitmap_test_and_set_bit(&smr_dsa_context->dsa_bitmap, i)) + if (!dsa_bitmap_test_and_set_bit(&dsa_ctx->dsa_bitmap, i)) break; } if (i == CMD_CONTEXT_COUNT) return NULL; - dsa_cmd_context = &smr_dsa_context->dsa_cmd_context[i]; - memset(dsa_cmd_context, 0, sizeof(*dsa_cmd_context)); - dsa_cmd_context->index = i; + cmd_ctx = &dsa_ctx->dsa_cmd_context[i]; + memset(cmd_ctx, 0, sizeof(*cmd_ctx)); + cmd_ctx->index = i; - return dsa_cmd_context; + return cmd_ctx; } -static void dsa_free_cmd_context(struct dsa_cmd_context *dsa_cmd_context, - struct smr_dsa_context *smr_dsa_context) +static void dsa_free_cmd(struct dsa_cmd_context *cmd_ctx, + struct smr_dsa_context *dsa_ctx) { - dsa_bitmap_clear_bit(&smr_dsa_context->dsa_bitmap, - dsa_cmd_context->index); + dsa_bitmap_clear_bit(&dsa_ctx->dsa_bitmap, cmd_ctx->index); } -static struct dsa_hw_desc * -dsa_get_work_descriptor_array_ptr(struct dsa_cmd_context *dsa_cmd_context, - struct smr_dsa_context *dsa_context) +static struct dsa_hw_desc *dsa_get_desc(struct dsa_cmd_context *cmd_ctx, + struct smr_dsa_context *dsa_ctx) { - return &dsa_context->dsa_work_desc[dsa_cmd_context->index * - MAX_CMD_BATCH_SIZE]; + return &dsa_ctx->dsa_work_desc[cmd_ctx->index * MAX_CMD_BATCH_SIZE]; } -static struct dsa_hw_desc * -dsa_get_free_work_descriptor(struct dsa_cmd_context *dsa_cmd_context, - struct smr_dsa_context *dsa_context) +static struct dsa_hw_desc *dsa_alloc_desc(struct dsa_cmd_context *cmd_ctx, + struct smr_dsa_context *dsa_ctx) { struct dsa_hw_desc *free_desc; struct dsa_completion_record *free_comp; - free_desc = &dsa_context->dsa_work_desc[dsa_cmd_context->index * - MAX_CMD_BATCH_SIZE + dsa_cmd_context->batch_size]; - free_comp = &dsa_context->dsa_work_comp[dsa_cmd_context->index * - MAX_CMD_BATCH_SIZE + dsa_cmd_context->batch_size++]; + free_desc = &dsa_ctx->dsa_work_desc[cmd_ctx->index * + MAX_CMD_BATCH_SIZE + cmd_ctx->batch_size]; + free_comp = &dsa_ctx->dsa_work_comp[cmd_ctx->index * + MAX_CMD_BATCH_SIZE + cmd_ctx->batch_size++]; memset(free_desc, 0, sizeof(*free_desc)); memset(free_comp, 0, sizeof(*free_comp)); @@ -456,16 +445,16 @@ dsa_get_free_work_descriptor(struct dsa_cmd_context *dsa_cmd_context, return free_desc; } -static struct dsa_completion_record * -dsa_get_work_completion_array_ptr(struct dsa_cmd_context *dsa_cmd_context, - struct smr_dsa_context *dsa_context) +static struct dsa_completion_record *dsa_get_comp_ptr( + struct dsa_cmd_context *dsa_cmd_context, + struct smr_dsa_context *dsa_context) { return &dsa_context->dsa_work_comp[dsa_cmd_context->index * MAX_CMD_BATCH_SIZE]; } -static struct dsa_cmd_context *dsa_get_cmd_context(struct smr_dsa_context - *dsa_context, int index) +static struct dsa_cmd_context *dsa_get_cmd(struct smr_dsa_context *dsa_context, + int index) { if (dsa_bitmap_test_bit(&dsa_context->dsa_bitmap, index)) return &dsa_context->dsa_cmd_context[index]; @@ -495,8 +484,9 @@ static void dsa_touch_buffer_pages(struct dsa_hw_desc *desc) *dst_addr = *dst_addr; } - // Touch last byte in case start of buffer is not aligned to page - // boundary + /* Touch last byte in case start of buffer is not aligned to page + * boundary + */ src_addr = (char *)desc->src_addr + (desc->xfer_size - 1); dst_addr = (char *)desc->dst_addr + (desc->xfer_size - 1); @@ -506,9 +496,8 @@ static void dsa_touch_buffer_pages(struct dsa_hw_desc *desc) #pragma GCC diagnostic pop } -static void dsa_prepare_copy_desc(struct dsa_hw_desc *desc, - uint32_t xfer_size, uint64_t src_addr, - uint64_t dst_addr) +static void dsa_prepare_desc(struct dsa_hw_desc *desc, uint32_t xfer_size, + uint64_t src_addr, uint64_t dst_addr) { desc->opcode = DSA_OPCODE_MEMMOVE; desc->flags = IDXD_OP_FLAG_CRAV | IDXD_OP_FLAG_RCR | IDXD_OP_FLAG_CC; @@ -517,44 +506,51 @@ static void dsa_prepare_copy_desc(struct dsa_hw_desc *desc, desc->dst_addr = dst_addr; } -static void smr_dsa_copy_sar(struct smr_freestack *sar_pool, - struct smr_dsa_context *dsa_context, - struct dsa_cmd_context *dsa_cmd_context, - struct smr_resp *resp, struct smr_cmd *cmd, - const struct iovec *iov, size_t count, - size_t *bytes_done, struct smr_region *region) +ssize_t smr_dsa_copy_sar(struct smr_ep *ep, struct smr_pend_entry *pend) { + struct smr_dsa_context *dsa_ctx = ep->dsa_context; + struct dsa_cmd_context *cmd_ctx; + struct smr_region *peer_smr; + struct smr_freestack *sar_pool; struct smr_sar_buf *smr_sar_buf; - size_t remaining_sar_size; - size_t remaining_iov_size; - size_t iov_len; - size_t iov_index = 0; - int sar_index = 0; - int cmd_index = 0; - size_t iov_offset = *bytes_done; - size_t sar_offset = 0; - size_t cmd_size = 0; - char *iov_buf = NULL; - char *sar_buf = NULL; + size_t remaining_sar_size, remaining_iov_size, iov_len, iov_index = 0; + size_t iov_offset, sar_offset = 0, cmd_size = 0, dsa_bytes_pending = 0; + int sar_index = 0, cmd_index = 0; + char *iov_buf = NULL, *sar_buf = NULL; struct dsa_hw_desc *desc = NULL; - size_t dsa_bytes_pending = 0; - for (iov_index = 0; iov_index < count; iov_index++) { - iov_len = iov[iov_index].iov_len; + assert(smr_env.use_dsa_sar); + + if (pend->type == SMR_RX_ENTRY) { + peer_smr = smr_peer_region(ep, pend->cmd->hdr.rx_id); + if (smr_peer_data(peer_smr)[pend->cmd->hdr.tx_id].sar_status != + SMR_SAR_READY) + return -FI_EAGAIN; + } + cmd_ctx = dsa_alloc_cmd(ep->dsa_context); + if (!cmd_ctx) + return -FI_ENOMEM; + + cmd_ctx->pend = pend; + + iov_offset = pend->bytes_done; + for (iov_index = 0; iov_index < pend->iov_count; iov_index++) { + iov_len = pend->iov[iov_index].iov_len; if (iov_offset < iov_len) break; iov_offset -= iov_len; } - while ((iov_index < count) && - (sar_index < cmd->msg.data.buf_batch_size) && + sar_pool = smr_pend_sar_pool(ep, pend); + while ((iov_index < pend->iov_count) && + (sar_index < pend->cmd->data.buf_batch_size) && (cmd_index < MAX_CMD_BATCH_SIZE)) { smr_sar_buf = smr_freestack_get_entry_from_index( - sar_pool, cmd->msg.data.sar[sar_index]); - iov_len = iov[iov_index].iov_len; + sar_pool, pend->cmd->data.sar[sar_index]); + iov_len = pend->iov[iov_index].iov_len; - iov_buf = (char *)iov[iov_index].iov_base + iov_offset; + iov_buf = (char *)pend->iov[iov_index].iov_base + iov_offset; sar_buf = (char *)smr_sar_buf->buf + sar_offset; remaining_sar_size = SMR_SAR_SIZE - sar_offset; @@ -562,17 +558,16 @@ static void smr_dsa_copy_sar(struct smr_freestack *sar_pool, cmd_size = MIN(remaining_iov_size, remaining_sar_size); assert(cmd_size > 0); - desc = dsa_get_free_work_descriptor(dsa_cmd_context, - dsa_context); + desc = dsa_alloc_desc(cmd_ctx, ep->dsa_context); - if (dsa_cmd_context->dir == OFI_COPY_BUF_TO_IOV) - dsa_prepare_copy_desc(desc, cmd_size, (uintptr_t) - sar_buf, (uintptr_t) iov_buf); + if (pend->sar_dir == OFI_COPY_BUF_TO_IOV) + dsa_prepare_desc(desc, cmd_size, (uintptr_t) sar_buf, + (uintptr_t) iov_buf); else - dsa_prepare_copy_desc(desc, cmd_size, (uintptr_t) - iov_buf, (uintptr_t) sar_buf); + dsa_prepare_desc(desc, cmd_size, (uintptr_t) iov_buf, + (uintptr_t) sar_buf); - dsa_desc_submit(dsa_context, desc); + dsa_desc_submit(ep->dsa_context, desc); cmd_index++; dsa_bytes_pending += cmd_size; @@ -594,158 +589,147 @@ static void smr_dsa_copy_sar(struct smr_freestack *sar_pool, } assert(dsa_bytes_pending > 0); - resp->status = SMR_STATUS_BUSY; + cmd_ctx->bytes_in_progress = dsa_bytes_pending; + dsa_ctx->copy_type_stats[pend->sar_dir]++; - dsa_cmd_context->bytes_in_progress = dsa_bytes_pending; - dsa_context->copy_type_stats[dsa_cmd_context->dir]++; - dsa_cmd_context->op = cmd->msg.hdr.op; + /* FI_EBUSY indicates command was issued successfully but contents are + * not ready yet */ + return -FI_EBUSY; } - -static void -dsa_process_partially_completed_desc(struct smr_dsa_context *dsa_context, - struct dsa_hw_desc *dsa_descriptor) +static void dsa_process_partial_copy(struct smr_dsa_context *dsa_ctx, + struct dsa_hw_desc *dsa_desc) { - uint32_t new_xfer_size; - uint64_t new_src_addr; - uint64_t new_dst_addr; - uint32_t bytes_completed; + uint32_t new_xfer_size, bytes_completed; + uint64_t new_src_addr, new_dst_addr; struct dsa_completion_record *comp = - (struct dsa_completion_record *)dsa_descriptor->completion_addr; + (struct dsa_completion_record *)dsa_desc->completion_addr; bytes_completed = comp->bytes_completed; - // Update descriptor src & dst buffer based on copy direction; see 8.3.4 - // of DSA spec - new_xfer_size = dsa_descriptor->xfer_size - bytes_completed; - new_src_addr = - (comp->result ? dsa_descriptor->src_addr - : dsa_descriptor->src_addr + bytes_completed); - new_dst_addr = - (comp->result ? dsa_descriptor->dst_addr - : dsa_descriptor->dst_addr + bytes_completed); - - // Reset completion record. + /* Update descriptor src & dst buffer based on copy direction + * See 8.3.4 of DSA spec + */ + new_xfer_size = dsa_desc->xfer_size - bytes_completed; + new_src_addr = (comp->result ? dsa_desc->src_addr : + dsa_desc->src_addr + bytes_completed); + new_dst_addr = (comp->result ? dsa_desc->dst_addr : + dsa_desc->dst_addr + bytes_completed); + memset(comp, 0, sizeof(*comp)); - dsa_prepare_copy_desc(dsa_descriptor, new_xfer_size, - new_src_addr, new_dst_addr); + dsa_prepare_desc(dsa_desc, new_xfer_size, new_src_addr, new_dst_addr); - dsa_touch_buffer_pages(dsa_descriptor); + dsa_touch_buffer_pages(dsa_desc); - dsa_desc_submit(dsa_context, dsa_descriptor); + dsa_desc_submit(dsa_ctx, dsa_desc); } -static void dsa_update_tx_entry(struct smr_region *smr, - struct dsa_cmd_context *dsa_cmd_context) +static void dsa_complete_tx_work(struct smr_ep *ep, struct smr_pend_entry *pend) { - struct smr_resp *resp; - struct smr_cmd *cmd; - struct smr_tx_entry *tx_entry = dsa_cmd_context->entry_ptr; + int ret; - tx_entry->bytes_done += dsa_cmd_context->bytes_in_progress; - cmd = &tx_entry->cmd; - resp = smr_get_ptr(smr, cmd->msg.hdr.src_data); + if (pend->bytes_done == pend->cmd->hdr.size && + pend->cmd->hdr.op == ofi_op_read_req) { + ret = smr_complete_tx(ep, pend->comp_ctx, pend->cmd->hdr.op, + pend->comp_flags); + if (ret) + FI_WARN(&smr_prov, FI_LOG_EP_CTRL, + "unable to process tx completion\n"); + + smr_peer_data(ep->region)[pend->cmd->hdr.tx_id].sar_status = + SMR_SAR_FREE; + ofi_buf_free(pend); + return; + } - assert(resp->status == SMR_STATUS_BUSY); - resp->status = (dsa_cmd_context->dir == OFI_COPY_IOV_TO_BUF ? - SMR_STATUS_SAR_FULL : SMR_STATUS_SAR_EMPTY); + smr_peer_data(ep->region)[pend->cmd->hdr.tx_id].sar_status = + SMR_SAR_READY; } -static void dsa_update_sar_entry(struct smr_region *smr, - struct dsa_cmd_context *dsa_cmd_context) +static void dsa_complete_rx_work(struct smr_ep *ep, struct smr_pend_entry *pend) { - struct smr_pend_entry *sar_entry = dsa_cmd_context->entry_ptr; - struct smr_region *peer_smr; - struct smr_resp *resp; - struct smr_cmd *cmd; - - sar_entry->bytes_done += dsa_cmd_context->bytes_in_progress; - cmd = &sar_entry->cmd; - peer_smr = smr_peer_region(smr, cmd->msg.hdr.id); - resp = smr_get_ptr(peer_smr, cmd->msg.hdr.src_data); + int ret; - assert(resp->status == SMR_STATUS_BUSY); - resp->status = (dsa_cmd_context->dir == OFI_COPY_IOV_TO_BUF ? - SMR_STATUS_SAR_FULL : SMR_STATUS_SAR_EMPTY); + if (pend->bytes_done == pend->cmd->hdr.size) { + ret = smr_complete_rx(ep, pend->comp_ctx, pend->cmd->hdr.op, + pend->comp_flags, pend->bytes_done, + pend->iov[0].iov_base, + pend->cmd->hdr.rx_id, pend->cmd->hdr.tag, + pend->cmd->hdr.cq_data); + if (ret) { + FI_WARN(&smr_prov, FI_LOG_EP_CTRL, + "unable to process rx completion\n"); + } + if (pend->rx.rx_entry) + ep->srx->owner_ops->free_entry(pend->rx.rx_entry); + } + smr_return_cmd(ep, pend->cmd); } -static void dsa_process_complete_work(struct smr_region *smr, - struct dsa_cmd_context *dsa_cmd_context, - struct smr_dsa_context *dsa_context) +static void dsa_process_complete_work(struct smr_ep *ep, + struct dsa_cmd_context *cmd_ctx) { - if (dsa_cmd_context->op == ofi_op_read_req) { - if (dsa_cmd_context->dir == OFI_COPY_BUF_TO_IOV) - dsa_update_tx_entry(smr, dsa_cmd_context); - else - dsa_update_sar_entry(smr, dsa_cmd_context); - } else { - if (dsa_cmd_context->dir == OFI_COPY_IOV_TO_BUF) - dsa_update_tx_entry(smr, dsa_cmd_context); - else - dsa_update_sar_entry(smr, dsa_cmd_context); - } + cmd_ctx->pend->bytes_done += cmd_ctx->bytes_in_progress; + + if (cmd_ctx->pend->type == SMR_RX_ENTRY) + dsa_complete_rx_work(ep, cmd_ctx->pend); + else + dsa_complete_tx_work(ep, cmd_ctx->pend); - dsa_free_cmd_context(dsa_cmd_context, dsa_context); + dsa_free_cmd(cmd_ctx, ep->dsa_context); } -static inline void -dsa_page_fault_debug_info(struct dsa_cmd_context *dsa_cmd_context, - struct dsa_completion_record *dsa_work_comp) +static inline void dsa_page_fault_debug_info(struct dsa_cmd_context *cmd_ctx, + struct dsa_completion_record *comp) { - FI_TRACE( - &smr_prov, FI_LOG_EP_CTRL, + FI_TRACE(&smr_prov, FI_LOG_EP_CTRL, "handle_page_fault read_fault %d\ write_fault %d addr %p dir: %d cmd_idx: %d\n", - !(dsa_work_comp->status & DSA_COMP_STATUS_WRITE), - dsa_work_comp->status & DSA_COMP_STATUS_WRITE, - (void *)dsa_work_comp->fault_addr, - dsa_cmd_context->dir, dsa_cmd_context->index); + !(comp->status & DSA_COMP_STATUS_WRITE), + comp->status & DSA_COMP_STATUS_WRITE, + (void *)comp->fault_addr, + cmd_ctx->pend->sar_dir, cmd_ctx->index); } -static bool dsa_check_cmd_status(struct smr_dsa_context *dsa_context, - struct dsa_cmd_context *dsa_cmd_context) +static bool dsa_check_cmd_status(struct smr_dsa_context *dsa_ctx, + struct dsa_cmd_context *cmd_ctx) { int i; struct dsa_hw_desc *dsa_work; - struct dsa_completion_record *dsa_work_comp; - bool dsa_cmd_completed = true; - uint8_t status_value = 0; - dsa_work = dsa_get_work_descriptor_array_ptr(dsa_cmd_context, - dsa_context); - dsa_work_comp = - dsa_get_work_completion_array_ptr(dsa_cmd_context, dsa_context); - - for (i = 0; i < dsa_cmd_context->batch_size; i++) { - status_value = dsa_work_comp[i].status & DSA_COMP_STATUS_MASK; - - switch (status_value) { + struct dsa_completion_record *comp; + bool cmd_completed = true; + uint8_t status = 0; + + dsa_work = dsa_get_desc(cmd_ctx, dsa_ctx); + comp = dsa_get_comp_ptr(cmd_ctx, dsa_ctx); + + for (i = 0; i < cmd_ctx->batch_size; i++) { + status = comp[i].status & DSA_COMP_STATUS_MASK; + + switch (status) { case DSA_COMP_SUCCESS: break; case DSA_COMP_NONE: - dsa_cmd_completed = false; + cmd_completed = false; break; case DSA_COMP_PAGE_FAULT_NOBOF: - dsa_page_fault_debug_info(dsa_cmd_context, - &dsa_work_comp[i]); - dsa_process_partially_completed_desc(dsa_context, - &dsa_work[i]); - dsa_context->page_fault_stats[dsa_cmd_context->dir]++; - dsa_cmd_completed = false; + dsa_page_fault_debug_info(cmd_ctx, &comp[i]); + dsa_process_partial_copy(dsa_ctx, &dsa_work[i]); + dsa_ctx->page_fault_stats[cmd_ctx->pend->sar_dir]++; + cmd_completed = false; break; default: FI_WARN(&smr_prov, FI_LOG_EP_CTRL, - "Unhandled status codes: 0x%x\n", - status_value); + "Unhandled status codes: 0x%x\n", status); assert(0); } } - return dsa_cmd_completed; + return cmd_completed; } /* SMR functions */ - void smr_dsa_init(void) { libdsa_handle = dlopen("libaccel-config.so", RTLD_NOW); @@ -755,24 +739,24 @@ void smr_dsa_init(void) return; } - dsa_ops.accfg_wq_get_device = dlsym(libdsa_handle, - "accfg_wq_get_device"); + dsa_ops.accfg_wq_get_device = + dlsym(libdsa_handle, "accfg_wq_get_device"); if (!dsa_ops.accfg_wq_get_device) { FI_WARN(&core_prov, FI_LOG_CORE, "Failed to find accfg_wq_get_device\n"); goto err_dlclose; } - dsa_ops.accfg_device_get_cdev_major = dlsym(libdsa_handle, - "accfg_device_get_cdev_major"); + dsa_ops.accfg_device_get_cdev_major = + dlsym(libdsa_handle, "accfg_device_get_cdev_major"); if (!dsa_ops.accfg_device_get_cdev_major) { FI_WARN(&core_prov, FI_LOG_CORE, "Failed to find accfg_device_get_cdev_major\n"); goto err_dlclose; } - dsa_ops.accfg_wq_get_cdev_minor = dlsym(libdsa_handle, - "accfg_wq_get_cdev_minor"); + dsa_ops.accfg_wq_get_cdev_minor = + dlsym(libdsa_handle, "accfg_wq_get_cdev_minor"); if (!dsa_ops.accfg_wq_get_cdev_minor) { FI_WARN(&core_prov, FI_LOG_CORE, "Failed to find accfg_wq_get_cdev_minor\n"); @@ -785,25 +769,24 @@ void smr_dsa_init(void) goto err_dlclose; } - - dsa_ops.accfg_device_get_state = dlsym(libdsa_handle, - "accfg_device_get_state"); + dsa_ops.accfg_device_get_state = + dlsym(libdsa_handle, "accfg_device_get_state"); if (!dsa_ops.accfg_device_get_state) { FI_WARN(&core_prov, FI_LOG_CORE, "Failed to find accfg_device_get_state\n"); goto err_dlclose; } - dsa_ops.accfg_device_get_gen_cap = dlsym(libdsa_handle, - "accfg_device_get_gen_cap"); + dsa_ops.accfg_device_get_gen_cap = + dlsym(libdsa_handle, "accfg_device_get_gen_cap"); if (!dsa_ops.accfg_device_get_gen_cap) { FI_WARN(&core_prov, FI_LOG_CORE, "Failed to find accfg_device_get_gen_cap\n"); goto err_dlclose; } - dsa_ops.accfg_device_get_numa_node = dlsym(libdsa_handle, - "accfg_device_get_numa_node"); + dsa_ops.accfg_device_get_numa_node = + dlsym(libdsa_handle, "accfg_device_get_numa_node"); if (!dsa_ops.accfg_device_get_numa_node) { FI_WARN(&core_prov, FI_LOG_CORE, "Failed to find accfg_device_get_numa_node\n"); @@ -817,8 +800,8 @@ void smr_dsa_init(void) goto err_dlclose; } - dsa_ops.accfg_wq_get_max_transfer_size = dlsym(libdsa_handle, - "accfg_wq_get_max_transfer_size"); + dsa_ops.accfg_wq_get_max_transfer_size = + dlsym(libdsa_handle, "accfg_wq_get_max_transfer_size"); if (!dsa_ops.accfg_wq_get_max_transfer_size) { FI_WARN(&core_prov, FI_LOG_CORE, "Failed to find accfg_wq_get_max_transfer_size\n"); @@ -839,8 +822,8 @@ void smr_dsa_init(void) goto err_dlclose; } - dsa_ops.accfg_wq_get_devname = dlsym(libdsa_handle, - "accfg_wq_get_devname"); + dsa_ops.accfg_wq_get_devname = + dlsym(libdsa_handle, "accfg_wq_get_devname"); if (!dsa_ops.accfg_wq_get_devname) { FI_WARN(&core_prov, FI_LOG_CORE, "Failed to find accfg_wq_get_devname\n"); @@ -849,7 +832,8 @@ void smr_dsa_init(void) dsa_ops.accfg_unref = dlsym(libdsa_handle, "accfg_unref"); if (!dsa_ops.accfg_unref) { - FI_WARN(&core_prov, FI_LOG_CORE, "Failed to find accfg_unref\n"); + FI_WARN(&core_prov, FI_LOG_CORE, + "Failed to find accfg_unref\n"); goto err_dlclose; } @@ -867,16 +851,16 @@ void smr_dsa_init(void) goto err_dlclose; } - dsa_ops.accfg_device_get_first = dlsym(libdsa_handle, - "accfg_device_get_first"); + dsa_ops.accfg_device_get_first = + dlsym(libdsa_handle, "accfg_device_get_first"); if (!dsa_ops.accfg_device_get_first) { FI_WARN(&core_prov, FI_LOG_CORE, "Failed to find accfg_device_get_first\n"); goto err_dlclose; } - dsa_ops.accfg_device_get_next = dlsym(libdsa_handle, - "accfg_device_get_next"); + dsa_ops.accfg_device_get_next = + dlsym(libdsa_handle, "accfg_device_get_next"); if (!dsa_ops.accfg_device_get_next) { FI_WARN(&core_prov, FI_LOG_CORE, "Failed to find accfg_device_get_next\n"); @@ -911,7 +895,7 @@ void smr_dsa_context_init(struct smr_ep *ep) if (!ep->dsa_context) { FI_WARN(&smr_prov, FI_LOG_EP_CTRL, - "aligned_alloc failed for dsa_context\n"); + "aligned_alloc failed for dsa_context\n"); goto alloc_error; } @@ -922,7 +906,7 @@ void smr_dsa_context_init(struct smr_ep *ep) if (wq_count == 0) { FI_WARN(&smr_prov, FI_LOG_EP_CTRL, - "error: wq mmap and wq write not supported\n"); + "error: wq mmap and wq write not supported\n"); goto wq_get_error; } @@ -935,7 +919,7 @@ void smr_dsa_context_init(struct smr_ep *ep) dsa_context->wq_count = wq_count; FI_DBG(&smr_prov, FI_LOG_EP_CTRL, "Numa node of endpoint CPU: %d\n", - numa_node); + numa_node); return; wq_get_error: @@ -974,76 +958,24 @@ void smr_dsa_context_cleanup(struct smr_ep *ep) void smr_dsa_progress(struct smr_ep *ep) { int index; - struct dsa_cmd_context *dsa_cmd_context; + struct dsa_cmd_context *cmd_ctx; bool dsa_cmd_completed; struct smr_dsa_context *dsa_context = ep->dsa_context; if (!dsa_is_work_in_progress(ep->dsa_context)) return; - pthread_spin_lock(&ep->region->lock); for (index = 0; index < CMD_CONTEXT_COUNT; index++) { - dsa_cmd_context = dsa_get_cmd_context(dsa_context, index); + cmd_ctx = dsa_get_cmd(dsa_context, index); - if (!dsa_cmd_context) + if (!cmd_ctx) continue; - dsa_cmd_completed = dsa_check_cmd_status(dsa_context, - dsa_cmd_context); + dsa_cmd_completed = dsa_check_cmd_status(dsa_context, cmd_ctx); if (dsa_cmd_completed) - dsa_process_complete_work(ep->region, dsa_cmd_context, - dsa_context); + dsa_process_complete_work(ep, cmd_ctx); } - pthread_spin_unlock(&ep->region->lock); -} - -size_t smr_dsa_copy_to_sar(struct smr_ep *ep, struct smr_freestack *sar_pool, - struct smr_resp *resp, struct smr_cmd *cmd, - const struct iovec *iov, size_t count, size_t *bytes_done, - void *entry_ptr) -{ - struct dsa_cmd_context *dsa_cmd_context; - - assert(smr_env.use_dsa_sar); - - if (resp->status != SMR_STATUS_SAR_EMPTY) - return -FI_EAGAIN; - - dsa_cmd_context = dsa_allocate_cmd_context(ep->dsa_context); - if (!dsa_cmd_context) - return -FI_ENOMEM; - - dsa_cmd_context->dir = OFI_COPY_IOV_TO_BUF; - dsa_cmd_context->entry_ptr = entry_ptr; - smr_dsa_copy_sar(sar_pool, ep->dsa_context, dsa_cmd_context, resp, - cmd, iov, count, bytes_done, ep->region); - - return FI_SUCCESS; -} - -size_t smr_dsa_copy_from_sar(struct smr_ep *ep, struct smr_freestack *sar_pool, - struct smr_resp *resp, struct smr_cmd *cmd, - const struct iovec *iov, size_t count, size_t *bytes_done, - void *entry_ptr) -{ - struct dsa_cmd_context *dsa_cmd_context; - - assert(smr_env.use_dsa_sar); - - if (resp->status != SMR_STATUS_SAR_FULL) - return FI_EAGAIN; - - dsa_cmd_context = dsa_allocate_cmd_context(ep->dsa_context); - if (!dsa_cmd_context) - return -FI_ENOMEM; - - dsa_cmd_context->dir = OFI_COPY_BUF_TO_IOV; - dsa_cmd_context->entry_ptr = entry_ptr; - smr_dsa_copy_sar(sar_pool, ep->dsa_context, dsa_cmd_context, resp, - cmd, iov, count, bytes_done, ep->region); - - return FI_SUCCESS; } #else @@ -1051,18 +983,7 @@ size_t smr_dsa_copy_from_sar(struct smr_ep *ep, struct smr_freestack *sar_pool, void smr_dsa_init(void) {} void smr_dsa_cleanup(void) {} -size_t smr_dsa_copy_to_sar(struct smr_ep *ep, struct smr_freestack *sar_pool, - struct smr_resp *resp, struct smr_cmd *cmd, - const struct iovec *iov, size_t count, size_t *bytes_done, - void *entry_ptr) -{ - return -FI_ENOSYS; -} - -size_t smr_dsa_copy_from_sar(struct smr_ep *ep, struct smr_freestack *sar_pool, - struct smr_resp *resp, struct smr_cmd *cmd, - const struct iovec *iov, size_t count, size_t *bytes_done, - void *entry_ptr) +ssize_t smr_dsa_copy_sar(struct smr_ep *ep, struct smr_pend_entry *pend) { return -FI_ENOSYS; } @@ -1077,4 +998,4 @@ void smr_dsa_context_cleanup(struct smr_ep *ep) {} void smr_dsa_progress(struct smr_ep *ep) {} -#endif /* SHM_HAVE_DSA */ +#endif /* SHM_HAVE_DSA */ \ No newline at end of file diff --git a/prov/shm/src/smr_dsa.h b/prov/shm/src/smr_dsa.h index 2c51915e621..45cc859702b 100644 --- a/prov/shm/src/smr_dsa.h +++ b/prov/shm/src/smr_dsa.h @@ -39,17 +39,9 @@ extern "C" { #include "smr.h" -/* SMR FUNCTIONS FOR DSA SUPPORT */ void smr_dsa_init(void); void smr_dsa_cleanup(void); -size_t smr_dsa_copy_to_sar(struct smr_ep *ep, struct smr_freestack *sar_pool, - struct smr_resp *resp, struct smr_cmd *cmd, - const struct iovec *iov, size_t count, size_t *bytes_done, - void *entry_ptr); -size_t smr_dsa_copy_from_sar(struct smr_ep *ep, struct smr_freestack *sar_pool, - struct smr_resp *resp, struct smr_cmd *cmd, - const struct iovec *iov, size_t count, size_t *bytes_done, - void *entry_ptr); +ssize_t smr_dsa_copy_sar(struct smr_ep *ep, struct smr_pend_entry *pend); void smr_dsa_context_init(struct smr_ep *ep); void smr_dsa_context_cleanup(struct smr_ep *ep); void smr_dsa_progress(struct smr_ep *ep); diff --git a/prov/shm/src/smr_ep.c b/prov/shm/src/smr_ep.c index f0aa3c29ba4..2894c7c0991 100644 --- a/prov/shm/src/smr_ep.c +++ b/prov/shm/src/smr_ep.c @@ -129,8 +129,9 @@ int smr_ep_setopt(fid_t fid, int level, int optname, const void *optval, if (optname == FI_OPT_CUDA_API_PERMITTED) { if (!hmem_ops[FI_HMEM_CUDA].initialized) { FI_WARN(&smr_prov, FI_LOG_CORE, - "Cannot set option FI_OPT_CUDA_API_PERMITTED when cuda library " - "or cuda device not available\n"); + "Cannot set option FI_OPT_CUDA_API_PERMITTED " + "when cuda library or cuda device " + "not available\n"); return -FI_EINVAL; } @@ -164,11 +165,10 @@ static void smr_send_name(struct smr_ep *ep, int64_t id) { struct smr_region *peer_smr; struct smr_cmd_entry *ce; - struct smr_inject_buf *tx_buf; int64_t pos; int ret; - peer_smr = smr_peer_region(ep->region, id); + peer_smr = smr_peer_region(ep, id); if (smr_peer_data(ep->region)[id].name_sent) return; @@ -177,20 +177,14 @@ static void smr_send_name(struct smr_ep *ep, int64_t id) if (ret == -FI_ENOENT) return; - tx_buf = smr_get_txbuf(peer_smr); - if (!tx_buf) { - smr_cmd_queue_discard(ce, pos); - return; - } - - ce->cmd.msg.hdr.op = SMR_OP_MAX + ofi_ctrl_connreq; - ce->cmd.msg.hdr.id = id; - ce->cmd.msg.hdr.data = ep->region->pid; + ce->ptr = smr_peer_to_peer(ep, id, (uintptr_t) &ce->cmd); + ce->cmd.hdr.op = SMR_OP_MAX + ofi_ctrl_connreq; + ce->cmd.hdr.tx_id = id; + ce->cmd.hdr.cq_data = ep->region->pid; - ce->cmd.msg.hdr.src_data = smr_get_offset(peer_smr, tx_buf); - ce->cmd.msg.hdr.size = strlen(ep->name) + 1; - memcpy(tx_buf->data, ep->name, ce->cmd.msg.hdr.size); + ce->cmd.hdr.size = strlen(ep->name) + 1; + memcpy(ce->cmd.data.msg, ep->name, ce->cmd.hdr.size); smr_peer_data(ep->region)[id].name_sent = 1; smr_cmd_queue_commit(ce, pos); @@ -206,13 +200,13 @@ int64_t smr_verify_peer(struct smr_ep *ep, fi_addr_t fi_addr) if (id < 0) return -1; - if (smr_peer_data(ep->region)[id].addr.id >= 0) + if (smr_peer_data(ep->region)[id].id >= 0) return id; - if (!ep->region->map->peers[id].region) { - ofi_spin_lock(&ep->region->map->lock); - ret = smr_map_to_region(&smr_prov, ep->region->map, id); - ofi_spin_unlock(&ep->region->map->lock); + if (!ep->map->peers[id].region) { + ofi_genlock_lock(&ep->util_ep.av->lock); + ret = smr_map_to_region(ep->map, id); + ofi_genlock_unlock(&ep->util_ep.av->lock); if (ret) return -1; } @@ -222,312 +216,173 @@ int64_t smr_verify_peer(struct smr_ep *ep, fi_addr_t fi_addr) return -1; } -void smr_format_pend_resp(struct smr_tx_entry *pend, struct smr_cmd *cmd, - void *context, struct ofi_mr **mr, - const struct iovec *iov, uint32_t iov_count, - uint64_t op_flags, int64_t id, struct smr_resp *resp) +void smr_format_tx_pend(struct smr_pend_entry *pend, struct smr_cmd *cmd, + void *context, struct ofi_mr **mr, + const struct iovec *iov, uint32_t iov_count, + uint64_t op_flags) { - pend->cmd = *cmd; - pend->context = context; + pend->type = SMR_TX_ENTRY; + pend->cmd = cmd; + pend->comp_ctx = context; + pend->comp_flags = op_flags; + memcpy(pend->iov, iov, sizeof(*iov) * iov_count); pend->iov_count = iov_count; - pend->peer_id = id; - pend->op_flags = op_flags; - if (cmd->msg.hdr.op_src != smr_src_sar) { - pend->bytes_done = 0; - resp->status = FI_EBUSY; - } + pend->bytes_done = 0; if (mr) memcpy(pend->mr, mr, sizeof(*mr) * iov_count); else memset(pend->mr, 0, sizeof(*mr) * iov_count); - - resp->msg_id = (uint64_t) (uintptr_t) pend; } -void smr_generic_format(struct smr_cmd *cmd, int64_t peer_id, uint32_t op, - uint64_t tag, uint64_t data, uint64_t op_flags) +void smr_generic_format(struct smr_cmd *cmd, int64_t tx_id, int64_t rx_id, + uint32_t op, uint64_t tag, uint64_t data, + uint64_t op_flags) { - cmd->msg.hdr.op = op; - cmd->msg.hdr.op_flags = op == ofi_op_read_req ? SMR_RMA_REQ : 0; - cmd->msg.hdr.tag = tag; - cmd->msg.hdr.id = peer_id; - cmd->msg.hdr.data = data; + cmd->hdr.op = op; + cmd->hdr.status = 0; + cmd->hdr.op_flags = 0; + cmd->hdr.tag = tag; + cmd->hdr.tx_id = tx_id; + cmd->hdr.rx_id = rx_id; + cmd->hdr.cq_data = data; + cmd->hdr.rx_ctx = 0; if (op_flags & FI_REMOTE_CQ_DATA) - cmd->msg.hdr.op_flags |= SMR_REMOTE_CQ_DATA; - if (op_flags & FI_COMPLETION) - cmd->msg.hdr.op_flags |= SMR_TX_COMPLETION; + cmd->hdr.op_flags |= SMR_REMOTE_CQ_DATA; } static void smr_format_inline(struct smr_cmd *cmd, struct ofi_mr **mr, const struct iovec *iov, size_t count) { - cmd->msg.hdr.op_src = smr_src_inline; - cmd->msg.hdr.size = ofi_copy_from_mr_iov(cmd->msg.data.msg, - SMR_MSG_DATA_LEN, mr, - iov, count, 0); + cmd->hdr.proto = smr_proto_inline; + cmd->hdr.size = ofi_copy_from_mr_iov(cmd->data.msg, SMR_MSG_DATA_LEN, + mr, iov, count, 0); } -static void smr_format_inject(struct smr_cmd *cmd, struct ofi_mr **mr, - const struct iovec *iov, size_t count, struct smr_region *smr, - struct smr_inject_buf *tx_buf) +static void smr_format_inject(struct smr_ep *ep, struct smr_cmd *cmd, + struct smr_pend_entry *pend) { - cmd->msg.hdr.op_src = smr_src_inject; - cmd->msg.hdr.src_data = smr_get_offset(smr, tx_buf); - cmd->msg.hdr.size = ofi_copy_from_mr_iov(tx_buf->data, SMR_INJECT_SIZE, - mr, iov, count, 0); + struct smr_inject_buf *tx_buf; + + tx_buf = smr_get_inject_buf(ep->region, cmd); + + cmd->hdr.proto = smr_proto_inject; + if (cmd->hdr.op != ofi_op_read_req) { + cmd->hdr.size = ofi_copy_from_mr_iov(tx_buf->data, + SMR_INJECT_SIZE, + pend->mr, pend->iov, + pend->iov_count, 0); + pend->bytes_done = cmd->hdr.size; + } else { + cmd->hdr.size = ofi_total_iov_len(pend->iov, pend->iov_count); + pend->bytes_done = 0; + } } -static void smr_format_iov(struct smr_cmd *cmd, const struct iovec *iov, - size_t count, size_t total_len, struct smr_region *smr, - struct smr_resp *resp) +static void smr_format_iov(struct smr_cmd *cmd, struct smr_pend_entry *pend) { - cmd->msg.hdr.op_src = smr_src_iov; - cmd->msg.hdr.src_data = smr_get_offset(smr, resp); - cmd->msg.data.iov_count = count; - cmd->msg.hdr.size = total_len; - memcpy(cmd->msg.data.iov, iov, sizeof(*iov) * count); + cmd->hdr.proto = smr_proto_iov; + cmd->data.iov_count = pend->iov_count; + cmd->hdr.size = ofi_total_iov_len(pend->iov, pend->iov_count); + memcpy(cmd->data.iov, pend->iov, sizeof(*pend->iov) * pend->iov_count); } static int smr_format_ipc(struct smr_cmd *cmd, void *ptr, size_t len, - struct smr_region *smr, struct smr_resp *resp, + struct smr_region *smr, enum fi_hmem_iface iface, uint64_t device) { int ret; void *base; - cmd->msg.hdr.op_src = smr_src_ipc; - cmd->msg.hdr.src_data = smr_get_offset(smr, resp); - cmd->msg.hdr.size = len; - cmd->msg.data.ipc_info.iface = iface; - cmd->msg.data.ipc_info.device = device; + cmd->hdr.proto = smr_proto_ipc; + cmd->hdr.size = len; + cmd->data.ipc_info.iface = iface; + cmd->data.ipc_info.device = device; - ret = ofi_hmem_get_base_addr(cmd->msg.data.ipc_info.iface, ptr, + ret = ofi_hmem_get_base_addr(cmd->data.ipc_info.iface, ptr, len, &base, - &cmd->msg.data.ipc_info.base_length); + &cmd->data.ipc_info.base_length); if (ret) return ret; - ret = ofi_hmem_get_handle(cmd->msg.data.ipc_info.iface, base, - cmd->msg.data.ipc_info.base_length, - (void **)&cmd->msg.data.ipc_info.ipc_handle); + ret = ofi_hmem_get_handle(cmd->data.ipc_info.iface, base, + cmd->data.ipc_info.base_length, + (void **)&cmd->data.ipc_info.ipc_handle); if (ret) return ret; - cmd->msg.data.ipc_info.base_addr = (uintptr_t) base; - cmd->msg.data.ipc_info.offset = (uintptr_t) ptr - (uintptr_t) base; + cmd->data.ipc_info.base_addr = (uintptr_t) base; + cmd->data.ipc_info.offset = (uintptr_t) ptr - (uintptr_t) base; return FI_SUCCESS; } -static int smr_format_mmap(struct smr_ep *ep, struct smr_cmd *cmd, - const struct iovec *iov, size_t count, size_t total_len, - struct smr_tx_entry *pend, struct smr_resp *resp) -{ - void *mapped_ptr; - int fd, ret, num; - uint64_t msg_id; - struct smr_ep_name *map_name; - - msg_id = ep->msg_id++; - map_name = calloc(1, sizeof(*map_name)); - if (!map_name) { - FI_WARN(&smr_prov, FI_LOG_EP_CTRL, "calloc error\n"); - return -FI_ENOMEM; - } - - pthread_mutex_lock(&ep_list_lock); - dlist_insert_tail(&map_name->entry, &ep_name_list); - pthread_mutex_unlock(&ep_list_lock); - num = smr_mmap_name(map_name->name, ep->name, msg_id); - if (num < 0) { - FI_WARN(&smr_prov, FI_LOG_AV, "generating shm file name failed\n"); - ret = -errno; - goto remove_entry; - } - - fd = shm_open(map_name->name, O_RDWR | O_CREAT, S_IRUSR | S_IWUSR); - if (fd < 0) { - FI_WARN(&smr_prov, FI_LOG_EP_CTRL, "shm_open error\n"); - ret = -errno; - goto remove_entry; - } - - ret = ftruncate(fd, total_len); - if (ret < 0) { - FI_WARN(&smr_prov, FI_LOG_EP_CTRL, "ftruncate error\n"); - goto unlink_close; - } - - mapped_ptr = mmap(NULL, total_len, PROT_READ | PROT_WRITE, - MAP_SHARED, fd, 0); - if (mapped_ptr == MAP_FAILED) { - FI_WARN(&smr_prov, FI_LOG_EP_CTRL, "mmap error\n"); - ret = -errno; - goto unlink_close; - } - - if (cmd->msg.hdr.op != ofi_op_read_req) { - if (ofi_copy_from_iov(mapped_ptr, total_len, iov, count, 0) - != total_len) { - FI_WARN(&smr_prov, FI_LOG_EP_CTRL, "copy from iov error\n"); - ret = -FI_EIO; - goto munmap; - } - munmap(mapped_ptr, total_len); - } else { - pend->map_ptr = mapped_ptr; - } - - cmd->msg.hdr.op_src = smr_src_mmap; - cmd->msg.hdr.msg_id = msg_id; - cmd->msg.hdr.src_data = smr_get_offset(ep->region, resp); - cmd->msg.hdr.size = total_len; - pend->map_name = map_name; - - close(fd); - return 0; - -munmap: - munmap(mapped_ptr, total_len); -unlink_close: - shm_unlink(map_name->name); - close(fd); -remove_entry: - dlist_remove(&map_name->entry); - free(map_name); - return ret; -} - -size_t smr_copy_to_sar(struct smr_freestack *sar_pool, struct smr_resp *resp, - struct smr_cmd *cmd, struct ofi_mr **mr, - const struct iovec *iov, size_t count, - size_t *bytes_done) -{ - struct smr_sar_buf *sar_buf; - size_t start = *bytes_done; - int next_sar_buf = 0; - - if (resp->status != SMR_STATUS_SAR_EMPTY) - return 0; - - while ((*bytes_done < cmd->msg.hdr.size) && - (next_sar_buf < cmd->msg.data.buf_batch_size)) { - sar_buf = smr_freestack_get_entry_from_index( - sar_pool, cmd->msg.data.sar[next_sar_buf]); - - *bytes_done += ofi_copy_from_mr_iov( - sar_buf->buf, SMR_SAR_SIZE, mr, iov, count, - *bytes_done); - - next_sar_buf++; - } - - ofi_wmb(); - - resp->status = SMR_STATUS_SAR_FULL; - - return *bytes_done - start; -} - -size_t smr_copy_from_sar(struct smr_freestack *sar_pool, struct smr_resp *resp, - struct smr_cmd *cmd, struct ofi_mr **mr, - const struct iovec *iov, size_t count, - size_t *bytes_done) +ssize_t smr_copy_sar(struct smr_ep *ep, struct smr_pend_entry *pend) { + struct smr_freestack *sar_pool; struct smr_sar_buf *sar_buf; - size_t start = *bytes_done; int next_sar_buf = 0; - if (resp->status != SMR_STATUS_SAR_FULL) - return 0; - - while ((*bytes_done < cmd->msg.hdr.size) && - (next_sar_buf < cmd->msg.data.buf_batch_size)) { + sar_pool = smr_pend_sar_pool(ep, pend); + while (pend->bytes_done < pend->cmd->hdr.size && + next_sar_buf < pend->cmd->data.buf_batch_size) { sar_buf = smr_freestack_get_entry_from_index( - sar_pool, cmd->msg.data.sar[next_sar_buf]); - - *bytes_done += ofi_copy_to_mr_iov(mr, iov, count, *bytes_done, - sar_buf->buf, SMR_SAR_SIZE); + sar_pool, pend->cmd->data.sar[next_sar_buf]); + pend->bytes_done += ofi_copy_mr_iov( + pend->mr, pend->iov, pend->iov_count, + pend->bytes_done, sar_buf->buf, + SMR_SAR_SIZE, pend->sar_dir); next_sar_buf++; } - ofi_wmb(); - - resp->status = SMR_STATUS_SAR_EMPTY; - return *bytes_done - start; + return FI_SUCCESS; } static int smr_format_sar(struct smr_ep *ep, struct smr_cmd *cmd, - struct ofi_mr **mr, const struct iovec *iov, size_t count, - size_t total_len, struct smr_region *smr, - struct smr_region *peer_smr, int64_t id, - struct smr_tx_entry *pending, struct smr_resp *resp) + struct ofi_mr **mr, const struct iovec *iov, + size_t count, size_t total_len, + struct smr_region *smr, struct smr_region *peer_smr, + struct smr_pend_entry *pend) { int i, ret; - uint32_t sar_needed; - if (peer_smr->max_sar_buf_per_peer == 0) + if (ep->region->max_sar_buf_per_peer == 0 || + smr_peer_data(ep->region)[cmd->hdr.tx_id].sar_status) return -FI_EAGAIN; - if (smr_peer_data(ep->region)[id].sar_status) { - return -FI_EAGAIN; - } - - sar_needed = (total_len + SMR_SAR_SIZE - 1) / SMR_SAR_SIZE; - cmd->msg.data.buf_batch_size = MIN(SMR_BUF_BATCH_MAX, - MIN(peer_smr->max_sar_buf_per_peer, sar_needed)); - - pthread_spin_lock(&peer_smr->lock); - for (i = 0; i < cmd->msg.data.buf_batch_size; i++) { - if (smr_freestack_isempty(smr_sar_pool(peer_smr))) { - cmd->msg.data.buf_batch_size = i; - if (i == 0) { - pthread_spin_unlock(&peer_smr->lock); - return -FI_EAGAIN; - } - break; - } + cmd->data.buf_batch_size = MIN3( + ep->region->max_sar_buf_per_peer, + (total_len + SMR_SAR_SIZE - 1) / SMR_SAR_SIZE, + smr_freestack_avail(smr_sar_pool(ep->region))); - cmd->msg.data.sar[i] = - smr_freestack_pop_by_index(smr_sar_pool(peer_smr)); + for (i = 0; i < cmd->data.buf_batch_size; i++) { + cmd->data.sar[i] = + smr_freestack_pop_by_index(smr_sar_pool(ep->region)); } - pthread_spin_unlock(&peer_smr->lock); - - resp->status = SMR_STATUS_SAR_EMPTY; - cmd->msg.hdr.op_src = smr_src_sar; - cmd->msg.hdr.src_data = smr_get_offset(smr, resp); - cmd->msg.hdr.size = total_len; - pending->bytes_done = 0; - /* Nothing to copy for 0 byte transfer */ - if (!cmd->msg.hdr.size) - goto out; + cmd->hdr.proto = smr_proto_sar; + cmd->hdr.size = total_len; - if (cmd->msg.hdr.op != ofi_op_read_req) { - if (smr_env.use_dsa_sar && ofi_mr_all_host(mr, count)) { - ret = smr_dsa_copy_to_sar(ep, smr_sar_pool(peer_smr), - resp, cmd, iov, count, - &pending->bytes_done, pending); - if (ret != FI_SUCCESS) { - for (i = cmd->msg.data.buf_batch_size - 1; - i >= 0; i--) { - smr_freestack_push_by_index( - smr_sar_pool(peer_smr), - cmd->msg.data.sar[i]); - } - return -FI_EAGAIN; + if (cmd->hdr.op != ofi_op_read_req) { + ret = pend->sar_copy_fn(ep, pend); + if (ret < 0 && ret != -FI_EBUSY) { + for (i = cmd->data.buf_batch_size - 1; i >= 0; i--) { + smr_freestack_push_by_index( + smr_sar_pool(ep->region), + cmd->data.sar[i]); } - } else { - smr_copy_to_sar(smr_sar_pool(peer_smr), resp, cmd, - mr, iov, count, &pending->bytes_done); + return -FI_EAGAIN; } + smr_peer_data(ep->region)[cmd->hdr.tx_id].sar_status = + SMR_SAR_BUSY; + } else { + smr_peer_data(ep->region)[cmd->hdr.tx_id].sar_status = + SMR_SAR_READY; } -out: - smr_peer_data(smr)[id].sar_status = SMR_STATUS_SAR_FULL; + return FI_SUCCESS; } @@ -555,204 +410,161 @@ int smr_select_proto(void **desc, size_t iov_count, bool vma_avail, } if (op == ofi_op_read_req) { + if (total_len <= SMR_INJECT_SIZE) + return smr_proto_inject; if (use_ipc) - return smr_src_ipc; + return smr_proto_ipc; if (vma_avail && FI_HMEM_SYSTEM == iface) - return smr_src_iov; - return smr_src_sar; + return smr_proto_iov; + return smr_proto_sar; } if (fastcopy_avail && total_len <= smr_env.max_gdrcopy_size) - return total_len <= SMR_MSG_DATA_LEN ? smr_src_inline : - smr_src_inject; + return total_len <= SMR_MSG_DATA_LEN ? smr_proto_inline : + smr_proto_inject; - if (op_flags & FI_INJECT) { + if (op_flags & FI_INJECT || total_len <= SMR_INJECT_SIZE) { if (op_flags & FI_DELIVERY_COMPLETE) - return smr_src_sar; + return smr_proto_inject; return total_len <= SMR_MSG_DATA_LEN ? - smr_src_inline : smr_src_inject; + smr_proto_inline : smr_proto_inject; } if (use_ipc) - return smr_src_ipc; - - if (total_len > SMR_INJECT_SIZE && vma_avail) - return smr_src_iov; + return smr_proto_ipc; - if (op_flags & FI_DELIVERY_COMPLETE) - return smr_src_sar; - - if (total_len <= SMR_MSG_DATA_LEN) - return smr_src_inline; - - if (total_len <= SMR_INJECT_SIZE) - return smr_src_inject; - - if (total_len <= smr_env.sar_threshold) - return smr_src_sar; - - return smr_src_mmap; + return vma_avail ? smr_proto_iov: smr_proto_sar; } -static ssize_t smr_do_inline(struct smr_ep *ep, struct smr_region *peer_smr, int64_t id, - int64_t peer_id, uint32_t op, uint64_t tag, uint64_t data, - uint64_t op_flags, struct ofi_mr **desc, - const struct iovec *iov, size_t iov_count, size_t total_len, - void *context, struct smr_cmd *cmd) +static ssize_t smr_do_inline(struct smr_ep *ep, struct smr_region *peer_smr, + int64_t tx_id, int64_t rx_id, uint32_t op, + uint64_t tag, uint64_t data, uint64_t op_flags, + struct ofi_mr **desc, const struct iovec *iov, + size_t iov_count, size_t total_len, void *context, + struct smr_cmd *cmd) { - smr_generic_format(cmd, peer_id, op, tag, data, op_flags); + cmd->hdr.tx_ctx = 0; + smr_generic_format(cmd, tx_id, rx_id, op, tag, data, op_flags); smr_format_inline(cmd, desc, iov, iov_count); return FI_SUCCESS; } -static ssize_t smr_do_inject(struct smr_ep *ep, struct smr_region *peer_smr, int64_t id, - int64_t peer_id, uint32_t op, uint64_t tag, uint64_t data, - uint64_t op_flags, struct ofi_mr **desc, - const struct iovec *iov, size_t iov_count, size_t total_len, - void *context, struct smr_cmd *cmd) +static ssize_t smr_do_inject(struct smr_ep *ep, struct smr_region *peer_smr, + int64_t tx_id, int64_t rx_id, uint32_t op, + uint64_t tag, uint64_t data, uint64_t op_flags, + struct ofi_mr **desc, const struct iovec *iov, + size_t iov_count, size_t total_len, void *context, + struct smr_cmd *cmd) { - struct smr_inject_buf *tx_buf; + struct smr_pend_entry *pend; - tx_buf = smr_get_txbuf(peer_smr); - if (!tx_buf) - return -FI_EAGAIN; + pend = ofi_buf_alloc(ep->pend_pool); + assert(pend); - smr_generic_format(cmd, peer_id, op, tag, data, op_flags); - smr_format_inject(cmd, desc, iov, iov_count, peer_smr, tx_buf); + cmd->hdr.tx_ctx = (uintptr_t) pend; + smr_format_tx_pend(pend, cmd, context, desc, iov, iov_count, op_flags); + + smr_generic_format(cmd, tx_id, rx_id, op, tag, data, op_flags); + smr_format_inject(ep, cmd, pend); return FI_SUCCESS; } -static ssize_t smr_do_iov(struct smr_ep *ep, struct smr_region *peer_smr, int64_t id, - int64_t peer_id, uint32_t op, uint64_t tag, uint64_t data, - uint64_t op_flags, struct ofi_mr **desc, - const struct iovec *iov, size_t iov_count, size_t total_len, - void *context, struct smr_cmd *cmd) +static ssize_t smr_do_iov(struct smr_ep *ep, struct smr_region *peer_smr, + int64_t tx_id, int64_t rx_id, uint32_t op, + uint64_t tag, uint64_t data, uint64_t op_flags, + struct ofi_mr **desc, const struct iovec *iov, + size_t iov_count, size_t total_len, void *context, + struct smr_cmd *cmd) { - struct smr_resp *resp; - struct smr_tx_entry *pend; + struct smr_pend_entry *pend; - if (ofi_cirque_isfull(smr_resp_queue(ep->region))) - return -FI_EAGAIN; + pend = ofi_buf_alloc(ep->pend_pool); + assert(pend); - resp = ofi_cirque_next(smr_resp_queue(ep->region)); - pend = ofi_freestack_pop(ep->tx_fs); + cmd->hdr.tx_ctx = (uintptr_t) pend; + smr_format_tx_pend(pend, cmd, context, desc, iov, iov_count, op_flags); - smr_generic_format(cmd, peer_id, op, tag, data, op_flags); - smr_format_iov(cmd, iov, iov_count, total_len, ep->region, resp); - smr_format_pend_resp(pend, cmd, context, desc, iov, - iov_count, op_flags, id, resp); - ofi_cirque_commit(smr_resp_queue(ep->region)); + smr_generic_format(cmd, tx_id, rx_id, op, tag, data, op_flags); + smr_format_iov(cmd, pend); return FI_SUCCESS; } -static ssize_t smr_do_sar(struct smr_ep *ep, struct smr_region *peer_smr, int64_t id, - int64_t peer_id, uint32_t op, uint64_t tag, uint64_t data, - uint64_t op_flags, struct ofi_mr **desc, - const struct iovec *iov, size_t iov_count, size_t total_len, - void *context, struct smr_cmd *cmd) +static ssize_t smr_do_sar(struct smr_ep *ep, struct smr_region *peer_smr, + int64_t tx_id, int64_t rx_id, uint32_t op, + uint64_t tag, uint64_t data, uint64_t op_flags, + struct ofi_mr **desc, const struct iovec *iov, + size_t iov_count, size_t total_len, void *context, + struct smr_cmd *cmd) { - struct smr_resp *resp; - struct smr_tx_entry *pend; + struct smr_pend_entry *pend; int ret; - if (ofi_cirque_isfull(smr_resp_queue(ep->region))) - return -FI_EAGAIN; + pend = ofi_buf_alloc(ep->pend_pool); + assert(pend); - resp = ofi_cirque_next(smr_resp_queue(ep->region)); - pend = ofi_freestack_pop(ep->tx_fs); + cmd->hdr.tx_ctx = (uintptr_t) pend; + smr_format_tx_pend(pend, cmd, context, desc, iov, iov_count, op_flags); - smr_generic_format(cmd, peer_id, op, tag, data, op_flags); - ret = smr_format_sar(ep, cmd, desc, iov, iov_count, total_len, - ep->region, peer_smr, id, pend, resp); - if (ret) { - ofi_freestack_push(ep->tx_fs, pend); - return ret; - } + pend->sar_dir = op == ofi_op_read_req ? + OFI_COPY_BUF_TO_IOV : OFI_COPY_IOV_TO_BUF; - smr_format_pend_resp(pend, cmd, context, desc, iov, - iov_count, op_flags, id, resp); - ofi_cirque_commit(smr_resp_queue(ep->region)); + if (smr_env.use_dsa_sar && ofi_mr_all_host(pend->mr, pend->iov_count)) + pend->sar_copy_fn = &smr_dsa_copy_sar; + else + pend->sar_copy_fn = &smr_copy_sar; - return FI_SUCCESS; + smr_generic_format(cmd, tx_id, rx_id, op, tag, data, op_flags); + ret = smr_format_sar(ep, cmd, desc, iov, iov_count, total_len, + ep->region, peer_smr, pend); + if (ret) + ofi_buf_free(pend); + + return ret; } -static ssize_t smr_do_ipc(struct smr_ep *ep, struct smr_region *peer_smr, int64_t id, - int64_t peer_id, uint32_t op, uint64_t tag, uint64_t data, - uint64_t op_flags, struct ofi_mr **desc, - const struct iovec *iov, size_t iov_count, size_t total_len, - void *context, struct smr_cmd *cmd) +static ssize_t smr_do_ipc(struct smr_ep *ep, struct smr_region *peer_smr, + int64_t tx_id, int64_t rx_id, uint32_t op, + uint64_t tag, uint64_t data, uint64_t op_flags, + struct ofi_mr **desc, const struct iovec *iov, + size_t iov_count, size_t total_len, void *context, + struct smr_cmd *cmd) { - struct smr_resp *resp; - struct smr_tx_entry *pend; + struct smr_pend_entry *pend; int ret = -FI_EAGAIN; - if (ofi_cirque_isfull(smr_resp_queue(ep->region))) - return -FI_EAGAIN; + pend = ofi_buf_alloc(ep->pend_pool); + assert(pend); - resp = ofi_cirque_next(smr_resp_queue(ep->region)); - pend = ofi_freestack_pop(ep->tx_fs); - - smr_generic_format(cmd, peer_id, op, tag, data, op_flags); + cmd->hdr.tx_ctx = (uintptr_t) pend; + smr_generic_format(cmd, tx_id, rx_id, op, tag, data, op_flags); assert(iov_count == 1 && desc && desc[0]); ret = smr_format_ipc(cmd, iov[0].iov_base, total_len, ep->region, - resp, desc[0]->iface, desc[0]->device); + desc[0]->iface, desc[0]->device); if (ret) { FI_WARN_ONCE(&smr_prov, FI_LOG_EP_CTRL, - "unable to use IPC for msg, fallback to using SAR\n"); - ofi_freestack_push(ep->tx_fs, pend); - return smr_do_sar(ep, peer_smr, id, peer_id, op, tag, data, + "unable to use IPC for msg, " + "fallback to using SAR\n"); + ofi_buf_free(pend); + return smr_do_sar(ep, peer_smr, tx_id, rx_id, op, tag, data, op_flags, desc, iov, iov_count, total_len, context, cmd); } - smr_format_pend_resp(pend, cmd, context, desc, iov, - iov_count, op_flags, id, resp); - ofi_cirque_commit(smr_resp_queue(ep->region)); + smr_format_tx_pend(pend, cmd, context, desc, iov, iov_count, op_flags); return FI_SUCCESS; } -static ssize_t smr_do_mmap(struct smr_ep *ep, struct smr_region *peer_smr, int64_t id, - int64_t peer_id, uint32_t op, uint64_t tag, uint64_t data, - uint64_t op_flags, struct ofi_mr **desc, - const struct iovec *iov, size_t iov_count, size_t total_len, - void *context, struct smr_cmd *cmd) -{ - struct smr_resp *resp; - struct smr_tx_entry *pend; - int ret; - - if (ofi_cirque_isfull(smr_resp_queue(ep->region))) - return -FI_EAGAIN; - - resp = ofi_cirque_next(smr_resp_queue(ep->region)); - pend = ofi_freestack_pop(ep->tx_fs); - - smr_generic_format(cmd, peer_id, op, tag, data, op_flags); - ret = smr_format_mmap(ep, cmd, iov, iov_count, total_len, pend, resp); - if (ret) { - ofi_freestack_push(ep->tx_fs, pend); - return ret; - } - - smr_format_pend_resp(pend, cmd, context, desc, iov, - iov_count, op_flags, id, resp); - ofi_cirque_commit(smr_resp_queue(ep->region)); - - return FI_SUCCESS; -} - -smr_proto_func smr_proto_ops[smr_src_max] = { - [smr_src_inline] = &smr_do_inline, - [smr_src_inject] = &smr_do_inject, - [smr_src_iov] = &smr_do_iov, - [smr_src_mmap] = &smr_do_mmap, - [smr_src_sar] = &smr_do_sar, - [smr_src_ipc] = &smr_do_ipc, +smr_send_func smr_send_ops[smr_proto_max] = { + [smr_proto_inline] = &smr_do_inline, + [smr_proto_inject] = &smr_do_inject, + [smr_proto_iov] = &smr_do_iov, + [smr_proto_sar] = &smr_do_sar, + [smr_proto_ipc] = &smr_do_ipc, }; static int smr_ep_close(struct fid *fid) @@ -781,10 +593,8 @@ static int smr_ep_close(struct fid *fid) if (ep->unexp_buf_pool) ofi_bufpool_destroy(ep->unexp_buf_pool); - if (ep->pend_buf_pool) - ofi_bufpool_destroy(ep->pend_buf_pool); - - smr_tx_fs_free(ep->tx_fs); + if (ep->pend_pool) + ofi_bufpool_destroy(ep->pend_pool); free((void *)ep->name); free(ep); @@ -820,7 +630,8 @@ static int smr_ep_bind_cq(struct smr_ep *ep, struct util_cq *cq, uint64_t flags) return ret; } -static int smr_ep_bind_cntr(struct smr_ep *ep, struct util_cntr *cntr, uint64_t flags) +static int smr_ep_bind_cntr(struct smr_ep *ep, struct util_cntr *cntr, + uint64_t flags) { int ret; @@ -841,14 +652,25 @@ static int smr_ep_bind_cntr(struct smr_ep *ep, struct util_cntr *cntr, uint64_t static int smr_discard(struct fi_peer_rx_entry *rx_entry) { struct smr_cmd_ctx *cmd_ctx = rx_entry->peer_context; - struct smr_region *peer_smr; - struct smr_resp *resp; + struct smr_unexp_buf *sar_buf; - if (cmd_ctx->cmd.msg.hdr.src_data >= smr_src_iov) { - peer_smr = smr_peer_region(cmd_ctx->ep->region, - cmd_ctx->cmd.msg.hdr.id); - resp = smr_get_ptr(peer_smr, cmd_ctx->cmd.msg.hdr.src_data); - resp->status = SMR_STATUS_SUCCESS; + switch (cmd_ctx->cmd->hdr.proto) { + case smr_proto_inline: + break; + case smr_proto_sar: + while (!slist_empty(&cmd_ctx->buf_list)) { + slist_remove_head_container( + &cmd_ctx->buf_list, + struct smr_unexp_buf, sar_buf, + entry); + ofi_buf_free(sar_buf); + } + break; + case smr_proto_inject: + case smr_proto_iov: + case smr_proto_ipc: + smr_return_cmd(cmd_ctx->ep, cmd_ctx->cmd); + break; } ofi_buf_free(cmd_ctx); @@ -877,6 +699,7 @@ static int smr_ep_bind(struct fid *ep_fid, struct fid *bfid, uint64_t flags) int ret = 0; ep = container_of(ep_fid, struct smr_ep, util_ep.ep_fid.fid); + switch (bfid->fclass) { case FI_CLASS_AV: av = container_of(bfid, struct util_av, av_fid.fid); @@ -886,6 +709,7 @@ static int smr_ep_bind(struct fid *ep_fid, struct fid *bfid, uint64_t flags) "duplicate AV binding\n"); return -FI_EINVAL; } + ep->map = &container_of(av, struct smr_av, util_av)->smr_map; break; case FI_CLASS_CQ: ret = smr_ep_bind_cq(ep, container_of(bfid, struct util_cq, @@ -898,7 +722,7 @@ static int smr_ep_bind(struct fid *ep_fid, struct fid *bfid, uint64_t flags) struct util_cntr, cntr_fid.fid), flags); break; case FI_CLASS_SRX_CTX: - ep->srx = (container_of(bfid, struct smr_domain, rx_ep.fid))->srx; + ep->srx = container_of(bfid, struct smr_domain, rx_ep.fid)->srx; break; default: FI_WARN(&smr_prov, FI_LOG_EP_CTRL, "invalid fid class\n"); @@ -908,17 +732,26 @@ static int smr_ep_bind(struct fid *ep_fid, struct fid *bfid, uint64_t flags) return ret; } +static void smr_ep_map_all_peers(struct smr_ep *ep) +{ + int64_t i; + + ofi_genlock_lock(&ep->util_ep.av->lock); + for (i = 0; i < SMR_MAX_PEERS; i++) + smr_map_to_endpoint(ep, i); + + ofi_genlock_unlock(&ep->util_ep.av->lock); +} + static int smr_ep_ctrl(struct fid *fid, int command, void *arg) { struct smr_attr attr; struct smr_domain *domain; struct smr_ep *ep; - struct smr_av *av; struct fid_ep *srx; int ret; ep = container_of(fid, struct smr_ep, util_ep.ep_fid.fid); - av = container_of(ep->util_ep.av, struct smr_av, util_av); switch (command) { case FI_ENABLE: @@ -933,26 +766,37 @@ static int smr_ep_ctrl(struct fid *fid, int command, void *arg) attr.tx_count = ep->tx_size; attr.flags = ep->util_ep.caps & FI_HMEM ? SMR_FLAG_HMEM_ENABLED : 0; + attr.flags |= smr_env.use_xpmem ? SMR_FLAG_XPMEM_ENABLED : 0; - ret = smr_create(&smr_prov, &av->smr_map, &attr, &ep->region); + ret = smr_create(&smr_prov, &attr, &ep->region); if (ret) return ret; if (ep->util_ep.caps & FI_HMEM || smr_env.disable_cma) { - ep->region->cma_cap_peer = SMR_VMA_CAP_OFF; - ep->region->cma_cap_self = SMR_VMA_CAP_OFF; + smr_set_vma_cap(&ep->region->peer_vma_caps, + FI_SHM_P2P_CMA, false); + smr_set_vma_cap(&ep->region->self_vma_caps, + FI_SHM_P2P_CMA, false); + ep->region->flags |= SMR_FLAG_CMA_INIT; } - if (ofi_hmem_any_ipc_enabled()) - ep->smr_progress_ipc_list = smr_progress_ipc_list; - else - ep->smr_progress_ipc_list = smr_progress_ipc_list_noop; + if (ofi_hmem_any_ipc_enabled()) { + ep->smr_progress_async = smr_progress_async; + } else { +#if SHM_HAVE_DSA + ep->smr_progress_async = smr_progress_async; +#else + ep->smr_progress_async = + smr_progress_async_noop; +#endif + } if (!ep->srx) { domain = container_of(ep->util_ep.domain, struct smr_domain, util_domain.domain_fid); - ret = util_ep_srx_context(&domain->util_domain, + ret = util_ep_srx_context( + &domain->util_domain, ep->rx_size, SMR_IOV_LIMIT, ep->min_multi_recv_size, &smr_update, &ep->util_ep.lock, &srx); @@ -972,16 +816,16 @@ static int smr_ep_ctrl(struct fid *fid, int command, void *arg) ep->util_ep.ep_fid.msg = &smr_no_recv_msg_ops; ep->util_ep.ep_fid.tagged = &smr_no_recv_tag_ops; } - smr_exchange_all_peers(ep->region); + smr_ep_map_all_peers(ep); if (smr_env.use_dsa_sar) smr_dsa_context_init(ep); /* if XPMEM is on after exchanging peer info, then set the - * endpoint p2p to XPMEM so it can be used on the fast - * path + * endpoint p2p to XPMEM so it can be used on the fast path */ - if (ep->region->xpmem_cap_self == SMR_VMA_CAP_ON) + if (smr_get_vma_cap(ep->region->self_vma_caps, + FI_SHM_P2P_XPMEM)) ep->p2p_type = FI_SHM_P2P_XPMEM; break; @@ -1060,13 +904,14 @@ static int smr_create_pools(struct smr_ep *ep, struct fi_info *info) if (ret) goto free2; - ret = ofi_bufpool_create(&ep->pend_buf_pool, + ret = ofi_bufpool_create(&ep->pend_pool, sizeof(struct smr_pend_entry), - 16, 0, 4, OFI_BUFPOOL_NO_TRACK); + 16, 0, ep->tx_size, OFI_BUFPOOL_NO_TRACK); if (ret) goto free1; return FI_SUCCESS; + free1: ofi_bufpool_destroy(ep->unexp_buf_pool); free2: @@ -1078,7 +923,7 @@ static int smr_create_pools(struct smr_ep *ep, struct fi_info *info) } int smr_endpoint(struct fid_domain *domain, struct fi_info *info, - struct fid_ep **ep_fid, void *context) + struct fid_ep **ep_fid, void *context) { struct smr_ep *ep; int ret; @@ -1093,14 +938,15 @@ int smr_endpoint(struct fid_domain *domain, struct fi_info *info, ret = smr_endpoint_name(ep, name, info->src_addr, info->src_addrlen); if (ret) goto free; + ret = smr_setname(&ep->util_ep.ep_fid.fid, name, SMR_NAME_MAX); if (ret) goto free; ep->rx_size = info->rx_attr->size; ep->tx_size = info->tx_attr->size; - ret = ofi_endpoint_init(domain, &smr_util_prov, info, &ep->util_ep, context, - smr_ep_progress); + ret = ofi_endpoint_init(domain, &smr_util_prov, info, &ep->util_ep, + context, smr_ep_progress); if (ret) goto name; @@ -1111,10 +957,8 @@ int smr_endpoint(struct fid_domain *domain, struct fi_info *info, if (ret) goto ep; - ep->tx_fs = smr_tx_fs_create(info->tx_attr->size, NULL, NULL); - - dlist_init(&ep->sar_list); - dlist_init(&ep->ipc_cpy_pend_list); + dlist_init(&ep->async_cpy_list); + slist_init(&ep->overflow_list); ep->min_multi_recv_size = SMR_INJECT_SIZE; diff --git a/prov/shm/src/smr_fabric.c b/prov/shm/src/smr_fabric.c index fab9b2b583f..2b934746596 100644 --- a/prov/shm/src/smr_fabric.c +++ b/prov/shm/src/smr_fabric.c @@ -60,10 +60,12 @@ static int smr_fabric_close(fid_t fid) { int ret; struct util_fabric *fabric; + fabric = container_of(fid, struct util_fabric, fabric_fid.fid); ret = ofi_fabric_close(fabric); if (ret) return ret; + free(fabric); return 0; } @@ -77,24 +79,24 @@ static struct fi_ops smr_fabric_fi_ops = { }; int smr_fabric(struct fi_fabric_attr *attr, struct fid_fabric **fabric, - void *context) + void *context) { int ret; - struct smr_fabric *smr_fabric; + struct util_fabric *util_fabric; - smr_fabric = calloc(1, sizeof(*smr_fabric)); - if (!smr_fabric) + util_fabric = calloc(1, sizeof(*util_fabric)); + if (!util_fabric) return -FI_ENOMEM; ret = ofi_fabric_init(&smr_prov, smr_info.fabric_attr, attr, - &smr_fabric->util_fabric, context); + util_fabric, context); if (ret) { - free(smr_fabric); + free(util_fabric); return ret; } - *fabric = &smr_fabric->util_fabric.fabric_fid; + *fabric = &util_fabric->fabric_fid; (*fabric)->fid.ops = &smr_fabric_fi_ops; (*fabric)->ops = &smr_fabric_ops; return 0; -} +} \ No newline at end of file diff --git a/prov/shm/src/smr_init.c b/prov/shm/src/smr_init.c index df4472579e2..e25a5cf1b65 100644 --- a/prov/shm/src/smr_init.c +++ b/prov/shm/src/smr_init.c @@ -104,8 +104,8 @@ static int smr_shm_space_check(size_t tx_count, size_t rx_count) } shm_size_needed = num_of_core * smr_calculate_size_offsets(tx_count, rx_count, - NULL, NULL, NULL, - NULL, NULL, NULL); + NULL, NULL, NULL, NULL, + NULL, NULL); err = statvfs(shm_fs, &stat); if (err) { FI_WARN(&smr_prov, FI_LOG_CORE, @@ -128,8 +128,7 @@ static int smr_getinfo(uint32_t version, const char *node, const char *service, { struct fi_info *cur; uint64_t mr_mode, msg_order; - int fast_rma; - int ret; + int fast_rma, ret; mr_mode = hints && hints->domain_attr ? hints->domain_attr->mr_mode : FI_MR_VIRT_ADDR; @@ -141,7 +140,8 @@ static int smr_getinfo(uint32_t version, const char *node, const char *service, if (ret) return ret; - ret = smr_shm_space_check((*info)->tx_attr->size, (*info)->rx_attr->size); + ret = smr_shm_space_check((*info)->tx_attr->size, + (*info)->rx_attr->size); if (ret) { fi_freeinfo(*info); return ret; @@ -149,15 +149,18 @@ static int smr_getinfo(uint32_t version, const char *node, const char *service, for (cur = *info; cur; cur = cur->next) { if (!(flags & FI_SOURCE) && !cur->dest_addr) - smr_resolve_addr(node, service, (char **) &cur->dest_addr, + smr_resolve_addr(node, service, + (char **) &cur->dest_addr, &cur->dest_addrlen); if (!cur->src_addr) { if (flags & FI_SOURCE) - smr_resolve_addr(node, service, (char **) &cur->src_addr, + smr_resolve_addr(node, service, + (char **) &cur->src_addr, &cur->src_addrlen); else - smr_resolve_addr(NULL, NULL, (char **) &cur->src_addr, + smr_resolve_addr(NULL, NULL, + (char **) &cur->src_addr, &cur->src_addrlen); } if (fast_rma) { diff --git a/prov/shm/src/smr_msg.c b/prov/shm/src/smr_msg.c index 41e36cb14d2..bdf3fe57f67 100644 --- a/prov/shm/src/smr_msg.c +++ b/prov/shm/src/smr_msg.c @@ -73,28 +73,29 @@ static ssize_t smr_recv(struct fid_ep *ep_fid, void *buf, size_t len, } static ssize_t smr_generic_sendmsg(struct smr_ep *ep, const struct iovec *iov, - void **desc, size_t iov_count, fi_addr_t addr, - uint64_t tag, uint64_t data, void *context, - uint32_t op, uint64_t op_flags) + void **desc, size_t iov_count, + fi_addr_t addr, uint64_t tag, uint64_t data, + void *context, uint32_t op, + uint64_t op_flags) { struct smr_region *peer_smr; - int64_t id, peer_id; + int64_t tx_id, rx_id, pos; ssize_t ret = 0; size_t total_len; int proto; struct smr_cmd_entry *ce; - int64_t pos; + struct smr_cmd *cmd; assert(iov_count <= SMR_IOV_LIMIT); - id = smr_verify_peer(ep, addr); - if (id < 0) + tx_id = smr_verify_peer(ep, addr); + if (tx_id < 0) return -FI_EAGAIN; - peer_id = smr_peer_data(ep->region)[id].addr.id; - peer_smr = smr_peer_region(ep->region, id); + rx_id = smr_peer_data(ep->region)[tx_id].id; + peer_smr = smr_peer_region(ep, tx_id); - if (smr_peer_data(ep->region)[id].sar_status) + if (smr_peer_data(ep->region)[tx_id].sar_status) return -FI_EAGAIN; ret = smr_cmd_queue_next(smr_cmd_queue(peer_smr), &ce, &pos); @@ -107,19 +108,36 @@ static ssize_t smr_generic_sendmsg(struct smr_ep *ep, const struct iovec *iov, assert(!(op_flags & FI_INJECT) || total_len <= SMR_INJECT_SIZE); proto = smr_select_proto(desc, iov_count, smr_vma_enabled(ep, peer_smr), - smr_ipc_valid(ep, peer_smr, id, peer_id), op, + smr_ipc_valid(ep, peer_smr, tx_id, rx_id), op, total_len, op_flags); - ret = smr_proto_ops[proto](ep, peer_smr, id, peer_id, op, tag, data, op_flags, - (struct ofi_mr **)desc, iov, iov_count, total_len, - context, &ce->cmd); + if (proto != smr_proto_inline) { + if (smr_freestack_isempty(smr_cmd_stack(ep->region))) { + smr_cmd_queue_discard(ce, pos); + ret = -FI_EAGAIN; + goto unlock; + } + + cmd = smr_freestack_pop(smr_cmd_stack(ep->region)); + assert(cmd); + ce->ptr = smr_local_to_peer(ep, tx_id, rx_id, (uintptr_t) cmd); + } else { + cmd = &ce->cmd; + ce->ptr = smr_peer_to_peer(ep, tx_id, (uintptr_t) &ce->cmd); + } + + ret = smr_send_ops[proto](ep, peer_smr, tx_id, rx_id, op, tag, data, + op_flags, (struct ofi_mr **) desc, iov, + iov_count, total_len, context, cmd); if (ret) { smr_cmd_queue_discard(ce, pos); + if (proto != smr_proto_inline) + smr_freestack_push(smr_cmd_stack(ep->region), cmd); goto unlock; } smr_cmd_queue_commit(ce, pos); - if (proto != smr_src_inline && proto != smr_src_inject) + if (proto != smr_proto_inline) goto unlock; ret = smr_complete_tx(ep, context, op, op_flags); @@ -135,7 +153,7 @@ static ssize_t smr_generic_sendmsg(struct smr_ep *ep, const struct iovec *iov, } static ssize_t smr_send(struct fid_ep *ep_fid, const void *buf, size_t len, - void *desc, fi_addr_t dest_addr, void *context) + void *desc, fi_addr_t dest_addr, void *context) { struct smr_ep *ep; struct iovec msg_iov; @@ -150,7 +168,8 @@ static ssize_t smr_send(struct fid_ep *ep_fid, const void *buf, size_t len, } static ssize_t smr_sendv(struct fid_ep *ep_fid, const struct iovec *iov, - void **desc, size_t count, fi_addr_t dest_addr, void *context) + void **desc, size_t count, fi_addr_t dest_addr, + void *context) { struct smr_ep *ep; @@ -169,7 +188,8 @@ static ssize_t smr_sendmsg(struct fid_ep *ep_fid, const struct fi_msg *msg, return smr_generic_sendmsg(ep, msg->msg_iov, msg->desc, msg->iov_count, msg->addr, 0, msg->data, msg->context, - ofi_op_msg, flags | ep->util_ep.tx_msg_flags); + ofi_op_msg, + flags | ep->util_ep.tx_msg_flags); } static ssize_t smr_generic_inject(struct fid_ep *ep_fid, const void *buf, @@ -178,12 +198,12 @@ static ssize_t smr_generic_inject(struct fid_ep *ep_fid, const void *buf, { struct smr_ep *ep; struct smr_region *peer_smr; - int64_t id, peer_id; + int64_t tx_id, rx_id, pos; ssize_t ret = 0; struct iovec msg_iov; int proto; struct smr_cmd_entry *ce; - int64_t pos; + struct smr_cmd *cmd; assert(len <= SMR_INJECT_SIZE); @@ -192,31 +212,57 @@ static ssize_t smr_generic_inject(struct fid_ep *ep_fid, const void *buf, ep = container_of(ep_fid, struct smr_ep, util_ep.ep_fid.fid); - id = smr_verify_peer(ep, dest_addr); - if (id < 0) + tx_id = smr_verify_peer(ep, dest_addr); + if (tx_id < 0) return -FI_EAGAIN; - peer_id = smr_peer_data(ep->region)[id].addr.id; - peer_smr = smr_peer_region(ep->region, id); + rx_id = smr_peer_data(ep->region)[tx_id].id; + peer_smr = smr_peer_region(ep, tx_id); - if (smr_peer_data(ep->region)[id].sar_status) - return -FI_EAGAIN; + ofi_genlock_lock(&ep->util_ep.lock); + if (smr_peer_data(ep->region)[tx_id].sar_status) { + ret = -FI_EAGAIN; + goto unlock; + } ret = smr_cmd_queue_next(smr_cmd_queue(peer_smr), &ce, &pos); - if (ret == -FI_ENOENT) - return -FI_EAGAIN; + if (ret == -FI_ENOENT) { + ret = -FI_EAGAIN; + goto unlock; + } + + if (len <= SMR_MSG_DATA_LEN) { + proto = smr_proto_inline; + cmd = &ce->cmd; + ce->ptr = smr_peer_to_peer(ep, tx_id, (uintptr_t) &ce->cmd); + } else { + proto = smr_proto_inject; + if (smr_freestack_isempty(smr_cmd_stack(ep->region))) { + smr_cmd_queue_discard(ce, pos); + ret = -FI_EAGAIN; + goto unlock; + } + + cmd = smr_freestack_pop(smr_cmd_stack(ep->region)); + assert(cmd); + ce->ptr = smr_local_to_peer(ep, tx_id, rx_id, (uintptr_t) cmd); + } - proto = len <= SMR_MSG_DATA_LEN ? smr_src_inline : smr_src_inject; - ret = smr_proto_ops[proto](ep, peer_smr, id, peer_id, op, tag, data, - op_flags, NULL, &msg_iov, 1, len, NULL, &ce->cmd); + ret = smr_send_ops[proto](ep, peer_smr, tx_id, rx_id, op, tag, data, + op_flags, NULL, &msg_iov, 1, len, NULL, cmd); if (ret) { smr_cmd_queue_discard(ce, pos); - return -FI_EAGAIN; + ret = -FI_EAGAIN; + goto unlock; } smr_cmd_queue_commit(ce, pos); - ofi_ep_peer_tx_cntr_inc(&ep->util_ep, op); - return FI_SUCCESS; + if (proto == smr_proto_inline) + ofi_ep_peer_tx_cntr_inc(&ep->util_ep, op); + +unlock: + ofi_genlock_unlock(&ep->util_ep.lock); + return ret; } static ssize_t smr_inject(struct fid_ep *ep_fid, const void *buf, size_t len, @@ -320,7 +366,8 @@ static ssize_t smr_trecvmsg(struct fid_ep *ep_fid, } static ssize_t smr_tsend(struct fid_ep *ep_fid, const void *buf, size_t len, - void *desc, fi_addr_t dest_addr, uint64_t tag, void *context) + void *desc, fi_addr_t dest_addr, uint64_t tag, + void *context) { struct smr_ep *ep; struct iovec msg_iov; @@ -336,8 +383,8 @@ static ssize_t smr_tsend(struct fid_ep *ep_fid, const void *buf, size_t len, } static ssize_t smr_tsendv(struct fid_ep *ep_fid, const struct iovec *iov, - void **desc, size_t count, fi_addr_t dest_addr, uint64_t tag, - void *context) + void **desc, size_t count, fi_addr_t dest_addr, + uint64_t tag, void *context) { struct smr_ep *ep; @@ -417,4 +464,4 @@ struct fi_ops_tagged smr_no_recv_tag_ops = { .inject = smr_tinject, .senddata = smr_tsenddata, .injectdata = smr_tinjectdata, -}; +}; \ No newline at end of file diff --git a/prov/shm/src/smr_progress.c b/prov/shm/src/smr_progress.c index 48012eb7eb2..a17b2dbc33c 100644 --- a/prov/shm/src/smr_progress.c +++ b/prov/shm/src/smr_progress.c @@ -35,147 +35,128 @@ #include "ofi_atomic.h" #include "ofi_mb.h" -static inline void -smr_try_progress_to_sar(struct smr_ep *ep, struct smr_region *smr, - struct smr_freestack *sar_pool, struct smr_resp *resp, - struct smr_cmd *cmd, struct ofi_mr **mr, - struct iovec *iov, size_t iov_count, - size_t *bytes_done, void *entry_ptr) +static void smr_progress_overflow(struct smr_ep *ep) { - if (*bytes_done < cmd->msg.hdr.size) { - if (smr_env.use_dsa_sar && ofi_mr_all_host(mr, iov_count)) { - (void) smr_dsa_copy_to_sar(ep, sar_pool, resp, cmd, iov, - iov_count, bytes_done, entry_ptr); + struct smr_cmd_entry *ce; + struct smr_region *peer_smr; + struct smr_cmd *cmd; + int64_t pos; + struct slist_entry *entry; + int ret; + + entry = ep->overflow_list.head; + while (entry) { + cmd = (struct smr_cmd *) entry; + peer_smr = smr_peer_region(ep, cmd->hdr.tx_id); + ret = smr_cmd_queue_next(smr_cmd_queue(peer_smr), &ce, &pos); + if (ret == -FI_ENOENT) return; - } else { - smr_copy_to_sar(sar_pool, resp, cmd, mr, iov, iov_count, - bytes_done); - } + + ce->ptr = smr_local_to_peer(ep, cmd->hdr.tx_id, + cmd->hdr.rx_id, (uintptr_t) cmd); + + slist_remove_head(&ep->overflow_list); + smr_cmd_queue_commit(ce, pos); + entry = ep->overflow_list.head; } } -static inline void -smr_try_progress_from_sar(struct smr_ep *ep, struct smr_region *smr, - struct smr_freestack *sar_pool, struct smr_resp *resp, - struct smr_cmd *cmd, struct ofi_mr **mr, - struct iovec *iov, size_t iov_count, - size_t *bytes_done, void *entry_ptr) +static void smr_try_send_cmd(struct smr_ep *ep, struct smr_cmd *cmd) { - if (*bytes_done < cmd->msg.hdr.size) { - if (smr_env.use_dsa_sar && ofi_mr_all_host(mr, iov_count)) { - (void) smr_dsa_copy_from_sar(ep, sar_pool, resp, cmd, - iov, iov_count, bytes_done, entry_ptr); - return; - } else { - smr_copy_from_sar(sar_pool, resp, cmd, mr, - iov, iov_count, bytes_done); - } - } + cmd->hdr.entry = 0; + slist_insert_tail((struct slist_entry *) &cmd->hdr.entry, + &ep->overflow_list); + + smr_progress_overflow(ep); } -static int smr_progress_resp_entry(struct smr_ep *ep, struct smr_resp *resp, - struct smr_tx_entry *pending, uint64_t *err) +static inline void smr_free_sar_bufs(struct smr_ep *ep, struct smr_cmd *cmd, + struct smr_pend_entry *pending) { int i; - struct smr_region *peer_smr; - size_t inj_offset; + + for (i = cmd->data.buf_batch_size - 1; i >= 0; i--) { + smr_freestack_push_by_index(smr_sar_pool(ep->region), + cmd->data.sar[i]); + } + smr_peer_data(ep->region)[cmd->hdr.tx_id].sar_status = SMR_SAR_FREE; +} + +static int smr_progress_return_entry(struct smr_ep *ep, struct smr_cmd *cmd, + struct smr_pend_entry *pend) +{ struct smr_inject_buf *tx_buf = NULL; - struct smr_sar_buf *sar_buf = NULL; uint8_t *src; ssize_t hmem_copy_ret; + int ret = FI_SUCCESS; - peer_smr = smr_peer_region(ep->region, pending->peer_id); - - switch (pending->cmd.msg.hdr.op_src) { - case smr_src_iov: + switch (cmd->hdr.proto) { + case smr_proto_iov: break; - case smr_src_ipc: - assert(pending->mr[0]); + case smr_proto_ipc: + assert(pend->mr[0]); break; - case smr_src_sar: - sar_buf = smr_freestack_get_entry_from_index( - smr_sar_pool(peer_smr), pending->cmd.msg.data.sar[0]); - if (pending->bytes_done == pending->cmd.msg.hdr.size && - (resp->status == SMR_STATUS_SAR_EMPTY || - resp->status == SMR_STATUS_SUCCESS)) { - resp->status = SMR_STATUS_SUCCESS; - break; + case smr_proto_sar: + if (cmd->hdr.status) { + smr_free_sar_bufs(ep, cmd, pend); + return cmd->hdr.status; } - if (pending->cmd.msg.hdr.op == ofi_op_read_req) - smr_try_progress_from_sar(ep, peer_smr, - smr_sar_pool(peer_smr), resp, - &pending->cmd, pending->mr, pending->iov, - pending->iov_count, &pending->bytes_done, - pending); - else - smr_try_progress_to_sar(ep, peer_smr, - smr_sar_pool(peer_smr), resp, - &pending->cmd, pending->mr, pending->iov, - pending->iov_count, &pending->bytes_done, - pending); - if (pending->bytes_done != pending->cmd.msg.hdr.size || - resp->status != SMR_STATUS_SAR_EMPTY) { + if (cmd->hdr.op == ofi_op_read_req) { + ret = pend->sar_copy_fn(ep, pend); + if (ret && ret != -FI_EBUSY) + return ret; + if (pend->bytes_done == cmd->hdr.size) { + smr_free_sar_bufs(ep, cmd, pend); + return FI_SUCCESS; + } + smr_peer_data(ep->region)[cmd->hdr.tx_id].sar_status = + SMR_SAR_BUSY; + smr_try_send_cmd(ep, cmd); return -FI_EAGAIN; } - resp->status = SMR_STATUS_SUCCESS; - break; - case smr_src_mmap: - if (!pending->map_name) - break; - if (pending->cmd.msg.hdr.op == ofi_op_read_req) { - if (!*err) { - hmem_copy_ret = - ofi_copy_to_mr_iov(pending->mr, - pending->iov, - pending->iov_count, - 0, pending->map_ptr, - pending->cmd.msg.hdr.size); + if (pend->bytes_done == cmd->hdr.size) { + smr_free_sar_bufs(ep, cmd, pend); + return FI_SUCCESS; + } + + ret = pend->sar_copy_fn(ep, pend); + if (ret && ret != -FI_EBUSY) + return ret; + + smr_peer_data(ep->region)[cmd->hdr.tx_id].sar_status = + SMR_SAR_BUSY; + smr_try_send_cmd(ep, cmd); + return -FI_EAGAIN; + case smr_proto_inject: + tx_buf = smr_get_inject_buf(ep->region, cmd); + if (pend) { + if (pend->bytes_done != cmd->hdr.size && + cmd->hdr.op != ofi_op_atomic) { + src = cmd->hdr.op == ofi_op_atomic_compare ? + tx_buf->buf : tx_buf->data; + hmem_copy_ret = ofi_copy_to_mr_iov( + pend->mr, pend->iov, + pend->iov_count, + 0, src, cmd->hdr.size); + if (hmem_copy_ret < 0) { FI_WARN(&smr_prov, FI_LOG_EP_CTRL, - "Copy from mmapped file failed with code %d\n", + "RMA read/fetch failed " + "with code %d\n", (int)(-hmem_copy_ret)); - *err = hmem_copy_ret; - } else if (hmem_copy_ret != pending->cmd.msg.hdr.size) { + ret = hmem_copy_ret; + } else if (hmem_copy_ret != cmd->hdr.size) { FI_WARN(&smr_prov, FI_LOG_EP_CTRL, - "Incomplete copy from mmapped file\n"); - *err = -FI_ETRUNC; + "Incomplete rma read/fetch " + "buffer copied\n"); + ret = -FI_ETRUNC; } else { - pending->bytes_done = (size_t) hmem_copy_ret; + pend->bytes_done = + (size_t) hmem_copy_ret; } } - munmap(pending->map_ptr, pending->cmd.msg.hdr.size); - } - shm_unlink(pending->map_name->name); - dlist_remove(&pending->map_name->entry); - free(pending->map_name); - pending->map_name = NULL; - break; - case smr_src_inject: - inj_offset = (size_t) pending->cmd.msg.hdr.src_data; - tx_buf = smr_get_ptr(peer_smr, inj_offset); - if (*err || pending->bytes_done == pending->cmd.msg.hdr.size || - pending->cmd.msg.hdr.op == ofi_op_atomic) - break; - - src = pending->cmd.msg.hdr.op == ofi_op_atomic_compare ? - tx_buf->buf : tx_buf->data; - hmem_copy_ret = ofi_copy_to_mr_iov(pending->mr, - pending->iov, pending->iov_count, - 0, src, pending->cmd.msg.hdr.size); - - if (hmem_copy_ret < 0) { - FI_WARN(&smr_prov, FI_LOG_EP_CTRL, - "RMA read/fetch failed with code %d\n", - (int)(-hmem_copy_ret)); - *err = hmem_copy_ret; - } else if (hmem_copy_ret != pending->cmd.msg.hdr.size) { - FI_WARN(&smr_prov, FI_LOG_EP_CTRL, - "Incomplete rma read/fetch buffer copied\n"); - *err = -FI_ETRUNC; - } else { - pending->bytes_done = (size_t) hmem_copy_ret; } break; default: @@ -183,327 +164,345 @@ static int smr_progress_resp_entry(struct smr_ep *ep, struct smr_resp *resp, "unidentified operation type\n"); } - if (tx_buf) { - smr_release_txbuf(peer_smr, tx_buf); - } else if (sar_buf) { - pthread_spin_lock(&peer_smr->lock); - for (i = pending->cmd.msg.data.buf_batch_size - 1; i >= 0; i--) { - smr_freestack_push_by_index(smr_sar_pool(peer_smr), - pending->cmd.msg.data.sar[i]); - } - pthread_spin_unlock(&peer_smr->lock); - smr_peer_data(ep->region)[pending->peer_id].sar_status = 0; - } - - return FI_SUCCESS; + return ret; } -static void smr_progress_resp(struct smr_ep *ep) +static void smr_progress_return(struct smr_ep *ep) { - struct smr_resp *resp; - struct smr_tx_entry *pending; + struct smr_return_entry *queue_entry; + struct smr_cmd *cmd; + struct smr_pend_entry *pending; + int64_t pos; int ret; - ofi_genlock_lock(&ep->util_ep.lock); - while (!ofi_cirque_isempty(smr_resp_queue(ep->region))) { - resp = ofi_cirque_head(smr_resp_queue(ep->region)); - if (resp->status == SMR_STATUS_BUSY) - break; - - pending = (struct smr_tx_entry *) resp->msg_id; - if (smr_progress_resp_entry(ep, resp, pending, &resp->status)) + while (1) { + ret = smr_return_queue_head(smr_return_queue(ep->region), + &queue_entry, &pos); + if (ret == -FI_ENOENT) break; - if (resp->status) { - ret = smr_write_err_comp(ep->util_ep.tx_cq, pending->context, - pending->op_flags, pending->cmd.msg.hdr.tag, - resp->status); - } else { - ret = smr_complete_tx(ep, pending->context, - pending->cmd.msg.hdr.op, pending->op_flags); - } - if (ret) { - FI_WARN(&smr_prov, FI_LOG_EP_CTRL, - "unable to process tx completion\n"); - break; + cmd = (struct smr_cmd *) queue_entry->ptr; + pending = (struct smr_pend_entry *) cmd->hdr.tx_ctx; + + ret = smr_progress_return_entry(ep, cmd, pending); + if (ret != -FI_EAGAIN) { + if (pending) { + if (cmd->hdr.status) { + ret = smr_write_err_comp( + ep->util_ep.tx_cq, + pending->comp_ctx, + pending->comp_flags, + cmd->hdr.tag, + cmd->hdr.status); + } else { + ret = smr_complete_tx( + ep, pending->comp_ctx, + cmd->hdr.op, + pending->comp_flags); + } + if (ret) { + FI_WARN(&smr_prov, FI_LOG_EP_CTRL, + "unable to process " + "tx completion\n"); + } + ofi_buf_free(pending); + } + smr_freestack_push(smr_cmd_stack(ep->region), cmd); } - ofi_freestack_push(ep->tx_fs, pending); - ofi_cirque_discard(smr_resp_queue(ep->region)); + smr_return_queue_release(smr_return_queue(ep->region), + queue_entry, pos); } - ofi_genlock_unlock(&ep->util_ep.lock); } -static int smr_progress_inline(struct smr_cmd *cmd, struct ofi_mr **mr, - struct iovec *iov, size_t iov_count, - size_t *total_len) +static ssize_t smr_progress_inline(struct smr_ep *ep, struct smr_cmd *cmd, + struct fi_peer_rx_entry *rx_entry, + struct ofi_mr **mr, struct iovec *iov, + size_t iov_count) { - ssize_t hmem_copy_ret; + ssize_t ret; - hmem_copy_ret = ofi_copy_to_mr_iov(mr, iov, iov_count, 0, - cmd->msg.data.msg, cmd->msg.hdr.size); - if (hmem_copy_ret < 0) { + ret = ofi_copy_to_mr_iov(mr, iov, iov_count, 0, cmd->data.msg, + cmd->hdr.size); + if (ret < 0) { FI_WARN(&smr_prov, FI_LOG_EP_CTRL, - "inline recv failed with code %d\n", - (int)(-hmem_copy_ret)); - return hmem_copy_ret; - } else if (hmem_copy_ret != cmd->msg.hdr.size) { - FI_WARN(&smr_prov, FI_LOG_EP_CTRL, - "inline recv truncated\n"); + "inline recv failed with code %d\n", (int)(-ret)); + return ret; + } + if (ret != cmd->hdr.size) { + FI_WARN(&smr_prov, FI_LOG_EP_CTRL, "inline recv truncated\n"); return -FI_ETRUNC; } - - *total_len = hmem_copy_ret; - return FI_SUCCESS; } -static int smr_progress_inject(struct smr_cmd *cmd, struct ofi_mr **mr, - struct iovec *iov, size_t iov_count, - size_t *total_len, struct smr_ep *ep, int err) +static ssize_t smr_progress_inject(struct smr_ep *ep, struct smr_cmd *cmd, + struct fi_peer_rx_entry *rx_entry, + struct ofi_mr **mr, struct iovec *iov, + size_t iov_count) { + struct smr_region *peer_smr; struct smr_inject_buf *tx_buf; - size_t inj_offset; - ssize_t hmem_copy_ret; - - assert(cmd->msg.hdr.op != ofi_op_read_req); + ssize_t ret; - inj_offset = (size_t) cmd->msg.hdr.src_data; - tx_buf = smr_get_ptr(ep->region, inj_offset); + peer_smr = smr_peer_region(ep, cmd->hdr.rx_id); + tx_buf = smr_get_inject_buf(peer_smr, cmd); - if (err) { - smr_release_txbuf(ep->region, tx_buf); - return err; + if (cmd->hdr.op == ofi_op_read_req) { + ret = ofi_copy_from_mr_iov(tx_buf->data, cmd->hdr.size, mr, + iov, iov_count, 0); + } else { + ret = ofi_copy_to_mr_iov(mr, iov, iov_count, 0, tx_buf->data, + cmd->hdr.size); } - hmem_copy_ret = ofi_copy_to_mr_iov(mr, iov, iov_count, 0, tx_buf->data, - cmd->msg.hdr.size); - smr_release_txbuf(ep->region, tx_buf); - - if (hmem_copy_ret < 0) { + if (ret < 0) { FI_WARN(&smr_prov, FI_LOG_EP_CTRL, - "inject recv failed with code %d\n", - (int)(-hmem_copy_ret)); - return hmem_copy_ret; - } else if (hmem_copy_ret != cmd->msg.hdr.size) { + "inject recv failed with code %lu\n", ret); + } else if (ret != cmd->hdr.size) { FI_WARN(&smr_prov, FI_LOG_EP_CTRL, "inject recv truncated\n"); - return -FI_ETRUNC; + ret = -FI_ETRUNC; + } else { + ret = FI_SUCCESS; } - *total_len = hmem_copy_ret; - - return FI_SUCCESS; + cmd->hdr.status = ret; + smr_return_cmd(ep, cmd); + return ret; } -static int smr_progress_iov(struct smr_cmd *cmd, struct iovec *iov, - size_t iov_count, size_t *total_len, - struct smr_ep *ep) +static ssize_t smr_progress_iov(struct smr_ep *ep, struct smr_cmd *cmd, + struct fi_peer_rx_entry *rx_entry, + struct ofi_mr **mr, struct iovec *iov, + size_t iov_count) { struct smr_region *peer_smr; struct ofi_xpmem_client *xpmem; - struct smr_resp *resp; int ret; - peer_smr = smr_peer_region(ep->region, cmd->msg.hdr.id); - resp = smr_get_ptr(peer_smr, cmd->msg.hdr.src_data); + peer_smr = smr_peer_region(ep, cmd->hdr.rx_id); - xpmem = &smr_peer_data(ep->region)[cmd->msg.hdr.id].xpmem; + xpmem = &smr_peer_data(ep->region)[cmd->hdr.rx_id].xpmem; - ret = ofi_shm_p2p_copy(ep->p2p_type, iov, iov_count, cmd->msg.data.iov, - cmd->msg.data.iov_count, cmd->msg.hdr.size, - peer_smr->pid, cmd->msg.hdr.op == ofi_op_read_req, + ret = ofi_shm_p2p_copy(ep->p2p_type, iov, iov_count, cmd->data.iov, + cmd->data.iov_count, cmd->hdr.size, + peer_smr->pid, cmd->hdr.op == ofi_op_read_req, xpmem); - if (!ret) - *total_len = cmd->msg.hdr.size; - - //Status must be set last (signals peer: op done, valid resp entry) - resp->status = -ret; - + cmd->hdr.status = ret; + smr_return_cmd(ep, cmd); return ret; } -static int smr_mmap_peer_copy(struct smr_ep *ep, struct smr_cmd *cmd, - struct ofi_mr **mr, struct iovec *iov, - size_t iov_count, size_t *total_len) +static void smr_buffer_sar(struct smr_ep *ep, struct smr_pend_entry *sar_entry, + struct smr_cmd_ctx *cmd_ctx) { - char shm_name[SMR_NAME_MAX]; - void *mapped_ptr; - int fd, num; - int ret = 0; - ssize_t hmem_copy_ret; + struct smr_region *peer_smr; + struct smr_sar_buf *sar_buf; + struct smr_unexp_buf *buf; + struct smr_cmd *cmd = cmd_ctx->cmd; + size_t bytes; + int next_buf = 0; - num = smr_mmap_name(shm_name, - ep->region->map->peers[cmd->msg.hdr.id].peer.name, - cmd->msg.hdr.msg_id); - if (num < 0) { - FI_WARN(&smr_prov, FI_LOG_AV, "generating shm file name failed\n"); - return -errno; - } + peer_smr = smr_peer_region(ep, cmd->hdr.rx_id); - fd = shm_open(shm_name, O_RDWR, S_IRUSR | S_IWUSR); - if (fd < 0) { - FI_WARN(&smr_prov, FI_LOG_AV, "shm_open error\n"); - return -errno; - } + while (next_buf < cmd->data.buf_batch_size && + sar_entry->bytes_done < cmd->hdr.size) { + buf = ofi_buf_alloc(ep->unexp_buf_pool); + if (!buf) { + FI_WARN(&smr_prov, FI_LOG_EP_CTRL, + "Error allocating buffer for unexpected SAR " + "(-FI_ENOMEM)\n"); + return; + } + slist_insert_tail(&buf->entry, &cmd_ctx->buf_list); - mapped_ptr = mmap(NULL, cmd->msg.hdr.size, PROT_READ | PROT_WRITE, - MAP_SHARED, fd, 0); - if (mapped_ptr == MAP_FAILED) { - FI_WARN(&smr_prov, FI_LOG_AV, "mmap error %s\n", strerror(errno)); - ret = -errno; - goto unlink_close; - } + sar_buf = smr_freestack_get_entry_from_index( + smr_sar_pool(peer_smr), + cmd->data.sar[next_buf]); + bytes = MIN(cmd->hdr.size - sar_entry->bytes_done, + SMR_SAR_SIZE); - if (cmd->msg.hdr.op == ofi_op_read_req) { - hmem_copy_ret = ofi_copy_from_mr_iov(mapped_ptr, - cmd->msg.hdr.size, mr, iov, - iov_count, 0); - } else { - hmem_copy_ret = ofi_copy_to_mr_iov(mr, iov, iov_count, 0, - mapped_ptr, cmd->msg.hdr.size); - } + memcpy(buf->buf, sar_buf->buf, bytes); - if (hmem_copy_ret < 0) { - FI_WARN(&smr_prov, FI_LOG_EP_CTRL, - "mmap copy iov failed with code %d\n", - (int)(-hmem_copy_ret)); - ret = hmem_copy_ret; - } else if (hmem_copy_ret != cmd->msg.hdr.size) { - FI_WARN(&smr_prov, FI_LOG_EP_CTRL, - "mmap copy iov truncated\n"); - ret = -FI_ETRUNC; + sar_entry->bytes_done += bytes; + next_buf++; } +} - *total_len = hmem_copy_ret; +static ssize_t smr_try_copy_rx_sar(struct smr_ep *ep, + struct smr_pend_entry *pend) +{ + ssize_t ret; - munmap(mapped_ptr, cmd->msg.hdr.size); -unlink_close: - shm_unlink(shm_name); - close(fd); + ret = pend->sar_copy_fn(ep, pend); + if (ret) { + if (ret == -FI_EAGAIN) + dlist_insert_tail(&pend->entry, &ep->async_cpy_list); + else if (ret != -FI_EBUSY) + pend->cmd->hdr.status = ret; + } return ret; } -static int smr_progress_mmap(struct smr_cmd *cmd, struct ofi_mr **mr, - struct iovec *iov, size_t iov_count, - size_t *total_len, struct smr_ep *ep) +static int smr_progress_pending_sar(struct smr_ep *ep, struct smr_cmd *cmd) { - struct smr_region *peer_smr; - struct smr_resp *resp; + struct smr_pend_entry *pend; int ret; - peer_smr = smr_peer_region(ep->region, cmd->msg.hdr.id); - resp = smr_get_ptr(peer_smr, cmd->msg.hdr.src_data); + pend = (struct smr_pend_entry *) cmd->hdr.rx_ctx; + if (pend->rx.rx_entry && pend->rx.rx_entry->peer_context) { + smr_buffer_sar(ep, pend, pend->rx.rx_entry->peer_context); + goto out; + } - ret = smr_mmap_peer_copy(ep, cmd, mr, iov, iov_count, total_len); + ret = smr_try_copy_rx_sar(ep, pend); + if (ret == -FI_EBUSY || ret == -FI_EAGAIN) + return FI_SUCCESS; + + if (pend->bytes_done == cmd->hdr.size || pend->cmd->hdr.status) { + if (pend->cmd->hdr.status) { + ret = smr_write_err_comp(ep->util_ep.rx_cq, + pend->comp_ctx, + pend->comp_flags, + cmd->hdr.tag, + pend->cmd->hdr.status); + } else { + ret = smr_complete_rx(ep, pend->comp_ctx, + cmd->hdr.op, + pend->comp_flags, + pend->bytes_done, + pend->iov[0].iov_base, + cmd->hdr.rx_id, cmd->hdr.tag, + cmd->hdr.cq_data); + } + if (ret) { + FI_WARN(&smr_prov, FI_LOG_EP_CTRL, + "unable to process rx completion\n"); + } + if (pend->rx.rx_entry) + ep->srx->owner_ops->free_entry(pend->rx.rx_entry); - //Status must be set last (signals peer: op done, valid resp entry) - resp->status = -ret; + ofi_buf_free(pend); + } - return -ret; +out: + smr_return_cmd(ep, cmd); + return FI_SUCCESS; } -static struct smr_pend_entry *smr_progress_sar(struct smr_cmd *cmd, - struct fi_peer_rx_entry *rx_entry, struct ofi_mr **mr, - struct iovec *iov, size_t iov_count, - size_t *total_len, struct smr_ep *ep) +static int smr_progress_pending(struct smr_ep *ep, struct smr_cmd *cmd) { - struct smr_region *peer_smr; - struct smr_pend_entry *sar_entry; - struct smr_resp *resp; - struct iovec sar_iov[SMR_IOV_LIMIT]; - - peer_smr = smr_peer_region(ep->region, cmd->msg.hdr.id); - resp = smr_get_ptr(peer_smr, cmd->msg.hdr.src_data); + switch (cmd->hdr.proto) { + case smr_proto_sar: + return smr_progress_pending_sar(ep, cmd); + default: + return -FI_EINVAL; + } +} - /* Nothing to do for 0 byte transfer */ - if (!cmd->msg.hdr.size) { - resp->status = SMR_STATUS_SUCCESS; - return NULL; +static void smr_init_rx_pend(struct smr_pend_entry *pend, struct smr_cmd *cmd, + struct fi_peer_rx_entry *rx_entry, + struct ofi_mr **mr, struct iovec *iov, + size_t iov_count) +{ + pend->type = SMR_RX_ENTRY; + if (rx_entry) { + pend->comp_ctx = rx_entry->context; + pend->comp_flags = smr_rx_cq_flags(rx_entry->flags, + cmd->hdr.op_flags); + } else { + pend->comp_ctx = NULL; + pend->comp_flags = smr_rx_cq_flags(0, cmd->hdr.op_flags); } + pend->cmd = cmd; + + pend->sar_dir = pend->cmd->hdr.op == ofi_op_read_req ? + OFI_COPY_IOV_TO_BUF : OFI_COPY_BUF_TO_IOV; + + pend->bytes_done = 0; + memcpy(pend->iov, iov, sizeof(*iov) * iov_count); + pend->iov_count = iov_count; + pend->rx.rx_entry = rx_entry; + if (mr) + memcpy(pend->mr, mr, sizeof(*mr) * iov_count); + else + memset(pend->mr, 0, sizeof(*mr) * iov_count); +} + +static ssize_t smr_progress_sar(struct smr_ep *ep, struct smr_cmd *cmd, + struct fi_peer_rx_entry *rx_entry, + struct ofi_mr **mr, struct iovec *iov, + size_t iov_count) +{ + struct smr_pend_entry *pend = NULL; + struct iovec sar_iov[SMR_IOV_LIMIT]; + ssize_t ret = FI_SUCCESS; + memcpy(sar_iov, iov, sizeof(*iov) * iov_count); - (void) ofi_truncate_iov(sar_iov, &iov_count, cmd->msg.hdr.size); + (void) ofi_truncate_iov(sar_iov, &iov_count, cmd->hdr.size); + + pend = ofi_buf_alloc(ep->pend_pool); + assert(pend); - sar_entry = ofi_buf_alloc(ep->pend_buf_pool); - dlist_insert_tail(&sar_entry->entry, &ep->sar_list); + cmd->hdr.rx_ctx = (uintptr_t) pend; - if (cmd->msg.hdr.op == ofi_op_read_req) - smr_try_progress_to_sar(ep, peer_smr, smr_sar_pool(ep->region), - resp, cmd, mr, sar_iov, iov_count, - total_len, sar_entry); + smr_init_rx_pend(pend, cmd, rx_entry, mr, sar_iov, iov_count); + if (smr_env.use_dsa_sar && ofi_mr_all_host(pend->mr, pend->iov_count)) + pend->sar_copy_fn = &smr_dsa_copy_sar; else - smr_try_progress_from_sar(ep, peer_smr, - smr_sar_pool(ep->region), resp, cmd, mr, - sar_iov, iov_count, total_len, sar_entry); - - if (*total_len == cmd->msg.hdr.size) { - dlist_remove(&sar_entry->entry); - ofi_buf_free(sar_entry); - return NULL; + pend->sar_copy_fn = &smr_copy_sar; + + ret = smr_try_copy_rx_sar(ep, pend); + + if (pend->bytes_done == cmd->hdr.size || pend->cmd->hdr.status) { + cmd->hdr.rx_ctx = 0; + ofi_buf_free(pend); + ret = FI_SUCCESS; } - sar_entry->cmd = *cmd; - sar_entry->cmd_ctx = NULL; - sar_entry->bytes_done = *total_len; - memcpy(sar_entry->iov, sar_iov, sizeof(*sar_iov) * iov_count); - sar_entry->iov_count = iov_count; - sar_entry->rx_entry = rx_entry ? rx_entry : NULL; - if (mr) - memcpy(sar_entry->mr, mr, sizeof(*mr) * iov_count); - else - memset(sar_entry->mr, 0, sizeof(*mr) * iov_count); - *total_len = cmd->msg.hdr.size; - return sar_entry; + if (!ret) + smr_return_cmd(ep, cmd); + return ret; } -static int -smr_ipc_async_copy(struct smr_ep *ep, void *ptr, - struct fi_peer_rx_entry *rx_entry, - struct iovec *iov, size_t iov_count, - struct ofi_mr_entry *mr_entry, struct smr_cmd *cmd, - struct smr_pend_entry **pend) +static int smr_ipc_async_copy(struct smr_ep *ep, struct smr_cmd *cmd, + struct fi_peer_rx_entry *rx_entry, + struct ofi_mr_entry *mr_entry, + struct iovec *iov, size_t iov_count, void *ptr) { struct smr_pend_entry *ipc_entry; - enum fi_hmem_iface iface = cmd->msg.data.ipc_info.iface; - uint64_t device = cmd->msg.data.ipc_info.device; + enum fi_hmem_iface iface = cmd->data.ipc_info.iface; + uint64_t device = cmd->data.ipc_info.device; int ret; - ipc_entry = ofi_buf_alloc(ep->pend_buf_pool); + ipc_entry = ofi_buf_alloc(ep->pend_pool); if (!ipc_entry) return -FI_ENOMEM; - ipc_entry->cmd = *cmd; - ipc_entry->ipc_entry = mr_entry; - ipc_entry->bytes_done = 0; - memcpy(ipc_entry->iov, iov, sizeof(*iov) * iov_count); - ipc_entry->iov_count = iov_count; - ipc_entry->rx_entry = rx_entry; - if (rx_entry) { - ipc_entry->rx_entry->flags |= cmd->msg.hdr.op_flags; - ipc_entry->rx_entry->flags &= ~SMR_MULTI_RECV; - } + cmd->hdr.rx_ctx = (uintptr_t) ipc_entry; + smr_init_rx_pend(ipc_entry, cmd, rx_entry, NULL, iov, iov_count); + ipc_entry->rx.ipc_entry = mr_entry; ret = ofi_create_async_copy_event(iface, device, - &ipc_entry->async_event); + &ipc_entry->rx.async_event); if (ret < 0) goto fail; - if (cmd->msg.hdr.op == ofi_op_read_req) { - ret = ofi_async_copy_from_hmem_iov(ptr, cmd->msg.hdr.size, - iface, device, iov, iov_count, 0, - ipc_entry->async_event); + if (cmd->hdr.op == ofi_op_read_req) { + ret = ofi_async_copy_from_hmem_iov(ptr, cmd->hdr.size, + iface, device, iov, + iov_count, 0, + ipc_entry->rx.async_event); } else { - ret = ofi_async_copy_to_hmem_iov(iface, device, iov, iov_count, 0, - ptr, cmd->msg.hdr.size, - ipc_entry->async_event); + ret = ofi_async_copy_to_hmem_iov(iface, device, iov, iov_count, + 0, ptr, cmd->hdr.size, + ipc_entry->rx.async_event); } - if (ret < 0) goto fail; - dlist_insert_tail(&ipc_entry->entry, &ep->ipc_cpy_pend_list); - *pend = ipc_entry; - + dlist_insert_tail(&ipc_entry->entry, &ep->async_cpy_list); return FI_SUCCESS; fail: @@ -511,86 +510,79 @@ smr_ipc_async_copy(struct smr_ep *ep, void *ptr, return ret; } -static struct smr_pend_entry *smr_progress_ipc(struct smr_cmd *cmd, - struct fi_peer_rx_entry *rx_entry, - struct ofi_mr **mr, struct iovec *iov, - size_t iov_count, size_t *total_len, - struct smr_ep *ep, int *err) +static ssize_t smr_progress_ipc(struct smr_ep *ep, struct smr_cmd *cmd, + struct fi_peer_rx_entry *rx_entry, + struct ofi_mr **mr, struct iovec *iov, + size_t iov_count) { - struct smr_region *peer_smr; - struct smr_resp *resp; void *ptr; int ret; - ssize_t hmem_copy_ret; struct ofi_mr_entry *mr_entry; struct smr_domain *domain; - struct smr_pend_entry *ipc_entry; domain = container_of(ep->util_ep.domain, struct smr_domain, util_domain); - peer_smr = smr_peer_region(ep->region, cmd->msg.hdr.id); - resp = smr_get_ptr(peer_smr, cmd->msg.hdr.src_data); - - if (cmd->msg.data.ipc_info.iface == FI_HMEM_ZE) - ze_set_pid_fd((void **) &cmd->msg.data.ipc_info.ipc_handle, - ep->region->map->peers[cmd->msg.hdr.id].pid_fd); + if (cmd->data.ipc_info.iface == FI_HMEM_ZE) + ze_set_pid_fd((void **) &cmd->data.ipc_info.ipc_handle, + ep->map->peers[cmd->hdr.rx_id].pid_fd); //TODO disable IPC if more than 1 interface is initialized - ret = ofi_ipc_cache_search(domain->ipc_cache, cmd->msg.hdr.id, - &cmd->msg.data.ipc_info, &mr_entry); + ret = ofi_ipc_cache_search(domain->ipc_cache, cmd->hdr.rx_id, + &cmd->data.ipc_info, &mr_entry); if (ret) goto out; ptr = (char *) (uintptr_t) mr_entry->info.mapped_addr + - (uintptr_t) cmd->msg.data.ipc_info.offset; + (uintptr_t) cmd->data.ipc_info.offset; - if (cmd->msg.data.ipc_info.iface == FI_HMEM_ROCR) { - *total_len = 0; - ipc_entry = NULL; - resp->status = SMR_STATUS_BUSY; - - ret = smr_ipc_async_copy(ep, (char*)ptr, rx_entry, iov, - iov_count, mr_entry, cmd, - &ipc_entry); + if (cmd->data.ipc_info.iface == FI_HMEM_ROCR) { + ret = smr_ipc_async_copy(ep, cmd, rx_entry, mr_entry, iov, + iov_count, ptr); if (ret) - resp->status = -ret; - - return ipc_entry; + goto uncache; + return FI_SUCCESS; } - if (cmd->msg.hdr.op == ofi_op_read_req) { - hmem_copy_ret = ofi_copy_from_hmem_iov(ptr, cmd->msg.hdr.size, - cmd->msg.data.ipc_info.iface, - cmd->msg.data.ipc_info.device, iov, - iov_count, 0); + if (cmd->hdr.op == ofi_op_read_req) { + ret = ofi_copy_from_hmem_iov(ptr, cmd->hdr.size, + cmd->data.ipc_info.iface, + cmd->data.ipc_info.device, iov, + iov_count, 0); } else { - hmem_copy_ret = ofi_copy_to_hmem_iov(cmd->msg.data.ipc_info.iface, - cmd->msg.data.ipc_info.device, iov, - iov_count, 0, ptr, cmd->msg.hdr.size); + ret = ofi_copy_to_hmem_iov(cmd->data.ipc_info.iface, + cmd->data.ipc_info.device, iov, + iov_count, 0, ptr, cmd->hdr.size); } + if (ret == cmd->hdr.size) + ret = FI_SUCCESS; + else if (ret > 0) + ret = -FI_ETRUNC; +uncache: ofi_mr_cache_delete(domain->ipc_cache, mr_entry); - - if (hmem_copy_ret < 0) - *err = hmem_copy_ret; - else if (hmem_copy_ret != cmd->msg.hdr.size) - *err = -FI_ETRUNC; - else - *err = FI_SUCCESS; - - *total_len = hmem_copy_ret; - out: - //Status must be set last (signals peer: op done, valid resp entry) - resp->status = -ret; - - return NULL; + cmd->hdr.status = ret; + smr_return_cmd(ep, cmd); + return ret; } -static void smr_do_atomic(void *src, struct ofi_mr *dst_mr, void *dst, - void *cmp, enum fi_datatype datatype, enum fi_op op, - size_t cnt, uint16_t flags) +typedef ssize_t (*smr_progress_func)( + struct smr_ep *ep, struct smr_cmd *cmd, + struct fi_peer_rx_entry *rx_entry, struct ofi_mr **mr, + struct iovec *iov, size_t iov_count); + +static smr_progress_func smr_progress_ops[smr_proto_max] = { + [smr_proto_inline] = &smr_progress_inline, + [smr_proto_inject] = &smr_progress_inject, + [smr_proto_iov] = &smr_progress_iov, + [smr_proto_sar] = &smr_progress_sar, + [smr_proto_ipc] = &smr_progress_ipc, +}; + +static void smr_do_atomic(struct smr_cmd *cmd, void *src, struct ofi_mr *dst_mr, + void *dst, void *cmp, enum fi_datatype datatype, + enum fi_op op, size_t cnt, uint16_t flags) { char tmp_result[SMR_INJECT_SIZE]; char tmp_dst[SMR_INJECT_SIZE]; @@ -611,9 +603,13 @@ static void smr_do_atomic(void *src, struct ofi_mr *dst_mr, void *dst, if (ofi_atomic_isswap_op(op)) { ofi_atomic_swap_handler(op, datatype, cpy_dst, src, cmp, tmp_result, cnt); - } else if (flags & SMR_RMA_REQ && ofi_atomic_isreadwrite_op(op)) { + memcpy(src, tmp_result, cnt * ofi_datatype_size(datatype)); + } else if (cmd->hdr.op == ofi_op_atomic_fetch || + ofi_atomic_isreadwrite_op(op)) { ofi_atomic_readwrite_handler(op, datatype, cpy_dst, src, tmp_result, cnt); + memcpy(src, op == FI_ATOMIC_READ ? cpy_dst : tmp_result, + cnt * ofi_datatype_size(datatype)); } else if (ofi_atomic_iswrite_op(op)) { ofi_atomic_write_handler(op, datatype, cpy_dst, src, cnt); } else { @@ -621,10 +617,6 @@ static void smr_do_atomic(void *src, struct ofi_mr *dst_mr, void *dst, "invalid atomic operation\n"); } - if (flags & SMR_RMA_REQ) - memcpy(src, op == FI_ATOMIC_READ ? cpy_dst : tmp_result, - cnt * ofi_datatype_size(datatype)); - if (cpy_dst != dst) { ret = ofi_copy_to_hmem(dst_mr->iface, dst_mr->device, dst, cpy_dst, cnt * ofi_datatype_size(datatype)); @@ -635,21 +627,22 @@ static void smr_do_atomic(void *src, struct ofi_mr *dst_mr, void *dst, } static int smr_progress_inline_atomic(struct smr_cmd *cmd, struct ofi_mr **mr, - struct fi_ioc *ioc, size_t ioc_count, size_t *len) + struct fi_ioc *ioc, size_t ioc_count, + size_t *len) { int i; - uint8_t *src = cmd->msg.data.msg; + uint8_t *src = cmd->data.msg; - assert(cmd->msg.hdr.op == ofi_op_atomic); + assert(cmd->hdr.op == ofi_op_atomic); - for (i = *len = 0; i < ioc_count && *len < cmd->msg.hdr.size; i++) { - smr_do_atomic(&src[*len], mr[i], ioc[i].addr, NULL, - cmd->msg.hdr.datatype, cmd->msg.hdr.atomic_op, - ioc[i].count, cmd->msg.hdr.op_flags); - *len += ioc[i].count * ofi_datatype_size(cmd->msg.hdr.datatype); + for (i = *len = 0; i < ioc_count && *len < cmd->hdr.size; i++) { + smr_do_atomic(cmd, &src[*len], mr[i], ioc[i].addr, NULL, + cmd->hdr.datatype, cmd->hdr.atomic_op, + ioc[i].count, cmd->hdr.op_flags); + *len += ioc[i].count * ofi_datatype_size(cmd->hdr.datatype); } - if (*len != cmd->msg.hdr.size) { + if (*len != cmd->hdr.size) { FI_WARN(&smr_prov, FI_LOG_EP_CTRL, "recv truncated"); return -FI_ETRUNC; @@ -658,20 +651,19 @@ static int smr_progress_inline_atomic(struct smr_cmd *cmd, struct ofi_mr **mr, } static int smr_progress_inject_atomic(struct smr_cmd *cmd, struct ofi_mr **mr, - struct fi_ioc *ioc, size_t ioc_count, size_t *len, - struct smr_ep *ep, int err) + struct fi_ioc *ioc, size_t ioc_count, + size_t *len, struct smr_ep *ep, int err) { struct smr_inject_buf *tx_buf; - size_t inj_offset; uint8_t *src, *comp; int i; - inj_offset = (size_t) cmd->msg.hdr.src_data; - tx_buf = smr_get_ptr(ep->region, inj_offset); + tx_buf = smr_get_inject_buf(smr_peer_region(ep, cmd->hdr.rx_id), cmd); + if (err) goto out; - switch (cmd->msg.hdr.op) { + switch (cmd->hdr.op) { case ofi_op_atomic_compare: src = tx_buf->buf; comp = tx_buf->comp; @@ -682,92 +674,55 @@ static int smr_progress_inject_atomic(struct smr_cmd *cmd, struct ofi_mr **mr, break; } - for (i = *len = 0; i < ioc_count && *len < cmd->msg.hdr.size; i++) { - smr_do_atomic(&src[*len], mr[i], ioc[i].addr, - comp ? &comp[*len] : NULL, cmd->msg.hdr.datatype, - cmd->msg.hdr.atomic_op, ioc[i].count, - cmd->msg.hdr.op_flags); - *len += ioc[i].count * ofi_datatype_size(cmd->msg.hdr.datatype); + for (i = *len = 0; i < ioc_count && *len < cmd->hdr.size; i++) { + smr_do_atomic(cmd, &src[*len], mr[i], ioc[i].addr, + comp ? &comp[*len] : NULL, cmd->hdr.datatype, + cmd->hdr.atomic_op, ioc[i].count, + cmd->hdr.op_flags); + *len += ioc[i].count * ofi_datatype_size(cmd->hdr.datatype); } - if (*len != cmd->msg.hdr.size) { - FI_WARN(&smr_prov, FI_LOG_EP_CTRL, - "recv truncated"); + if (*len != cmd->hdr.size) { + FI_WARN(&smr_prov, FI_LOG_EP_CTRL, "recv truncated"); err = -FI_ETRUNC; } out: - if (!(cmd->msg.hdr.op_flags & SMR_RMA_REQ)) - smr_release_txbuf(ep->region, tx_buf); - + smr_return_cmd(ep, cmd); return err; } static int smr_start_common(struct smr_ep *ep, struct smr_cmd *cmd, - struct fi_peer_rx_entry *rx_entry) + struct fi_peer_rx_entry *rx_entry) { - struct smr_pend_entry *pend = NULL; - size_t total_len = 0; uint64_t comp_flags; void *comp_buf; int ret; - int err = 0; - switch (cmd->msg.hdr.op_src) { - case smr_src_inline: - err = smr_progress_inline(cmd, - (struct ofi_mr **) rx_entry->desc, - rx_entry->iov, rx_entry->count, &total_len); - break; - case smr_src_inject: - err = smr_progress_inject(cmd, - (struct ofi_mr **) rx_entry->desc, - rx_entry->iov, rx_entry->count, &total_len, - ep, 0); - break; - case smr_src_iov: - err = smr_progress_iov(cmd, rx_entry->iov, rx_entry->count, - &total_len, ep); - break; - case smr_src_mmap: - err = smr_progress_mmap(cmd, (struct ofi_mr **) rx_entry->desc, - rx_entry->iov, rx_entry->count, - &total_len, ep); - break; - case smr_src_sar: - pend = smr_progress_sar(cmd, rx_entry, - (struct ofi_mr **) rx_entry->desc, - rx_entry->iov, rx_entry->count, - &total_len, ep); - break; - case smr_src_ipc: - pend = smr_progress_ipc(cmd, rx_entry, - (struct ofi_mr **) rx_entry->desc, - rx_entry->iov, rx_entry->count, - &total_len, ep, &err); - break; - default: - FI_WARN(&smr_prov, FI_LOG_EP_CTRL, - "unidentified operation type\n"); - err = -FI_EINVAL; - } + rx_entry->peer_context = NULL; + assert (cmd->hdr.proto < smr_proto_max); + ret = smr_progress_ops[cmd->hdr.proto]( + ep, cmd, rx_entry, + (struct ofi_mr **) rx_entry->desc, + rx_entry->iov, rx_entry->count); - if (!pend) { + if (!cmd->hdr.rx_ctx) { comp_buf = rx_entry->iov[0].iov_base; comp_flags = smr_rx_cq_flags(rx_entry->flags, - cmd->msg.hdr.op_flags); - if (err) { + cmd->hdr.op_flags); + if (ret) { FI_WARN(&smr_prov, FI_LOG_EP_CTRL, "error processing op\n"); ret = smr_write_err_comp(ep->util_ep.rx_cq, rx_entry->context, comp_flags, rx_entry->tag, - -err); + ret); } else { - ret = smr_complete_rx(ep, rx_entry->context, cmd->msg.hdr.op, - comp_flags, total_len, comp_buf, - cmd->msg.hdr.id, cmd->msg.hdr.tag, - cmd->msg.hdr.data); + ret = smr_complete_rx(ep, rx_entry->context, + cmd->hdr.op, comp_flags, + cmd->hdr.size, comp_buf, + cmd->hdr.rx_id, cmd->hdr.tag, + cmd->hdr.cq_data); } if (ret) { FI_WARN(&smr_prov, FI_LOG_EP_CTRL, @@ -780,49 +735,50 @@ static int smr_start_common(struct smr_ep *ep, struct smr_cmd *cmd, } static int smr_copy_saved(struct smr_cmd_ctx *cmd_ctx, - struct fi_peer_rx_entry *rx_entry) + struct fi_peer_rx_entry *rx_entry) { struct smr_unexp_buf *sar_buf; + struct smr_pend_entry *sar_entry; size_t bytes = 0; uint64_t comp_flags; int ret; + sar_entry = (struct smr_pend_entry *) cmd_ctx->pend; while (!slist_empty(&cmd_ctx->buf_list)) { slist_remove_head_container(&cmd_ctx->buf_list, - struct smr_unexp_buf, sar_buf, entry); + struct smr_unexp_buf, sar_buf, + entry); bytes += ofi_copy_to_mr_iov((struct ofi_mr **) rx_entry->desc, - rx_entry->iov, rx_entry->count, bytes, - sar_buf->buf, - MIN(cmd_ctx->cmd.msg.hdr.size - bytes, - SMR_SAR_SIZE)); + rx_entry->iov, rx_entry->count, + bytes, sar_buf->buf, + MIN(cmd_ctx->cmd->hdr.size - bytes, + SMR_SAR_SIZE)); ofi_buf_free(sar_buf); } - if (bytes != cmd_ctx->cmd.msg.hdr.size) { - assert(cmd_ctx->sar_entry); - cmd_ctx->sar_entry->cmd_ctx = NULL; - cmd_ctx->sar_entry->rx_entry = rx_entry; - memcpy(cmd_ctx->sar_entry->iov, rx_entry->iov, + if (bytes != cmd_ctx->cmd->hdr.size) { + sar_entry->rx.rx_entry = rx_entry; + rx_entry->peer_context = NULL; + memcpy(sar_entry->iov, rx_entry->iov, sizeof(*rx_entry->iov) * rx_entry->count); - cmd_ctx->sar_entry->iov_count = rx_entry->count; - (void) ofi_truncate_iov(cmd_ctx->sar_entry->iov, - &cmd_ctx->sar_entry->iov_count, - cmd_ctx->cmd.msg.hdr.size); - memcpy(cmd_ctx->sar_entry->mr, rx_entry->desc, - sizeof(*rx_entry->desc) * cmd_ctx->sar_entry->iov_count); + sar_entry->iov_count = rx_entry->count; + (void) ofi_truncate_iov(sar_entry->iov, + &sar_entry->iov_count, + cmd_ctx->cmd->hdr.size); + memcpy(sar_entry->mr, rx_entry->desc, + sizeof(*rx_entry->desc) * sar_entry->iov_count); return FI_SUCCESS; } - assert(!cmd_ctx->sar_entry); comp_flags = smr_rx_cq_flags(rx_entry->flags, - cmd_ctx->cmd.msg.hdr.op_flags); + cmd_ctx->cmd->hdr.op_flags); ret = smr_complete_rx(cmd_ctx->ep, rx_entry->context, - cmd_ctx->cmd.msg.hdr.op, comp_flags, + cmd_ctx->cmd->hdr.op, comp_flags, bytes, rx_entry->iov[0].iov_base, - cmd_ctx->cmd.msg.hdr.id, - cmd_ctx->cmd.msg.hdr.tag, - cmd_ctx->cmd.msg.hdr.data); + cmd_ctx->cmd->hdr.rx_id, + cmd_ctx->cmd->hdr.tag, + cmd_ctx->cmd->hdr.cq_data); if (ret) { FI_WARN(&smr_prov, FI_LOG_EP_CTRL, "unable to process rx completion\n"); @@ -836,15 +792,26 @@ static int smr_copy_saved(struct smr_cmd_ctx *cmd_ctx, int smr_unexp_start(struct fi_peer_rx_entry *rx_entry) { struct smr_cmd_ctx *cmd_ctx = rx_entry->peer_context; - int ret; + int ret = FI_SUCCESS; - if (cmd_ctx->cmd.msg.hdr.op_src == smr_src_sar || - cmd_ctx->cmd.msg.hdr.op_src == smr_src_inject) + switch (cmd_ctx->cmd->hdr.proto) { + case smr_proto_sar: ret = smr_copy_saved(cmd_ctx, rx_entry); - else - ret = smr_start_common(cmd_ctx->ep, &cmd_ctx->cmd, rx_entry); + break; + case smr_proto_inline: + case smr_proto_inject: + case smr_proto_iov: + case smr_proto_ipc: + ret = smr_start_common(cmd_ctx->ep, cmd_ctx->cmd, rx_entry); + break; + default: + FI_WARN(&smr_prov, FI_LOG_EP_CTRL, + "unidentified operation type\n"); + ret = -FI_EINVAL; + } ofi_buf_free(cmd_ctx); + rx_entry->peer_context = NULL; return ret; } @@ -852,64 +819,54 @@ int smr_unexp_start(struct fi_peer_rx_entry *rx_entry) static void smr_progress_connreq(struct smr_ep *ep, struct smr_cmd *cmd) { struct smr_region *peer_smr; - struct smr_inject_buf *tx_buf; - size_t inj_offset; int64_t idx = -1; int ret = 0; - inj_offset = (size_t) cmd->msg.hdr.src_data; - tx_buf = smr_get_ptr(ep->region, inj_offset); - - ret = smr_map_add(&smr_prov, ep->region->map, - (char *) tx_buf->data, &idx); - if (ret || idx < 0) { - FI_WARN(&smr_prov, FI_LOG_EP_CTRL, - "Error processing mapping request\n"); - return; - } + ofi_genlock_lock(&ep->util_ep.av->lock); + smr_map_add(ep->map, (char *) cmd->data.msg, &idx); - peer_smr = smr_peer_region(ep->region, idx); + peer_smr = smr_peer_region(ep, idx); if (!peer_smr) { - ofi_spin_lock(&ep->region->map->lock); - ret = smr_map_to_region(&smr_prov, ep->region->map, idx); - ofi_spin_unlock(&ep->region->map->lock); + ret = smr_map_to_region(ep->map, idx); if (ret) { FI_WARN(&smr_prov, FI_LOG_EP_CTRL, "Could not map peer region\n"); - return; + goto out; } - peer_smr = smr_peer_region(ep->region, idx); + peer_smr = smr_peer_region(ep, idx); } assert(peer_smr); - if (peer_smr->pid != (int) cmd->msg.hdr.data) { + if (peer_smr->pid != (int) cmd->hdr.cq_data) { /* TODO track and update/complete in error any transfers * to or from old mapping */ - ofi_spin_lock(&ep->region->map->lock); - smr_unmap_region(&smr_prov, ep->region->map, idx, false); - smr_map_to_region(&smr_prov, ep->region->map, idx); - ofi_spin_unlock(&ep->region->map->lock); - peer_smr = smr_peer_region(ep->region, idx); + ofi_genlock_lock(&ep->util_ep.av->lock); + smr_unmap_region(ep->map, idx, false); + smr_map_to_region(ep->map, idx); + ofi_genlock_unlock(&ep->util_ep.av->lock); + peer_smr = smr_peer_region(ep, idx); } - smr_set_ipc_valid(ep->region, idx); - smr_peer_data(peer_smr)[cmd->msg.hdr.id].addr.id = idx; - smr_peer_data(ep->region)[idx].addr.id = cmd->msg.hdr.id; + smr_set_ipc_valid(ep, idx); + smr_peer_data(peer_smr)[cmd->hdr.tx_id].id = idx; + smr_peer_data(ep->region)[idx].id = cmd->hdr.tx_id; + smr_peer_data(ep->region)[idx].local_region = (uintptr_t) peer_smr; - smr_release_txbuf(ep->region, tx_buf); - assert(ep->region->map->num_peers > 0); - ep->region->max_sar_buf_per_peer = SMR_MAX_PEERS / - ep->region->map->num_peers; + assert(ep->map->num_peers > 0); + ep->region->max_sar_buf_per_peer = MIN( + SMR_BUF_BATCH_MAX, + SMR_MAX_PEERS / ep->map->num_peers); +out: + ofi_genlock_unlock(&ep->util_ep.av->lock); } static int smr_alloc_cmd_ctx(struct smr_ep *ep, - struct fi_peer_rx_entry *rx_entry, struct smr_cmd *cmd) + struct fi_peer_rx_entry *rx_entry, + struct smr_cmd *cmd) { struct smr_cmd_ctx *cmd_ctx; struct smr_pend_entry *sar_entry; - struct smr_inject_buf *tx_buf; - struct smr_unexp_buf *buf; cmd_ctx = ofi_buf_alloc(ep->cmd_ctx_pool); if (!cmd_ctx) { @@ -918,56 +875,56 @@ static int smr_alloc_cmd_ctx(struct smr_ep *ep, return -FI_ENOMEM; } cmd_ctx->ep = ep; + cmd_ctx->cmd = cmd; - rx_entry->msg_size = cmd->msg.hdr.size; - if (cmd->msg.hdr.op_flags & SMR_REMOTE_CQ_DATA) { + rx_entry->msg_size = cmd->hdr.size; + if (cmd->hdr.op_flags & SMR_REMOTE_CQ_DATA) { rx_entry->flags |= FI_REMOTE_CQ_DATA; - rx_entry->cq_data = cmd->msg.hdr.data; + rx_entry->cq_data = cmd->hdr.cq_data; } - if (cmd->msg.hdr.op_src == smr_src_inline) { - memcpy(&cmd_ctx->cmd, cmd, sizeof(cmd->msg.hdr) + cmd->msg.hdr.size); - } else if (cmd->msg.hdr.op_src == smr_src_inject) { - memcpy(&cmd_ctx->cmd, cmd, sizeof(cmd->msg.hdr)); - buf = ofi_buf_alloc(ep->unexp_buf_pool); - if (!buf) { - FI_WARN(&smr_prov, FI_LOG_EP_CTRL, - "Error allocating buffer\n"); - ofi_buf_free(cmd_ctx); - return -FI_ENOMEM; - } - cmd_ctx->sar_entry = NULL; - slist_init(&cmd_ctx->buf_list); - slist_insert_tail(&buf->entry, &cmd_ctx->buf_list); - tx_buf = smr_get_ptr(ep->region, (size_t) cmd->msg.hdr.src_data); - memcpy(buf->buf, tx_buf->buf, cmd->msg.hdr.size); - smr_release_txbuf(ep->region, tx_buf); - } else if (cmd->msg.hdr.op_src == smr_src_sar) { - memcpy(&cmd_ctx->cmd, cmd, sizeof(*cmd)); - slist_init(&cmd_ctx->buf_list); - - if (cmd->msg.hdr.size) { - sar_entry = ofi_buf_alloc(ep->pend_buf_pool); + switch(cmd->hdr.proto) { + case smr_proto_inline: + cmd_ctx->cmd = &cmd_ctx->cmd_cpy; + memcpy(&cmd_ctx->cmd_cpy, cmd, + sizeof(cmd->hdr) + cmd->hdr.size); + goto out; + case smr_proto_inject: + case smr_proto_ipc: + case smr_proto_iov: + cmd_ctx->cmd = cmd; + goto out; + case smr_proto_sar: + cmd_ctx->cmd = &cmd_ctx->cmd_cpy; + memcpy(&cmd_ctx->cmd_cpy, cmd, + sizeof(cmd->hdr) + cmd->hdr.size); + + if (cmd->hdr.size) { + sar_entry = ofi_buf_alloc(ep->pend_pool); if (!sar_entry) { FI_WARN(&smr_prov, FI_LOG_EP_CTRL, "Error allocating sar entry\n"); ofi_buf_free(cmd_ctx); return -FI_ENOMEM; } + cmd->hdr.rx_ctx = (uintptr_t) sar_entry; - memcpy(&sar_entry->cmd, cmd, sizeof(*cmd)); - sar_entry->cmd_ctx = cmd_ctx; + slist_init(&cmd_ctx->buf_list); sar_entry->bytes_done = 0; - sar_entry->rx_entry = rx_entry; - - dlist_insert_tail(&sar_entry->entry, &ep->sar_list); + sar_entry->rx.rx_entry = rx_entry; - cmd_ctx->sar_entry = sar_entry; + smr_buffer_sar(ep, sar_entry, cmd_ctx); } - } else { - memcpy(&cmd_ctx->cmd, cmd, sizeof(*cmd)); + smr_return_cmd(ep, cmd); + break; + default: + FI_WARN(&smr_prov, FI_LOG_EP_CTRL, + "unidentified operation type\n"); + ofi_buf_free(cmd_ctx); + return -FI_EINVAL; } +out: rx_entry->peer_context = cmd_ctx; return FI_SUCCESS; } @@ -978,10 +935,13 @@ static int smr_progress_cmd_msg(struct smr_ep *ep, struct smr_cmd *cmd) struct fi_peer_rx_entry *rx_entry; int ret; - attr.addr = ep->region->map->peers[cmd->msg.hdr.id].fiaddr; - attr.msg_size = cmd->msg.hdr.size; - attr.tag = cmd->msg.hdr.tag; - if (cmd->msg.hdr.op == ofi_op_tagged) { + if (cmd->hdr.rx_ctx) + return smr_progress_pending(ep, cmd); + + attr.addr = ep->map->peers[cmd->hdr.rx_id].fiaddr; + attr.msg_size = cmd->hdr.size; + attr.tag = cmd->hdr.tag; + if (cmd->hdr.op == ofi_op_tagged) { ret = ep->srx->owner_ops->get_tag(ep->srx, &attr, &rx_entry); if (ret == -FI_ENOENT) { ret = smr_alloc_cmd_ctx(ep, rx_entry, cmd); @@ -1024,140 +984,109 @@ static int smr_progress_cmd_msg(struct smr_ep *ep, struct smr_cmd *cmd) return ret < 0 ? ret : 0; } -static int smr_progress_cmd_rma(struct smr_ep *ep, struct smr_cmd *cmd, - struct smr_cmd *rma_cmd) +static int smr_progress_cmd_rma(struct smr_ep *ep, struct smr_cmd *cmd) { - struct smr_region *peer_smr; struct smr_domain *domain; - struct smr_resp *resp; struct iovec iov[SMR_IOV_LIMIT]; + struct fi_rma_iov *rma_iov; size_t iov_count; - size_t total_len = 0; - int err = 0, ret = 0; + int ret = 0; struct ofi_mr *mr[SMR_IOV_LIMIT]; + if (cmd->hdr.rx_ctx) + return smr_progress_pending(ep, cmd); + domain = container_of(ep->util_ep.domain, struct smr_domain, util_domain); ofi_genlock_lock(&domain->util_domain.lock); - for (iov_count = 0; iov_count < rma_cmd->rma.rma_count; iov_count++) { + for (iov_count = 0; iov_count < cmd->rma.rma_count; iov_count++) { + rma_iov = &cmd->rma.rma_iov[iov_count]; ret = ofi_mr_map_verify(&domain->util_domain.mr_map, - (uintptr_t *) &(rma_cmd->rma.rma_iov[iov_count].addr), - rma_cmd->rma.rma_iov[iov_count].len, - rma_cmd->rma.rma_iov[iov_count].key, - ofi_rx_mr_reg_flags(cmd->msg.hdr.op, 0), - (void **) &mr[iov_count]); + (uintptr_t *) &(rma_iov->addr), + rma_iov->len, rma_iov->key, + ofi_rx_mr_reg_flags(cmd->hdr.op, 0), + (void **) &mr[iov_count]); if (ret) break; - iov[iov_count].iov_base = (void *) rma_cmd->rma.rma_iov[iov_count].addr; - iov[iov_count].iov_len = rma_cmd->rma.rma_iov[iov_count].len; + iov[iov_count].iov_base = (void *) rma_iov->addr; + iov[iov_count].iov_len = rma_iov->len; } ofi_genlock_unlock(&domain->util_domain.lock); if (ret) goto out; - switch (cmd->msg.hdr.op_src) { - case smr_src_inline: - err = smr_progress_inline(cmd, mr, iov, iov_count, &total_len); - break; - case smr_src_inject: - err = smr_progress_inject(cmd, mr, iov, iov_count, &total_len, - ep, ret); - if (cmd->msg.hdr.op == ofi_op_read_req && cmd->msg.hdr.data) { - peer_smr = smr_peer_region(ep->region, cmd->msg.hdr.id); - resp = smr_get_ptr(peer_smr, cmd->msg.hdr.data); - resp->status = -err; - } - break; - case smr_src_iov: - err = smr_progress_iov(cmd, iov, iov_count, &total_len, ep); - break; - case smr_src_mmap: - err = smr_progress_mmap(cmd, mr, iov, iov_count, &total_len, - ep); - break; - case smr_src_sar: - if (smr_progress_sar(cmd, NULL, mr, iov, iov_count, &total_len, - ep)) - return ret; - break; - case smr_src_ipc: - if (smr_progress_ipc(cmd, NULL, mr, iov, iov_count, &total_len, - ep, &ret)) - return ret; - break; - default: - FI_WARN(&smr_prov, FI_LOG_EP_CTRL, - "unidentified operation type\n"); - err = -FI_EINVAL; - } + assert(cmd->hdr.proto < smr_proto_max); + ret = smr_progress_ops[cmd->hdr.proto](ep, cmd, NULL, mr, iov, + iov_count); - if (err) { + if (cmd->hdr.rx_ctx) + goto out; + + if (ret && ret != -FI_EBUSY) { FI_WARN(&smr_prov, FI_LOG_EP_CTRL, "error processing rma op\n"); ret = smr_write_err_comp(ep->util_ep.rx_cq, NULL, - smr_rx_cq_flags(0, cmd->msg.hdr.op_flags), - 0, -err); + smr_rx_cq_flags(0, cmd->hdr.op_flags), + 0, ret); } else { - ret = smr_complete_rx(ep, (void *) cmd->msg.hdr.msg_id, - cmd->msg.hdr.op, smr_rx_cq_flags(0, - cmd->msg.hdr.op_flags), total_len, - iov_count ? iov[0].iov_base : NULL, - cmd->msg.hdr.id, 0, cmd->msg.hdr.data); + ret = smr_complete_rx(ep, NULL, cmd->hdr.op, + smr_rx_cq_flags(0, cmd->hdr.op_flags), + cmd->hdr.size, + iov_count ? iov[0].iov_base : NULL, + cmd->hdr.rx_id, 0, cmd->hdr.cq_data); } if (ret) { FI_WARN(&smr_prov, FI_LOG_EP_CTRL, - "unable to process rx completion\n"); + "unable to process rx completion\n"); } out: return ret; } -static int smr_progress_cmd_atomic(struct smr_ep *ep, struct smr_cmd *cmd, - struct smr_cmd *rma_cmd) +static int smr_progress_cmd_atomic(struct smr_ep *ep, struct smr_cmd *cmd) { - struct smr_region *peer_smr; struct smr_domain *domain; - struct smr_resp *resp; struct ofi_mr *mr[SMR_IOV_LIMIT]; struct fi_ioc ioc[SMR_IOV_LIMIT]; - size_t ioc_count; - size_t total_len = 0; + struct fi_rma_ioc *rma_ioc; + size_t ioc_count, dt_size, total_len = 0; int err = 0, ret = 0; domain = container_of(ep->util_ep.domain, struct smr_domain, util_domain); + dt_size = ofi_datatype_size(cmd->hdr.datatype); ofi_genlock_lock(&domain->util_domain.lock); - for (ioc_count = 0; ioc_count < rma_cmd->rma.rma_count; ioc_count++) { + for (ioc_count = 0; ioc_count < cmd->rma.rma_count; ioc_count++) { + rma_ioc = &cmd->rma.rma_ioc[ioc_count]; ret = ofi_mr_map_verify(&domain->util_domain.mr_map, - (uintptr_t *) &(rma_cmd->rma.rma_ioc[ioc_count].addr), - rma_cmd->rma.rma_ioc[ioc_count].count * - ofi_datatype_size(cmd->msg.hdr.datatype), - rma_cmd->rma.rma_ioc[ioc_count].key, - ofi_rx_mr_reg_flags(cmd->msg.hdr.op, - cmd->msg.hdr.atomic_op), - (void **) &mr[ioc_count]); + (uintptr_t *) &(rma_ioc->addr), + rma_ioc->count * dt_size, + rma_ioc->key, + ofi_rx_mr_reg_flags(cmd->hdr.op, + cmd->hdr.atomic_op), + (void **) &mr[ioc_count]); if (ret) break; - ioc[ioc_count].addr = (void *) rma_cmd->rma.rma_ioc[ioc_count].addr; - ioc[ioc_count].count = rma_cmd->rma.rma_ioc[ioc_count].count; + ioc[ioc_count].addr = (void *) rma_ioc->addr; + ioc[ioc_count].count = rma_ioc->count; } ofi_genlock_unlock(&domain->util_domain.lock); if (ret) goto out; - switch (cmd->msg.hdr.op_src) { - case smr_src_inline: + switch (cmd->hdr.proto) { + case smr_proto_inline: err = smr_progress_inline_atomic(cmd, mr, ioc, ioc_count, &total_len); break; - case smr_src_inject: + case smr_proto_inject: err = smr_progress_inject_atomic(cmd, mr, ioc, ioc_count, &total_len, ep, ret); break; @@ -1166,30 +1095,20 @@ static int smr_progress_cmd_atomic(struct smr_ep *ep, struct smr_cmd *cmd, "unidentified operation type\n"); err = -FI_EINVAL; } - if (cmd->msg.hdr.data) { - peer_smr = smr_peer_region(ep->region, cmd->msg.hdr.id); - resp = smr_get_ptr(peer_smr, cmd->msg.hdr.data); - /* - * smr_do_atomic will do memcpy when flags has SMR_RMA_REQ. - * Add a memory barrier before updating resp status to ensure - * the buffer is ready before the status update. - */ - if (cmd->msg.hdr.op_flags & SMR_RMA_REQ) - ofi_wmb(); - resp->status = -err; - } + cmd->hdr.status = -err; if (err) { FI_WARN(&smr_prov, FI_LOG_EP_CTRL, "error processing atomic op\n"); ret = smr_write_err_comp(ep->util_ep.rx_cq, NULL, - smr_rx_cq_flags(0, cmd->msg.hdr.op_flags), - 0, err); + smr_rx_cq_flags(0, + cmd->hdr.op_flags), 0, err); } else { - ret = smr_complete_rx(ep, NULL, cmd->msg.hdr.op, - smr_rx_cq_flags(0, cmd->msg.hdr.op_flags), - total_len, ioc_count ? ioc[0].addr : NULL, - cmd->msg.hdr.id, 0, cmd->msg.hdr.data); + ret = smr_complete_rx(ep, NULL, cmd->hdr.op, + smr_rx_cq_flags(0, + cmd->hdr.op_flags), total_len, + ioc_count ? ioc[0].addr : NULL, + cmd->hdr.rx_id, 0, cmd->hdr.cq_data); } if (ret) { FI_WARN(&smr_prov, FI_LOG_EP_CTRL, @@ -1204,49 +1123,36 @@ static int smr_progress_cmd_atomic(struct smr_ep *ep, struct smr_cmd *cmd, static void smr_progress_cmd(struct smr_ep *ep) { struct smr_cmd_entry *ce; + struct smr_cmd *cmd; int ret = 0; int64_t pos; - /* ep->util_ep.lock is used to serialize the message/tag matching. - * We keep the lock until the matching is complete. This will - * ensure that commands are matched in the order they are - * received, if there are multiple progress threads. - * - * This lock should be low cost because it's only used by this - * single process. It is also optimized to be a noop if - * multi-threading is disabled. - * - * Other processes are free to post on the queue without the need - * for locking the queue. - */ - ofi_genlock_lock(&ep->util_ep.lock); while (1) { ret = smr_cmd_queue_head(smr_cmd_queue(ep->region), &ce, &pos); if (ret == -FI_ENOENT) break; - switch (ce->cmd.msg.hdr.op) { + + cmd = (struct smr_cmd *) ce->ptr; + switch (cmd->hdr.op) { case ofi_op_msg: case ofi_op_tagged: - ret = smr_progress_cmd_msg(ep, &ce->cmd); + ret = smr_progress_cmd_msg(ep, cmd); break; case ofi_op_write: case ofi_op_read_req: - ret = smr_progress_cmd_rma(ep, &ce->cmd, - &ce->rma_cmd); + ret = smr_progress_cmd_rma(ep, cmd); break; case ofi_op_write_async: case ofi_op_read_async: - ofi_ep_peer_rx_cntr_inc(&ep->util_ep, - ce->cmd.msg.hdr.op); + ofi_ep_peer_rx_cntr_inc(&ep->util_ep, cmd->hdr.op); break; case ofi_op_atomic: case ofi_op_atomic_fetch: case ofi_op_atomic_compare: - ret = smr_progress_cmd_atomic(ep, &ce->cmd, - &ce->rma_cmd); + ret = smr_progress_cmd_atomic(ep, cmd); break; case SMR_OP_MAX + ofi_ctrl_connreq: - smr_progress_connreq(ep, &ce->cmd); + smr_progress_connreq(ep, cmd); break; default: FI_WARN(&smr_prov, FI_LOG_EP_CTRL, @@ -1262,182 +1168,87 @@ static void smr_progress_cmd(struct smr_ep *ep) break; } } - ofi_genlock_unlock(&ep->util_ep.lock); } -void smr_progress_ipc_list(struct smr_ep *ep) +static void smr_progress_async_ipc(struct smr_ep *ep, + struct smr_pend_entry *ipc_entry) { - struct smr_pend_entry *ipc_entry; - struct smr_region *peer_smr; struct smr_domain *domain; enum fi_hmem_iface iface; - struct dlist_entry *tmp; - struct smr_resp *resp; uint64_t device; - uint64_t flags; - void *context; int ret; domain = container_of(ep->util_ep.domain, struct smr_domain, util_domain); - /* after the synchronize all operations should be complete */ - dlist_foreach_container_safe(&ep->ipc_cpy_pend_list, - struct smr_pend_entry, - ipc_entry, entry, tmp) { - iface = ipc_entry->cmd.msg.data.ipc_info.iface; - device = ipc_entry->cmd.msg.data.ipc_info.device; - peer_smr = smr_peer_region(ep->region, ipc_entry->cmd.msg.hdr.id); - resp = smr_get_ptr(peer_smr, ipc_entry->cmd.msg.hdr.src_data); - - if (ofi_async_copy_query(iface, ipc_entry->async_event)) - continue; - - if (ipc_entry->rx_entry) { - context = ipc_entry->rx_entry->context; - flags = smr_rx_cq_flags(ipc_entry->rx_entry->flags, - ipc_entry->cmd.msg.hdr.op_flags); - } else { - context = NULL; - flags = smr_rx_cq_flags(0, ipc_entry->cmd.msg.hdr.op_flags); - } + iface = ipc_entry->cmd->data.ipc_info.iface; + device = ipc_entry->cmd->data.ipc_info.device; - ret = smr_complete_rx(ep, context, ipc_entry->cmd.msg.hdr.op, - flags, ipc_entry->cmd.msg.hdr.size, - ipc_entry->iov[0].iov_base, - ipc_entry->cmd.msg.hdr.id, - ipc_entry->cmd.msg.hdr.tag, - ipc_entry->cmd.msg.hdr.data); - if (ret) { - FI_WARN(&smr_prov, FI_LOG_EP_CTRL, - "unable to process rx completion\n"); - } + if (ofi_async_copy_query(iface, ipc_entry->rx.async_event)) + return; - /* indicate that the operation is completed only after we - * have confirmed that the write has finished. This is to - * ensure that the tx_complete occurs after the sending - * buffer is now free to be reused - */ - resp->status = SMR_STATUS_SUCCESS; - - ofi_mr_cache_delete(domain->ipc_cache, ipc_entry->ipc_entry); - ofi_free_async_copy_event(iface, device, - ipc_entry->async_event); - dlist_remove(&ipc_entry->entry); - if (ipc_entry->rx_entry) - ep->srx->owner_ops->free_entry(ipc_entry->rx_entry); - ofi_buf_free(ipc_entry); + ofi_mr_cache_delete(domain->ipc_cache, ipc_entry->rx.ipc_entry); + ofi_free_async_copy_event(iface, device, ipc_entry->rx.async_event); + + ret = smr_complete_rx(ep, ipc_entry->comp_ctx, ipc_entry->cmd->hdr.op, + ipc_entry->comp_flags, ipc_entry->cmd->hdr.size, + ipc_entry->iov[0].iov_base, + ipc_entry->cmd->hdr.rx_id, + ipc_entry->cmd->hdr.tag, + ipc_entry->cmd->hdr.cq_data); + if (ret) { + FI_WARN(&smr_prov, FI_LOG_EP_CTRL, + "unable to process rx completion\n"); } + if (ipc_entry->rx.rx_entry) + ep->srx->owner_ops->free_entry(ipc_entry->rx.rx_entry); + + smr_return_cmd(ep, ipc_entry->cmd); + dlist_remove(&ipc_entry->entry); + ofi_buf_free(ipc_entry); } -static void smr_buffer_sar(struct smr_ep *ep, struct smr_region *peer_smr, - struct smr_resp *resp, struct smr_pend_entry *sar_entry) +static void smr_progress_async_sar(struct smr_ep *ep, + struct smr_pend_entry *pend) { - struct smr_sar_buf *sar_buf; - struct smr_unexp_buf *buf; - size_t bytes; - int next_buf = 0; + ssize_t ret; - while (next_buf < sar_entry->cmd.msg.data.buf_batch_size && - sar_entry->bytes_done < sar_entry->cmd.msg.hdr.size) { - buf = ofi_buf_alloc(ep->unexp_buf_pool); - if (!buf) { - FI_WARN(&smr_prov, FI_LOG_EP_CTRL, - "Error allocating buffer for unexpected SAR " - "(-FI_ENOMEM)\n"); + ret = pend->sar_copy_fn(ep, pend); + if (ret) { + if (ret == -FI_EAGAIN) + return; + /* -FI_EBUSY indicates copy was submitted successfully but will + * complete asynchronously through DSA progress + */ + if (ret == -FI_EBUSY) { + dlist_remove(&pend->entry); return; } - slist_insert_tail(&buf->entry, - &sar_entry->cmd_ctx->buf_list); - - sar_buf = smr_freestack_get_entry_from_index( - smr_sar_pool(ep->region), - sar_entry->cmd.msg.data.sar[next_buf]); - bytes = MIN(sar_entry->cmd.msg.hdr.size - - sar_entry->bytes_done, - SMR_SAR_SIZE); - - memcpy(buf->buf, sar_buf->buf, bytes); - - sar_entry->bytes_done += bytes; - next_buf++; + pend->cmd->hdr.status = ret; } - ofi_wmb(); - resp->status = SMR_STATUS_SAR_EMPTY; } -static void smr_progress_sar_list(struct smr_ep *ep) +void smr_progress_async(struct smr_ep *ep) { - struct smr_region *peer_smr; - struct smr_pend_entry *sar_entry; - struct smr_resp *resp; + struct smr_pend_entry *async_entry; struct dlist_entry *tmp; - void *comp_ctx; - uint64_t comp_flags; - int ret; - - ofi_genlock_lock(&ep->util_ep.lock); - dlist_foreach_container_safe(&ep->sar_list, struct smr_pend_entry, - sar_entry, entry, tmp) { - peer_smr = smr_peer_region(ep->region, sar_entry->cmd.msg.hdr.id); - resp = smr_get_ptr(peer_smr, sar_entry->cmd.msg.hdr.src_data); - if (sar_entry->cmd.msg.hdr.op == ofi_op_read_req) { - smr_try_progress_to_sar(ep, peer_smr, smr_sar_pool(ep->region), - resp, &sar_entry->cmd, sar_entry->mr, - sar_entry->iov, sar_entry->iov_count, - &sar_entry->bytes_done, sar_entry); - } else { - if (sar_entry->cmd_ctx) { - if (resp->status != SMR_STATUS_SAR_FULL) - continue; - smr_buffer_sar(ep, peer_smr, resp, sar_entry); - } else { - smr_try_progress_from_sar(ep, peer_smr, smr_sar_pool(ep->region), - resp, &sar_entry->cmd, sar_entry->mr, - sar_entry->iov, - sar_entry->iov_count, - &sar_entry->bytes_done, - sar_entry); - } - } - - if (sar_entry->bytes_done == sar_entry->cmd.msg.hdr.size) { - if (sar_entry->cmd_ctx) { - sar_entry->cmd_ctx->sar_entry = NULL; - dlist_remove(&sar_entry->entry); - ofi_buf_free(sar_entry); - continue; - } - if (sar_entry->rx_entry) { - comp_ctx = sar_entry->rx_entry->context; - comp_flags = smr_rx_cq_flags( - sar_entry->rx_entry->flags, - sar_entry->cmd.msg.hdr.op_flags); - } else { - comp_ctx = NULL; - comp_flags = smr_rx_cq_flags(0, - sar_entry->cmd.msg.hdr.op_flags); - } - ret = smr_complete_rx(ep, comp_ctx, - sar_entry->cmd.msg.hdr.op, comp_flags, - sar_entry->bytes_done, - sar_entry->iov[0].iov_base, - sar_entry->cmd.msg.hdr.id, - sar_entry->cmd.msg.hdr.tag, - sar_entry->cmd.msg.hdr.data); - if (ret) { - FI_WARN(&smr_prov, FI_LOG_EP_CTRL, - "unable to process rx completion\n"); - } - if (sar_entry->rx_entry) - ep->srx->owner_ops->free_entry(sar_entry->rx_entry); - - dlist_remove(&sar_entry->entry); - ofi_buf_free(sar_entry); + dlist_foreach_container_safe(&ep->async_cpy_list, + struct smr_pend_entry, + async_entry, entry, tmp) { + switch (async_entry->cmd->hdr.proto) { + case smr_proto_ipc: + smr_progress_async_ipc(ep, async_entry); + break; + case smr_proto_sar: + smr_progress_async_sar(ep, async_entry); + break; + default: + FI_WARN(&smr_prov, FI_LOG_EP_CTRL, + "unidentified operation type\n"); + assert(0); } } - ofi_genlock_unlock(&ep->util_ep.lock); } void smr_ep_progress(struct util_ep *util_ep) @@ -1446,13 +1257,33 @@ void smr_ep_progress(struct util_ep *util_ep) ep = container_of(util_ep, struct smr_ep, util_ep); + /* ep->util_ep.lock is used to serialize the message/tag matching. + * We keep the lock until the matching is complete. This will + * ensure that commands are matched in the order they are + * received, if there are multiple progress threads. + * + * This lock should be low cost because it's only used by this + * single process. It is also optimized to be a noop if + * multi-threading is disabled. + * + * Other processes are free to post on the queue without the need + * for locking the queue. + */ + ofi_genlock_lock(&ep->util_ep.lock); + if (smr_env.use_dsa_sar) smr_dsa_progress(ep); - smr_progress_resp(ep); - smr_progress_sar_list(ep); + + smr_progress_return(ep); + + if (!slist_empty(&ep->overflow_list)) + smr_progress_overflow(ep); + smr_progress_cmd(ep); /* always drive forward the ipc list since the completion is * independent of any action by the provider */ - ep->smr_progress_ipc_list(ep); -} + ep->smr_progress_async(ep); + + ofi_genlock_unlock(&ep->util_ep.lock); +} \ No newline at end of file diff --git a/prov/shm/src/smr_rma.c b/prov/shm/src/smr_rma.c index 08b201d1864..bf370cf0c68 100644 --- a/prov/shm/src/smr_rma.c +++ b/prov/shm/src/smr_rma.c @@ -34,26 +34,27 @@ #include "smr.h" static void smr_add_rma_cmd(struct smr_region *peer_smr, - const struct fi_rma_iov *rma_iov, size_t iov_count, - struct smr_cmd_entry *ce) + const struct fi_rma_iov *rma_iov, size_t iov_count, + struct smr_cmd *cmd) { - ce->rma_cmd.rma.rma_count = iov_count; - memcpy(ce->rma_cmd.rma.rma_iov, rma_iov, sizeof(*rma_iov) * iov_count); + cmd->rma.rma_count = iov_count; + memcpy(cmd->rma.rma_iov, rma_iov, sizeof(*rma_iov) * iov_count); } -static void smr_format_rma_resp(struct smr_cmd *cmd, fi_addr_t peer_id, +static void smr_format_rma_resp(struct smr_cmd *cmd, int64_t peer_id, const struct fi_rma_iov *rma_iov, size_t count, - size_t total_len, uint32_t op, uint64_t op_flags) + size_t total_len, uint32_t op, + uint64_t op_flags) { - smr_generic_format(cmd, peer_id, op, 0, 0, op_flags); - cmd->msg.hdr.size = total_len; + smr_generic_format(cmd, 0, peer_id, op, 0, 0, op_flags); + cmd->hdr.size = total_len; } static ssize_t smr_rma_fast(struct smr_ep *ep, struct smr_region *peer_smr, - const struct iovec *iov, size_t iov_count, - const struct fi_rma_iov *rma_iov, size_t rma_count, - void **desc, int peer_id, int id, void *context, - uint32_t op, uint64_t op_flags) + const struct iovec *iov, size_t iov_count, + const struct fi_rma_iov *rma_iov, size_t rma_count, + void **desc, int rx_id, int tx_id, void *context, + uint32_t op, uint64_t op_flags) { struct iovec vma_iovec[SMR_IOV_LIMIT], rma_iovec[SMR_IOV_LIMIT]; struct ofi_xpmem_client *xpmem; @@ -74,36 +75,65 @@ static ssize_t smr_rma_fast(struct smr_ep *ep, struct smr_region *peer_smr, total_len = ofi_total_iov_len(iov, iov_count); - xpmem = &smr_peer_data(ep->region)[id].xpmem; + xpmem = &smr_peer_data(ep->region)[tx_id].xpmem; ret = ofi_shm_p2p_copy(ep->p2p_type, vma_iovec, iov_count, rma_iovec, rma_count, total_len, peer_smr->pid, op == ofi_op_write, xpmem); if (ret) { + FI_WARN(&smr_prov, FI_LOG_EP_CTRL, "error doing fast RMA\n"); + ret = smr_write_err_comp(ep->util_ep.rx_cq, NULL, op_flags, 0, + ret); smr_cmd_queue_discard(ce, pos); return -FI_EAGAIN; } - smr_format_rma_resp(&ce->cmd, peer_id, rma_iov, rma_count, total_len, + smr_format_rma_resp(&ce->cmd, rx_id, rma_iov, rma_count, total_len, (op == ofi_op_write) ? ofi_op_write_async : ofi_op_read_async, op_flags); + + ce->ptr = smr_peer_to_peer(ep, tx_id, (uintptr_t) &ce->cmd); smr_cmd_queue_commit(ce, pos); + + ret = smr_complete_tx(ep, context, op, op_flags); + if (ret) { + FI_WARN(&smr_prov, FI_LOG_EP_CTRL, + "unable to process tx completion\n"); + } + return FI_SUCCESS; } -static ssize_t smr_generic_rma(struct smr_ep *ep, const struct iovec *iov, - size_t iov_count, const struct fi_rma_iov *rma_iov, size_t rma_count, - void **desc, fi_addr_t addr, void *context, uint32_t op, uint64_t data, - uint64_t op_flags) +static inline bool smr_do_fast_rma(struct smr_ep *ep, uint64_t op_flags, + size_t rma_count, size_t total_len, + struct smr_region *peer_smr) { struct smr_domain *domain; + + domain = container_of(ep->util_ep.domain, struct smr_domain, + util_domain); + + return domain->fast_rma && !(op_flags & + (FI_REMOTE_CQ_DATA | FI_DELIVERY_COMPLETE)) && + rma_count == 1 && smr_vma_enabled(ep, peer_smr) && + total_len > SMR_INJECT_SIZE; + +} + +static ssize_t smr_generic_rma( + struct smr_ep *ep, const struct iovec *iov, + size_t iov_count, const struct fi_rma_iov *rma_iov, + size_t rma_count, void **desc, fi_addr_t addr, void *context, + uint32_t op, uint64_t data, uint64_t op_flags) +{ struct smr_region *peer_smr; - int64_t id, peer_id; - int cmds, err = 0, proto = smr_src_inline; + int64_t tx_id, rx_id; + int proto = smr_proto_inline; ssize_t ret = 0; size_t total_len; struct smr_cmd_entry *ce; + struct smr_cmd *cmd; int64_t pos; assert(iov_count <= SMR_IOV_LIMIT); @@ -111,75 +141,64 @@ static ssize_t smr_generic_rma(struct smr_ep *ep, const struct iovec *iov, assert(ofi_total_iov_len(iov, iov_count) == ofi_total_rma_iov_len(rma_iov, rma_count)); - domain = container_of(ep->util_ep.domain, struct smr_domain, util_domain); - - id = smr_verify_peer(ep, addr); - if (id < 0) + tx_id = smr_verify_peer(ep, addr); + if (tx_id < 0) return -FI_EAGAIN; - peer_id = smr_peer_data(ep->region)[id].addr.id; - peer_smr = smr_peer_region(ep->region, id); + rx_id = smr_peer_data(ep->region)[tx_id].id; + peer_smr = smr_peer_region(ep, tx_id); - cmds = 1 + !(domain->fast_rma && !(op_flags & - (FI_REMOTE_CQ_DATA | FI_DELIVERY_COMPLETE)) && - rma_count == 1 && smr_vma_enabled(ep, peer_smr)); - - if (smr_peer_data(ep->region)[id].sar_status) + if (smr_peer_data(ep->region)[tx_id].sar_status) return -FI_EAGAIN; ofi_genlock_lock(&ep->util_ep.lock); - if (cmds == 1) { - err = smr_rma_fast(ep, peer_smr, iov, iov_count, rma_iov, - rma_count, desc, peer_id, id, context, op, + total_len = ofi_total_iov_len(iov, iov_count); + if (smr_do_fast_rma(ep, op_flags, rma_count, total_len, peer_smr)) { + ret = smr_rma_fast(ep, peer_smr, iov, iov_count, rma_iov, + rma_count, desc, rx_id, tx_id, context, op, op_flags); - if (err) { - FI_WARN(&smr_prov, FI_LOG_EP_CTRL, - "error doing fast RMA\n"); - if (err == -FI_EAGAIN) { - ret = -FI_EAGAIN; - goto unlock; - } - - ret = smr_write_err_comp(ep->util_ep.rx_cq, NULL, - op_flags, 0, -err); - } else { - ret = smr_complete_tx(ep, context, op, op_flags); - } - - if (ret) { - FI_WARN(&smr_prov, FI_LOG_EP_CTRL, - "unable to process tx completion\n"); - } goto unlock; } ret = smr_cmd_queue_next(smr_cmd_queue(peer_smr), &ce, &pos); if (ret == -FI_ENOENT) { - /* kick the peer to process any outstanding commands */ ret = -FI_EAGAIN; goto unlock; } - total_len = ofi_total_iov_len(iov, iov_count); assert(!(op_flags & FI_INJECT) || total_len <= SMR_INJECT_SIZE); proto = smr_select_proto(desc, iov_count, smr_vma_enabled(ep, peer_smr), - smr_ipc_valid(ep, peer_smr, id, peer_id), op, + smr_ipc_valid(ep, peer_smr, tx_id, rx_id), op, total_len, op_flags); - - ret = smr_proto_ops[proto](ep, peer_smr, id, peer_id, op, 0, data, - op_flags, (struct ofi_mr **)desc, iov, - iov_count, total_len, context, &ce->cmd); + if (proto != smr_proto_inline) { + if (smr_freestack_isempty(smr_cmd_stack(ep->region))) { + smr_cmd_queue_discard(ce, pos); + ret = -FI_EAGAIN; + goto unlock; + } + cmd = smr_freestack_pop(smr_cmd_stack(ep->region)); + assert(cmd); + ce->ptr = smr_local_to_peer(ep, tx_id, rx_id, (uintptr_t) cmd); + } else { + cmd = &ce->cmd; + ce->ptr = smr_peer_to_peer(ep, tx_id, (uintptr_t) &ce->cmd); + } + ret = smr_send_ops[proto](ep, peer_smr, tx_id, rx_id, op, 0, data, + op_flags, (struct ofi_mr **)desc, iov, + iov_count, total_len, context, cmd); if (ret) { + if (proto != smr_proto_inline) + smr_freestack_push(smr_cmd_stack(ep->region), cmd); smr_cmd_queue_discard(ce, pos); goto unlock; } - smr_add_rma_cmd(peer_smr, rma_iov, rma_count, ce); + smr_add_rma_cmd(peer_smr, rma_iov, rma_count, cmd); smr_cmd_queue_commit(ce, pos); - if (proto != smr_src_inline && proto != smr_src_inject) + if (proto != smr_proto_inline || op == ofi_op_read_req) goto unlock; ret = smr_complete_tx(ep, context, op, op_flags); @@ -300,36 +319,33 @@ static ssize_t smr_writemsg(struct fid_ep *ep_fid, const struct fi_msg_rma *msg, flags | ep->util_ep.tx_msg_flags); } -static ssize_t smr_generic_rma_inject(struct fid_ep *ep_fid, const void *buf, - size_t len, fi_addr_t dest_addr, uint64_t addr, uint64_t key, - uint64_t data, uint64_t flags) +static ssize_t smr_generic_rma_inject( + struct fid_ep *ep_fid, const void *buf, size_t len, + fi_addr_t dest_addr, uint64_t addr, uint64_t key, uint64_t data, + uint64_t flags) { struct smr_ep *ep; - struct smr_domain *domain; struct smr_region *peer_smr; struct iovec iov; struct fi_rma_iov rma_iov; - int64_t id, peer_id; - int cmds, proto = smr_src_inline; + int64_t tx_id, rx_id; + int proto = smr_proto_inline; ssize_t ret = 0; + struct smr_cmd *cmd; struct smr_cmd_entry *ce; int64_t pos; assert(len <= SMR_INJECT_SIZE); ep = container_of(ep_fid, struct smr_ep, util_ep.ep_fid.fid); - domain = container_of(ep->util_ep.domain, struct smr_domain, util_domain); - id = smr_verify_peer(ep, dest_addr); - if (id < 0) + tx_id = smr_verify_peer(ep, dest_addr); + if (tx_id < 0) return -FI_EAGAIN; - peer_id = smr_peer_data(ep->region)[id].addr.id; - peer_smr = smr_peer_region(ep->region, id); - - cmds = 1 + !(domain->fast_rma && !(flags & FI_REMOTE_CQ_DATA) && - smr_vma_enabled(ep, peer_smr)); + rx_id = smr_peer_data(ep->region)[tx_id].id; + peer_smr = smr_peer_region(ep, tx_id); - if (smr_peer_data(ep->region)[id].sar_status) + if (smr_peer_data(ep->region)[tx_id].sar_status) return -FI_EAGAIN; iov.iov_base = (void *) buf; @@ -338,29 +354,46 @@ static ssize_t smr_generic_rma_inject(struct fid_ep *ep_fid, const void *buf, rma_iov.len = len; rma_iov.key = key; - if (cmds == 1) { - ret = smr_rma_fast(ep, peer_smr, &iov, 1, &rma_iov, 1, NULL, - peer_id, id, NULL, ofi_op_write, flags); - goto out; - } + ofi_genlock_lock(&ep->util_ep.lock); ret = smr_cmd_queue_next(smr_cmd_queue(peer_smr), &ce, &pos); - if (ret == -FI_ENOENT) - return -FI_EAGAIN; + if (ret == -FI_ENOENT) { + ret = -FI_EAGAIN; + goto unlock; + } - proto = len <= SMR_MSG_DATA_LEN ? smr_src_inline : smr_src_inject; - ret = smr_proto_ops[proto](ep, peer_smr, id, peer_id, ofi_op_write, 0, - data, flags, NULL, &iov, 1, len, NULL, &ce->cmd); + if (len <= SMR_MSG_DATA_LEN) { + proto = smr_proto_inline; + cmd = &ce->cmd; + ce->ptr = smr_peer_to_peer(ep, tx_id, (uintptr_t) &ce->cmd); + } else { + proto = smr_proto_inject; + if (smr_freestack_isempty(smr_cmd_stack(ep->region))) { + smr_cmd_queue_discard(ce, pos); + ret = -FI_EAGAIN; + goto unlock; + } + + cmd = smr_freestack_pop(smr_cmd_stack(ep->region)); + assert(cmd); + ce->ptr = smr_local_to_peer(ep, tx_id, rx_id, (uintptr_t) cmd); + } + + ret = smr_send_ops[proto](ep, peer_smr, tx_id, rx_id, ofi_op_write, 0, + data, flags, NULL, &iov, 1, len, NULL, cmd); if (ret) { + if (proto != smr_proto_inline) + smr_freestack_push(smr_cmd_stack(ep->region), cmd); smr_cmd_queue_discard(ce, pos); - return -FI_EAGAIN; + goto unlock; } - smr_add_rma_cmd(peer_smr, &rma_iov, 1, ce); + smr_add_rma_cmd(peer_smr, &rma_iov, 1, cmd); smr_cmd_queue_commit(ce, pos); -out: - if (!ret) + if (proto == smr_proto_inline) ofi_ep_peer_tx_cntr_inc(&ep->util_ep, ofi_op_write); +unlock: + ofi_genlock_unlock(&ep->util_ep.lock); return ret; } @@ -380,8 +413,8 @@ static ssize_t smr_writedata(struct fid_ep *ep_fid, const void *buf, size_t len, rma_iov.len = len; rma_iov.key = key; - return smr_generic_rma(ep, &iov, 1, &rma_iov, 1, &desc, dest_addr, context, - ofi_op_write, data, + return smr_generic_rma(ep, &iov, 1, &rma_iov, 1, &desc, dest_addr, + context, ofi_op_write, data, FI_REMOTE_CQ_DATA | smr_ep_tx_flags(ep)); } @@ -394,8 +427,9 @@ static ssize_t smr_rma_inject(struct fid_ep *ep_fid, const void *buf, } static ssize_t smr_inject_writedata(struct fid_ep *ep_fid, const void *buf, - size_t len, uint64_t data, fi_addr_t dest_addr, - uint64_t addr, uint64_t key) + size_t len, uint64_t data, + fi_addr_t dest_addr, uint64_t addr, + uint64_t key) { return smr_generic_rma_inject(ep_fid, buf, len, dest_addr, addr, key, data, FI_REMOTE_CQ_DATA); @@ -412,4 +446,4 @@ struct fi_ops_rma smr_rma_ops = { .inject = smr_rma_inject, .writedata = smr_writedata, .injectdata = smr_inject_writedata, -}; +}; \ No newline at end of file diff --git a/prov/shm/src/smr_signal.h b/prov/shm/src/smr_signal.h index c356f8db611..5be75ab8cc7 100644 --- a/prov/shm/src/smr_signal.h +++ b/prov/shm/src/smr_signal.h @@ -62,7 +62,6 @@ static void smr_handle_signal(int signum, siginfo_t *info, void *ucontext) old_action[signum].sa_sigaction(signum, info, ucontext); else raise(signum); - } static inline void smr_reg_sig_handler(int signum) @@ -80,4 +79,4 @@ static inline void smr_reg_sig_handler(int signum) "Unable to register handler for sig %d\n", signum); } -#endif /* _SMR_SIGNAL_H_ */ +#endif /* _SMR_SIGNAL_H_ */ \ No newline at end of file diff --git a/prov/shm/src/smr_util.c b/prov/shm/src/smr_util.c index ba1fe7c2243..5563f0d89a5 100644 --- a/prov/shm/src/smr_util.c +++ b/prov/shm/src/smr_util.c @@ -30,7 +30,8 @@ * SOFTWARE. */ -#include "smr.h" +#include "smr_util.h" +#include "ofi_shm_p2p.h" #include struct dlist_entry ep_name_list; @@ -55,8 +56,11 @@ void smr_cma_check(struct smr_region *smr, struct smr_region *peer_smr) int remote_pid; int ret; - if (smr != peer_smr && peer_smr->cma_cap_peer != SMR_VMA_CAP_NA) { - smr->cma_cap_peer = peer_smr->cma_cap_peer; + if (smr != peer_smr && peer_smr->flags & SMR_FLAG_CMA_INIT) { + smr_set_vma_cap(&smr->peer_vma_caps, FI_SHM_P2P_CMA, + smr_get_vma_cap(peer_smr->peer_vma_caps, + FI_SHM_P2P_CMA)); + smr->flags |= SMR_FLAG_CMA_INIT; return; } remote_pid = peer_smr->pid; @@ -70,20 +74,26 @@ void smr_cma_check(struct smr_region *smr, struct smr_region *peer_smr) assert(remote_pid == peer_smr->pid); if (smr == peer_smr) { - smr->cma_cap_self = (ret == -1) ? SMR_VMA_CAP_OFF : SMR_VMA_CAP_ON; + smr_set_vma_cap(&smr->self_vma_caps, FI_SHM_P2P_CMA, + (ret == -1) ? false : true); } else { - smr->cma_cap_peer = (ret == -1) ? SMR_VMA_CAP_OFF : SMR_VMA_CAP_ON; - peer_smr->cma_cap_peer = smr->cma_cap_peer; + smr_set_vma_cap(&smr->peer_vma_caps, FI_SHM_P2P_CMA, + (ret == -1) ? false : true); + smr_set_vma_cap(&peer_smr->peer_vma_caps, FI_SHM_P2P_CMA, + smr_get_vma_cap(smr->peer_vma_caps, + FI_SHM_P2P_CMA)); + smr->flags |= SMR_FLAG_CMA_INIT; + peer_smr->flags |= SMR_FLAG_CMA_INIT; } } size_t smr_calculate_size_offsets(size_t tx_count, size_t rx_count, - size_t *cmd_offset, size_t *resp_offset, - size_t *inject_offset, size_t *sar_offset, - size_t *peer_offset, size_t *name_offset) + size_t *cmd_offset, size_t *cs_offset, + size_t *inject_offset, size_t *rq_offset, + size_t *sar_offset, size_t *peer_offset) { - size_t cmd_queue_offset, resp_queue_offset, inject_pool_offset; - size_t sar_pool_offset, peer_data_offset, ep_name_offset; + size_t cmd_queue_offset, cmd_stack_offset, inject_pool_offset; + size_t ret_queue_offset, sar_pool_offset, peer_data_offset; size_t tx_size, rx_size, total_size; tx_size = roundup_power_of_two(tx_count); @@ -91,31 +101,31 @@ size_t smr_calculate_size_offsets(size_t tx_count, size_t rx_count, /* Align cmd_queue offset to cache line */ cmd_queue_offset = ofi_get_aligned_size(sizeof(struct smr_region), 64); - resp_queue_offset = cmd_queue_offset + sizeof(struct smr_cmd_queue) + + cmd_stack_offset = cmd_queue_offset + sizeof(struct smr_cmd_queue) + sizeof(struct smr_cmd_queue_entry) * rx_size; - inject_pool_offset = resp_queue_offset + sizeof(struct smr_resp_queue) + - sizeof(struct smr_resp) * tx_size; - sar_pool_offset = inject_pool_offset + - freestack_size(sizeof(struct smr_inject_buf), rx_size); + inject_pool_offset = cmd_stack_offset + + freestack_size(sizeof(struct smr_cmd), tx_size); + ret_queue_offset = inject_pool_offset + sizeof(struct smr_inject_buf) * tx_size; + ret_queue_offset = ofi_get_aligned_size(ret_queue_offset, 64); + sar_pool_offset = ret_queue_offset + sizeof(struct smr_return_queue) + + sizeof(struct smr_return_queue_entry) * tx_size; peer_data_offset = sar_pool_offset + freestack_size(sizeof(struct smr_sar_buf), SMR_MAX_PEERS); - ep_name_offset = peer_data_offset + sizeof(struct smr_peer_data) * + total_size = peer_data_offset + sizeof(struct smr_peer_data) * SMR_MAX_PEERS; - total_size = ep_name_offset + SMR_NAME_MAX; - if (cmd_offset) *cmd_offset = cmd_queue_offset; - if (resp_offset) - *resp_offset = resp_queue_offset; + if (cs_offset) + *cs_offset = cmd_stack_offset; if (inject_offset) *inject_offset = inject_pool_offset; + if (rq_offset) + *rq_offset = ret_queue_offset; if (sar_offset) *sar_offset = sar_pool_offset; if (peer_offset) *peer_offset = peer_data_offset; - if (name_offset) - *name_offset = ep_name_offset; /* * Revisit later to see if we really need the size adjustment, or @@ -165,29 +175,24 @@ static int smr_retry_map(const char *name, int *fd) return -FI_EBUSY; } -static void smr_lock_init(pthread_spinlock_t *lock) -{ - pthread_spin_init(lock, PTHREAD_PROCESS_SHARED); -} - /* TODO: Determine if aligning SMR data helps performance */ -int smr_create(const struct fi_provider *prov, struct smr_map *map, - const struct smr_attr *attr, struct smr_region *volatile *smr) +int smr_create(const struct fi_provider *prov, const struct smr_attr *attr, + struct smr_region *volatile *smr) { struct smr_ep_name *ep_name; - size_t total_size, cmd_queue_offset, peer_data_offset; - size_t resp_queue_offset, inject_pool_offset, name_offset; - size_t sar_pool_offset; + size_t total_size, cmd_queue_offset, ret_queue_offset, peer_data_offset; + size_t cmd_stack_offset, inject_pool_offset, sar_pool_offset; int fd, ret, i; void *mapped_addr; size_t tx_size, rx_size; tx_size = roundup_power_of_two(attr->tx_count); rx_size = roundup_power_of_two(attr->rx_count); - total_size = smr_calculate_size_offsets(tx_size, rx_size, &cmd_queue_offset, - &resp_queue_offset, &inject_pool_offset, - &sar_pool_offset, &peer_data_offset, - &name_offset); + total_size = smr_calculate_size_offsets( + tx_size, rx_size, &cmd_queue_offset, + &cmd_stack_offset, &inject_pool_offset, + &ret_queue_offset, &sar_pool_offset, + &peer_data_offset); fd = shm_open(attr->name, O_RDWR | O_CREAT | O_EXCL, S_IRUSR | S_IWUSR); if (fd < 0) { @@ -248,9 +253,7 @@ int smr_create(const struct fi_provider *prov, struct smr_map *map, pthread_mutex_unlock(&ep_list_lock); *smr = mapped_addr; - smr_lock_init(&(*smr)->lock); - (*smr)->map = map; (*smr)->version = SMR_VERSION; (*smr)->flags = attr->flags; @@ -261,12 +264,8 @@ int smr_create(const struct fi_provider *prov, struct smr_map *map, (*smr)->flags |= SMR_FLAG_DEBUG; #endif - (*smr)->cma_cap_peer = SMR_VMA_CAP_NA; - (*smr)->cma_cap_self = SMR_VMA_CAP_NA; - - (*smr)->xpmem_cap_self = SMR_VMA_CAP_OFF; - if (xpmem && smr_env.use_xpmem) { - (*smr)->xpmem_cap_self = SMR_VMA_CAP_ON; + if (xpmem && attr->flags & SMR_FLAG_XPMEM_ENABLED) { + smr_set_vma_cap(&(*smr)->self_vma_caps, FI_SHM_P2P_XPMEM, true); (*smr)->xpmem_self = xpmem->pinfo; } @@ -274,27 +273,28 @@ int smr_create(const struct fi_provider *prov, struct smr_map *map, (*smr)->total_size = total_size; (*smr)->cmd_queue_offset = cmd_queue_offset; - (*smr)->resp_queue_offset = resp_queue_offset; + (*smr)->cmd_stack_offset = cmd_stack_offset; (*smr)->inject_pool_offset = inject_pool_offset; + (*smr)->ret_queue_offset = ret_queue_offset; (*smr)->sar_pool_offset = sar_pool_offset; (*smr)->peer_data_offset = peer_data_offset; - (*smr)->name_offset = name_offset; (*smr)->max_sar_buf_per_peer = SMR_BUF_BATCH_MAX; smr_cmd_queue_init(smr_cmd_queue(*smr), rx_size); - smr_resp_queue_init(smr_resp_queue(*smr), tx_size); - smr_freestack_init(smr_inject_pool(*smr), rx_size, - sizeof(struct smr_inject_buf)); + smr_return_queue_init(smr_return_queue(*smr), tx_size); + + smr_freestack_init(smr_cmd_stack(*smr), tx_size, + sizeof(struct smr_cmd)); smr_freestack_init(smr_sar_pool(*smr), SMR_MAX_PEERS, - sizeof(struct smr_sar_buf)); + sizeof(struct smr_sar_buf)); for (i = 0; i < SMR_MAX_PEERS; i++) { - smr_peer_data(*smr)[i].addr.id = -1; - smr_peer_data(*smr)[i].sar_status = 0; + smr_peer_data(*smr)[i].id = -1; + smr_peer_data(*smr)[i].sar_status = SMR_SAR_FREE; smr_peer_data(*smr)[i].name_sent = 0; smr_peer_data(*smr)[i].xpmem.avail = false; } - strncpy((char *) smr_name(*smr), attr->name, total_size - name_offset); + strcpy((*smr)->name, attr->name); /* Must be set last to signal full initialization to peers */ (*smr)->pid = getpid(); @@ -314,293 +314,6 @@ void smr_free(struct smr_region *smr) { if (smr->flags & SMR_FLAG_HMEM_ENABLED) (void) ofi_hmem_host_unregister(smr); - shm_unlink(smr_name(smr)); + shm_unlink(smr->name); munmap(smr, smr->total_size); -} - -static int smr_match_name(struct dlist_entry *item, const void *args) -{ - return !strcmp(container_of(item, struct smr_ep_name, entry)->name, - (char *) args); -} - -int smr_map_to_region(const struct fi_provider *prov, struct smr_map *map, - int64_t id) -{ - struct smr_peer *peer_buf = &map->peers[id]; - struct smr_region *peer; - struct util_ep *util_ep; - struct smr_ep *smr_ep; - struct smr_av *av; - size_t size; - int fd, ret = 0; - struct stat sts; - struct dlist_entry *entry; - const char *name = smr_no_prefix(peer_buf->peer.name); - char tmp[SMR_PATH_MAX]; - - pthread_mutex_lock(&ep_list_lock); - entry = dlist_find_first_match(&ep_name_list, smr_match_name, name); - if (entry) { - peer_buf->region = container_of(entry, struct smr_ep_name, - entry)->region; - pthread_mutex_unlock(&ep_list_lock); - return FI_SUCCESS; - } - pthread_mutex_unlock(&ep_list_lock); - - if (peer_buf->region) - return FI_SUCCESS; - - assert(ofi_spin_held(&map->lock)); - fd = shm_open(name, O_RDWR, S_IRUSR | S_IWUSR); - if (fd < 0) { - FI_WARN_ONCE(prov, FI_LOG_AV, - "shm_open error: name %s errno %d\n", name, errno); - return -errno; - } - - memset(tmp, 0, sizeof(tmp)); - snprintf(tmp, sizeof(tmp), "%s%s", SMR_DIR, name); - if (stat(tmp, &sts) == -1) { - ret = -errno; - goto out; - } - - if (sts.st_size < sizeof(*peer)) { - ret = -FI_ENOENT; - goto out; - } - - peer = mmap(NULL, sizeof(*peer), PROT_READ | PROT_WRITE, - MAP_SHARED, fd, 0); - if (peer == MAP_FAILED) { - FI_WARN(prov, FI_LOG_AV, "mmap error\n"); - ret = -errno; - goto out; - } - - if (!peer->pid) { - FI_WARN(prov, FI_LOG_AV, "peer not initialized\n"); - munmap(peer, sizeof(*peer)); - ret = -FI_ENOENT; - goto out; - } - - size = peer->total_size; - munmap(peer, sizeof(*peer)); - - peer = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); - peer_buf->region = peer; - - if (map->flags & SMR_FLAG_HMEM_ENABLED) { - ret = ofi_hmem_host_register(peer, peer->total_size); - if (ret) - FI_WARN(prov, FI_LOG_EP_CTRL, - "unable to register shm with iface\n"); - if (ofi_hmem_is_initialized(FI_HMEM_ZE)) { - peer_buf->pid_fd = ofi_pidfd_open(peer->pid, 0); - if (peer_buf->pid_fd < 0) { - FI_WARN(prov, FI_LOG_EP_CTRL, - "unable to open pidfd\n"); - } - } else { - peer_buf->pid_fd = -1; - } - } - - av = container_of(map, struct smr_av, smr_map); - dlist_foreach_container(&av->util_av.ep_list, struct util_ep, util_ep, - av_entry) { - smr_ep = container_of(util_ep, struct smr_ep, util_ep); - smr_map_to_endpoint(smr_ep->region, id); - } - -out: - close(fd); - return ret; -} - -void smr_map_to_endpoint(struct smr_region *region, int64_t id) -{ - int ret; - struct smr_region *peer_smr; - struct smr_peer_data *local_peers; - - assert(ofi_spin_held(®ion->map->lock)); - peer_smr = smr_peer_region(region, id); - if (region->map->peers[id].peer.id < 0 || !peer_smr) - return; - - local_peers = smr_peer_data(region); - - if ((region != peer_smr && region->cma_cap_peer == SMR_VMA_CAP_NA) || - (region == peer_smr && region->cma_cap_self == SMR_VMA_CAP_NA)) - smr_cma_check(region, peer_smr); - - /* enable xpmem locally if the peer also has it enabled */ - if (peer_smr->xpmem_cap_self == SMR_VMA_CAP_ON && - region->xpmem_cap_self == SMR_VMA_CAP_ON) { - ret = ofi_xpmem_enable(&peer_smr->xpmem_self, - &local_peers[id].xpmem); - if (ret) { - local_peers[id].xpmem.avail = false; - region->xpmem_cap_self = SMR_VMA_CAP_OFF; - return; - } - local_peers[id].xpmem.avail = true; - local_peers[id].xpmem.addr_max = peer_smr->xpmem_self.address_max; - } else { - local_peers[id].xpmem.avail = false; - } - - smr_set_ipc_valid(region, id); - - return; -} - -void smr_unmap_region(const struct fi_provider *prov, struct smr_map *map, - int64_t peer_id, bool local) -{ - struct smr_region *peer_region; - struct smr_peer *peer; - struct util_ep *util_ep; - struct smr_ep *smr_ep; - struct smr_av *av; - int ret = 0; - - assert(ofi_spin_held(&map->lock)); - peer_region = map->peers[peer_id].region; - if (!peer_region) - return; - - peer = &map->peers[peer_id]; - av = container_of(map, struct smr_av, smr_map); - dlist_foreach_container(&av->util_av.ep_list, struct util_ep, util_ep, - av_entry) { - smr_ep = container_of(util_ep, struct smr_ep, util_ep); - smr_unmap_from_endpoint(smr_ep->region, peer_id); - } - - /* Don't unmap memory owned by this pid because the endpoint it belongs - * to might still be active. - */ - if (local) - return; - - if (map->flags & SMR_FLAG_HMEM_ENABLED) { - ret = ofi_hmem_host_unregister(peer_region); - if (ret) - FI_WARN(prov, FI_LOG_EP_CTRL, - "unable to unregister shm with iface\n"); - - if (peer->pid_fd != -1) { - close(peer->pid_fd); - peer->pid_fd = -1; - } - } - - munmap(peer_region, peer_region->total_size); - peer->region = NULL; -} - -void smr_unmap_from_endpoint(struct smr_region *region, int64_t id) -{ - struct smr_region *peer_smr; - struct smr_peer_data *local_peers, *peer_peers; - int64_t peer_id; - - if (region->map->peers[id].peer.id < 0) - return; - - peer_smr = smr_peer_region(region, id); - assert(peer_smr); - peer_peers = smr_peer_data(peer_smr); - peer_id = smr_peer_data(region)[id].addr.id; - - peer_peers[peer_id].addr.id = -1; - peer_peers[peer_id].name_sent = 0; - - local_peers = smr_peer_data(region); - ofi_xpmem_release(&local_peers[peer_id].xpmem); -} - -void smr_exchange_all_peers(struct smr_region *region) -{ - int64_t i; - - ofi_spin_lock(®ion->map->lock); - for (i = 0; i < SMR_MAX_PEERS; i++) - smr_map_to_endpoint(region, i); - - ofi_spin_unlock(®ion->map->lock); -} - -int smr_map_add(const struct fi_provider *prov, struct smr_map *map, - const char *name, int64_t *id) -{ - struct ofi_rbnode *node; - const char *shm_name = smr_no_prefix(name); - int tries = 0, ret = 0; - - ofi_spin_lock(&map->lock); - ret = ofi_rbmap_insert(&map->rbmap, (void *) shm_name, - (void *) (intptr_t) *id, &node); - if (ret) { - assert(ret == -FI_EALREADY); - *id = (intptr_t) node->data; - goto out; - } - - while (map->peers[map->cur_id].peer.id != -1 && tries < SMR_MAX_PEERS) { - if (++map->cur_id == SMR_MAX_PEERS) - map->cur_id = 0; - tries++; - } - - assert(map->cur_id < SMR_MAX_PEERS && tries < SMR_MAX_PEERS); - *id = map->cur_id; - if (++map->cur_id == SMR_MAX_PEERS) - map->cur_id = 0; - node->data = (void *) (intptr_t) *id; - strncpy(map->peers[*id].peer.name, shm_name, SMR_NAME_MAX); - map->peers[*id].peer.name[SMR_NAME_MAX - 1] = '\0'; - map->peers[*id].region = NULL; - map->num_peers++; - map->peers[*id].peer.id = *id; - -out: - ofi_spin_unlock(&map->lock); - return FI_SUCCESS; -} - -void smr_map_del(struct smr_map *map, int64_t id) -{ - struct smr_ep_name *name; - bool local = false; - - assert(id >= 0 && id < SMR_MAX_PEERS); - pthread_mutex_lock(&ep_list_lock); - dlist_foreach_container(&ep_name_list, struct smr_ep_name, name, entry) { - if (!strcmp(name->name, map->peers[id].peer.name)) { - local = true; - break; - } - } - pthread_mutex_unlock(&ep_list_lock); - ofi_spin_lock(&map->lock); - smr_unmap_region(&smr_prov, map, id, local); - map->peers[id].fiaddr = FI_ADDR_NOTAVAIL; - map->peers[id].peer.id = -1; - map->num_peers--; - ofi_rbmap_find_delete(&map->rbmap, map->peers[id].peer.name); - ofi_spin_unlock(&map->lock); -} - -struct smr_region *smr_map_get(struct smr_map *map, int64_t id) -{ - if (id < 0 || id >= SMR_MAX_PEERS) - return NULL; - - return map->peers[id].region; -} +} \ No newline at end of file diff --git a/prov/shm/src/smr_util.h b/prov/shm/src/smr_util.h index 533381f25d1..13764a184ee 100644 --- a/prov/shm/src/smr_util.h +++ b/prov/shm/src/smr_util.h @@ -36,68 +36,74 @@ #include "ofi.h" #include "ofi_atomic_queue.h" #include "ofi_xpmem.h" +#include #ifdef __cplusplus extern "C" { #endif -#define SMR_VERSION 8 +#define SMR_VERSION 9 -#define SMR_FLAG_ATOMIC (1 << 0) -#define SMR_FLAG_DEBUG (1 << 1) -#define SMR_FLAG_IPC_SOCK (1 << 2) -#define SMR_FLAG_HMEM_ENABLED (1 << 3) +#define SMR_FLAG_ATOMIC (1 << 0) +#define SMR_FLAG_DEBUG (1 << 1) +#define SMR_FLAG_HMEM_ENABLED (1 << 2) +#define SMR_FLAG_CMA_INIT (1 << 3) +#define SMR_FLAG_XPMEM_ENABLED (1 << 4) -#define SMR_CMD_SIZE 256 /* align with 64-byte cache line */ - -/* SMR op_src: Specifies data source location */ -enum { - smr_src_inline, /* command data */ - smr_src_inject, /* inject buffers */ - smr_src_iov, /* reference iovec via CMA */ - smr_src_mmap, /* mmap-based fallback protocol */ - smr_src_sar, /* segmentation fallback protocol */ - smr_src_ipc, /* device IPC handle protocol */ - smr_src_max, -}; +/* SMR_CMD_SIZE refers to the total bytes dedicated for use in shm headers and + * data. The entire atomic queue entry will be cache aligned (512) but this also + * includes the cmd aq header (16) + cmd entry ptr (8) + * 512 (total entry size) - 16 (aq header) - 8 (entry ptr) = 488 + * This maximizes the inline payload. Increasing this value will increase the + * atomic queue entry to 576 bytes. + */ +#define SMR_CMD_SIZE 488 -//reserves 0-255 for defined ops and room for new ops -//256 and beyond reserved for ctrl ops +/* reserves 0-255 for defined ops and room for new ops + * 256 and beyond reserved for ctrl ops + */ #define SMR_OP_MAX (1 << 8) #define SMR_REMOTE_CQ_DATA (1 << 0) -#define SMR_RMA_REQ (1 << 1) -#define SMR_TX_COMPLETION (1 << 2) -#define SMR_RX_COMPLETION (1 << 3) -#define SMR_MULTI_RECV (1 << 4) -/* CMA/XPMEM capability. Generic acronym used: - * VMA: Virtual Memory Address */ enum { - SMR_VMA_CAP_NA, - SMR_VMA_CAP_ON, - SMR_VMA_CAP_OFF, + smr_proto_inline, /* inline payload */ + smr_proto_inject, /* inject buffers */ + smr_proto_iov, /* iovec copy via CMA or xpmem */ + smr_proto_sar, /* segmentation fallback */ + smr_proto_ipc, /* device IPC handle */ + smr_proto_max, }; /* - * Unique smr_op_hdr for smr message protocol: - * addr - local shm_id of peer sending msg (for shm lookup) - * op - type of op (ex. ofi_op_msg, defined in ofi_proto.h) - * op_src - msg src (ex. smr_src_inline, defined above) - * op_flags - operation flags (ex. SMR_REMOTE_CQ_DATA, defined above) - * src_data - src of additional op data (inject offset / resp offset) - * data - remote CQ data + * Unique smr_cmd_hdr for smr message protocol: + * entry for internal use managing commands (must be kept first) + * tx_ctx source side context (unused by target side) + * rx_ctx target side context (unused by source side) + * tx_id local shm_id of peer sending msg (unused by target) + * rx_id remote shm_id of peer sending msg (unused by source) + * op type of op (ex. ofi_op_msg, defined in ofi_proto.h) + * proto smr protocol (ex. smr_proto_inline, defined above) + * op_flags operation flags (ex. SMR_REMOTE_CQ_DATA, defined above) + * size size of data transfer + * status returned status of operation + * cq_data remote CQ data + * tag tag for FI_TAGGED API only + * datatype atomic datatype for FI_ATOMIC API only + * atomic_op atomic operation for FI_ATOMIC API only */ -struct smr_msg_hdr { - uint64_t msg_id; - int64_t id; +struct smr_cmd_hdr { + uint64_t entry; + uint64_t tx_ctx; + uint64_t rx_ctx; + int64_t rx_id; + int64_t tx_id; uint32_t op; - uint16_t op_src; + uint16_t proto; uint16_t op_flags; - uint64_t size; - uint64_t src_data; - uint64_t data; + int64_t status; + uint64_t cq_data; union { uint64_t tag; struct { @@ -108,66 +114,64 @@ struct smr_msg_hdr { } __attribute__ ((aligned(16))); #define SMR_BUF_BATCH_MAX 64 -#define SMR_MSG_DATA_LEN (SMR_CMD_SIZE - sizeof(struct smr_msg_hdr)) - -union smr_cmd_data { - uint8_t msg[SMR_MSG_DATA_LEN]; - struct { - size_t iov_count; - struct iovec iov[(SMR_MSG_DATA_LEN - sizeof(size_t)) / - sizeof(struct iovec)]; - }; - struct { - uint32_t buf_batch_size; - int16_t sar[SMR_BUF_BATCH_MAX]; - }; - struct ipc_info ipc_info; -}; +#define SMR_MSG_DATA_LEN (SMR_CMD_SIZE - \ + (sizeof(struct smr_cmd_hdr) + \ + sizeof(struct smr_cmd_rma))) +#define SMR_IOV_LIMIT 4 -struct smr_cmd_msg { - struct smr_msg_hdr hdr; - union smr_cmd_data data; -}; - -#define SMR_RMA_DATA_LEN (128 - sizeof(uint64_t)) struct smr_cmd_rma { - uint64_t rma_count; + uint64_t rma_count; union { - struct fi_rma_iov rma_iov[SMR_RMA_DATA_LEN / - sizeof(struct fi_rma_iov)]; - struct fi_rma_ioc rma_ioc[SMR_RMA_DATA_LEN / - sizeof(struct fi_rma_ioc)]; + struct fi_rma_iov rma_iov[SMR_IOV_LIMIT]; + struct fi_rma_ioc rma_ioc[SMR_IOV_LIMIT]; }; }; -struct smr_cmd { +struct smr_cmd_data { union { - struct smr_cmd_msg msg; - struct smr_cmd_rma rma; + uint8_t msg[SMR_MSG_DATA_LEN]; + struct { + size_t iov_count; + struct iovec iov[SMR_IOV_LIMIT]; + }; + struct { + uint32_t buf_batch_size; + int16_t sar[SMR_BUF_BATCH_MAX]; + }; + struct ipc_info ipc_info; }; }; +#ifdef static_assert +static_assert(sizeof(struct smr_cmd_data) == SMR_MSG_DATA_LEN, + "Insufficient cmd data"); +#endif + +struct smr_cmd { + struct smr_cmd_hdr hdr; + struct smr_cmd_data data; + struct smr_cmd_rma rma; +}; #define SMR_INJECT_SIZE 4096 #define SMR_COMP_INJECT_SIZE (SMR_INJECT_SIZE / 2) #define SMR_SAR_SIZE 32768 -#define SMR_DIR "/dev/shm/" +#define SMR_DIR "/dev/shm/" #define SMR_NAME_MAX 256 #define SMR_PATH_MAX (SMR_NAME_MAX + sizeof(SMR_DIR)) -/* On next version update remove this struct to make id a bool in the smr_peer - * remove name from smr_peer_data because it is unused. - */ -struct smr_addr { - char name[SMR_NAME_MAX]; - int64_t id; +enum smr_sar_status { + SMR_SAR_FREE = 0, + SMR_SAR_BUSY, + SMR_SAR_READY, }; struct smr_peer_data { - struct smr_addr addr; + int64_t id; uint32_t sar_status; uint16_t name_sent; uint16_t ipc_valid; + uintptr_t local_region; struct ofi_xpmem_client xpmem; }; @@ -177,9 +181,9 @@ extern pthread_mutex_t ep_list_lock; struct smr_region; struct smr_ep_name { - char name[SMR_NAME_MAX]; - struct smr_region *region; - struct dlist_entry entry; + char name[SMR_NAME_MAX]; + struct smr_region *region; + struct dlist_entry entry; }; static inline const char *smr_no_prefix(const char *addr) @@ -190,7 +194,8 @@ static inline const char *smr_no_prefix(const char *addr) } struct smr_peer { - struct smr_addr peer; + char name[SMR_NAME_MAX]; + bool id_assigned; fi_addr_t fiaddr; struct smr_region *region; int pid_fd; @@ -198,51 +203,45 @@ struct smr_peer { #define SMR_MAX_PEERS 256 -struct smr_map { - ofi_spin_t lock; - int64_t cur_id; - int num_peers; - uint16_t flags; - struct ofi_rbmap rbmap; - struct smr_peer peers[SMR_MAX_PEERS]; -}; - struct smr_region { - uint8_t version; - uint8_t resv; - uint16_t flags; - int pid; - uint8_t cma_cap_peer; - uint8_t cma_cap_self; - uint8_t xpmem_cap_self; - uint8_t resv2; + uint8_t version; + uint8_t resv; + uint16_t flags; + uint8_t self_vma_caps; + uint8_t peer_vma_caps; - uint32_t max_sar_buf_per_peer; + uint16_t max_sar_buf_per_peer; struct ofi_xpmem_pinfo xpmem_self; struct ofi_xpmem_pinfo xpmem_peer; - void *base_addr; - pthread_spinlock_t lock; /* lock for shm access - if both ep->tx_lock and this lock need to - held, then ep->tx_lock needs to be held - first */ - struct smr_map *map; + int pid; + int resv2; + + void *base_addr; + + char name[SMR_NAME_MAX]; - size_t total_size; + size_t total_size; /* offsets from start of smr_region */ - size_t cmd_queue_offset; - size_t resp_queue_offset; - size_t inject_pool_offset; - size_t sar_pool_offset; - size_t peer_data_offset; - size_t name_offset; + size_t cmd_queue_offset; + size_t cmd_stack_offset; + size_t inject_pool_offset; + size_t ret_queue_offset; + size_t sar_pool_offset; + size_t peer_data_offset; }; -struct smr_resp { - uint64_t msg_id; - uint64_t status; -}; +static inline void smr_set_vma_cap(uint8_t *vma_cap, uint8_t type, bool avail) +{ + (*vma_cap) &= ~(1 << type); + (*vma_cap) |= (uint8_t) avail << type; +} + +static inline uint8_t smr_get_vma_cap(uint8_t vma_cap, uint8_t type) +{ + return vma_cap & (1 << type); +} struct smr_inject_buf { union { @@ -254,50 +253,42 @@ struct smr_inject_buf { }; }; -enum smr_status { - SMR_STATUS_SUCCESS = 0, /* success*/ - SMR_STATUS_BUSY = FI_EBUSY, /* busy */ - - SMR_STATUS_OFFSET = 1024, /* Beginning of shm-specific codes */ - SMR_STATUS_SAR_EMPTY, /* buffer can be written into */ - SMR_STATUS_SAR_FULL, /* buffer can be read from */ -}; - struct smr_sar_buf { uint8_t buf[SMR_SAR_SIZE]; }; -/* TODO it is expected that a future patch will expand the smr_cmd - * structure to also include the rma information, thereby removing the - * need to have two commands in the cmd_entry. We can also remove the - * command entry completely and just use the smr_cmd - */ struct smr_cmd_entry { - struct smr_cmd cmd; - struct smr_cmd rma_cmd; + uintptr_t ptr; + struct smr_cmd cmd; +}; + +struct smr_return_entry { + uintptr_t ptr; }; +OFI_DECLARE_ATOMIC_Q(struct smr_cmd_entry, smr_cmd_queue); +OFI_DECLARE_ATOMIC_Q(struct smr_return_entry, smr_return_queue); + /* Queue of offsets of the command blocks obtained from the command pool * freestack */ -OFI_DECLARE_CIRQUE(struct smr_resp, smr_resp_queue); -OFI_DECLARE_ATOMIC_Q(struct smr_cmd_entry, smr_cmd_queue); - -static inline struct smr_region *smr_peer_region(struct smr_region *smr, int i) -{ - return smr->map->peers[i].region; -} static inline struct smr_cmd_queue *smr_cmd_queue(struct smr_region *smr) { return (struct smr_cmd_queue *) ((char *) smr + smr->cmd_queue_offset); } -static inline struct smr_resp_queue *smr_resp_queue(struct smr_region *smr) +static inline struct smr_freestack *smr_cmd_stack(struct smr_region *smr) +{ + return (struct smr_freestack *) ((char *) smr + smr->cmd_stack_offset); +} +static inline struct smr_inject_buf *smr_inject_pool(struct smr_region *smr) { - return (struct smr_resp_queue *) ((char *) smr + smr->resp_queue_offset); + return (struct smr_inject_buf *) + ((char *) smr + smr->inject_pool_offset); } -static inline struct smr_freestack *smr_inject_pool(struct smr_region *smr) +static inline struct smr_return_queue *smr_return_queue(struct smr_region *smr) { - return (struct smr_freestack *) ((char *) smr + smr->inject_pool_offset); + return (struct smr_return_queue *) + ((char *) smr + smr->ret_queue_offset); } static inline struct smr_peer_data *smr_peer_data(struct smr_region *smr) { @@ -307,14 +298,12 @@ static inline struct smr_freestack *smr_sar_pool(struct smr_region *smr) { return (struct smr_freestack *) ((char *) smr + smr->sar_pool_offset); } -static inline const char *smr_name(struct smr_region *smr) -{ - return (const char *) smr + smr->name_offset; -} -static inline void smr_set_map(struct smr_region *smr, struct smr_map *map) +static inline struct smr_inject_buf *smr_get_inject_buf(struct smr_region *smr, + struct smr_cmd *cmd) { - smr->map = map; + return &smr_inject_pool(smr)[smr_freestack_get_index(smr_cmd_stack(smr), + (char *) cmd)]; } struct smr_attr { @@ -325,30 +314,18 @@ struct smr_attr { }; size_t smr_calculate_size_offsets(size_t tx_count, size_t rx_count, - size_t *cmd_offset, size_t *resp_offset, - size_t *inject_offset, size_t *sar_offset, - size_t *peer_offset, size_t *name_offset); -void smr_cma_check(struct smr_region *region, struct smr_region *peer_region); -void smr_cleanup(void); -int smr_map_to_region(const struct fi_provider *prov, struct smr_map *map, - int64_t id); -void smr_map_to_endpoint(struct smr_region *region, int64_t id); -void smr_unmap_region(const struct fi_provider *prov, struct smr_map *map, - int64_t id, bool found); -void smr_unmap_from_endpoint(struct smr_region *region, int64_t id); -void smr_exchange_all_peers(struct smr_region *region); -int smr_map_add(const struct fi_provider *prov, struct smr_map *map, - const char *name, int64_t *id); -void smr_map_del(struct smr_map *map, int64_t id); - -struct smr_region *smr_map_get(struct smr_map *map, int64_t id); - -int smr_create(const struct fi_provider *prov, struct smr_map *map, - const struct smr_attr *attr, struct smr_region *volatile *smr); -void smr_free(struct smr_region *smr); + size_t *cmd_offset, size_t *cs_offset, + size_t *inject_offset, size_t *rq_offset, + size_t *sar_offset, size_t *peer_offset); +void smr_cma_check(struct smr_region *region, + struct smr_region *peer_region); +void smr_cleanup(void); +int smr_create(const struct fi_provider *prov, const struct smr_attr *attr, + struct smr_region *volatile *smr); +void smr_free(struct smr_region *smr); #ifdef __cplusplus } #endif -#endif /* _SMR_UTIL_H_ */ +#endif /* _SMR_UTIL_H_ */ \ No newline at end of file