Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

prov/efa: Control plane AV operation locking fix #10823

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
55 changes: 36 additions & 19 deletions prov/efa/src/efa_av.c
Original file line number Diff line number Diff line change
Expand Up @@ -427,7 +427,7 @@ int efa_av_update_reverse_av(struct efa_av *av, struct efa_ep_addr *raw_addr,

/**
* @brief allocate an efa_conn object
* caller of this function must obtain av->util_av.lock
* caller of this function must obtain av->domain->progress_lock
*
* @param[in] av efa address vector
* @param[in] raw_addr raw efa address
Expand Down Expand Up @@ -456,7 +456,9 @@ struct efa_conn *efa_conn_alloc(struct efa_av *av, struct efa_ep_addr *raw_addr,
return NULL;
}

ofi_genlock_lock(&av->util_av.lock);
err = ofi_av_insert_addr(&av->util_av, raw_addr, &fi_addr);
ofi_genlock_unlock(&av->util_av.lock);
if (err) {
EFA_WARN(FI_LOG_AV, "ofi_av_insert_addr failed! Error message: %s\n",
fi_strerror(err));
Expand Down Expand Up @@ -501,7 +503,9 @@ struct efa_conn *efa_conn_alloc(struct efa_av *av, struct efa_ep_addr *raw_addr,
efa_ah_release(av, conn->ah);

conn->ep_addr = NULL;
ofi_genlock_lock(&av->util_av.lock);
err = ofi_av_remove_addr(&av->util_av, fi_addr);
ofi_genlock_unlock(&av->util_av.lock);
if (err)
EFA_WARN(FI_LOG_AV, "While processing previous failure, ofi_av_remove_addr failed! err=%d\n",
err);
Expand All @@ -511,7 +515,7 @@ struct efa_conn *efa_conn_alloc(struct efa_av *av, struct efa_ep_addr *raw_addr,

/**
* @brief release an efa conn object
* Caller of this function must obtain av->util_av.lock
* Caller of this function must obtain av->domain->progress_lock
*
* @param[in] av address vector
* @param[in] conn efa_conn object pointer
Expand Down Expand Up @@ -555,7 +559,9 @@ void efa_conn_release(struct efa_av *av, struct efa_conn *conn)
assert(util_av_entry);
efa_av_entry = (struct efa_av_entry *)util_av_entry->data;

ofi_genlock_lock(&av->util_av.lock);
err = ofi_av_remove_addr(&av->util_av, conn->fi_addr);
ofi_genlock_unlock(&av->util_av.lock);
if (err) {
EFA_WARN(FI_LOG_AV, "ofi_av_remove_addr failed! err=%d\n", err);
}
Expand Down Expand Up @@ -593,7 +599,6 @@ int efa_av_insert_one(struct efa_av *av, struct efa_ep_addr *addr,
if (av->ep_type == FI_EP_DGRAM)
addr->qkey = EFA_DGRAM_CONNID;

ofi_genlock_lock(&av->util_av.lock);
memset(raw_gid_str, 0, sizeof(raw_gid_str));
if (!inet_ntop(AF_INET6, addr->raw, raw_gid_str, INET6_ADDRSTRLEN)) {
EFA_WARN(FI_LOG_AV, "cannot convert address to string. errno: %d\n", errno);
Expand Down Expand Up @@ -629,7 +634,6 @@ int efa_av_insert_one(struct efa_av *av, struct efa_ep_addr *addr,
raw_gid_str, addr->qpn, addr->qkey, *fi_addr);
ret = 0;
out:
ofi_genlock_unlock(&av->util_av.lock);
return ret;
}

Expand All @@ -643,18 +647,20 @@ int efa_av_insert(struct fid_av *av_fid, const void *addr,
struct efa_ep_addr *addr_i;
fi_addr_t fi_addr_res;

ofi_genlock_lock(&av->domain->progress_lock);

if (av->util_av.flags & FI_EVENT)
return -FI_ENOEQ;
goto out;

if ((flags & FI_SYNC_ERR) && (!context || (flags & FI_EVENT)))
return -FI_EINVAL;
goto out;

/*
* Providers are allowed to ignore FI_MORE.
*/
flags &= ~FI_MORE;
if (flags)
return -FI_ENOSYS;
goto out;

for (i = 0; i < count; i++) {
addr_i = (struct efa_ep_addr *) ((uint8_t *)addr + i * EFA_EP_ADDR_LEN);
Expand All @@ -677,6 +683,8 @@ int efa_av_insert(struct fid_av *av_fid, const void *addr,
fi_addr[i] = FI_ADDR_NOTAVAIL;
}

out:
ofi_genlock_unlock(&av->domain->progress_lock);
return success_cnt;
}

Expand All @@ -685,21 +693,29 @@ static int efa_av_lookup(struct fid_av *av_fid, fi_addr_t fi_addr,
{
struct efa_av *av = container_of(av_fid, struct efa_av, util_av.av_fid);
struct efa_conn *conn = NULL;
int ret = 0;

if (av->type != FI_AV_TABLE)
return -FI_EINVAL;
ofi_genlock_lock(&av->domain->progress_lock);

if (fi_addr == FI_ADDR_NOTAVAIL)
return -FI_EINVAL;
if (fi_addr == FI_ADDR_NOTAVAIL) {
ret = -FI_EINVAL;
goto out;
}

conn = efa_av_addr_to_conn(av, fi_addr);
if (!conn)
return -FI_EINVAL;

if (!conn) {
ret = -FI_EINVAL;
goto out;
}

memcpy(addr, (void *)conn->ep_addr, MIN(EFA_EP_ADDR_LEN, *addrlen));
if (*addrlen > EFA_EP_ADDR_LEN)
*addrlen = EFA_EP_ADDR_LEN;
return 0;

out:
ofi_genlock_unlock(&av->domain->progress_lock);
return ret;
}

/*
Expand Down Expand Up @@ -742,7 +758,7 @@ static int efa_av_remove(struct fid_av *av_fid, fi_addr_t *fi_addr,
if (av->type != FI_AV_TABLE)
return -FI_EINVAL;

ofi_genlock_lock(&av->util_av.lock);
ofi_genlock_lock(&av->domain->progress_lock);
for (i = 0; i < count; i++) {
conn = efa_av_addr_to_conn(av, fi_addr[i]);
if (!conn) {
Expand All @@ -758,7 +774,7 @@ static int efa_av_remove(struct fid_av *av_fid, fi_addr_t *fi_addr,
assert(err);
}

ofi_genlock_unlock(&av->util_av.lock);
ofi_genlock_unlock(&av->domain->progress_lock);
return err;
}

Expand All @@ -783,7 +799,6 @@ static void efa_av_close_reverse_av(struct efa_av *av)
struct efa_cur_reverse_av *cur_entry, *curtmp;
struct efa_prv_reverse_av *prv_entry, *prvtmp;

ofi_genlock_lock(&av->util_av.lock);

HASH_ITER(hh, av->cur_reverse_av, cur_entry, curtmp) {
efa_conn_release(av, cur_entry->conn);
Expand All @@ -792,8 +807,6 @@ static void efa_av_close_reverse_av(struct efa_av *av)
HASH_ITER(hh, av->prv_reverse_av, prv_entry, prvtmp) {
efa_conn_release(av, prv_entry->conn);
}

ofi_genlock_unlock(&av->util_av.lock);
}

static int efa_av_close(struct fid *fid)
Expand All @@ -803,6 +816,8 @@ static int efa_av_close(struct fid *fid)

av = container_of(fid, struct efa_av, util_av.av_fid.fid);

ofi_genlock_lock(&av->domain->progress_lock);

efa_av_close_reverse_av(av);

err = ofi_av_close(&av->util_av);
Expand All @@ -820,7 +835,9 @@ static int efa_av_close(struct fid *fid)
}
}
}
ofi_genlock_unlock(&av->domain->progress_lock);
free(av);

return err;
}

Expand Down
12 changes: 6 additions & 6 deletions prov/efa/src/efa_cntr.c
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ static int efa_cntr_wait(struct fid_cntr *cntr_fid, uint64_t threshold, int time
cntr = container_of(cntr_fid, struct util_cntr, cntr_fid);
domain = container_of(cntr->domain, struct efa_domain, util_domain);

ofi_genlock_lock(&domain->srx_lock);
ofi_genlock_lock(&domain->progress_lock);

assert(cntr->wait);
errcnt = ofi_atomic_get64(&cntr->err);
Expand Down Expand Up @@ -54,7 +54,7 @@ static int efa_cntr_wait(struct fid_cntr *cntr_fid, uint64_t threshold, int time
}

unlock:
ofi_genlock_unlock(&domain->srx_lock);
ofi_genlock_unlock(&domain->progress_lock);
return ret;
}

Expand All @@ -68,13 +68,13 @@ static uint64_t efa_cntr_read(struct fid_cntr *cntr_fid)

domain = container_of(efa_cntr->util_cntr.domain, struct efa_domain, util_domain);

ofi_genlock_lock(&domain->srx_lock);
ofi_genlock_lock(&domain->progress_lock);

if (efa_cntr->shm_cntr)
fi_cntr_read(efa_cntr->shm_cntr);
ret = ofi_cntr_read(cntr_fid);

ofi_genlock_unlock(&domain->srx_lock);
ofi_genlock_unlock(&domain->progress_lock);

return ret;
}
Expand All @@ -89,12 +89,12 @@ static uint64_t efa_cntr_readerr(struct fid_cntr *cntr_fid)

domain = container_of(efa_cntr->util_cntr.domain, struct efa_domain, util_domain);

ofi_genlock_lock(&domain->srx_lock);
ofi_genlock_lock(&domain->progress_lock);
if (efa_cntr->shm_cntr)
fi_cntr_read(efa_cntr->shm_cntr);
ret = ofi_cntr_readerr(cntr_fid);

ofi_genlock_unlock(&domain->srx_lock);
ofi_genlock_unlock(&domain->progress_lock);

return ret;
}
Expand Down
6 changes: 3 additions & 3 deletions prov/efa/src/efa_domain.c
Original file line number Diff line number Diff line change
Expand Up @@ -204,10 +204,10 @@ int efa_domain_open(struct fid_fabric *fabric_fid, struct fi_info *info,
efa_domain->ibv_mr_reg_ct = 0;
efa_domain->ibv_mr_reg_sz = 0;

err = ofi_genlock_init(&efa_domain->srx_lock, efa_domain->util_domain.threading != FI_THREAD_SAFE ?
err = ofi_genlock_init(&efa_domain->progress_lock, efa_domain->util_domain.threading != FI_THREAD_SAFE ?
OFI_LOCK_NOOP : OFI_LOCK_MUTEX);
if (err) {
EFA_WARN(FI_LOG_DOMAIN, "srx lock init failed! err: %d\n", err);
EFA_WARN(FI_LOG_DOMAIN, "domain progress lock init failed! err: %d\n", err);
ret = err;
goto err_free;
}
Expand Down Expand Up @@ -354,7 +354,7 @@ static int efa_domain_close(fid_t fid)
if (efa_domain->info)
fi_freeinfo(efa_domain->info);

ofi_genlock_destroy(&efa_domain->srx_lock);
ofi_genlock_destroy(&efa_domain->progress_lock);
free(efa_domain->qp_table);
free(efa_domain);
return 0;
Expand Down
2 changes: 1 addition & 1 deletion prov/efa/src/efa_domain.h
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ struct efa_domain {
size_t addrlen;
bool mr_local;
struct dlist_entry list_entry; /* linked to g_efa_domain_list */
struct ofi_genlock srx_lock; /* shared among peer providers */
struct ofi_genlock progress_lock; /* shared among peer providers */
/* Total count of ibv memory registrations */
size_t ibv_mr_reg_ct;
/* Total size of memory registrations (in bytes) */
Expand Down
4 changes: 2 additions & 2 deletions prov/efa/src/rdm/efa_rdm_cq.c
Original file line number Diff line number Diff line change
Expand Up @@ -563,7 +563,7 @@ static ssize_t efa_rdm_cq_readfrom(struct fid_cq *cq_fid, void *buf, size_t coun

domain = container_of(cq->efa_cq.util_cq.domain, struct efa_domain, util_domain);

ofi_genlock_lock(&domain->srx_lock);
ofi_genlock_lock(&domain->progress_lock);

if (cq->shm_cq) {
fi_cq_read(cq->shm_cq, NULL, 0);
Expand All @@ -582,7 +582,7 @@ static ssize_t efa_rdm_cq_readfrom(struct fid_cq *cq_fid, void *buf, size_t coun
ret = ofi_cq_readfrom(&cq->efa_cq.util_cq.cq_fid, buf, count, src_addr);

out:
ofi_genlock_unlock(&domain->srx_lock);
ofi_genlock_unlock(&domain->progress_lock);

return ret;
}
Expand Down
4 changes: 2 additions & 2 deletions prov/efa/src/rdm/efa_rdm_ep_fiops.c
Original file line number Diff line number Diff line change
Expand Up @@ -814,7 +814,7 @@ void efa_rdm_ep_wait_send(struct efa_rdm_ep *efa_rdm_ep)
{
struct efa_cq *tx_cq, *rx_cq;

ofi_genlock_lock(&efa_rdm_ep_domain(efa_rdm_ep)->srx_lock);
ofi_genlock_lock(&efa_rdm_ep_domain(efa_rdm_ep)->progress_lock);

tx_cq = efa_base_ep_get_tx_cq(&efa_rdm_ep->base_ep);
rx_cq = efa_base_ep_get_rx_cq(&efa_rdm_ep->base_ep);
Expand All @@ -828,7 +828,7 @@ void efa_rdm_ep_wait_send(struct efa_rdm_ep *efa_rdm_ep)
efa_domain_progress_rdm_peers_and_queues(efa_rdm_ep_domain(efa_rdm_ep));
}

ofi_genlock_unlock(&efa_rdm_ep_domain(efa_rdm_ep)->srx_lock);
ofi_genlock_unlock(&efa_rdm_ep_domain(efa_rdm_ep)->progress_lock);
}

static inline
Expand Down
2 changes: 1 addition & 1 deletion prov/efa/src/rdm/efa_rdm_srx.c
Original file line number Diff line number Diff line change
Expand Up @@ -161,7 +161,7 @@ int efa_rdm_peer_srx_construct(struct efa_rdm_ep *ep)
ep->base_ep.info->rx_attr->size, EFA_RDM_IOV_LIMIT,
ep->min_multi_recv_size,
&efa_rdm_srx_update_mr,
&efa_rdm_ep_domain(ep)->srx_lock,
&efa_rdm_ep_domain(ep)->progress_lock,
&ep->peer_srx_ep);
if (ret) {
EFA_WARN(FI_LOG_EP_CTRL, "util_ep_srx_context failed, err: %d\n", ret);
Expand Down
6 changes: 3 additions & 3 deletions prov/efa/test/efa_unit_test_srx.c
Original file line number Diff line number Diff line change
Expand Up @@ -50,8 +50,8 @@ void test_efa_srx_cq(struct efa_resource **state)
assert_true((void *) &srx_ctx->cq->cq_fid == (void *) resource->cq);
}

/* This test verified that srx_lock created in efa_domain is correctly passed to srx */
void test_efa_srx_lock(struct efa_resource **state)
/* This test verified that progress_lock created in efa_domain is correctly passed to srx */
void test_efa_progress_lock(struct efa_resource **state)
{
struct efa_resource *resource = *state;
struct efa_rdm_ep *efa_rdm_ep;
Expand All @@ -64,7 +64,7 @@ void test_efa_srx_lock(struct efa_resource **state)
srx_ctx = efa_rdm_ep_get_peer_srx_ctx(efa_rdm_ep);
efa_domain = container_of(resource->domain, struct efa_domain,
util_domain.domain_fid.fid);
assert_true(((void *) srx_ctx->lock == (void *) &efa_domain->srx_lock));
assert_true(((void *) srx_ctx->lock == (void *) &efa_domain->progress_lock));
}


Expand Down
2 changes: 1 addition & 1 deletion prov/efa/test/efa_unit_tests.c
Original file line number Diff line number Diff line change
Expand Up @@ -197,7 +197,7 @@ int main(void)
cmocka_unit_test_setup_teardown(test_efa_hmem_info_disable_p2p_cuda, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown),
cmocka_unit_test_setup_teardown(test_efa_srx_min_multi_recv_size, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown),
cmocka_unit_test_setup_teardown(test_efa_srx_cq, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown),
cmocka_unit_test_setup_teardown(test_efa_srx_lock, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown),
cmocka_unit_test_setup_teardown(test_efa_progress_lock, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown),
cmocka_unit_test_setup_teardown(test_efa_srx_unexp_pkt, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown),
cmocka_unit_test_setup_teardown(test_efa_rnr_queue_and_resend, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown),
cmocka_unit_test_setup_teardown(test_efa_rdm_ope_prepare_to_post_send_with_no_enough_tx_pkts, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown),
Expand Down
2 changes: 1 addition & 1 deletion prov/efa/test/efa_unit_tests.h
Original file line number Diff line number Diff line change
Expand Up @@ -212,7 +212,7 @@ void test_efa_use_device_rdma_opt_old();

void test_efa_srx_min_multi_recv_size();
void test_efa_srx_cq();
void test_efa_srx_lock();
void test_efa_progress_lock();
void test_efa_srx_unexp_pkt();
void test_efa_rnr_queue_and_resend();
void test_efa_rdm_ope_prepare_to_post_send_with_no_enough_tx_pkts();
Expand Down
Loading