From 5638e3295a88730747fc93b8bef93fca4e189251 Mon Sep 17 00:00:00 2001 From: snordmann Date: Sun, 9 Jul 2023 14:13:57 +0300 Subject: [PATCH 01/16] TL/MLX5: set up ci --- .ci/scripts/build_ucc.sh | 3 ++- .ci/scripts/run_tests_ucc_mpi.sh | 21 +++++++++++++++++++-- 2 files changed, 21 insertions(+), 3 deletions(-) diff --git a/.ci/scripts/build_ucc.sh b/.ci/scripts/build_ucc.sh index 58bb7ffdcb..13d2bb0c54 100755 --- a/.ci/scripts/build_ucc.sh +++ b/.ci/scripts/build_ucc.sh @@ -8,7 +8,8 @@ cd "${UCC_SRC_DIR}" mkdir -p "${UCC_SRC_DIR}/build" cd "${UCC_SRC_DIR}/build" "${UCC_SRC_DIR}/configure" --with-ucx="${UCX_INSTALL_DIR}" --with-cuda="${CUDA_HOME}" \ - --prefix="${UCC_INSTALL_DIR}" --enable-gtest --with-mpi + --prefix="${UCC_INSTALL_DIR}" --enable-gtest --with-mpi \ + --with-tls=cuda,nccl,self,ucp,mlx5,sharp,rccl make -j install echo "${UCC_INSTALL_DIR}/lib" > /etc/ld.so.conf.d/ucc.conf ldconfig diff --git a/.ci/scripts/run_tests_ucc_mpi.sh b/.ci/scripts/run_tests_ucc_mpi.sh index 73a4eaca6a..2621b1152a 100755 --- a/.ci/scripts/run_tests_ucc_mpi.sh +++ b/.ci/scripts/run_tests_ucc_mpi.sh @@ -21,13 +21,20 @@ export MASTER_ADDR=${HEAD_NODE} NNODES=$(wc --lines "$HOSTFILE" | awk '{print $1}') DEV="" +CX7_DEV="" # Find first available active device for d in $(ssh $HEAD_NODE "ibstat -l"); do state=$(ssh $HEAD_NODE "ibstat $d" | grep 'State:' | awk '{print $2}') + type=$(ssh $HEAD_NODE "ibstat $d" | grep 'CA type:' | awk '{print $2}') if [ $state == 'Active' ]; then - DEV=$d - break + if [ "x$DEV" == "x" ]; then + DEV=$d + fi + if [ $state == 'MT4129' ]; then + CX7_DEV=$d + break + fi fi done @@ -101,6 +108,16 @@ for MT in "" "-T"; do mpirun $(mpi_params $PPN 1) $ucx_tls_no_cuda_ipc $tlcuda_args $EXE $MT $TG --mtypes cuda -c $tlcuda_colls echo "INFO: UCC MPI unit tests (TL/CUDA) ... DONE" + echo "INFO: UCC MPI unit tests (TL/MLX5) ..." + # shellcheck disable=SC2086 + if [ "x$CX7_DEV" == "x" ]; then + echo "No active CX7 devices found on $HEAD_NODE" + else + tlmlx5_args=" -x UCC_CLS=basic -x UCC_CL_BASIC_TLS=ucp,mlx5 -x UCC_TL_MLXS_NET_DEVICES=$CX7_DEV -x UCC_TL_MLX5_TUNE=inf -x UCX_RC_MLX5_DM_COUNT=0 -x UCX_DC_MLX5_DM_COUNT=0 " + tlmlx5_colls="alltoall" + mpirun $(mpi_params $PPN) $tlmlx5_args $EXE $MT $TG --mtypes host,cuda -c $tlmlx5_colls -t world -d uint8 -O 0 -m 1:128 + fi + echo "INFO: UCC MPI unit tests (TL/MLX5) ... DONE" echo "INFO: UCC MPI unit tests (CL/HIER) ..." # shellcheck disable=SC2086 From 4f63ca202cfe45a2754ae1e44dbf07f2a6c239e6 Mon Sep 17 00:00:00 2001 From: snordmann Date: Tue, 7 Nov 2023 16:11:14 +0200 Subject: [PATCH 02/16] TL/MLX5: adding ci check that nnodes>2 --- .ci/scripts/run_tests_ucc_mpi.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.ci/scripts/run_tests_ucc_mpi.sh b/.ci/scripts/run_tests_ucc_mpi.sh index 2621b1152a..f37e831457 100755 --- a/.ci/scripts/run_tests_ucc_mpi.sh +++ b/.ci/scripts/run_tests_ucc_mpi.sh @@ -112,6 +112,8 @@ for MT in "" "-T"; do # shellcheck disable=SC2086 if [ "x$CX7_DEV" == "x" ]; then echo "No active CX7 devices found on $HEAD_NODE" + elif [ $NNODES -lt 2 ]; then + echo "At least two nodes are required, but only $NNODES are available" else tlmlx5_args=" -x UCC_CLS=basic -x UCC_CL_BASIC_TLS=ucp,mlx5 -x UCC_TL_MLXS_NET_DEVICES=$CX7_DEV -x UCC_TL_MLX5_TUNE=inf -x UCX_RC_MLX5_DM_COUNT=0 -x UCX_DC_MLX5_DM_COUNT=0 " tlmlx5_colls="alltoall" From 539d09f992a9c2f0ed6f5a9f2203eb62c72c8a47 Mon Sep 17 00:00:00 2001 From: snordmann Date: Tue, 7 Nov 2023 19:14:31 +0200 Subject: [PATCH 03/16] TL/MLX5: fix bug --- .ci/scripts/run_tests_ucc_mpi.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.ci/scripts/run_tests_ucc_mpi.sh b/.ci/scripts/run_tests_ucc_mpi.sh index f37e831457..7467d4d09f 100755 --- a/.ci/scripts/run_tests_ucc_mpi.sh +++ b/.ci/scripts/run_tests_ucc_mpi.sh @@ -26,12 +26,12 @@ CX7_DEV="" # Find first available active device for d in $(ssh $HEAD_NODE "ibstat -l"); do state=$(ssh $HEAD_NODE "ibstat $d" | grep 'State:' | awk '{print $2}') - type=$(ssh $HEAD_NODE "ibstat $d" | grep 'CA type:' | awk '{print $2}') + type=$(ssh $HEAD_NODE "ibstat $d" | grep 'CA type:' | awk '{print $3}') if [ $state == 'Active' ]; then if [ "x$DEV" == "x" ]; then DEV=$d fi - if [ $state == 'MT4129' ]; then + if [ $type == 'MT4129' ]; then CX7_DEV=$d break fi From 46ac1969dc16da5b2263293302d5a65ab1bee3d2 Mon Sep 17 00:00:00 2001 From: snordmann Date: Tue, 23 Jan 2024 13:06:37 +0200 Subject: [PATCH 04/16] TL/MLX5: add port --- .ci/scripts/run_tests_ucc_mpi.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.ci/scripts/run_tests_ucc_mpi.sh b/.ci/scripts/run_tests_ucc_mpi.sh index 7467d4d09f..54750d575a 100755 --- a/.ci/scripts/run_tests_ucc_mpi.sh +++ b/.ci/scripts/run_tests_ucc_mpi.sh @@ -115,7 +115,7 @@ for MT in "" "-T"; do elif [ $NNODES -lt 2 ]; then echo "At least two nodes are required, but only $NNODES are available" else - tlmlx5_args=" -x UCC_CLS=basic -x UCC_CL_BASIC_TLS=ucp,mlx5 -x UCC_TL_MLXS_NET_DEVICES=$CX7_DEV -x UCC_TL_MLX5_TUNE=inf -x UCX_RC_MLX5_DM_COUNT=0 -x UCX_DC_MLX5_DM_COUNT=0 " + tlmlx5_args=" -x UCC_CLS=basic -x UCC_CL_BASIC_TLS=ucp,mlx5 -x UCC_TL_MLXS_NET_DEVICES=$CX7_DEV:1 -x UCC_TL_MLX5_TUNE=inf -x UCX_RC_MLX5_DM_COUNT=0 -x UCX_DC_MLX5_DM_COUNT=0 " tlmlx5_colls="alltoall" mpirun $(mpi_params $PPN) $tlmlx5_args $EXE $MT $TG --mtypes host,cuda -c $tlmlx5_colls -t world -d uint8 -O 0 -m 1:128 fi From 97c7635f8ad7715869c7dc80f4a1b7be7d43958f Mon Sep 17 00:00:00 2001 From: snordmann Date: Tue, 23 Jan 2024 13:07:23 +0200 Subject: [PATCH 05/16] TL/MLX5: add debug print for ci --- .ci/scripts/run_tests_ucc_mpi.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.ci/scripts/run_tests_ucc_mpi.sh b/.ci/scripts/run_tests_ucc_mpi.sh index 54750d575a..8cf15db0e5 100755 --- a/.ci/scripts/run_tests_ucc_mpi.sh +++ b/.ci/scripts/run_tests_ucc_mpi.sh @@ -116,6 +116,8 @@ for MT in "" "-T"; do echo "At least two nodes are required, but only $NNODES are available" else tlmlx5_args=" -x UCC_CLS=basic -x UCC_CL_BASIC_TLS=ucp,mlx5 -x UCC_TL_MLXS_NET_DEVICES=$CX7_DEV:1 -x UCC_TL_MLX5_TUNE=inf -x UCX_RC_MLX5_DM_COUNT=0 -x UCX_DC_MLX5_DM_COUNT=0 " + tlmlx5_args+=" -x UCC_LOG_LEVEL=debug -x UCC_COLL_TRACE=info " + echo $CX7_DEV tlmlx5_colls="alltoall" mpirun $(mpi_params $PPN) $tlmlx5_args $EXE $MT $TG --mtypes host,cuda -c $tlmlx5_colls -t world -d uint8 -O 0 -m 1:128 fi From f2566280c0a5d066083ec081b4fd6a2b05f7fb7c Mon Sep 17 00:00:00 2001 From: snordmann Date: Mon, 11 Mar 2024 16:31:21 +0200 Subject: [PATCH 06/16] TL/MLX5: simplify CI test to debug --- .ci/scripts/run_tests_ucc_mpi.sh | 2 +- src/components/tl/mlx5/alltoall/alltoall.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.ci/scripts/run_tests_ucc_mpi.sh b/.ci/scripts/run_tests_ucc_mpi.sh index 8cf15db0e5..e9257f7fc0 100755 --- a/.ci/scripts/run_tests_ucc_mpi.sh +++ b/.ci/scripts/run_tests_ucc_mpi.sh @@ -119,7 +119,7 @@ for MT in "" "-T"; do tlmlx5_args+=" -x UCC_LOG_LEVEL=debug -x UCC_COLL_TRACE=info " echo $CX7_DEV tlmlx5_colls="alltoall" - mpirun $(mpi_params $PPN) $tlmlx5_args $EXE $MT $TG --mtypes host,cuda -c $tlmlx5_colls -t world -d uint8 -O 0 -m 1:128 + mpirun $(mpi_params $PPN) $tlmlx5_args /opt/nvidia/src/ucc/build/test/mpi/ucc_test_mpi --mtypes host -c $tlmlx5_colls -t world -d uint8 -O 0 -m 128 fi echo "INFO: UCC MPI unit tests (TL/MLX5) ... DONE" diff --git a/src/components/tl/mlx5/alltoall/alltoall.c b/src/components/tl/mlx5/alltoall/alltoall.c index 5afc7c7d30..81d18230b7 100644 --- a/src/components/tl/mlx5/alltoall/alltoall.c +++ b/src/components/tl/mlx5/alltoall/alltoall.c @@ -83,7 +83,7 @@ ucc_status_t ucc_tl_mlx5_team_init_alltoall(ucc_tl_mlx5_team_t *team) node_size = node->group_size; nnodes = ucc_topo_nnodes(topo); team_size = UCC_TL_TEAM_SIZE(team); - + // while(1) {;}; if (!ucc_topo_isoppn(topo)) { tl_debug(ctx->super.super.lib, "disabling mlx5 a2a for team with non-uniform ppn, " @@ -93,7 +93,7 @@ ucc_status_t ucc_tl_mlx5_team_init_alltoall(ucc_tl_mlx5_team_t *team) } ppn = ucc_topo_max_ppn(topo); - if (net->status == UCC_SBGP_NOT_EXISTS) { + if (nnodes == 1) { tl_debug(ctx->super.super.lib, "disabling mlx5 a2a for single node team"); goto non_fatal_error; From ee5c33c5515c082c43c2995b320f738918f736d3 Mon Sep 17 00:00:00 2001 From: snordmann Date: Tue, 12 Mar 2024 16:03:38 +0200 Subject: [PATCH 07/16] TL/MLX5: deactivate mcast bcast --- src/components/tl/mlx5/tl_mlx5_team.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/components/tl/mlx5/tl_mlx5_team.c b/src/components/tl/mlx5/tl_mlx5_team.c index 16c85b54b9..b57e3d1362 100644 --- a/src/components/tl/mlx5/tl_mlx5_team.c +++ b/src/components/tl/mlx5/tl_mlx5_team.c @@ -223,7 +223,7 @@ ucc_status_t ucc_tl_mlx5_team_get_scores(ucc_base_team_t * tl_team, team_info.supported_mem_types = mt; team_info.supported_colls = (UCC_COLL_TYPE_ALLTOALL * (team->a2a_status.local == UCC_OK)) | - UCC_COLL_TYPE_BCAST; + UCC_COLL_TYPE_BCAST * 0; team_info.size = UCC_TL_TEAM_SIZE(team); status = ucc_coll_score_build_default( From aab6ba30f2ebe2733d63ef2d59cc0b5a2ba017b6 Mon Sep 17 00:00:00 2001 From: snordmann Date: Wed, 13 Mar 2024 22:21:45 +0200 Subject: [PATCH 08/16] TL/MLX5: deactivate a2a also --- src/components/tl/mlx5/tl_mlx5_team.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/components/tl/mlx5/tl_mlx5_team.c b/src/components/tl/mlx5/tl_mlx5_team.c index b57e3d1362..a1e6267d72 100644 --- a/src/components/tl/mlx5/tl_mlx5_team.c +++ b/src/components/tl/mlx5/tl_mlx5_team.c @@ -222,7 +222,7 @@ ucc_status_t ucc_tl_mlx5_team_get_scores(ucc_base_team_t * tl_team, team_info.num_mem_types = 2; team_info.supported_mem_types = mt; team_info.supported_colls = - (UCC_COLL_TYPE_ALLTOALL * (team->a2a_status.local == UCC_OK)) | + (UCC_COLL_TYPE_ALLTOALL * 0) | UCC_COLL_TYPE_BCAST * 0; team_info.size = UCC_TL_TEAM_SIZE(team); From db1aadd8bacefe646be76f919b29655167952db5 Mon Sep 17 00:00:00 2001 From: snordmann Date: Thu, 14 Mar 2024 17:02:45 +0200 Subject: [PATCH 09/16] TL/MLX5: deactivating a2a only --- .ci/scripts/run_tests_ucc_mpi.sh | 2 +- src/components/tl/mlx5/alltoall/alltoall.c | 4 ++-- src/components/tl/mlx5/tl_mlx5_team.c | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.ci/scripts/run_tests_ucc_mpi.sh b/.ci/scripts/run_tests_ucc_mpi.sh index e9257f7fc0..8cf15db0e5 100755 --- a/.ci/scripts/run_tests_ucc_mpi.sh +++ b/.ci/scripts/run_tests_ucc_mpi.sh @@ -119,7 +119,7 @@ for MT in "" "-T"; do tlmlx5_args+=" -x UCC_LOG_LEVEL=debug -x UCC_COLL_TRACE=info " echo $CX7_DEV tlmlx5_colls="alltoall" - mpirun $(mpi_params $PPN) $tlmlx5_args /opt/nvidia/src/ucc/build/test/mpi/ucc_test_mpi --mtypes host -c $tlmlx5_colls -t world -d uint8 -O 0 -m 128 + mpirun $(mpi_params $PPN) $tlmlx5_args $EXE $MT $TG --mtypes host,cuda -c $tlmlx5_colls -t world -d uint8 -O 0 -m 1:128 fi echo "INFO: UCC MPI unit tests (TL/MLX5) ... DONE" diff --git a/src/components/tl/mlx5/alltoall/alltoall.c b/src/components/tl/mlx5/alltoall/alltoall.c index 81d18230b7..5afc7c7d30 100644 --- a/src/components/tl/mlx5/alltoall/alltoall.c +++ b/src/components/tl/mlx5/alltoall/alltoall.c @@ -83,7 +83,7 @@ ucc_status_t ucc_tl_mlx5_team_init_alltoall(ucc_tl_mlx5_team_t *team) node_size = node->group_size; nnodes = ucc_topo_nnodes(topo); team_size = UCC_TL_TEAM_SIZE(team); - // while(1) {;}; + if (!ucc_topo_isoppn(topo)) { tl_debug(ctx->super.super.lib, "disabling mlx5 a2a for team with non-uniform ppn, " @@ -93,7 +93,7 @@ ucc_status_t ucc_tl_mlx5_team_init_alltoall(ucc_tl_mlx5_team_t *team) } ppn = ucc_topo_max_ppn(topo); - if (nnodes == 1) { + if (net->status == UCC_SBGP_NOT_EXISTS) { tl_debug(ctx->super.super.lib, "disabling mlx5 a2a for single node team"); goto non_fatal_error; diff --git a/src/components/tl/mlx5/tl_mlx5_team.c b/src/components/tl/mlx5/tl_mlx5_team.c index a1e6267d72..bfc837908a 100644 --- a/src/components/tl/mlx5/tl_mlx5_team.c +++ b/src/components/tl/mlx5/tl_mlx5_team.c @@ -223,7 +223,7 @@ ucc_status_t ucc_tl_mlx5_team_get_scores(ucc_base_team_t * tl_team, team_info.supported_mem_types = mt; team_info.supported_colls = (UCC_COLL_TYPE_ALLTOALL * 0) | - UCC_COLL_TYPE_BCAST * 0; + UCC_COLL_TYPE_BCAST; team_info.size = UCC_TL_TEAM_SIZE(team); status = ucc_coll_score_build_default( From 9ba2c2939c9224e9078273de7b294dc67804d550 Mon Sep 17 00:00:00 2001 From: snordmann Date: Thu, 14 Mar 2024 22:43:25 +0200 Subject: [PATCH 10/16] TL/MLX5: CI cleaning --- .ci/scripts/run_tests_ucc_mpi.sh | 2 -- 1 file changed, 2 deletions(-) diff --git a/.ci/scripts/run_tests_ucc_mpi.sh b/.ci/scripts/run_tests_ucc_mpi.sh index 8cf15db0e5..54750d575a 100755 --- a/.ci/scripts/run_tests_ucc_mpi.sh +++ b/.ci/scripts/run_tests_ucc_mpi.sh @@ -116,8 +116,6 @@ for MT in "" "-T"; do echo "At least two nodes are required, but only $NNODES are available" else tlmlx5_args=" -x UCC_CLS=basic -x UCC_CL_BASIC_TLS=ucp,mlx5 -x UCC_TL_MLXS_NET_DEVICES=$CX7_DEV:1 -x UCC_TL_MLX5_TUNE=inf -x UCX_RC_MLX5_DM_COUNT=0 -x UCX_DC_MLX5_DM_COUNT=0 " - tlmlx5_args+=" -x UCC_LOG_LEVEL=debug -x UCC_COLL_TRACE=info " - echo $CX7_DEV tlmlx5_colls="alltoall" mpirun $(mpi_params $PPN) $tlmlx5_args $EXE $MT $TG --mtypes host,cuda -c $tlmlx5_colls -t world -d uint8 -O 0 -m 1:128 fi From 0528dbdd4f96ada621075f3e6d565331accba319 Mon Sep 17 00:00:00 2001 From: snordmann Date: Fri, 15 Mar 2024 14:19:37 +0000 Subject: [PATCH 11/16] TL/MLX5: minor revisions in CI --- .ci/scripts/build_ucc.sh | 3 +-- .ci/scripts/run_tests_ucc_mpi.sh | 2 +- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/.ci/scripts/build_ucc.sh b/.ci/scripts/build_ucc.sh index 13d2bb0c54..58bb7ffdcb 100755 --- a/.ci/scripts/build_ucc.sh +++ b/.ci/scripts/build_ucc.sh @@ -8,8 +8,7 @@ cd "${UCC_SRC_DIR}" mkdir -p "${UCC_SRC_DIR}/build" cd "${UCC_SRC_DIR}/build" "${UCC_SRC_DIR}/configure" --with-ucx="${UCX_INSTALL_DIR}" --with-cuda="${CUDA_HOME}" \ - --prefix="${UCC_INSTALL_DIR}" --enable-gtest --with-mpi \ - --with-tls=cuda,nccl,self,ucp,mlx5,sharp,rccl + --prefix="${UCC_INSTALL_DIR}" --enable-gtest --with-mpi make -j install echo "${UCC_INSTALL_DIR}/lib" > /etc/ld.so.conf.d/ucc.conf ldconfig diff --git a/.ci/scripts/run_tests_ucc_mpi.sh b/.ci/scripts/run_tests_ucc_mpi.sh index 54750d575a..ef0f5a0070 100755 --- a/.ci/scripts/run_tests_ucc_mpi.sh +++ b/.ci/scripts/run_tests_ucc_mpi.sh @@ -115,7 +115,7 @@ for MT in "" "-T"; do elif [ $NNODES -lt 2 ]; then echo "At least two nodes are required, but only $NNODES are available" else - tlmlx5_args=" -x UCC_CLS=basic -x UCC_CL_BASIC_TLS=ucp,mlx5 -x UCC_TL_MLXS_NET_DEVICES=$CX7_DEV:1 -x UCC_TL_MLX5_TUNE=inf -x UCX_RC_MLX5_DM_COUNT=0 -x UCX_DC_MLX5_DM_COUNT=0 " + tlmlx5_args=" -x UCC_CLS=basic -x UCC_CL_BASIC_TLS=ucp,mlx5 -x UCC_TL_MLX5_NET_DEVICES=$CX7_DEV:1 -x UCC_TL_MLX5_TUNE=inf -x UCX_RC_MLX5_DM_COUNT=0 -x UCX_DC_MLX5_DM_COUNT=0 " tlmlx5_colls="alltoall" mpirun $(mpi_params $PPN) $tlmlx5_args $EXE $MT $TG --mtypes host,cuda -c $tlmlx5_colls -t world -d uint8 -O 0 -m 1:128 fi From 26bbfdaf1be0454da3746f22c4f8ad9390201326 Mon Sep 17 00:00:00 2001 From: snordmann Date: Mon, 18 Mar 2024 18:55:13 +0200 Subject: [PATCH 12/16] TL/MLX5: CI minor comments --- .ci/scripts/run_tests_ucc_mpi.sh | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/.ci/scripts/run_tests_ucc_mpi.sh b/.ci/scripts/run_tests_ucc_mpi.sh index ef0f5a0070..dbd298927b 100755 --- a/.ci/scripts/run_tests_ucc_mpi.sh +++ b/.ci/scripts/run_tests_ucc_mpi.sh @@ -28,10 +28,10 @@ for d in $(ssh $HEAD_NODE "ibstat -l"); do state=$(ssh $HEAD_NODE "ibstat $d" | grep 'State:' | awk '{print $2}') type=$(ssh $HEAD_NODE "ibstat $d" | grep 'CA type:' | awk '{print $3}') if [ $state == 'Active' ]; then - if [ "x$DEV" == "x" ]; then + if [ "$DEV" == "" ]; then DEV=$d fi - if [ $type == 'MT4129' ]; then + if [ "$type" == 'MT4129' ]; then CX7_DEV=$d break fi @@ -111,15 +111,17 @@ for MT in "" "-T"; do echo "INFO: UCC MPI unit tests (TL/MLX5) ..." # shellcheck disable=SC2086 if [ "x$CX7_DEV" == "x" ]; then - echo "No active CX7 devices found on $HEAD_NODE" + echo "WARNING: No active CX7 devices found on ${HEAD_NODE}" + echo "INFO: UCC MPI unit tests (TL/MLX5) ... SKIPPED" elif [ $NNODES -lt 2 ]; then - echo "At least two nodes are required, but only $NNODES are available" + echo "WARNING: At least two nodes are required, but only $NNODES are available" + echo "INFO: UCC MPI unit tests (TL/MLX5) ... SKIPPED" else tlmlx5_args=" -x UCC_CLS=basic -x UCC_CL_BASIC_TLS=ucp,mlx5 -x UCC_TL_MLX5_NET_DEVICES=$CX7_DEV:1 -x UCC_TL_MLX5_TUNE=inf -x UCX_RC_MLX5_DM_COUNT=0 -x UCX_DC_MLX5_DM_COUNT=0 " tlmlx5_colls="alltoall" mpirun $(mpi_params $PPN) $tlmlx5_args $EXE $MT $TG --mtypes host,cuda -c $tlmlx5_colls -t world -d uint8 -O 0 -m 1:128 + echo "INFO: UCC MPI unit tests (TL/MLX5) ... DONE" fi - echo "INFO: UCC MPI unit tests (TL/MLX5) ... DONE" echo "INFO: UCC MPI unit tests (CL/HIER) ..." # shellcheck disable=SC2086 From df699b02b4c185db692c76ba4147ed53c889680c Mon Sep 17 00:00:00 2001 From: snordmann Date: Mon, 18 Mar 2024 19:17:10 +0200 Subject: [PATCH 13/16] TL/MLX5: remove setting UCX's dm to zero --- .ci/scripts/run_tests_ucc_mpi.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.ci/scripts/run_tests_ucc_mpi.sh b/.ci/scripts/run_tests_ucc_mpi.sh index dbd298927b..53e9258d88 100755 --- a/.ci/scripts/run_tests_ucc_mpi.sh +++ b/.ci/scripts/run_tests_ucc_mpi.sh @@ -117,7 +117,7 @@ for MT in "" "-T"; do echo "WARNING: At least two nodes are required, but only $NNODES are available" echo "INFO: UCC MPI unit tests (TL/MLX5) ... SKIPPED" else - tlmlx5_args=" -x UCC_CLS=basic -x UCC_CL_BASIC_TLS=ucp,mlx5 -x UCC_TL_MLX5_NET_DEVICES=$CX7_DEV:1 -x UCC_TL_MLX5_TUNE=inf -x UCX_RC_MLX5_DM_COUNT=0 -x UCX_DC_MLX5_DM_COUNT=0 " + tlmlx5_args=" -x UCC_CLS=basic -x UCC_CL_BASIC_TLS=ucp,mlx5 -x UCC_TL_MLX5_NET_DEVICES=$CX7_DEV:1 -x UCC_TL_MLX5_TUNE=inf " tlmlx5_colls="alltoall" mpirun $(mpi_params $PPN) $tlmlx5_args $EXE $MT $TG --mtypes host,cuda -c $tlmlx5_colls -t world -d uint8 -O 0 -m 1:128 echo "INFO: UCC MPI unit tests (TL/MLX5) ... DONE" From c81d1dbbe0a6d8725f05634020ab6b71ffa25a54 Mon Sep 17 00:00:00 2001 From: snordmann Date: Tue, 19 Mar 2024 16:53:50 +0200 Subject: [PATCH 14/16] TL/MLX5: add tls in cmake config --- .ci/scripts/build_ucc.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.ci/scripts/build_ucc.sh b/.ci/scripts/build_ucc.sh index 58bb7ffdcb..2c570f808e 100755 --- a/.ci/scripts/build_ucc.sh +++ b/.ci/scripts/build_ucc.sh @@ -8,7 +8,8 @@ cd "${UCC_SRC_DIR}" mkdir -p "${UCC_SRC_DIR}/build" cd "${UCC_SRC_DIR}/build" "${UCC_SRC_DIR}/configure" --with-ucx="${UCX_INSTALL_DIR}" --with-cuda="${CUDA_HOME}" \ - --prefix="${UCC_INSTALL_DIR}" --enable-gtest --with-mpi + --prefix="${UCC_INSTALL_DIR}" --enable-gtest --with-mpi \ + --with-tls=cuda,nccl,self,sharp,shm,ucp,mlx5 make -j install echo "${UCC_INSTALL_DIR}/lib" > /etc/ld.so.conf.d/ucc.conf ldconfig From 09a71bf37470f13a1dca5a63c58bdcc2f5d96558 Mon Sep 17 00:00:00 2001 From: snordmann Date: Tue, 19 Mar 2024 18:22:39 +0200 Subject: [PATCH 15/16] TL/MLX5: empty commit to trigger CI From b9f5efa2c65e05a6229cfcaf09c756f066a0d08f Mon Sep 17 00:00:00 2001 From: snordmann Date: Fri, 3 May 2024 11:12:29 +0300 Subject: [PATCH 16/16] TL/MLX5: empty commit for triggering CI