diff --git a/.ci/scripts/build_ucc.sh b/.ci/scripts/build_ucc.sh index 58bb7ffdcb..2c570f808e 100755 --- a/.ci/scripts/build_ucc.sh +++ b/.ci/scripts/build_ucc.sh @@ -8,7 +8,8 @@ cd "${UCC_SRC_DIR}" mkdir -p "${UCC_SRC_DIR}/build" cd "${UCC_SRC_DIR}/build" "${UCC_SRC_DIR}/configure" --with-ucx="${UCX_INSTALL_DIR}" --with-cuda="${CUDA_HOME}" \ - --prefix="${UCC_INSTALL_DIR}" --enable-gtest --with-mpi + --prefix="${UCC_INSTALL_DIR}" --enable-gtest --with-mpi \ + --with-tls=cuda,nccl,self,sharp,shm,ucp,mlx5 make -j install echo "${UCC_INSTALL_DIR}/lib" > /etc/ld.so.conf.d/ucc.conf ldconfig diff --git a/.ci/scripts/run_tests_ucc_mpi.sh b/.ci/scripts/run_tests_ucc_mpi.sh index 73a4eaca6a..53e9258d88 100755 --- a/.ci/scripts/run_tests_ucc_mpi.sh +++ b/.ci/scripts/run_tests_ucc_mpi.sh @@ -21,13 +21,20 @@ export MASTER_ADDR=${HEAD_NODE} NNODES=$(wc --lines "$HOSTFILE" | awk '{print $1}') DEV="" +CX7_DEV="" # Find first available active device for d in $(ssh $HEAD_NODE "ibstat -l"); do state=$(ssh $HEAD_NODE "ibstat $d" | grep 'State:' | awk '{print $2}') + type=$(ssh $HEAD_NODE "ibstat $d" | grep 'CA type:' | awk '{print $3}') if [ $state == 'Active' ]; then - DEV=$d - break + if [ "$DEV" == "" ]; then + DEV=$d + fi + if [ "$type" == 'MT4129' ]; then + CX7_DEV=$d + break + fi fi done @@ -101,6 +108,20 @@ for MT in "" "-T"; do mpirun $(mpi_params $PPN 1) $ucx_tls_no_cuda_ipc $tlcuda_args $EXE $MT $TG --mtypes cuda -c $tlcuda_colls echo "INFO: UCC MPI unit tests (TL/CUDA) ... DONE" + echo "INFO: UCC MPI unit tests (TL/MLX5) ..." + # shellcheck disable=SC2086 + if [ "x$CX7_DEV" == "x" ]; then + echo "WARNING: No active CX7 devices found on ${HEAD_NODE}" + echo "INFO: UCC MPI unit tests (TL/MLX5) ... SKIPPED" + elif [ $NNODES -lt 2 ]; then + echo "WARNING: At least two nodes are required, but only $NNODES are available" + echo "INFO: UCC MPI unit tests (TL/MLX5) ... SKIPPED" + else + tlmlx5_args=" -x UCC_CLS=basic -x UCC_CL_BASIC_TLS=ucp,mlx5 -x UCC_TL_MLX5_NET_DEVICES=$CX7_DEV:1 -x UCC_TL_MLX5_TUNE=inf " + tlmlx5_colls="alltoall" + mpirun $(mpi_params $PPN) $tlmlx5_args $EXE $MT $TG --mtypes host,cuda -c $tlmlx5_colls -t world -d uint8 -O 0 -m 1:128 + echo "INFO: UCC MPI unit tests (TL/MLX5) ... DONE" + fi echo "INFO: UCC MPI unit tests (CL/HIER) ..." # shellcheck disable=SC2086 diff --git a/src/components/tl/mlx5/tl_mlx5_team.c b/src/components/tl/mlx5/tl_mlx5_team.c index 614ead348b..edc0b95d90 100644 --- a/src/components/tl/mlx5/tl_mlx5_team.c +++ b/src/components/tl/mlx5/tl_mlx5_team.c @@ -313,7 +313,7 @@ ucc_status_t ucc_tl_mlx5_team_get_scores(ucc_base_team_t * tl_team, team_info.num_mem_types = 2; team_info.supported_mem_types = mt; team_info.supported_colls = - (UCC_COLL_TYPE_ALLTOALL * (team->a2a_state == TL_MLX5_TEAM_STATE_ALLTOALL_READY)) | + (UCC_COLL_TYPE_ALLTOALL * (team->a2a_state == TL_MLX5_TEAM_STATE_ALLTOALL_READY) * 0) | UCC_COLL_TYPE_BCAST * (team->mcast_state == TL_MLX5_TEAM_STATE_MCAST_READY); team_info.size = UCC_TL_TEAM_SIZE(team);