Skip to content

Commit

Permalink
Merge branch 'master' into topic/fix_rank_reordering
Browse files Browse the repository at this point in the history
  • Loading branch information
artemry-nv authored Jan 25, 2024
2 parents c16f2bd + 1bfa3f3 commit dcc7dcd
Show file tree
Hide file tree
Showing 11 changed files with 162 additions and 82 deletions.
69 changes: 45 additions & 24 deletions config/m4/rocm.m4
Original file line number Diff line number Diff line change
Expand Up @@ -39,11 +39,28 @@ AC_DEFUN([ROCM_BUILD_FLAGS],
# Parse value of ARG into appropriate LIBS, LDFLAGS, and
# CPPFLAGS variables.
AC_DEFUN([HIP_BUILD_FLAGS],
$4="-D__HIP_PLATFORM_AMD__ -I$1/include/hip -I$1/include"
$3="-L$1/lib"
$4="-D__HIP_PLATFORM_AMD__ -I$1/include/hip -I$1/include -I$1/llvm/include"
$3="-L$1/lib -L$1/llvm/lib"
$2="-lamdhip64"
)

# CHECK_ROCM_VERSION(HIP_VERSION_MAJOR, ROCM_VERSION_CONDITION)
# ----------------------------------------------------------
# Checks ROCm version and marks condition as 1 (TRUE) or 0 (FALSE)
AC_DEFUN([CHECK_ROCM_VERSION], [
AC_COMPILE_IFELSE(
[AC_LANG_PROGRAM([[#include <${with_rocm}/include/hip/hip_version.h>
]], [[
#if HIP_VERSION_MAJOR >= $1
return 0;
#else
intr make+compilation_fail()
#endif
]])],
[$2=1],
[$2=0])
])

#
# Check for ROCm support
#
Expand Down Expand Up @@ -102,28 +119,25 @@ AS_IF([test "x$with_rocm" != "xno"],
LDFLAGS="$SAVE_LDFLAGS"
LIBS="$SAVE_LIBS"
#Check whether we run on ROCm 5.0 or higher
AC_COMPILE_IFELSE(
[AC_LANG_PROGRAM([[#include <${with_rocm}/include/rocm_version.h>
]], [[
#if ROCM_VERSION_MAJOR >= 5
return 0;
#else
intr make+compilation_fail()
#endif
]])],
[ROCM_VERSION_50_OR_GREATER=1],
[ROCM_VERSION_50_OR_GREATER=0])
HIP_BUILD_FLAGS([$with_rocm], [HIP_LIBS], [HIP_LDFLAGS], [HIP_CPPFLAGS])
AC_MSG_CHECKING([if ROCm version is 5.0 or above])
if test "$ROCM_VERSION_50_OR_GREATER" = "1" ; then
# Check whether we run on ROCm 6.0 or higher
CHECK_ROCM_VERSION(6, ROCM_VERSION_60_OR_GREATER)
AC_MSG_CHECKING([if ROCm version is 6.0 or above])
if test "$ROCM_VERSION_60_OR_GREATER" = "1" ; then
AC_MSG_RESULT([yes])
else
AC_MSG_RESULT([no])
HIP_CPPFLAGS="${HIP_CPPFLAGS} -I${with_rocm}/hip/include"
HIP_LDFLAGS="${HIP_LDFLAGS} -L${with_rocm}/hip/lib"
# Check whether we run on ROCm 5.0-5.7
CHECK_ROCM_VERSION(5, ROCM_VERSION_50_OR_GREATER)
AC_MSG_CHECKING([if ROCm version is 5.0 - 5.7])
if test "$ROCM_VERSION_50_OR_GREATER" = "1" ; then
AC_MSG_RESULT([yes])
else
AC_MSG_RESULT([no])
HIP_CPPFLAGS="${HIP_CPPFLAGS} -I${with_rocm}/hip/include"
HIP_LDFLAGS="${HIP_LDFLAGS} -L${with_rocm}/hip/lib"
fi
fi
CPPFLAGS="$HIP_CPPFLAGS $CPPFLAGS"
Expand All @@ -142,10 +156,17 @@ intr make+compilation_fail()
LDFLAGS="$SAVE_LDFLAGS"
LIBS="$SAVE_LIBS"
AS_IF([test "x$hip_happy" = "xyes"],
[AC_PATH_PROG([HIPCC], [hipcc], [notfound], [$PATH:$with_rocm/bin])])
AS_IF([test "$HIPCC" = "notfound"], [hip_happy="no"])
if test "$ROCM_VERSION_60_OR_GREATER" = "1" ; then
AC_MSG_NOTICE([using amdclang as ROCm version is 6.0 or above])
AS_IF([test "x$hip_happy" = "xyes"],
[AC_PATH_PROG([HIPCC], [amdclang], [notfound], [$PATH:$with_rocm/bin])])
AS_IF([test "$HIPCC" = "notfound"], [hip_happy="no"])
else
AC_MSG_NOTICE([using hipcc as ROCm version is 3.7.0 to ROCm 5.7.1])
AS_IF([test "x$hip_happy" = "xyes"],
[AC_PATH_PROG([HIPCC], [hipcc], [notfound], [$PATH:$with_rocm/bin])])
AS_IF([test "$HIPCC" = "notfound"], [hip_happy="no"])
fi
AS_IF([test "x$hip_happy" = "xyes"],
[AC_DEFINE([HAVE_HIP], 1, [Enable HIP support])
Expand Down
10 changes: 8 additions & 2 deletions cuda_lt.sh
Original file line number Diff line number Diff line change
Expand Up @@ -27,15 +27,21 @@ local_npic_filepath="${local_npic_dir}${o_filename}"
mkdir -p $pic_dir

tmpcmd="${@:3}"
if [[ "$tmpcmd" == *"hipcc"* ]]; then
if [[ "$tmpcmd" == *"amdclang"* ]]; then
cmd="${@:3:2} -x hip -target x86_64-unknown-linux-gnu --offload-arch=gfx908:xnack- --offload-arch=gfx90a:xnack- --offload-arch=gfx90a:xnack+ --offload-arch=gfx940 --offload-arch=gfx941 --offload-arch=gfx942 --offload-arch=gfx1030 --offload-arch=gfx1100 --offload-arch=gfx1101 --offload-arch=gfx1102 --offload-arch=native ${@:5} -fPIC -o ${pic_filepath}"
elif [[ "$tmpcmd" == *"hipcc"* ]]; then
cmd="${@:3} -fPIC -o ${pic_filepath}"
else
cmd="${@:3} -Xcompiler -fPIC -o ${pic_filepath}"
fi
echo $cmd
$cmd

cmd="${@:3} -o ${npic_filepath}"
if [[ "$tmpcmd" == *"amdclang"* ]]; then
cmd="${@:3:2} -x hip -target x86_64-unknown-linux-gnu --offload-arch=gfx908:xnack- --offload-arch=gfx90a:xnack- --offload-arch=gfx90a:xnack+ --offload-arch=gfx940 --offload-arch=gfx941 --offload-arch=gfx942 --offload-arch=gfx1030 --offload-arch=gfx1100 --offload-arch=gfx1101 --offload-arch=gfx1102 --offload-arch=native ${@:5} -o ${npic_filepath}"
else
cmd="${@:3} -o ${npic_filepath}"
fi
echo $cmd
$cmd

Expand Down
15 changes: 6 additions & 9 deletions src/components/tl/mlx5/mcast/tl_mlx5_mcast_rcache.c
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/**
* Copyright (c) 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* Copyright (c) 2022-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
*
* See file LICENSE for terms.
*/
Expand Down Expand Up @@ -130,7 +130,7 @@ ucc_tl_mlx5_mcast_mem_deregister(ucc_tl_mlx5_mcast_coll_context_t *ctx,
return UCC_OK;
}

static ucc_rcache_ops_t ucc_rcache_ops = {
static ucc_rcache_ops_t ucc_tl_mlx5_rcache_ops = {
.mem_reg = ucc_tl_mlx5_mcast_rcache_mem_reg_cb,
.mem_dereg = ucc_tl_mlx5_mcast_rcache_mem_dereg_cb,
.dump_region = ucc_tl_mlx5_mcast_rcache_dump_region_cb
Expand All @@ -140,15 +140,12 @@ ucc_status_t ucc_tl_mlx5_mcast_setup_rcache(ucc_tl_mlx5_mcast_coll_context_t *ct
{
ucc_rcache_params_t rcache_params;

rcache_params.ucm_event_priority = 1000;
rcache_params.max_regions = ULONG_MAX;
rcache_params.max_size = SIZE_MAX;
ucc_rcache_set_default_params(&rcache_params);
rcache_params.region_struct_size = sizeof(ucc_tl_mlx5_mcast_rcache_region_t);
rcache_params.context = ctx;
rcache_params.ops = &ucc_tl_mlx5_rcache_ops;
rcache_params.ucm_events = UCM_EVENT_VM_UNMAPPED |
UCM_EVENT_MEM_TYPE_FREE;
rcache_params.context = ctx;
rcache_params.ops = &ucc_rcache_ops;
rcache_params.flags = 0;

return ucc_rcache_create(&rcache_params, "MCAST", &ctx->rcache);
return ucc_rcache_create(&rcache_params, "MLX5_MCAST", &ctx->rcache);
}
18 changes: 9 additions & 9 deletions src/components/tl/mlx5/tl_mlx5_rcache.c
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/**
* Copyright (c) 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* Copyright (c) 2022-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
*
* See file LICENSE for terms.
*/
Expand Down Expand Up @@ -52,22 +52,22 @@ static void ucc_tl_mlx5_rcache_dump_region_cb(void *context, //NOLINT
snprintf(buf, max, "bar ptr:%p", mlx5_rregion->reg.mr);
}

static ucc_rcache_ops_t ucc_rcache_ops = {
static ucc_rcache_ops_t ucc_tl_mlx5_rcache_ops = {
.mem_reg = rcache_reg_mr,
.mem_dereg = rcache_dereg_mr,
.dump_region = ucc_tl_mlx5_rcache_dump_region_cb
};

ucc_status_t tl_mlx5_rcache_create(ucc_tl_mlx5_context_t *ctx)
{
ucc_rcache_params_t rcache_params;
ucc_rcache_params_t rcache_params;

ucc_rcache_set_default_params(&rcache_params);
rcache_params.region_struct_size = sizeof(ucc_tl_mlx5_rcache_region_t);
rcache_params.ucm_event_priority = 1000;
rcache_params.context = (void *)ctx;
rcache_params.ops = &ucc_rcache_ops;
rcache_params.ucm_events = UCM_EVENT_VM_UNMAPPED
| UCM_EVENT_MEM_TYPE_FREE;
rcache_params.context = ctx;
rcache_params.ops = &ucc_tl_mlx5_rcache_ops;
rcache_params.ucm_events = UCM_EVENT_VM_UNMAPPED |
UCM_EVENT_MEM_TYPE_FREE;

return ucc_rcache_create(&rcache_params, "MLX5", &ctx->rcache);
return ucc_rcache_create(&rcache_params, "MLX5_A2A", &ctx->rcache);
}
11 changes: 4 additions & 7 deletions src/components/tl/sharp/tl_sharp_context.c
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/**
* Copyright (c) 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* Copyright (c) 2021-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
*
* See file LICENSE for terms.
*/
Expand Down Expand Up @@ -269,15 +269,12 @@ ucc_status_t ucc_tl_sharp_rcache_create(struct sharp_coll_context *context,
{
ucc_rcache_params_t rcache_params;

rcache_params.ucm_event_priority = 1000;
rcache_params.max_regions = ULONG_MAX;
rcache_params.max_size = SIZE_MAX;
ucc_rcache_set_default_params(&rcache_params);
rcache_params.region_struct_size = sizeof(ucc_tl_sharp_rcache_region_t);
rcache_params.ucm_events = UCM_EVENT_VM_UNMAPPED |
UCM_EVENT_MEM_TYPE_FREE;
rcache_params.context = context;
rcache_params.ops = &ucc_tl_sharp_rcache_ops;
rcache_params.flags = 0;
rcache_params.ucm_events = UCM_EVENT_VM_UNMAPPED |
UCM_EVENT_MEM_TYPE_FREE;

return ucc_rcache_create(&rcache_params, "SHARP", rcache);
}
Expand Down
35 changes: 30 additions & 5 deletions src/components/tl/ucp/alltoall/alltoall_pairwise.c
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/**
* Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* Copyright (c) 2021-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
*
* See file LICENSE for terms.
*/
Expand All @@ -11,6 +11,10 @@
#include "utils/ucc_math.h"
#include "tl_ucp_sendrecv.h"

/* TODO: add as parameters */
#define MSG_MEDIUM 66000
#define NP_THRESH 32

static inline ucc_rank_t get_recv_peer(ucc_rank_t rank, ucc_rank_t size,
ucc_rank_t step)
{
Expand All @@ -23,6 +27,29 @@ static inline ucc_rank_t get_send_peer(ucc_rank_t rank, ucc_rank_t size,
return (rank - step + size) % size;
}

static ucc_rank_t get_num_posts(const ucc_tl_ucp_team_t *team,
const ucc_coll_args_t *args)
{
unsigned long posts = UCC_TL_UCP_TEAM_LIB(team)->cfg.alltoall_pairwise_num_posts;
ucc_rank_t tsize = UCC_TL_TEAM_SIZE(team);
size_t data_size;

data_size = (size_t)args->src.info.count *
ucc_dt_size(args->src.info.datatype);
if (posts == UCC_ULUNITS_AUTO) {
if ((data_size > MSG_MEDIUM) && (tsize > NP_THRESH)) {
/* use pairwise algorithm */
posts = 1;
} else {
/* use linear algorithm */
posts = 0;
}
}

posts = (posts > tsize || posts == 0) ? tsize: posts;
return posts;
}

void ucc_tl_ucp_alltoall_pairwise_progress(ucc_coll_task_t *coll_task)
{
ucc_tl_ucp_task_t *task = ucc_derived_of(coll_task, ucc_tl_ucp_task_t);
Expand All @@ -34,12 +61,10 @@ void ucc_tl_ucp_alltoall_pairwise_progress(ucc_coll_task_t *coll_task)
ucc_rank_t grank = UCC_TL_TEAM_RANK(team);
ucc_rank_t gsize = UCC_TL_TEAM_SIZE(team);
int polls = 0;
ucc_rank_t peer;
int posts, nreqs;
ucc_rank_t peer, nreqs;
size_t data_size;

posts = UCC_TL_UCP_TEAM_LIB(team)->cfg.alltoall_pairwise_num_posts;
nreqs = (posts > gsize || posts == 0) ? gsize : posts;
nreqs = get_num_posts(team, &TASK_ARGS(task));
data_size = (size_t)(TASK_ARGS(task).src.info.count / gsize) *
ucc_dt_size(TASK_ARGS(task).src.info.datatype);
while ((task->tagged.send_posted < gsize ||
Expand Down
30 changes: 25 additions & 5 deletions src/components/tl/ucp/alltoallv/alltoallv_pairwise.c
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/**
* Copyright (c) 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* Copyright (c) 2021-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
*
* See file LICENSE for terms.
*/
Expand All @@ -12,6 +12,9 @@
#include "utils/ucc_coll_utils.h"
#include "tl_ucp_sendrecv.h"

/* TODO: add as parameters */
#define NP_THRESH 32

static inline ucc_rank_t get_recv_peer(ucc_rank_t rank, ucc_rank_t size,
ucc_rank_t step)
{
Expand All @@ -24,6 +27,25 @@ static inline ucc_rank_t get_send_peer(ucc_rank_t rank, ucc_rank_t size,
return (rank - step + size) % size;
}

static ucc_rank_t get_num_posts(const ucc_tl_ucp_team_t *team)
{
unsigned long posts = UCC_TL_UCP_TEAM_LIB(team)->cfg.alltoallv_pairwise_num_posts;
ucc_rank_t tsize = UCC_TL_TEAM_SIZE(team);

if (posts == UCC_ULUNITS_AUTO) {
if (UCC_TL_TEAM_SIZE(team) <= NP_THRESH) {
/* use linear algorithm */
posts = 0;
} else {
/* use pairwise algorithm */
posts = 1;
}
}

posts = (posts > tsize || posts == 0) ? tsize: posts;
return posts;
}

static void ucc_tl_ucp_alltoallv_pairwise_progress(ucc_coll_task_t *coll_task)
{
ucc_tl_ucp_task_t *task = ucc_derived_of(coll_task, ucc_tl_ucp_task_t);
Expand All @@ -35,12 +57,10 @@ static void ucc_tl_ucp_alltoallv_pairwise_progress(ucc_coll_task_t *coll_task)
ucc_rank_t grank = UCC_TL_TEAM_RANK(team);
ucc_rank_t gsize = UCC_TL_TEAM_SIZE(team);
int polls = 0;
ucc_rank_t peer;
int posts, nreqs;
ucc_rank_t peer, nreqs;
size_t rdt_size, sdt_size, data_size, data_displ;

posts = UCC_TL_UCP_TEAM_LIB(team)->cfg.alltoallv_pairwise_num_posts;
nreqs = (posts > gsize || posts == 0) ? gsize : posts;
nreqs = get_num_posts(team);
rdt_size = ucc_dt_size(TASK_ARGS(task).dst.info_v.datatype);
sdt_size = ucc_dt_size(TASK_ARGS(task).src.info_v.datatype);
while ((task->tagged.send_posted < gsize ||
Expand Down
10 changes: 5 additions & 5 deletions src/components/tl/ucp/tl_ucp.c
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/**
* Copyright (c) 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* Copyright (c) 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
*
* See file LICENSE for terms.
*/
Expand Down Expand Up @@ -36,17 +36,17 @@ ucc_config_field_t ucc_tl_ucp_lib_config_table[] = {
{"", "", NULL, ucc_offsetof(ucc_tl_ucp_lib_config_t, super),
UCC_CONFIG_TYPE_TABLE(ucc_tl_lib_config_table)},

{"ALLTOALL_PAIRWISE_NUM_POSTS", "1",
{"ALLTOALL_PAIRWISE_NUM_POSTS", "auto",
"Maximum number of outstanding send and receive messages in alltoall "
"pairwise algorithm",
ucc_offsetof(ucc_tl_ucp_lib_config_t, alltoall_pairwise_num_posts),
UCC_CONFIG_TYPE_UINT},
UCC_CONFIG_TYPE_ULUNITS},

{"ALLTOALLV_PAIRWISE_NUM_POSTS", "1",
{"ALLTOALLV_PAIRWISE_NUM_POSTS", "auto",
"Maximum number of outstanding send and receive messages in alltoallv "
"pairwise algorithm",
ucc_offsetof(ucc_tl_ucp_lib_config_t, alltoallv_pairwise_num_posts),
UCC_CONFIG_TYPE_UINT},
UCC_CONFIG_TYPE_ULUNITS},

/* TODO: add radix to config once it's fully supported by the algorithm
{"ALLTOALLV_HYBRID_RADIX", "2",
Expand Down
6 changes: 3 additions & 3 deletions src/components/tl/ucp/tl_ucp.h
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/**
* Copyright (c) 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* Copyright (c) 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
*
* See file LICENSE for terms.
*/
Expand Down Expand Up @@ -60,8 +60,8 @@ typedef struct ucc_tl_ucp_lib_config {
uint32_t scatter_kn_radix;
ucc_on_off_auto_value_t scatter_kn_enable_recv_zcopy;
uint32_t scatterv_linear_num_posts;
uint32_t alltoall_pairwise_num_posts;
uint32_t alltoallv_pairwise_num_posts;
unsigned long alltoall_pairwise_num_posts;
unsigned long alltoallv_pairwise_num_posts;
ucc_pipeline_params_t allreduce_sra_kn_pipeline;
int reduce_avg_pre_op;
int reduce_scatter_ring_bidirectional;
Expand Down
1 change: 0 additions & 1 deletion src/utils/ucc_parser.h
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,6 @@ typedef struct ucc_file_config ucc_file_config_t;
#define UCC_CONFIG_TYPE_ULUNITS UCS_CONFIG_TYPE_ULUNITS
#define UCC_CONFIG_TYPE_ENUM UCS_CONFIG_TYPE_ENUM
#define UCC_CONFIG_TYPE_MEMUNITS UCS_CONFIG_TYPE_MEMUNITS
#define UCC_CONFIG_TYPE_ULUNITS UCS_CONFIG_TYPE_ULUNITS
#define UCC_ULUNITS_AUTO UCS_ULUNITS_AUTO
#define UCC_CONFIG_TYPE_BITMAP UCS_CONFIG_TYPE_BITMAP
#define UCC_CONFIG_TYPE_MEMUNITS UCS_CONFIG_TYPE_MEMUNITS
Expand Down
Loading

0 comments on commit dcc7dcd

Please sign in to comment.