Skip to content

Commit

Permalink
Merge branch 'master' into nsarka/bcast-active-set
Browse files Browse the repository at this point in the history
  • Loading branch information
nsarka authored Feb 26, 2024
2 parents e86c6da + 7930478 commit fc31e64
Show file tree
Hide file tree
Showing 20 changed files with 401 additions and 35 deletions.
12 changes: 11 additions & 1 deletion src/coll_score/ucc_coll_score.c
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/**
* Copyright (c) 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* Copyright (c) 2021-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
*
* See file LICENSE for terms.
*/
Expand All @@ -9,6 +9,16 @@
#include "utils/ucc_log.h"
#include "utils/ucc_coll_utils.h"

char *ucc_score_to_str(ucc_score_t score, char *buf, size_t max) {
if (score == UCC_SCORE_MAX) {
ucc_strncpy_safe(buf, "inf", max);
} else {
ucc_snprintf_safe(buf, max, "%d", score);
}

return buf;
}

ucc_status_t ucc_coll_score_alloc(ucc_coll_score_t **score)
{
ucc_coll_score_t *s = ucc_malloc(sizeof(*s), "ucc_coll_score");
Expand Down
14 changes: 8 additions & 6 deletions src/coll_score/ucc_coll_score.h
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/**
* Copyright (c) 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* Copyright (c) 2021-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
*
* See file LICENSE for terms.
*/
Expand Down Expand Up @@ -63,6 +63,8 @@ typedef struct ucc_coll_score {

typedef struct ucc_score_map ucc_score_map_t;

char *ucc_score_to_str(ucc_score_t score, char *buf, size_t max);

/* Allocates empty score data structure */
ucc_status_t ucc_coll_score_alloc(ucc_coll_score_t **score);

Expand All @@ -77,7 +79,7 @@ ucc_status_t ucc_coll_score_add_range(ucc_coll_score_t *score,

/* Releases the score data structure and all the score ranges stored
there */
void ucc_coll_score_free(ucc_coll_score_t *score);
void ucc_coll_score_free(ucc_coll_score_t *score);

/* Merges 2 scores score1 and score2 into the new score "rst" selecting
larger score. Ie.: rst will contain a range from score1 if either
Expand All @@ -87,9 +89,9 @@ void ucc_coll_score_free(ucc_coll_score_t *score);
This fn is used by CL to merge scores from multiple TLs and produce
a score map. As a result the produced score map will select TL with
higher score.*/
ucc_status_t ucc_coll_score_merge(ucc_coll_score_t * score1,
ucc_coll_score_t * score2,
ucc_coll_score_t **rst, int free_inputs);
ucc_status_t ucc_coll_score_merge(ucc_coll_score_t * score1,
ucc_coll_score_t * score2,
ucc_coll_score_t **rst, int free_inputs);


/* Parses SCORE string (see ucc_base_iface.c for pattern description)
Expand Down Expand Up @@ -147,7 +149,7 @@ ucc_status_t ucc_coll_score_build_default(ucc_base_team_t *team,
ucc_status_t ucc_coll_score_build_map(ucc_coll_score_t *score,
ucc_score_map_t **map);

void ucc_coll_score_free_map(ucc_score_map_t *map);
void ucc_coll_score_free_map(ucc_score_map_t *map);

/* Initializes task based on args selection and score map.
Checks fallbacks if necessary. */
Expand Down
19 changes: 11 additions & 8 deletions src/coll_score/ucc_coll_score_map.c
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/**
* Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* Copyright (c) 2021-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
*
* See file LICENSE for terms.
*/
Expand Down Expand Up @@ -160,11 +160,12 @@ ucc_status_t ucc_coll_init(ucc_score_map_t *map,

void ucc_coll_score_map_print_info(const ucc_score_map_t *map)
{
size_t left;
ucc_msg_range_t *range;
int i, j, all_empty;
char range_str[128];
char coll_str[1024];
size_t left;
ucc_msg_range_t *range;
int i, j, all_empty;
char score_str[32];
char range_str[128];
char coll_str[1024];

for (i = 0; i < UCC_COLL_TYPE_NUM; i++) {
all_empty = 1;
Expand All @@ -191,10 +192,12 @@ void ucc_coll_score_map_print_info(const ucc_score_map_t *map)
super.list_elem) {
ucc_memunits_range_str(range->start, range->end, range_str,
sizeof(range_str));
STR_APPEND(coll_str, left, 256, "{%s}:%s:%u ",
ucc_score_to_str(range->super.score, score_str,
sizeof(score_str));
STR_APPEND(coll_str, left, 256, "{%s}:%s:%s ",
range_str,
range->super.team->context->lib->log_component.name,
range->super.score);
score_str);
}
STR_APPEND(coll_str, left, 4, "\n");
}
Expand Down
10 changes: 7 additions & 3 deletions src/components/cl/hier/cl_hier.h
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/**
* Copyright (c) 2020-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* Copyright (c) 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* Copyright (c) Meta Platforms, Inc. and affiliates. 2022.
*
* See file LICENSE for terms.
Expand Down Expand Up @@ -109,8 +109,12 @@ typedef struct ucc_cl_hier_team {
UCC_CLASS_DECLARE(ucc_cl_hier_team_t, ucc_base_context_t *,
const ucc_base_team_params_t *);

#define UCC_CL_HIER_SUPPORTED_COLLS \
(UCC_COLL_TYPE_ALLTOALL | UCC_COLL_TYPE_ALLTOALLV)
#define UCC_CL_HIER_SUPPORTED_COLLS \
(UCC_COLL_TYPE_ALLTOALL | \
UCC_COLL_TYPE_ALLTOALLV | \
UCC_COLL_TYPE_ALLREDUCE | \
UCC_COLL_TYPE_BARRIER | \
UCC_COLL_TYPE_BCAST)

ucc_status_t ucc_cl_hier_coll_init(ucc_base_coll_args_t *coll_args,
ucc_base_team_t *team,
Expand Down
2 changes: 1 addition & 1 deletion src/components/cl/hier/cl_hier_team.c
Original file line number Diff line number Diff line change
Expand Up @@ -363,7 +363,7 @@ ucc_status_t ucc_cl_hier_team_get_scores(ucc_base_team_t *cl_team,
team_info.init = ucc_cl_hier_coll_init;
team_info.num_mem_types = 0;
team_info.supported_mem_types = NULL; /* all memory types supported*/
team_info.supported_colls = UCC_COLL_TYPE_ALL;
team_info.supported_colls = UCC_CL_HIER_SUPPORTED_COLLS;
team_info.size = UCC_CL_TEAM_SIZE(team);

status = ucc_coll_score_alloc(&score);
Expand Down
2 changes: 1 addition & 1 deletion src/components/tl/mlx5/tl_mlx5_context.c
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ UCC_CLASS_CLEANUP_FUNC(ucc_tl_mlx5_context_t)
tl_debug(self->super.super.lib, "failed to free ib ctx and pd");
};

if (!self->sock) {
if (self->sock) {
close(self->sock);
}

Expand Down
1 change: 1 addition & 0 deletions src/components/tl/ucp/Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ allgather = \
allgather/allgather.c \
allgather/allgather_ring.c \
allgather/allgather_neighbor.c \
allgather/allgather_bruck.c \
allgather/allgather_knomial.c

allgatherv = \
Expand Down
4 changes: 4 additions & 0 deletions src/components/tl/ucp/allgather/allgather.c
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,10 @@ ucc_base_coll_alg_info_t
{.id = UCC_TL_UCP_ALLGATHER_ALG_NEIGHBOR,
.name = "neighbor",
.desc = "O(N) Neighbor Exchange N/2 steps"},
[UCC_TL_UCP_ALLGATHER_ALG_BRUCK] =
{.id = UCC_TL_UCP_ALLGATHER_ALG_BRUCK,
.name = "bruck",
.desc = "O(log(N)) Variation of Bruck algorithm"},
[UCC_TL_UCP_ALLGATHER_ALG_LAST] = {
.id = 0, .name = NULL, .desc = NULL}};

Expand Down
12 changes: 12 additions & 0 deletions src/components/tl/ucp/allgather/allgather.h
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ enum {
UCC_TL_UCP_ALLGATHER_ALG_KNOMIAL,
UCC_TL_UCP_ALLGATHER_ALG_RING,
UCC_TL_UCP_ALLGATHER_ALG_NEIGHBOR,
UCC_TL_UCP_ALLGATHER_ALG_BRUCK,
UCC_TL_UCP_ALLGATHER_ALG_LAST
};

Expand Down Expand Up @@ -56,6 +57,17 @@ void ucc_tl_ucp_allgather_neighbor_progress(ucc_coll_task_t *task);

ucc_status_t ucc_tl_ucp_allgather_neighbor_start(ucc_coll_task_t *task);

/* Bruck */
ucc_status_t ucc_tl_ucp_allgather_bruck_init(ucc_base_coll_args_t *coll_args,
ucc_base_team_t *team,
ucc_coll_task_t **task_h);

void ucc_tl_ucp_allgather_bruck_progress(ucc_coll_task_t *task);

ucc_status_t ucc_tl_ucp_allgather_bruck_start(ucc_coll_task_t *task);

ucc_status_t ucc_tl_ucp_allgather_bruck_finalize(ucc_coll_task_t *coll_task);

/* Uses allgather_kn_radix from config */
ucc_status_t ucc_tl_ucp_allgather_knomial_init(ucc_base_coll_args_t *coll_args,
ucc_base_team_t *team,
Expand Down
Loading

0 comments on commit fc31e64

Please sign in to comment.