Skip to content

Commit

Permalink
prov/hook: Proposal for updating the hook provider
Browse files Browse the repository at this point in the history
As far as I could tell the hook provider dumps all the trace to the
console, which is a lot to look at. This is a proposal to store the
tracing information internally and then dump it when shutting down.

This patch makes a change to how the send/recv operation tracing is done.

This example code stores a record in a database. The record has a key
consistent of addr/data len/operation. This way we can keep track of the
number of operations of a specific len per address. I then dump it to a
csv file, which is a lot easier to parse. Typically we don't need to see
each operation on the console. A summary statistics of the job would be
more useful (IMHO)

This is just a proposal to get some feedback.

Signed-off-by: Amir Shehata <shehataa@ornl.gov>
  • Loading branch information
amirshehataornl committed Feb 25, 2024
1 parent a6f4a66 commit 1fcdd43
Show file tree
Hide file tree
Showing 5 changed files with 254 additions and 31 deletions.
28 changes: 28 additions & 0 deletions include/ofi_hook.h
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@
#include <rdma/fi_rma.h>
#include <rdma/fi_tagged.h>

#include <ofi_tree.h>
#include <ofi.h>
#include <rdma/providers/fi_prov.h>

Expand Down Expand Up @@ -159,12 +160,39 @@ static inline struct fi_provider *hook_to_hprov(const struct fid *fid)
}

struct ofi_ops_flow_ctrl;
struct hook_ep;

enum hook_op {
FI_HOOK_TRECV = 1,
FI_HOOK_TSEND,
FI_HOOK_RECV,
FI_HOOK_SEND,
FI_HOOK_RMA_WRITE,
FI_HOOK_RMA_READ,
};

struct hook_db_record_key {
fi_addr_t addr;
enum hook_op op;
size_t len;
};

struct hook_db_record {
struct ofi_rbnode *node;
struct hook_db_record_key key;
size_t count;
};

void hook_db_insert(struct hook_ep *ep, size_t len, fi_addr_t addr, enum hook_op op);

struct hook_domain {
struct fid_domain domain;
struct fid_domain *hdomain;
struct hook_fabric *fabric;
struct ofi_ops_flow_ctrl *base_ops_flow_ctrl;
struct ofi_bufpool *trace_pool;
struct ofi_rbmap trace_map;

ssize_t (*base_credit_handler)(struct fid_ep *ep_fid, uint64_t credits);
};

Expand Down
3 changes: 3 additions & 0 deletions include/ofi_tree.h
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,9 @@ void ofi_rbmap_destroy(struct ofi_rbmap *map);
void ofi_rbmap_init(struct ofi_rbmap *map,
int (*compare)(struct ofi_rbmap *map, void *key, void *data));
void ofi_rbmap_cleanup(struct ofi_rbmap *map);
void ofi_rbmap_iterate(struct ofi_rbmap *map,
struct ofi_rbnode *node, void *context,
void (*handle_node)(struct ofi_rbnode *node, void *context));

struct ofi_rbnode *ofi_rbmap_get_root(struct ofi_rbmap *map);
struct ofi_rbnode *ofi_rbmap_find(struct ofi_rbmap *map, void *key);
Expand Down
151 changes: 149 additions & 2 deletions prov/hook/src/hook_domain.c
Original file line number Diff line number Diff line change
Expand Up @@ -172,9 +172,86 @@ static int hook_domain_ops_open(struct fid *fid, const char *name,
return 0;
}

static char *op2str(enum hook_op op)
{
switch (op) {
case FI_HOOK_TRECV:
return "TAGGED_RECV";
case FI_HOOK_TSEND:
return "TAGGED_SEND";
case FI_HOOK_RECV:
return "MSG_RECV";
case FI_HOOK_SEND:
return "MSG_SEND";
case FI_HOOK_RMA_WRITE:
return "RMA_WRITE";
case FI_HOOK_RMA_READ:
return "RMA_READ";
default:
return "UNKNOWN";
}
}

static void write2csv(struct ofi_rbnode *node, void *context)
{
struct hook_db_record *rec;
FILE *f = context;

rec = node->data;

if (!rec)
return;

fprintf(f, "%ld, %s, %ld, %ld\n",
rec->key.addr, op2str(rec->key.op), rec->key.len, rec->count);
}

static void hook_write_db2csv(struct ofi_rbmap *map)
{
char fname[64];
FILE *f;

sprintf(fname, "/tmp/%d.hook_out", getpid());

f = fopen(fname, "w");

if (f) {
fprintf(f, "addr, operation, data_len, count\n");
ofi_rbmap_iterate(map, map->root, f, write2csv);
fclose(f);
}
}

int hook_domain_close(struct fid *fid)
{
struct fid *hfid;
struct hook_prov_ctx *prov_ctx;
int ret;
struct hook_domain *dom;

dom = container_of(fid, struct hook_domain, domain.fid);

hook_write_db2csv(&dom->trace_map);

hfid = hook_to_hfid(fid);
if (!hfid)
return -FI_EINVAL;

prov_ctx = hook_to_prov_ctx(fid);
if (!prov_ctx)
return -FI_EINVAL;

hook_fini_fid(prov_ctx, fid);

ret = hfid->ops->close(hfid);
if (!ret)
free(fid);
return ret;
}

struct fi_ops hook_domain_fid_ops = {
.size = sizeof(struct fi_ops),
.close = hook_close,
.close = hook_domain_close,
.bind = hook_bind,
.control = hook_control,
.ops_open = hook_domain_ops_open,
Expand Down Expand Up @@ -211,6 +288,66 @@ struct fi_ops_domain hook_domain_ops = {
.query_collective = hook_query_collective,
};

static struct hook_db_record *
hook_alloc_rec(struct hook_domain *dom, struct hook_db_record_key *key)
{
struct hook_db_record *rec;

rec = ofi_ibuf_alloc(dom->trace_pool);
if (!rec)
return NULL;

memcpy(&rec->key, key, sizeof(*key));

if (ofi_rbmap_insert(&dom->trace_map, &rec->key, rec, &rec->node)) {
ofi_ibuf_free(rec);
rec = NULL;
}

return rec;
}

static struct hook_db_record *
hook_update_db_record(struct hook_domain *dom, struct hook_db_record_key *key)
{
struct hook_db_record *rec;
struct ofi_rbnode *node;

node = ofi_rbmap_find(&dom->trace_map, (void *) key);
if (node) {
rec = node->data;
rec->count++;
} else {
rec = hook_alloc_rec(dom, key);
if (rec)
rec->count++;
}

return rec;
}

void hook_db_insert(struct hook_ep *ep, size_t len, fi_addr_t addr, enum hook_op op)
{
struct hook_db_record *rec;
struct hook_db_record_key key;

key.addr = addr;
key.op = op;
key.len = len;

rec = hook_update_db_record(ep->domain, &key);

if (!rec)
FI_WARN(ep->domain->fabric->hprov, FI_LOG_EP_DATA,
"Failed to insert op %d addr %lx size %ld\n",
op, addr, len);
}

static int hook_addr_compare(struct ofi_rbmap *map, void *key, void *data)
{
return memcmp(&((struct hook_db_record *) data)->key, key,
sizeof(struct hook_db_record_key));
}

int hook_domain_init(struct fid_fabric *fabric, struct fi_info *info,
struct fid_domain **domain, void *context,
Expand All @@ -226,10 +363,20 @@ int hook_domain_init(struct fid_fabric *fabric, struct fi_info *info,
dom->domain.ops = &hook_domain_ops;
dom->domain.mr = &hook_mr_ops;

ret = fi_domain(fab->hfabric, info, &dom->hdomain, &dom->domain.fid);
ret = ofi_bufpool_create(&dom->trace_pool, sizeof(struct hook_db_record),
0, 0, 0, OFI_BUFPOOL_INDEXED |
OFI_BUFPOOL_NO_TRACK);
if (ret)
return ret;

ofi_rbmap_init(&dom->trace_map, hook_addr_compare);

ret = fi_domain(fab->hfabric, info, &dom->hdomain, &dom->domain.fid);
if (ret) {
ofi_bufpool_destroy(dom->trace_pool);
return ret;
}

*domain = &dom->domain;

return 0;
Expand Down
Loading

0 comments on commit 1fcdd43

Please sign in to comment.