sawradip · May 8, 2021
diff --git a/‎benchmarks/distributed/rpc/parameter_server/BenchmarkConfigurations.py
+15 b/‎benchmarks/distributed/rpc/parameter_server/BenchmarkConfigurations.py
+15
diff --git a/‎benchmarks/distributed/rpc/parameter_server/README.md
+56 b/‎benchmarks/distributed/rpc/parameter_server/README.md
+56
diff --git a/‎benchmarks/distributed/rpc/parameter_server/bash_experiment_scripts/ddp_nccl_allreduce.sh
+13 b/‎benchmarks/distributed/rpc/parameter_server/bash_experiment_scripts/ddp_nccl_allreduce.sh
+13
diff --git a/‎benchmarks/distributed/rpc/parameter_server/bash_experiment_scripts/helper_functions.sh
+7 b/‎benchmarks/distributed/rpc/parameter_server/bash_experiment_scripts/helper_functions.sh
+7
diff --git a/‎benchmarks/distributed/rpc/parameter_server/benchmark_class_helper.py
+33 b/‎benchmarks/distributed/rpc/parameter_server/benchmark_class_helper.py
+33
diff --git a/‎benchmarks/distributed/rpc/parameter_server/configurations/benchmark_configurations.json
+8 b/‎benchmarks/distributed/rpc/parameter_server/configurations/benchmark_configurations.json
+8
diff --git a/‎benchmarks/distributed/rpc/parameter_server/configurations/data_configurations.json
+20 b/‎benchmarks/distributed/rpc/parameter_server/configurations/data_configurations.json
+20
diff --git a/‎benchmarks/distributed/rpc/parameter_server/configurations/model_configurations.json
+22 b/‎benchmarks/distributed/rpc/parameter_server/configurations/model_configurations.json
+22
diff --git a/‎benchmarks/distributed/rpc/parameter_server/configurations/parameter_server_configurations.json
+1 b/‎benchmarks/distributed/rpc/parameter_server/configurations/parameter_server_configurations.json
+1
diff --git a/‎benchmarks/distributed/rpc/parameter_server/configurations/trainer_configurations.json
+8 b/‎benchmarks/distributed/rpc/parameter_server/configurations/trainer_configurations.json
+8
diff --git a/‎benchmarks/distributed/rpc/parameter_server/data/DummyData.py
+46 b/‎benchmarks/distributed/rpc/parameter_server/data/DummyData.py
+46
diff --git a/‎benchmarks/distributed/rpc/parameter_server/launcher.py
+343 b/‎benchmarks/distributed/rpc/parameter_server/launcher.py
+343
diff --git a/‎benchmarks/distributed/rpc/parameter_server/metrics/CPUMetric.py
+23 b/‎benchmarks/distributed/rpc/parameter_server/metrics/CPUMetric.py
+23
diff --git a/‎benchmarks/distributed/rpc/parameter_server/metrics/CUDAMetric.py
+32 b/‎benchmarks/distributed/rpc/parameter_server/metrics/CUDAMetric.py
+32
diff --git a/‎benchmarks/distributed/rpc/parameter_server/metrics/MetricBase.py
+26 b/‎benchmarks/distributed/rpc/parameter_server/metrics/MetricBase.py
+26
diff --git a/‎benchmarks/distributed/rpc/parameter_server/metrics/MetricsLogger.py
+78 b/‎benchmarks/distributed/rpc/parameter_server/metrics/MetricsLogger.py
+78
diff --git a/‎benchmarks/distributed/rpc/parameter_server/metrics/ProcessedMetricsPrinter.py
+82 b/‎benchmarks/distributed/rpc/parameter_server/metrics/ProcessedMetricsPrinter.py
+82
diff --git a/‎benchmarks/distributed/rpc/parameter_server/models/DummyModel.py
+22 b/‎benchmarks/distributed/rpc/parameter_server/models/DummyModel.py
+22
diff --git a/‎benchmarks/distributed/rpc/parameter_server/trainers/DdpNcclTrainer.py
+107 b/‎benchmarks/distributed/rpc/parameter_server/trainers/DdpNcclTrainer.py
+107
diff --git a/‎benchmarks/distributed/rpc/parameter_server/trainers/DdpTrainerBase.py
+32 b/‎benchmarks/distributed/rpc/parameter_server/trainers/DdpTrainerBase.py
+32
diff --git a/‎benchmarks/distributed/rpc/parameter_server/trainers/TrainerBase.py
+97 b/‎benchmarks/distributed/rpc/parameter_server/trainers/TrainerBase.py
+97
@@ -0,0 +1,15 @@
+from dataclasses import dataclass
+
+
+@dataclass
+class BenchmarkConfigurations:
+    trainer_count: int = 1
+    ps_count: int = 0
+    batch_size: int = 1
+    print_metrics_to_dir: bool = False
+    master_addr: str = "localhost"
+    master_port: str = "29500"
+    rpc_async_timeout: int = 5
+    rpc_init_method: str = "tcp://localhost:29501"
+    trainer_config: dict = None
+    ps_config: dict = None
@@ -0,0 +1,56 @@
+# RPC PS Benchmark
+
+## How to add your experiment
+
+1. Data
+    - Create a data class and add it to the data directory
+    - Update benchmark_class_helper.py to include your data class in the data_map
+    - Add configurations to data_configurations.json in the configurations directory
+2. Model
+    - Create a model class and add it to the model directory
+    - Update benchmark_class_helper.py to include your model class in the model_map
+    - Add configurations to model_configurations.json in the configurations directory
+3. Trainer
+    - Create a trainer class and add it to the trainer directory
+    - Update benchmark_class_helper.py to include your trainer class in the trainer_map
+    - Add configurations to trainer_configurations.json in the configurations directory
+4. Parameter Server
+    - Create a parameter server class and add it to the parameter_servers directory
+    - Update benchmark_class_helper.py to include your parameter_server class in the ps_map
+    - Add configurations to parameter_server_configurations.json in the configurations directory
+5. Script
+    - Create a bash script for your experiment and add it to the bash_experiment_scripts directory
+
+## Trainer class
+
+The trainer directory contains base classes to provide a starting point for implementing a trainer.
+Inherit from a base class and implement your trainer. The benchmark has two requirements for trainers.
+
+1. It must implement a __init__ method that takes rank, trainer_count, and ps_rref as arguments
+
+    ```python
+    def __init__(self, rank, trainer_count, ps_rref, backend, use_cuda_rpc):
+    ```
+
+2. It must implement a train method that takes model and data as arguments.
+
+    ```python
+    def train(self, model, data):
+    ```
+
+## Parameter Server class
+
+The parameter_server directory contains base classes to provide a starting point for implementing a parameter server.
+Inherit from a base class and implement your parameter server. The benchmark has two requirements for parameter servers.
+
+1. It must implement a __init__ method that takes rank and ps_trainer_count as arguments
+
+    ```python
+    def __init__(self, rank, ps_trainer_count, backend, use_cuda_rpc):
+    ```
+
+2. It must implement a reset_state method
+
+    ```python
+    def reset_state(ps_rref):
+    ```
@@ -0,0 +1,13 @@
+#!/bin/sh
+
+# requires slurm
+# configuration ids
+benchmark=3
+data="DummyData"
+model="DummyModel"
+trainer="DdpNcclTrainer"
+server="None"
+# moves to directory and runs the benchmark with the configurations selected
+cd "$(dirname $(dirname "$0"))"
+source ./bash_experiment_scripts/helper_functions.sh
+run_benchmark_basic "$benchmark" "$data" "$model" "$trainer" "$server"
@@ -0,0 +1,7 @@
+#!/bin/sh
+
+run_benchmark_basic() {
+    # requires slurm
+    gpurun='srun -p q2 --cpus-per-task=16 -t 5:00:00 --gpus-per-node=4'
+    $gpurun python launcher.py --benchmark=$1 --data=$2 --model=$3 --trainer=$4 --server=$5
+}
@@ -0,0 +1,33 @@
+from data.DummyData import DummyData
+from models.DummyModel import DummyModel
+from trainers.DdpNcclTrainer import DdpNcclTrainer
+
+trainer_map = {
+    "DdpNcclTrainer": DdpNcclTrainer
+}
+
+ps_map = {}
+
+model_map = {
+    "DummyModel": DummyModel
+}
+
+data_map = {
+    "DummyData": DummyData
+}
+
+
+def get_benchmark_trainer_map():
+    return trainer_map
+
+
+def get_benchmark_ps_map():
+    return ps_map
+
+
+def get_benchmark_model_map():
+    return model_map
+
+
+def get_benchmark_data_map():
+    return data_map
@@ -0,0 +1,8 @@
+{
+    "3": {
+        "trainer_count": 2,
+        "ps_count": 0,
+        "rpc_async_timeout": 15,
+        "batch_size": 5
+    }
+}
@@ -0,0 +1,20 @@
+{
+    "DummyData": {
+        "data_class": "DummyData",
+        "configurations": {
+            "max_val": 100,
+            "input_samples": 100,
+            "input_dim": 100,
+            "sparsity_percentage": 20
+        }
+    },
+    "DummyData2": {
+        "data_class": "DummyData",
+        "configurations": {
+            "max_val": 100,
+            "input_samples": 100,
+            "input_dim": 100,
+            "sparsity_percentage": 80
+        }
+    }
+}
@@ -0,0 +1,22 @@
+{
+    "DummyModel": {
+        "model_class": "DummyModel",
+        "configurations": {
+            "num_embeddings": 100,
+            "embedding_dim": 100,
+            "dense_input_size": 100,
+            "dense_output_size": 100,
+            "sparse": false
+        }
+    },
+    "DummyModelSparse": {
+        "model_class": "DummyModel",
+        "configurations": {
+            "num_embeddings": 100,
+            "embedding_dim": 100,
+            "dense_input_size": 100,
+            "dense_output_size": 100,
+            "sparse": true
+        }
+    }
+}
@@ -0,0 +1 @@
+{}
@@ -0,0 +1,8 @@
+{
+    "DdpNcclTrainer": {
+        "trainer_class": "DdpNcclTrainer",
+        "configurations": {
+            "epochs": 10
+        }
+    }
+}
@@ -0,0 +1,46 @@
+import random
+
+import numpy as np
+import torch
+from torch.utils.data import Dataset
+
+
+class DummyData(Dataset):
+
+    def __init__(
+        self,
+        max_val: int,
+        input_samples: int,
+        input_dim: int,
+        sparsity_percentage: int
+    ):
+        self.max_val = max_val
+        self.input_samples = input_samples
+        self.input_dim = input_dim
+        self.sparsity_percentage = sparsity_percentage
+
+        def generate_input():
+            precentage_of_elements = (100 - self.sparsity_percentage) / float(100)
+            index_count = int(self.max_val * precentage_of_elements)
+            elements = list(range(self.max_val))
+            random.shuffle(elements)
+            elements = elements[:index_count]
+            data = [
+                [
+                    elements[random.randint(0, index_count - 1)]
+                    for _ in range(self.input_dim)
+                ]
+                for _ in range(self.input_samples)
+            ]
+            return torch.from_numpy(np.array(data))
+
+        self.input = generate_input()
+        self.target = torch.randint(0, max_val, [input_samples])
+        self.start = 0
+        self.end = max_val
+
+    def __len__(self):
+        return len(self.input)
+
+    def __getitem__(self, index):
+        return self.input[index], self.target[index]
@@ -0,0 +1,343 @@
+import argparse
+import copy
+import json
+import os
+from pathlib import Path
+
+import torch.distributed.rpc as rpc
+import torch.multiprocessing as mp
+from torch.distributed.rpc import TensorPipeRpcBackendOptions
+from torch.utils.data import DataLoader
+
+from benchmark_class_helper import (get_benchmark_data_map,
+                                    get_benchmark_model_map,
+                                    get_benchmark_ps_map,
+                                    get_benchmark_trainer_map)
+from BenchmarkConfigurations import BenchmarkConfigurations
+from metrics.ProcessedMetricsPrinter import ProcessedMetricsPrinter
+
+USE_CUDA_RPC = "use_cuda_rpc"
+
+
+def get_name(rank, configs):
+    t_count = configs.trainer_count
+    ps_count = configs.ps_count
+    if rank < t_count:
+        return f"trainer{rank}"
+    elif rank < (t_count + ps_count):
+        return f"ps{rank}"
+    else:
+        return "master"
+
+
+def get_parameter_server_rank(rank, config):
+    # rank mod parameter server count to get parameter server number
+    # add trainer_count to get parameter server rank
+    rank_mod_ps_count = rank % config.ps_count
+    return rank_mod_ps_count + config.trainer_count
+
+
+def get_ps_rref(parameter_server_rank, config):
+    ps_config = config.ps_config
+    ps = get_benchmark_ps_map()[str(ps_config["ps_class"])]
+    name = get_name(
+        parameter_server_rank,
+        config
+    )
+    ps_args = ps_config["configurations"].values()
+    ps_trainer_count = config.trainer_count / ps_config.ps_count
+    rem = config.trainer_count % ps_config.ps_count
+    if parameter_server_rank - config.trainer_count < rem:
+        ps_trainer_count += 1
+    return rpc.remote(
+        name,
+        ps,
+        args=(
+            parameter_server_rank,
+            ps_trainer_count,
+            *ps_args,
+        ),
+    )
+
+
+def run_trainer(
+    config, model, data, rank, ps_rref
+):
+    trainer_config = config.trainer_config
+    trainer_class = get_benchmark_trainer_map()[str(trainer_config["trainer_class"])]
+    trainer_args = trainer_config["configurations"].values()
+    trainer = trainer_class(
+        rank,
+        config.trainer_count,
+        ps_rref,
+        *trainer_args
+    )
+    trainer.train(model, data)
+    metrics = trainer.get_metrics()
+    return [rank, metrics]
+
+
+def call_trainers(config, model, train_data, parameter_server_rrefs):
+    futs = []
+    for trainer_rank in range(0, config.trainer_count):
+        trainer_name = get_name(
+            trainer_rank,
+            config
+        )
+        ps_rref = None
+        if parameter_server_rrefs:
+            ps_rank = get_parameter_server_rank(trainer_rank, config)
+            ps_rref = parameter_server_rrefs[ps_rank]
+        fut = rpc.rpc_async(
+            trainer_name,
+            run_trainer,
+            args=(
+                config,
+                copy.deepcopy(model),
+                train_data[trainer_rank],
+                trainer_rank,
+                ps_rref,
+            ),
+            timeout=config.rpc_async_timeout
+        )
+        futs.append(fut)
+    return futs
+
+
+def benchmark_warmup(
+    config, model, data, parameter_server_rrefs
+):
+    if config.ps_count > 0:
+        ps_config = config.ps_config
+        ps = get_benchmark_ps_map()[str(ps_config["ps_class"])]
+    futs = call_trainers(config, model, data, parameter_server_rrefs)
+    for fut in futs:
+        fut.wait()
+    for ps_rref in parameter_server_rrefs.values():
+        rpc.rpc_sync(
+            ps_rref.owner(),
+            ps.reset_state,
+            args=(ps_rref,)
+        )
+    print("benchmark warmup done\n")
+
+
+def split_list(arr, n):
+    return [arr[i::n] for i in range(n)]
+
+
+def run_master(rank, model, data, config, rpc_backend_options):
+    world_size = config.trainer_count + config.ps_count + 1
+    rpc.init_rpc(
+        get_name(
+            rank,
+            config
+        ),
+        rank=rank,
+        world_size=world_size,
+        rpc_backend_options=rpc_backend_options
+    )
+    parameter_server_rrefs = {}
+    for i in range(
+        config.trainer_count, world_size - 1
+    ):
+        parameter_server_rrefs[i] = get_ps_rref(i, config)
+
+    train_data = split_list(
+        list(DataLoader(data, batch_size=config.batch_size)),
+        config.trainer_count
+    )
+
+    # warmup run the benchmark
+    benchmark_warmup(
+        config, model, train_data, parameter_server_rrefs
+    )
+    # run the benchmark
+    trainer_futs = call_trainers(
+        config, model, train_data, parameter_server_rrefs
+    )
+    # collect metrics and print
+    metrics_printer = ProcessedMetricsPrinter()
+    rank_metrics_list = [fut.wait() for fut in trainer_futs]
+    metrics_printer.print_metrics("trainer", rank_metrics_list)
+
+
+def run_benchmark(rank, model, data, config):
+
+    world_size = config.trainer_count + config.ps_count + 1
+    os.environ['MASTER_ADDR'] = config.master_addr
+    os.environ['MASTER_PORT'] = config.master_port
+    rpc_backend_options = TensorPipeRpcBackendOptions()
+    rpc_backend_options.init_method = config.rpc_init_method
+    if rank == world_size - 1:
+        # master = [trainer_count + parameter_server_count, trainer_count + parameter_server_count]
+        run_master(rank, model, data, config, rpc_backend_options)
+    elif rank >= config.trainer_count:
+        # parameter_servers = [trainer_count, trainer_count + parameter_server_count)
+        rpc.init_rpc(
+            get_name(
+                rank,
+                config
+            ),
+            rank=rank,
+            world_size=world_size,
+            rpc_backend_options=rpc_backend_options
+        )
+    else:
+        # trainers = [0, trainer_count)
+        trainer_config = config.trainer_config
+        ps_config = config.ps_config
+        if (USE_CUDA_RPC in trainer_config and
+            trainer_config[USE_CUDA_RPC] and
+            USE_CUDA_RPC in ps_config and
+            ps_config[USE_CUDA_RPC] and
+                config.ps_count > 0):
+            ps_rank = get_parameter_server_rank(rank, config)
+            ps_name = get_name(
+                ps_rank,
+                config
+            )
+            rpc_backend_options.set_device_map(
+                ps_name,
+                {rank: ps_rank}
+            )
+        trainer_name = get_name(
+            rank,
+            config
+        )
+        rpc.init_rpc(
+            trainer_name,
+            rank=rank,
+            world_size=world_size,
+            rpc_backend_options=rpc_backend_options
+        )
+    rpc.shutdown()
+
+
+def get_json_config(file_name, id):
+    f = open(
+        os.path.join(
+            Path(__file__).parent, file_name
+        ),
+        "r"
+    )
+    json_config = json.load(f)[id]
+    f.close()
+    return json_config
+
+
+def load_configurations(args):
+    trainer_config_file = args.trainer_config_path
+    ps_config_file = args.server_config_path
+    benchmark_config = get_json_config(args.benchmark_config_path, args.benchmark)
+    benchmark_config["trainer_config"] = get_json_config(trainer_config_file, args.trainer)
+    if args.server != "None":
+        benchmark_config["ps_config"] = get_json_config(ps_config_file, args.server)
+    else:
+        benchmark_config["ps_config"] = None
+    return BenchmarkConfigurations(**benchmark_config)
+
+
+def get_data(data_class, data_config):
+    data_class = get_benchmark_data_map()[data_class]
+    return data_class(**data_config)
+
+
+def load_data(args):
+    data_config_file = args.data_config_path
+    data_config = get_json_config(data_config_file, args.data)
+    return get_data(data_config["data_class"], data_config["configurations"])
+
+
+def get_model(model_class, model_config):
+    model_class = get_benchmark_model_map()[model_class]
+    return model_class(**model_config)
+
+
+def load_model(args):
+    model_config_file = args.model_config_path
+    model_config = get_json_config(model_config_file, args.model)
+    return get_model(model_config["model_class"], model_config["configurations"])
+
+
+def main():
+    parser = argparse.ArgumentParser(description="RPC PS Benchmark")
+
+    parser.add_argument(
+        "--benchmark_config_path",
+        type=str,
+        default="configurations/benchmark_configurations.json",
+        help="path to benchmark configuration file"
+    )
+    parser.add_argument(
+        "--data_config_path",
+        type=str,
+        default="configurations/data_configurations.json",
+        help="path to data configuration file"
+    )
+    parser.add_argument(
+        "--model_config_path",
+        type=str,
+        default="configurations/model_configurations.json",
+        help="path to model configuration file"
+    )
+    parser.add_argument(
+        "--server_config_path",
+        type=str,
+        default="configurations/server_configurations.json",
+        help="path to server configuration file"
+    )
+    parser.add_argument(
+        "--trainer_config_path",
+        type=str,
+        default="configurations/trainer_configurations.json",
+        help="path to trainer configuration file"
+    )
+    parser.add_argument(
+        "--benchmark",
+        type=str,
+        help="id for benchmark configuration"
+    )
+    parser.add_argument(
+        "--data",
+        type=str,
+        help="id for data configuration"
+    )
+    parser.add_argument(
+        "--model",
+        type=str,
+        help="id for model configuration"
+    )
+    parser.add_argument(
+        "--server",
+        type=str,
+        help="id for parameter server configuration"
+    )
+    parser.add_argument(
+        "--trainer",
+        type=str,
+        help="id for trainer configuration"
+    )
+    args = parser.parse_args()
+    print(f"{args}\n")
+
+    config = load_configurations(args)
+    data = load_data(args)
+    model = load_model(args)
+
+    world_size = config.trainer_count + config.ps_count + 1
+
+    mp.spawn(
+        run_benchmark,
+        args=(
+            model,
+            data,
+            config,
+        ),
+        nprocs=world_size,
+        join=True
+    )
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,23 @@
+import time
+
+from .MetricBase import MetricBase
+
+
+class CPUMetric(MetricBase):
+    def __init__(self, name: str):
+        self.name = name
+        self.start = None
+        self.end = None
+
+    def record_start(self):
+        self.start = time.time()
+
+    def record_end(self):
+        self.end = time.time()
+
+    def elapsed_time(self):
+        if self.start is None:
+            raise RuntimeError("start is None")
+        if self.end is None:
+            raise RuntimeError("end is None")
+        return self.end - self.start
@@ -0,0 +1,32 @@
+import torch
+
+from .MetricBase import MetricBase
+
+
+class CUDAMetric(MetricBase):
+    def __init__(self, rank: int, name: str):
+        self.rank = rank
+        self.name = name
+        self.start = None
+        self.end = None
+
+    def record_start(self):
+        self.start = torch.cuda.Event(enable_timing=True)
+        with torch.cuda.device(self.rank):
+            self.start.record()
+
+    def record_end(self):
+        self.end = torch.cuda.Event(enable_timing=True)
+        with torch.cuda.device(self.rank):
+            self.end.record()
+
+    def elapsed_time(self):
+        if not self.start.query():
+            raise RuntimeError("start event did not complete")
+        if not self.end.query():
+            raise RuntimeError("end event did not complete")
+        return self.start.elapsed_time(self.end)
+
+    def synchronize(self):
+        self.start.synchronize()
+        self.end.synchronize()
@@ -0,0 +1,26 @@
+from abc import ABC, abstractmethod
+
+
+class MetricBase(ABC):
+    def __init__(self, name):
+        self.name = name
+        self.start = None
+        self.end = None
+
+    @abstractmethod
+    def record_start(self):
+        return
+
+    @abstractmethod
+    def record_end(self):
+        return
+
+    @abstractmethod
+    def elapsed_time(self):
+        return
+
+    def get_name(self):
+        return self.name
+
+    def get_end(self):
+        return self.end
@@ -0,0 +1,78 @@
+from .CPUMetric import CPUMetric
+from .CUDAMetric import CUDAMetric
+
+
+class MetricsLogger:
+
+    def __init__(self, rank=None):
+        self.rank = rank
+        self.metrics = {}
+
+    def record_start(self, type, key, name, cuda):
+        if type in self.metrics and key in self.metrics[type]:
+            raise RuntimeError(f"metric_type={type} with key={key} already exists")
+        if cuda:
+            if self.rank is None:
+                raise RuntimeError("rank is required for cuda")
+            metric = CUDAMetric(self.rank, name)
+        else:
+            metric = CPUMetric(name)
+        if type not in self.metrics:
+            self.metrics[type] = {}
+        self.metrics[type][key] = metric
+        metric.record_start()
+
+    def record_end(self, type, key):
+        if type not in self.metrics or key not in self.metrics[type]:
+            raise RuntimeError(f"metric_type={type} with key={key} not found")
+        if self.metrics[type][key].get_end() is not None:
+            raise RuntimeError(f"end for metric_type={type} with key={key} already exists")
+        self.metrics[type][key].record_end()
+
+    def clear_metrics(self):
+        self.metrics.clear()
+
+    def get_metrics(self):
+        return self.metrics
+
+    def get_processed_metrics(self):
+        r"""
+        A method that processes the metrics recorded during the benchmark.
+
+        Returns::
+            It returns a dictionary containing keys as the metrics
+                and values list of elapsed times.
+
+        Examples::
+
+            >>> instance = MetricsLogger(rank)
+            >>> instance.cuda_record_start("forward_metric_type", "1", "forward_pass")
+            >>> instance.cuda_record_end("forward_metric_type", "1")
+            >>> instance.cuda_record_start("forward_metric_type", "2", "forward_pass")
+            >>> instance.cuda_record_end("forward_metric_type", "2")
+            >>> print(instance.metrics)
+            {
+                "forward_metric_type": {
+                    "1": metric1,
+                    "2": metric2
+                }
+            }
+
+            >>> print(instance.get_processed_metrics())
+            {
+                "forward_metric_type,forward_pass" : [.0429, .0888]
+            }
+        """
+        processed_metrics = {}
+        for metric_type in self.metrics.keys():
+            for metric_key in self.metrics[metric_type].keys():
+                metric = self.metrics[metric_type][metric_key]
+                if isinstance(metric, CUDAMetric):
+                    metric.synchronize()
+                metric_name = metric.get_name()
+                elapsed_time = metric.elapsed_time()
+                processed_metric_name = f"{metric_type},{metric_name}"
+                if processed_metric_name not in processed_metrics:
+                    processed_metrics[processed_metric_name] = []
+                processed_metrics[processed_metric_name].append(elapsed_time)
+        return processed_metrics
@@ -0,0 +1,82 @@
+import statistics
+
+import pandas as pd
+from tabulate import tabulate
+
+
+class ProcessedMetricsPrinter:
+
+    def print_data_frame(self, name, processed_metrics):
+        print(f"metrics for {name}")
+        data_frame = self.get_data_frame(processed_metrics)
+        print(tabulate(data_frame, showindex=False, headers=data_frame.columns, tablefmt="grid"))
+
+    def combine_processed_metrics(self, processed_metrics_list):
+        r"""
+        A method that merges the value arrays of the keys in the dictionary
+        of processed metrics.
+
+        Args:
+            processed_metrics_list (list): a list containing dictionaries with
+                recorded metrics as keys, and the values are lists of elapsed times.
+
+        Returns::
+            A merged dictionary that is created from the list of dictionaries passed
+                into the method.
+
+        Examples::
+            >>> instance = ProcessedMetricsPrinter()
+            >>> dict_1 = trainer1.get_processed_metrics()
+            >>> dict_2 = trainer2.get_processed_metrics()
+            >>> print(dict_1)
+            {
+                "forward_metric_type,forward_pass" : [.0429, .0888]
+            }
+            >>> print(dict_2)
+            {
+                "forward_metric_type,forward_pass" : [.0111, .0222]
+            }
+            >>> processed_metrics_list = [dict_1, dict_2]
+            >>> result = instance.combine_processed_metrics(processed_metrics_list)
+            >>> print(result)
+            {
+                "forward_metric_type,forward_pass" : [.0429, .0888, .0111, .0222]
+            }
+        """
+        processed_metric_totals = {}
+        for processed_metrics in processed_metrics_list:
+            for metric_name, values in processed_metrics.items():
+                if metric_name not in processed_metric_totals:
+                    processed_metric_totals[metric_name] = []
+                processed_metric_totals[metric_name] += values
+        return processed_metric_totals
+
+    def get_data_frame(self, processed_metrics):
+        df = pd.DataFrame(
+            columns=['name', 'min', 'max', 'mean', 'variance', 'stdev']
+        )
+        for metric_name in sorted(processed_metrics.keys()):
+            values = processed_metrics[metric_name]
+            row = {
+                "name": metric_name,
+                "min": min(values),
+                "max": max(values),
+                "mean": statistics.mean(values),
+                "variance": statistics.variance(values),
+                "stdev": statistics.stdev(values)
+            }
+            df = df.append(row, ignore_index=True)
+        return df
+
+    def print_metrics(self, name, rank_metrics_list):
+        if rank_metrics_list:
+            metrics_list = []
+            for rank, metric in rank_metrics_list:
+                self.print_data_frame(f"{name}={rank}", metric)
+                metrics_list.append(metric)
+            combined_metrics = self.combine_processed_metrics(metrics_list)
+            self.print_data_frame(f"all {name}", combined_metrics)
+
+    def save_to_file(self, data_frame, file_name):
+        file_name = f"data_frames/{file_name}.csv"
+        data_frame.to_csv(file_name, encoding='utf-8', index=False)
@@ -0,0 +1,22 @@
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+class DummyModel(nn.Module):
+    def __init__(
+        self,
+        num_embeddings: int,
+        embedding_dim: int,
+        dense_input_size: int,
+        dense_output_size: int,
+        sparse: bool
+    ):
+        super().__init__()
+        self.embedding = nn.EmbeddingBag(
+            num_embeddings, embedding_dim, sparse=sparse
+        )
+        self.dense = nn.Sequential(*[nn.Linear(dense_input_size, dense_output_size) for _ in range(10)])
+
+    def forward(self, x):
+        x = self.embedding(x)
+        return F.softmax(self.dense(x), dim=1)
@@ -0,0 +1,107 @@
+import torch
+import torch.distributed as c10d
+import torch.nn as nn
+from torch.nn.parallel import DistributedDataParallel as DDP
+
+from .DdpTrainerBase import DdpTrainerBase
+
+
+class DdpNcclTrainer(DdpTrainerBase):
+
+    class HookState:
+
+        def __init__(self, cref, process_group):
+            self.cref = cref
+            self.process_group = process_group
+            self.process_group_size = process_group.size()
+            self.param_location = 0
+            self.batch_number = -1
+
+        def get_key(self):
+            return f"{self.batch_number},{self.param_location}"
+
+        def next_batch_state(self):
+            self.param_location = 0
+            self.batch_number += 1
+
+    def __init__(self, rank, trainer_count, ps_rref, epochs):
+        super().__init__(rank)
+        self.rank = rank
+        self.trainer_count = trainer_count
+        self.epochs = epochs
+
+    @staticmethod
+    def hook(state, bucket):
+        cref = state.cref
+        tensors_count = len(cref.bucket_to_parameters(bucket))
+        tensors = [bucket.get_tensor() / state.process_group_size]
+        key = state.get_key()
+        cref.record_hook_fut_start(key, cref.NCCL_ALLREDUCE)
+        fut = state.process_group.allreduce(tensors).get_future()
+        state.param_location += tensors_count
+
+        def callback(fut):
+            cref.record_hook_fut_end(key)
+            return fut.wait()
+
+        return fut.then(callback)
+
+    def train(self, model, data):
+        torch.manual_seed(0)
+        model = model.cuda(self.rank)
+        for i in range(len(data)):
+            data[i][0] = data[i][0].cuda(self.rank)
+            data[i][1] = data[i][1].cuda(self.rank)
+        torch.cuda.synchronize(self.rank)
+
+        process_group_size = self.trainer_count
+
+        store = c10d.FileStore("/tmp/tmpn_k_8so02", process_group_size)
+
+        process_group = c10d.ProcessGroupNCCL(
+            store, self.rank, process_group_size
+        )
+
+        ddp_model = DDP(
+            model, device_ids=[self.rank], process_group=process_group
+        )
+
+        hook_state = self.HookState(self, process_group)
+
+        ddp_model.register_comm_hook(hook_state, DdpNcclTrainer.hook)
+
+        criterion = nn.CrossEntropyLoss().cuda(self.rank)
+
+        optimizer = torch.optim.SGD(ddp_model.parameters(), 1e-4)
+
+        def epoch_key(epoch, index):
+            return f"{epoch},{index}"
+
+        for epoch in range(self.epochs):
+            for index, batch in enumerate(data):
+                hook_state.next_batch_state()
+                input, target = batch[0], batch[1]
+
+                self.record_batch_start(epoch_key(epoch, index))
+
+                optimizer.zero_grad()
+
+                self.record_forward_start(epoch_key(epoch, index))
+
+                out = ddp_model(input)
+
+                self.record_forward_end(epoch_key(epoch, index))
+
+                loss = criterion(out, target)
+
+                self.record_backward_start(epoch_key(epoch, index))
+
+                loss.backward()
+
+                self.record_backward_end(epoch_key(epoch, index))
+
+                optimizer.step()
+
+                self.record_batch_end(epoch_key(epoch, index))
+
+        torch.cuda.synchronize(self.rank)
@@ -0,0 +1,32 @@
+from abc import abstractmethod
+
+from .TrainerBase import TrainerBase
+
+
+class DdpTrainerBase(TrainerBase):
+
+    HOOK_FUTURE_METRIC = "hook_future_metric"
+    NCCL_ALLREDUCE = "nccl_allreduce"
+    GLOO_ALLREDUCE = "gloo_allreduce"
+
+    def __init__(self, rank):
+        super().__init__(rank)
+
+    @staticmethod
+    @abstractmethod
+    def hook(state, bucket):
+        return
+
+    def record_hook_fut_start(self, key, name, cuda=True):
+        self.record_start(self.HOOK_FUTURE_METRIC, key, name, cuda)
+
+    def record_hook_fut_end(self, key):
+        self.record_end(self.HOOK_FUTURE_METRIC, key)
+
+    def bucket_to_parameters(self, bucket):
+        parameter_tensors = bucket.get_per_parameter_tensors()
+        parameter_tensors_count = len(parameter_tensors)
+        if parameter_tensors_count > 0:
+            return parameter_tensors
+        else:
+            return [bucket.get_tensor()]
@@ -0,0 +1,97 @@
+import functools
+import time
+from abc import ABC, abstractmethod
+
+from metrics.MetricsLogger import MetricsLogger
+
+
+class TrainerBase(ABC):
+
+    BATCH_LEVEL_METRIC = "batch_level_metric"
+    BATCH_ALL = "batch_all"
+    FORWARD_METRIC = "foward_metric"
+    FORWARD_PASS = "forward_pass"
+    BACKWARD_METRIC = "backward_metric"
+    BACKWARD = "backward"
+
+    def __init__(self, rank):
+        self.__metrics_logger = MetricsLogger(rank)
+
+    @abstractmethod
+    def train(self):
+        return
+
+    def record_start(self, type, key, name, cuda=True):
+        self.__metrics_logger.record_start(
+            type,
+            key,
+            name,
+            cuda
+        )
+
+    def record_end(self, type, key):
+        self.__metrics_logger.record_end(
+            type,
+            key
+        )
+
+    def record_batch_start(self, key, cuda=True):
+        self.__metrics_logger.record_start(
+            self.BATCH_LEVEL_METRIC,
+            key,
+            self.BATCH_ALL,
+            cuda
+        )
+
+    def record_batch_end(self, key):
+        self.__metrics_logger.record_end(
+            self.BATCH_LEVEL_METRIC,
+            key
+        )
+
+    def record_forward_start(self, key, cuda=True):
+        self.__metrics_logger.record_start(
+            self.FORWARD_METRIC,
+            key,
+            self.FORWARD_PASS,
+            cuda
+        )
+
+    def record_forward_end(self, key):
+        self.__metrics_logger.record_end(
+            self.FORWARD_METRIC,
+            key
+        )
+
+    def record_backward_start(self, key, cuda=True):
+        self.__metrics_logger.record_start(
+            self.BACKWARD_METRIC,
+            key,
+            self.BACKWARD,
+            cuda
+        )
+
+    def record_backward_end(self, key):
+        self.__metrics_logger.record_end(
+            self.BACKWARD_METRIC,
+            key
+        )
+
+    @staticmethod
+    def methodmetric(name, type="method_metric", cuda=True):
+        def decorator(function):
+            @functools.wraps(function)
+            def wrapper(self, *args):
+                key = time.time()
+                self.__metrics_logger.record_start(type, key, name, cuda)
+                result = function(self, *args)
+                self.__metrics_logger.record_end(type, key)
+                return result
+            return wrapper
+        return decorator
+
+    def get_metrics(self):
+        return self.__metrics_logger.get_processed_metrics()
+
+    def clear_metrics(self):
+        return self.__metrics_logger.clear_metrics()