Skip to content

Commit 97d59ad

Browse files
slayton58drnikolaev
authored andcommitted
NCCL integration
1 parent d988833 commit 97d59ad

26 files changed

+487
-292
lines changed

CMakeLists.txt

+15
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,21 @@ include(cmake/ConfigGen.cmake)
2828
# ---[ Options
2929
caffe_option(CPU_ONLY "Build Caffe without CUDA support" OFF) # TODO: rename to USE_CUDA
3030
caffe_option(USE_CUDNN "Build Caffe with cuDNN library support" ON IF NOT CPU_ONLY)
31+
32+
# USE_NCCL: Build Caffe with NCCL Library support
33+
# Regular ON/OFF option doesn't work here because we need to recognize 3 states:
34+
# 1. User didn't set USE_NCCL option =>
35+
# 1.1 If CPU_ONLY is ON we do nothing.
36+
# 1.2 If CPU_ONLY is OFF we *quietly* try to find it and use if found; do nothing otherwise.
37+
# 2. User explicitly set USE_NCCL=ON option =>
38+
# 1.1 If CPU_ONLY is ON we do nothing (it's higher priority).
39+
# 2.1 If CPU_ONLY is OFF we try to find it with *required* option, thus CMake fails if not found.
40+
# 3. User explicitly set USE_NCCL=OFF option => we do nothing.
41+
SET(USE_NCCL)
42+
if(DEFINED USE_NCCL)
43+
STRING(TOUPPER "${USE_NCCL}" USE_NCCL)
44+
endif()
45+
3146
caffe_option(BUILD_SHARED_LIBS "Build shared libraries" ON)
3247
caffe_option(BUILD_python "Build Python wrapper" ON)
3348
set(python_version "2" CACHE STRING "Specify which Python version to use")

Makefile

+6
Original file line numberDiff line numberDiff line change
@@ -334,6 +334,12 @@ ifeq ($(USE_CUDNN), 1)
334334
COMMON_FLAGS += -DUSE_CUDNN
335335
endif
336336

337+
# NCCL acceleration configuration
338+
ifeq ($(USE_NCCL), 1)
339+
LIBRARIES += nccl
340+
COMMON_FLAGS += -DUSE_NCCL
341+
endif
342+
337343
# configure IO libraries
338344
ifeq ($(USE_OPENCV), 1)
339345
COMMON_FLAGS += -DUSE_OPENCV

Makefile.config.example

+4
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,10 @@
55
# cuDNN version 4 or higher is required.
66
# USE_CUDNN := 1
77

8+
# NCCL acceleration switch (uncomment to build with NCCL)
9+
# See https://github.com/NVIDIA/nccl
10+
# USE_NCCL := 1
11+
812
# CPU-only switch (uncomment to build without GPU support).
913
# cuDNN version 4 or higher is required.
1014
# CPU_ONLY := 1

cmake/ConfigGen.cmake

+4
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,10 @@ function(caffe_generate_export_configs)
8181
list(APPEND Caffe_DEFINITIONS -DUSE_MKL)
8282
endif()
8383

84+
if(NCCL_FOUND)
85+
list(APPEND Caffe_DEFINITIONS -DUSE_NCCL)
86+
endif()
87+
8488
configure_file("cmake/Templates/CaffeConfig.cmake.in" "${PROJECT_BINARY_DIR}/CaffeConfig.cmake" @ONLY)
8589

8690
# Add targets to the build-tree export set

cmake/Dependencies.cmake

+16
Original file line numberDiff line numberDiff line change
@@ -170,3 +170,19 @@ endif()
170170
if(BUILD_docs)
171171
find_package(Doxygen)
172172
endif()
173+
174+
# ---[ NCCL
175+
if(DEFINED USE_NCCL)
176+
if(${USE_NCCL} STREQUAL "ON" AND NOT CPU_ONLY)
177+
find_package(NCCL REQUIRED)
178+
endif()
179+
else()
180+
if(NOT CPU_ONLY)
181+
find_package(NCCL)
182+
endif()
183+
endif()
184+
if(NCCL_FOUND)
185+
add_definitions(-DUSE_NCCL)
186+
include_directories(SYSTEM ${NCCL_INCLUDE})
187+
list(APPEND Caffe_LINKER_LIBS ${NCCL_LIBRARIES})
188+
endif()

cmake/Modules/FindNCCL.cmake

+22
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
# Find the NCCL libraries
2+
#
3+
# The following variables are optionally searched for defaults
4+
# NCCL_ROOT_DIR: Base directory where all NCCL components are found
5+
#
6+
# The following are set after configuration is done:
7+
# NCCL_FOUND
8+
# NCCL_INCLUDE_DIR
9+
# NCCL_LIBRARIES
10+
11+
find_path(NCCL_INCLUDE_DIR NAMES nccl.h PATHS ${NCCL_ROOT_DIR})
12+
13+
find_library(NCCL_LIBRARIES NAMES nccl PATHS ${NCCL_ROOT_DIR})
14+
15+
include(FindPackageHandleStandardArgs)
16+
find_package_handle_standard_args(NCCL DEFAULT_MSG NCCL_INCLUDE_DIR NCCL_LIBRARIES)
17+
18+
if(NCCL_FOUND)
19+
message(STATUS "Found NCCL (include: ${NCCL_INCLUDE_DIR}, library: ${NCCL_LIBRARIES})")
20+
mark_as_advanced(NCCL_INCLUDE_DIR NCCL_LIBRARIES)
21+
endif()
22+

cmake/Summary.cmake

+7
Original file line numberDiff line numberDiff line change
@@ -146,6 +146,13 @@ function(caffe_print_configuration_summary)
146146
else()
147147
caffe_status(" cuDNN : Disabled")
148148
endif()
149+
if(NOT DEFINED USE_NCCL)
150+
caffe_status(" NCCL : " NCCL_FOUND THEN "Yes" ELSE "Not found")
151+
elseif(${USE_NCCL} STREQUAL "ON")
152+
caffe_status(" NCCL : " NCCL_FOUND THEN "Yes" ELSE "Not found")
153+
else()
154+
caffe_status(" NCCL : Disabled")
155+
endif()
149156
caffe_status("")
150157
endif()
151158
if(HAVE_PYTHON)

cmake/Templates/caffe_config.h.in

+7-3
Original file line numberDiff line numberDiff line change
@@ -4,15 +4,19 @@
44
/* Binaries directory */
55
#define BINARY_FOLDER "${PROJECT_BINARY_DIR}"
66

7-
/* NVIDA Cuda */
7+
/* NVIDIA Cuda */
88
#cmakedefine HAVE_CUDA
99

10-
/* NVIDA cuDNN */
10+
/* NVIDIA cuDNN */
1111
#cmakedefine HAVE_CUDNN
1212
#cmakedefine USE_CUDNN
1313

14-
/* NVIDA cuDNN */
14+
/* NVIDIA cuDNN */
1515
#cmakedefine CPU_ONLY
16+
if(NCCL_FOUND)
17+
#cmakedefine USE_NCCL
18+
endif()
19+
1620

1721
/* Test device */
1822
#define CUDA_TEST_DEVICE ${CUDA_TEST_DEVICE}

include/caffe/net.hpp

+12
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,9 @@
1414

1515
namespace caffe {
1616

17+
template <typename Dtype>
18+
class Solver;
19+
1720
/**
1821
* @brief Connects Layer%s together into a directed acyclic graph (DAG)
1922
* specified by a NetParameter.
@@ -227,6 +230,11 @@ class Net {
227230
static bool StateMeetsRule(const NetState& state, const NetStateRule& rule,
228231
const string& layer_name);
229232

233+
/// @brief set a Solver for this net
234+
void SetSolver(Solver<Dtype>* s) {
235+
solver_ = s;
236+
}
237+
230238
protected:
231239
// Helpers for Init.
232240
/// @brief Append a new top blob to the net.
@@ -278,6 +286,8 @@ class Net {
278286
vector<int> param_owners_;
279287
vector<string> param_display_names_;
280288
vector<pair<int, int> > param_layer_indices_;
289+
/// (layer, blob) -> param_id map
290+
map<pair<int, int>, int> layer_index_params_;
281291
map<string, int> param_names_index_;
282292
/// blob indices for the input and the output of the net
283293
vector<int> net_input_blob_indices_;
@@ -307,6 +317,8 @@ class Net {
307317
bool debug_info_;
308318
/// The root net that actually holds the shared layers in data parallelism
309319
const Net* const root_net_;
320+
/// Pointer to the solver being used with this net
321+
Solver<Dtype>* solver_;
310322
DISABLE_COPY_AND_ASSIGN(Net);
311323
};
312324

include/caffe/parallel.hpp

+37-4
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,10 @@
1414
#include "caffe/syncedmem.hpp"
1515
#include "caffe/util/blocking_queue.hpp"
1616

17+
#ifdef USE_NCCL
18+
#include "caffe/util/nccl.hpp"
19+
#endif
20+
1721
namespace caffe {
1822

1923
// Represents a net parameters. Once a net is created, its parameter buffers can
@@ -89,7 +93,7 @@ class P2PSync : public GPUParams<Dtype>, public Solver<Dtype>::Callback,
8993
public InternalThread {
9094
public:
9195
explicit P2PSync(shared_ptr<Solver<Dtype> > root_solver,
92-
P2PSync<Dtype>* parent, const SolverParameter& param);
96+
int rank, int nranks, const SolverParameter& param);
9397
virtual ~P2PSync();
9498

9599
inline const shared_ptr<Solver<Dtype> >& solver() const {
@@ -104,18 +108,47 @@ class P2PSync : public GPUParams<Dtype>, public Solver<Dtype>::Callback,
104108
// Divide the batch size by the number of solvers
105109
static void divide_batch_size(NetParameter* net);
106110

111+
#ifdef USE_NCCL
112+
// set the NCCL communicator
113+
void setNCCLComm(ncclComm_t comm);
114+
#endif
115+
116+
public:
117+
void allreduce(int param_id);
118+
void syncCommStream();
119+
107120
protected:
121+
void SetupP2PAccess();
122+
void soft_barrier();
108123
void on_start();
109-
void on_gradients_ready();
110-
124+
void allreduce();
125+
void syncAllStreams();
126+
#ifndef CPU_ONLY
127+
#ifdef USE_NCCL
128+
ncclComm_t getNCCLComm();
129+
#endif
130+
cudaStream_t getCommStream();
131+
#endif
111132
void InternalThreadEntry();
112133

134+
const int rank_;
135+
const int nranks_;
113136
P2PSync<Dtype>* parent_;
114137
vector<P2PSync<Dtype>*> children_;
138+
#ifndef CPU_ONLY
139+
#ifdef USE_NCCL
140+
std::vector<ncclComm_t> nccl_comms_;
141+
#endif
142+
vector<cudaStream_t> comm_streams_;
143+
#endif
115144
BlockingQueue<P2PSync<Dtype>*> queue_;
116145
const int initial_iter_;
117-
Dtype* parent_grads_;
146+
118147
shared_ptr<Solver<Dtype> > solver_;
148+
const SolverParameter& params_;
149+
150+
// per-parameter reduction enabled
151+
bool per_parameter_reduce_;
119152

120153
using Params<Dtype>::size_;
121154
using Params<Dtype>::data_;

include/caffe/sgd_solvers.hpp

+39-23
Original file line numberDiff line numberDiff line change
@@ -15,9 +15,11 @@ namespace caffe {
1515
template <typename Dtype>
1616
class SGDSolver : public Solver<Dtype> {
1717
public:
18-
explicit SGDSolver(const SolverParameter& param)
19-
: Solver<Dtype>(param) { PreSolve(); }
20-
explicit SGDSolver(const string& param_file)
18+
explicit SGDSolver(const SolverParameter& param,
19+
Solver<Dtype> *root_solver = NULL)
20+
: Solver<Dtype>(param, root_solver) { PreSolve(); }
21+
explicit SGDSolver(const string& param_file,
22+
Solver<Dtype> *root_solver = NULL)
2123
: Solver<Dtype>(param_file) { PreSolve(); }
2224
virtual inline const char* type() const { return "SGD"; }
2325

@@ -48,10 +50,12 @@ class SGDSolver : public Solver<Dtype> {
4850
template <typename Dtype>
4951
class NesterovSolver : public SGDSolver<Dtype> {
5052
public:
51-
explicit NesterovSolver(const SolverParameter& param)
52-
: SGDSolver<Dtype>(param) {}
53-
explicit NesterovSolver(const string& param_file)
54-
: SGDSolver<Dtype>(param_file) {}
53+
explicit NesterovSolver(const SolverParameter& param,
54+
Solver<Dtype> *root_solver = NULL)
55+
: SGDSolver<Dtype>(param, root_solver) {}
56+
explicit NesterovSolver(const string& param_file,
57+
Solver<Dtype> *root_solver = NULL)
58+
: SGDSolver<Dtype>(param_file, root_solver) {}
5559
virtual inline const char* type() const { return "Nesterov"; }
5660

5761
protected:
@@ -63,10 +67,14 @@ class NesterovSolver : public SGDSolver<Dtype> {
6367
template <typename Dtype>
6468
class AdaGradSolver : public SGDSolver<Dtype> {
6569
public:
66-
explicit AdaGradSolver(const SolverParameter& param)
67-
: SGDSolver<Dtype>(param) { constructor_sanity_check(); }
68-
explicit AdaGradSolver(const string& param_file)
69-
: SGDSolver<Dtype>(param_file) { constructor_sanity_check(); }
70+
explicit AdaGradSolver(const SolverParameter& param,
71+
Solver<Dtype> *root_solver = NULL)
72+
: SGDSolver<Dtype>(param, root_solver)
73+
{ constructor_sanity_check(); }
74+
explicit AdaGradSolver(const string& param_file,
75+
Solver<Dtype> *root_solver = NULL)
76+
: SGDSolver<Dtype>(param_file, root_solver)
77+
{ constructor_sanity_check(); }
7078
virtual inline const char* type() const { return "AdaGrad"; }
7179

7280
protected:
@@ -83,10 +91,14 @@ class AdaGradSolver : public SGDSolver<Dtype> {
8391
template <typename Dtype>
8492
class RMSPropSolver : public SGDSolver<Dtype> {
8593
public:
86-
explicit RMSPropSolver(const SolverParameter& param)
87-
: SGDSolver<Dtype>(param) { constructor_sanity_check(); }
88-
explicit RMSPropSolver(const string& param_file)
89-
: SGDSolver<Dtype>(param_file) { constructor_sanity_check(); }
94+
explicit RMSPropSolver(const SolverParameter& param,
95+
Solver<Dtype> *root_solver = NULL)
96+
: SGDSolver<Dtype>(param, root_solver)
97+
{ constructor_sanity_check(); }
98+
explicit RMSPropSolver(const string& param_file,
99+
Solver<Dtype> *root_solver = NULL)
100+
: SGDSolver<Dtype>(param_file, root_solver)
101+
{ constructor_sanity_check(); }
90102
virtual inline const char* type() const { return "RMSProp"; }
91103

92104
protected:
@@ -106,10 +118,12 @@ class RMSPropSolver : public SGDSolver<Dtype> {
106118
template <typename Dtype>
107119
class AdaDeltaSolver : public SGDSolver<Dtype> {
108120
public:
109-
explicit AdaDeltaSolver(const SolverParameter& param)
110-
: SGDSolver<Dtype>(param) { AdaDeltaPreSolve(); }
111-
explicit AdaDeltaSolver(const string& param_file)
112-
: SGDSolver<Dtype>(param_file) { AdaDeltaPreSolve(); }
121+
explicit AdaDeltaSolver(const SolverParameter& param,
122+
Solver<Dtype> *root_solver = NULL)
123+
: SGDSolver<Dtype>(param, root_solver) { AdaDeltaPreSolve(); }
124+
explicit AdaDeltaSolver(const string& param_file,
125+
Solver<Dtype> *root_solver = NULL)
126+
: SGDSolver<Dtype>(param_file, root_solver) { AdaDeltaPreSolve(); }
113127
virtual inline const char* type() const { return "AdaDelta"; }
114128

115129
protected:
@@ -130,10 +144,12 @@ class AdaDeltaSolver : public SGDSolver<Dtype> {
130144
template <typename Dtype>
131145
class AdamSolver : public SGDSolver<Dtype> {
132146
public:
133-
explicit AdamSolver(const SolverParameter& param)
134-
: SGDSolver<Dtype>(param) { AdamPreSolve();}
135-
explicit AdamSolver(const string& param_file)
136-
: SGDSolver<Dtype>(param_file) { AdamPreSolve(); }
147+
explicit AdamSolver(const SolverParameter& param,
148+
Solver<Dtype> *root_solver = NULL)
149+
: SGDSolver<Dtype>(param, root_solver) { AdamPreSolve();}
150+
explicit AdamSolver(const string& param_file,
151+
Solver<Dtype> *root_solver = NULL)
152+
: SGDSolver<Dtype>(param_file, root_solver) { AdamPreSolve(); }
137153
virtual inline const char* type() const { return "Adam"; }
138154

139155
protected:

include/caffe/solver.hpp

+11-1
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66

77
#include "caffe/net.hpp"
88
#include "caffe/solver_factory.hpp"
9+
#include "caffe/util/benchmark.hpp"
910

1011
namespace caffe {
1112

@@ -76,9 +77,14 @@ class Solver {
7677

7778
// Invoked at specific points during an iteration
7879
class Callback {
80+
public:
81+
virtual void allreduce(int param_id) = 0;
82+
virtual void syncCommStream() = 0;
83+
7984
protected:
8085
virtual void on_start() = 0;
81-
virtual void on_gradients_ready() = 0;
86+
virtual void allreduce() = 0;
87+
virtual void soft_barrier() = 0;
8288

8389
template <typename T>
8490
friend class Solver;
@@ -129,6 +135,10 @@ class Solver {
129135
// True iff a request to stop early was received.
130136
bool requested_early_exit_;
131137

138+
// Timing information
139+
Timer iteration_timer_;
140+
float iterations_last_;
141+
132142
DISABLE_COPY_AND_ASSIGN(Solver);
133143
};
134144

0 commit comments

Comments
 (0)