Skip to content

Commit 3e31492

Browse files
committed
Fixed issue with redundant memory allocations/deallocations
1 parent 5fb4c72 commit 3e31492

File tree

6 files changed

+107
-73
lines changed

6 files changed

+107
-73
lines changed

include/caffe/layer.hpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -341,7 +341,7 @@ class Layer {
341341
virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
342342
const vector<Blob<Dtype>*>& top) {
343343
// LOG(WARNING) << "Using CPU code as backup.";
344-
return Forward_cpu(bottom, top);
344+
Forward_cpu(bottom, top);
345345
}
346346

347347
/**

include/caffe/layers/cudnn_conv_layer.hpp

+35-1
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,23 @@ namespace caffe {
3131
*/
3232
template <typename Dtype>
3333
class CuDNNConvolutionLayer : public ConvolutionLayer<Dtype> {
34+
// In iteration 0, use a small amount of memory in order to leave
35+
// most of memory for allocating layer blobs.
36+
// NOLINT_NEXT_LINE(build/storage_class)
37+
const static size_t INITIAL_WORKSPACE_SIZE;
38+
// Use 95% of available memory.
39+
// Using all of memory may result in failure of workspace.reserve.
40+
// NOLINT_NEXT_LINE(build/storage_class)
41+
const static float MAX_WORKSPACE_RATIO;
42+
// We update it on second Fwd/Bwd pass and we allocate it *once*
43+
// when we start third pass. We might recompute it later if demand grows
44+
// and/or we suddenly need to get extra memory for other needs.
45+
static size_t WORKSPACE_SIZE;
46+
// This is the workspace used by all Convolution layers one after another.
47+
// We carry it global to prevent unnecessary allocations/deallocations
48+
// because they hurt performance.
49+
static GPUMemory::Workspace WORKSPACE;
50+
3451
public:
3552
explicit CuDNNConvolutionLayer(const LayerParameter& param)
3653
: ConvolutionLayer<Dtype>(param), handles_setup_(false),
@@ -64,7 +81,6 @@ class CuDNNConvolutionLayer : public ConvolutionLayer<Dtype> {
6481
size_t *workspace_fwd_sizes_;
6582
size_t *workspace_bwd_data_sizes_;
6683
size_t *workspace_bwd_filter_sizes_;
67-
GPUMemory::Workspace workspace;
6884

6985
private:
7086
bool use_algo_seeker_;
@@ -85,7 +101,25 @@ class CuDNNConvolutionLayer : public ConvolutionLayer<Dtype> {
85101

86102
bool use_reshape_;
87103
bool initialized_cached_descs_;
104+
105+
void UpdateWorkspaceDemand(int size);
106+
107+
// This is current *demand*: it might be not yet allocated.
88108
};
109+
110+
template<typename Dtype>
111+
size_t CuDNNConvolutionLayer<Dtype>::WORKSPACE_SIZE = 0UL;
112+
113+
template<typename Dtype>
114+
const size_t CuDNNConvolutionLayer<Dtype>::INITIAL_WORKSPACE_SIZE =
115+
4*1024*1024;
116+
117+
template<typename Dtype>
118+
GPUMemory::Workspace CuDNNConvolutionLayer<Dtype>::WORKSPACE;
119+
120+
template<typename Dtype>
121+
const float CuDNNConvolutionLayer<Dtype>::MAX_WORKSPACE_RATIO = 0.95F;
122+
89123
#endif
90124

91125
} // namespace caffe

include/caffe/util/gpu_memory.hpp

+8-2
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,9 @@ struct GPUMemory {
1919
template <class Any>
2020
static void allocate(Any** ptr, size_t size,
2121
cudaStream_t stream = cudaStreamDefault) {
22-
CHECK(try_allocate(reinterpret_cast<void**>(ptr), size, stream));
22+
if (!try_allocate(reinterpret_cast<void**>(ptr), size, stream)) {
23+
LOG(FATAL) << "Out of memory: failed to allocate " << size << " bytes";
24+
}
2325
}
2426

2527
static void deallocate(void* ptr,
@@ -74,7 +76,11 @@ struct GPUMemory {
7476
return status;
7577
}
7678

77-
void reserve(size_t size) { CHECK(try_reserve(size)); }
79+
void reserve(size_t size) {
80+
if (!try_reserve(size)) {
81+
LOG(FATAL) << "Out of memory: failed to allocate " << size << " bytes";
82+
}
83+
}
7884

7985
void release() {
8086
if (mgr_.using_pool()) {

src/caffe/layers/cudnn_conv_layer.cpp

+32-19
Original file line numberDiff line numberDiff line change
@@ -90,6 +90,8 @@ void CuDNNConvolutionLayer<Dtype>::LayerSetUp(
9090
use_reshape_ = true;
9191
// When true, cached bottom and conv descriptors need to be set.
9292
initialized_cached_descs_ = false;
93+
// In case of reusing it
94+
WORKSPACE.release();
9395
}
9496

9597
template <typename Dtype>
@@ -179,17 +181,15 @@ void CuDNNConvolutionLayer<Dtype>::Reshape(
179181
if (use_modest_workspace_) {
180182
// In iteration 0, use a small amount of memory in order to leave
181183
// most of memory for allocating layer blobs.
182-
// TODO: Read 8*1024*1024 from a data member variable.
183-
workspace_bytes = 8*1024*1024;
184+
workspace_bytes = INITIAL_WORKSPACE_SIZE;
184185
} else {
185-
// Use 90% of available memory.
186+
// Use 95% of available memory.
186187
// Using all of memory may result in failure of workspace.reserve.
187-
// TODO: Since 90% of memory might be too large, we can allocate
188+
// TODO: Since 95% of memory might be too large, we can allocate
188189
// exactly how much FindEx needs by taking the maximum
189190
// workspace among all algorithms (requires an initial call
190191
// to FindEx with workspace size 0).
191-
// TODO: Read 0.9 from a data member variable.
192-
workspace_bytes = workspace_limit_bytes * 0.9;
192+
workspace_bytes = workspace_limit_bytes * MAX_WORKSPACE_RATIO;
193193
// Avoid seeking for an algorithm in subsequent iterations
194194
use_algo_seeker_ = false;
195195
}
@@ -233,8 +233,9 @@ void CuDNNConvolutionLayer<Dtype>::Reshape(
233233
CUDNN_CHECK(cudnnGetConvolutionBackwardDataWorkspaceSize(
234234
Caffe::cudnn_handle(),
235235
filter_desc_, top_descs_[i], conv_descs_[i], bottom_descs_[i],
236-
bwd_data_algo_[i], &workspace_bwd_data_sizes_[i]) );
236+
bwd_data_algo_[i], &workspace_bwd_data_sizes_[i]));
237237
}
238+
UpdateWorkspaceDemand(bottom.size()); // update WORKSPACE_SIZE
238239

239240
// Tensor descriptor for bias.
240241
if (this->bias_term_) {
@@ -292,11 +293,7 @@ void CuDNNConvolutionLayer<Dtype>::FindExConvAlgo(
292293
void *tmp_weights;
293294
const int tmp_weights_size = sizeof(Dtype) * weight_offset_;
294295
GPUMemory::allocate(&tmp_weights, tmp_weights_size);
295-
296-
// TODO: Try reducing workspace_bytes if it fails.
297-
// In case, workspace_bytes is 90% of available memory,
298-
// reduce it to 75%; if it fails again, reduce it to 50% and so on.
299-
workspace.reserve(workspace_bytes);
296+
WORKSPACE.reserve(workspace_bytes);
300297

301298
for (int i = 0; i < bottom.size(); i++) {
302299
// Find forward algorithm
@@ -312,8 +309,8 @@ void CuDNNConvolutionLayer<Dtype>::FindExConvAlgo(
312309
kRequestAlgoCount,
313310
&fwd_algo_count,
314311
fwd_results,
315-
workspace.data(),
316-
workspace.size()));
312+
WORKSPACE.data(),
313+
WORKSPACE.size()));
317314
fwd_algo_[i] = fwd_results[0].algo;
318315
workspace_fwd_sizes_[i] = fwd_results[0].memory;
319316

@@ -332,8 +329,8 @@ void CuDNNConvolutionLayer<Dtype>::FindExConvAlgo(
332329
kRequestAlgoCount,
333330
&filter_algo_count,
334331
bwd_filter_results,
335-
workspace.data(),
336-
workspace.size()));
332+
WORKSPACE.data(),
333+
WORKSPACE.size()));
337334
bwd_filter_algo_[i] = bwd_filter_results[0].algo;
338335
workspace_bwd_filter_sizes_[i] = bwd_filter_results[0].memory;
339336

@@ -350,15 +347,14 @@ void CuDNNConvolutionLayer<Dtype>::FindExConvAlgo(
350347
kRequestAlgoCount,
351348
&data_algo_count,
352349
bwd_data_results,
353-
workspace.data(),
354-
workspace.size()));
350+
WORKSPACE.data(),
351+
WORKSPACE.size()));
355352

356353
bwd_data_algo_[i] = bwd_data_results[0].algo;
357354
workspace_bwd_data_sizes_[i] = bwd_data_results[0].memory;
358355
}
359356
}
360357
GPUMemory::deallocate(tmp_weights);
361-
workspace.release();
362358
}
363359
#endif
364360

@@ -453,8 +449,25 @@ bool CuDNNConvolutionLayer<Dtype>::IsConvDescChanged(
453449
return false;
454450
}
455451

452+
template <typename Dtype>
453+
void CuDNNConvolutionLayer<Dtype>::UpdateWorkspaceDemand(int size) {
454+
// Updating the maximum WORKSPACE_SIZE
455+
for (int i = 0; i < size; ++i) {
456+
if (workspace_fwd_sizes_[i] > WORKSPACE_SIZE) {
457+
WORKSPACE_SIZE = workspace_fwd_sizes_[i];
458+
}
459+
if (workspace_bwd_filter_sizes_[i] > WORKSPACE_SIZE) {
460+
WORKSPACE_SIZE = workspace_bwd_filter_sizes_[i];
461+
}
462+
if (workspace_bwd_data_sizes_[i] > WORKSPACE_SIZE) {
463+
WORKSPACE_SIZE = workspace_bwd_data_sizes_[i];
464+
}
465+
}
466+
}
467+
456468
template <typename Dtype>
457469
CuDNNConvolutionLayer<Dtype>::~CuDNNConvolutionLayer() {
470+
WORKSPACE.release();
458471
// Check that handles have been setup before destroying.
459472
if (!handles_setup_) { return; }
460473

src/caffe/layers/cudnn_conv_layer.cu

+13-31
Original file line numberDiff line numberDiff line change
@@ -20,20 +20,14 @@ void CuDNNConvolutionLayer<Dtype>::Forward_gpu(
2020
const Dtype* bottom_data = bottom[i]->gpu_data();
2121
Dtype* top_data = top[i]->mutable_gpu_data();
2222

23-
// Test free space and force reshape if allocations have changed
24-
size_t workspace_limit_bytes, total_memory;
25-
GPUMemory::GetInfo(&workspace_limit_bytes, &total_memory);
26-
if (workspace_fwd_sizes_[i] > workspace_limit_bytes) {
27-
use_algo_seeker_ = true;
28-
this->Reshape(bottom, top);
29-
}
3023
// Sometimes closer to zero we might have memory info diverged from reality
3124
// If try_reserve fails, it updates the info internally and we proceed with
3225
// Reshape one more time
33-
if (!workspace.try_reserve(workspace_fwd_sizes_[i])) {
26+
// Note: if WORKSPACE_SIZE is already allocated next line does nothing.
27+
if (!WORKSPACE.try_reserve(WORKSPACE_SIZE)) {
3428
use_algo_seeker_ = true;
3529
this->Reshape(bottom, top);
36-
workspace.reserve(workspace_fwd_sizes_[i]);
30+
WORKSPACE.reserve(WORKSPACE_SIZE);
3731
}
3832

3933
// Forward through cuDNN in parallel over groups.
@@ -44,7 +38,7 @@ void CuDNNConvolutionLayer<Dtype>::Forward_gpu(
4438
bottom_descs_[i], bottom_data + bottom_offset_ * g,
4539
filter_desc_, weight + this->weight_offset_ * g,
4640
conv_descs_[i],
47-
fwd_algo_[i], workspace.data(), workspace.size(),
41+
fwd_algo_[i], WORKSPACE.data(), WORKSPACE.size(),
4842
cudnn::dataType<Dtype>::zero,
4943
top_descs_[i], top_data + top_offset_ * g));
5044

@@ -59,14 +53,11 @@ void CuDNNConvolutionLayer<Dtype>::Forward_gpu(
5953
}
6054
}
6155

62-
workspace.release();
6356
// Synchronize the work across groups, each of which went into its own
6457
// stream, by launching an empty kernel into the default (null) stream.
6558
// NOLINT_NEXT_LINE(whitespace/operators)
6659
CUDA_CHECK(cudaStreamSynchronize(cudaStreamLegacy));
6760
}
68-
// Possibly use faster algorithms by allowing larger workspace.
69-
use_modest_workspace_ = false;
7061
}
7162

7263
template<typename Dtype>
@@ -84,25 +75,15 @@ void CuDNNConvolutionLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
8475
}
8576
for (int i = 0; i < top.size(); ++i) {
8677
const Dtype* top_diff = top[i]->gpu_diff();
87-
// Test free space and force reshape if allocations have changed
88-
size_t workspace_limit_bytes, total_memory;
89-
GPUMemory::GetInfo(&workspace_limit_bytes, &total_memory);
90-
if (workspace_bwd_filter_sizes_[i] > workspace_limit_bytes ||
91-
workspace_bwd_data_sizes_[i] > workspace_limit_bytes) {
92-
use_algo_seeker_ = true;
93-
this->Reshape(bottom, top);
94-
}
95-
// To remove pressure on allocator, allocate the larger of the
96-
// workspaces needed for the following steps
78+
9779
// Sometimes closer to zero we might have memory info diverged from reality
9880
// If try_reserve fails, it updates the info internally and we proceed with
99-
// Reshape one more time
100-
if (!workspace.try_reserve(std::max(workspace_bwd_filter_sizes_[i],
101-
workspace_bwd_data_sizes_[i]))) {
81+
// Reshape one more time.
82+
// Note: if WORKSPACE_SIZE is already allocated next line does nothing.
83+
if (!WORKSPACE.try_reserve(WORKSPACE_SIZE)) {
10284
use_algo_seeker_ = true;
10385
this->Reshape(bottom, top);
104-
workspace.reserve(std::max(workspace_bwd_filter_sizes_[i],
105-
workspace_bwd_data_sizes_[i]));
86+
WORKSPACE.reserve(WORKSPACE_SIZE);
10687
}
10788

10889
// Backward through cuDNN in parallel over groups and gradients.
@@ -123,7 +104,7 @@ void CuDNNConvolutionLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
123104
bottom_descs_[i], bottom_data + bottom_offset_ * g,
124105
top_descs_[i], top_diff + top_offset_ * g,
125106
conv_descs_[i],
126-
bwd_filter_algo_[i], workspace.data(), workspace.size(),
107+
bwd_filter_algo_[i], WORKSPACE.data(), WORKSPACE.size(),
127108
cudnn::dataType<Dtype>::one,
128109
filter_desc_, weight_diff + this->weight_offset_ * g));
129110
}
@@ -138,18 +119,19 @@ void CuDNNConvolutionLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
138119
filter_desc_, weight + this->weight_offset_ * g,
139120
top_descs_[i], top_diff + top_offset_ * g,
140121
conv_descs_[i],
141-
bwd_data_algo_[i], workspace.data(), workspace.size(),
122+
bwd_data_algo_[i], WORKSPACE.data(), WORKSPACE.size(),
142123
cudnn::dataType<Dtype>::zero,
143124
bottom_descs_[i], bottom_diff + bottom_offset_ * g));
144125
}
145126
}
146127

147-
workspace.release();
148128
// Synchronize the work across groups, each of which went into its own
149129
// stream, by launching an empty kernel into the default (null) stream.
150130
// NOLINT_NEXT_LINE(whitespace/operators)
151131
CUDA_CHECK(cudaStreamSynchronize(cudaStreamLegacy));
152132
}
133+
// Possibly use faster algorithms by allowing larger workspace.
134+
use_modest_workspace_ = false;
153135
}
154136

155137
INSTANTIATE_LAYER_GPU_FUNCS(CuDNNConvolutionLayer);

tools/caffe.cpp

+18-19
Original file line numberDiff line numberDiff line change
@@ -357,6 +357,7 @@ int time() {
357357
// Do a number of clean forward and backward pass,
358358
// so that memory allocation are done,
359359
// and future iterations will be more stable.
360+
Timer init_timer;
360361
Timer forward_timer;
361362
Timer backward_timer;
362363
double forward_time = 0.0;
@@ -365,30 +366,28 @@ int time() {
365366
LOG(INFO) << "Initialization for " << kInitIterations << " iterations.";
366367
// Note that for the speed benchmark, we will assume that the network does
367368
// not take any input blobs.
368-
LOG(INFO) << "Performing Forward";
369-
float initial_loss;
370-
forward_timer.Start();
371-
for (int j = 0; j < kInitIterations; ++j) {
372-
caffe_net.Forward(&initial_loss);
373-
}
374-
forward_time += forward_timer.MicroSeconds();
375-
LOG(INFO) << "Initial loss: " << initial_loss;
376-
LOG(INFO) << "Performing Backward";
377-
backward_timer.Start();
378-
for (int j = 0; j < kInitIterations; ++j) {
379-
caffe_net.Backward();
380-
}
381-
backward_time += backward_timer.MicroSeconds();
382-
LOG(INFO) << "Average Initialization Forward pass: " << forward_time /
383-
1000 / kInitIterations << " ms.";
384-
LOG(INFO) << "Average Initialization Backward pass: " << backward_time /
385-
1000 / kInitIterations << " ms.";
386-
369+
LOG(INFO) << "Performing initial Forward/Backward";
387370
const vector<shared_ptr<Layer<float> > >& layers = caffe_net.layers();
388371
const vector<vector<Blob<float>*> >& bottom_vecs = caffe_net.bottom_vecs();
389372
const vector<vector<Blob<float>*> >& top_vecs = caffe_net.top_vecs();
390373
const vector<vector<bool> >& bottom_need_backward =
391374
caffe_net.bottom_need_backward();
375+
float initial_loss = 0.F;
376+
init_timer.Start();
377+
for (int j = 0; j < kInitIterations; ++j) {
378+
for (int i = 0; i < layers.size(); ++i) {
379+
initial_loss += layers[i]->Forward(bottom_vecs[i], top_vecs[i]);
380+
}
381+
for (int i = layers.size() - 1; i >= 0; --i) {
382+
layers[i]->Backward(top_vecs[i], bottom_need_backward[i],
383+
bottom_vecs[i]);
384+
}
385+
}
386+
double init_time = init_timer.MilliSeconds();
387+
LOG(INFO) << "Initial Forward/Backward complete, loss: " << initial_loss;
388+
LOG(INFO) << "Average Initialization Forward/Backward pass: " << init_time /
389+
kInitIterations << " ms.";
390+
392391
LOG(INFO) << "*** Benchmark begins ***";
393392
LOG(INFO) << "Testing for " << FLAGS_iterations << " iterations.";
394393
Timer total_timer;

0 commit comments

Comments
 (0)