Skip to content

Commit d988833

Browse files
committed
Merge pull request #153 from drnikolaev/caffe-0.15-oom
Thanks @lukeyeager for reviewing the code. This PR will be followed by another one implementing better memory distribution algorithm for cuDNN Convolution Layer (as per our discussion).
2 parents cf75318 + 1d4882b commit d988833

File tree

3 files changed

+45
-21
lines changed

3 files changed

+45
-21
lines changed

include/caffe/util/gpu_memory.hpp

+16-4
Original file line numberDiff line numberDiff line change
@@ -58,14 +58,22 @@ class GPUMemoryManager {
5858
}
5959

6060
// Memory allocation/release
61-
void reserve(size_t size) {
61+
bool try_reserve(size_t size) {
62+
bool status = true;
6263
if (size > size_) {
6364
if (ptr_) {
6465
GPUMemoryManager::deallocate(ptr_, stream_);
6566
}
66-
GPUMemoryManager::allocate(&ptr_, size, stream_);
67-
size_ = size;
67+
status = GPUMemoryManager::try_allocate(&ptr_, size, stream_);
68+
if (status) {
69+
size_ = size;
70+
}
6871
}
72+
return status;
73+
}
74+
75+
void reserve(size_t size) {
76+
CHECK(try_reserve(size));
6977
}
7078

7179
/*
@@ -111,8 +119,12 @@ class GPUMemoryManager {
111119

112120
public:
113121
typedef void* pointer;
114-
static void allocate(pointer* ptr, size_t size, cudaStream_t stream =
122+
static bool try_allocate(pointer* ptr, size_t size, cudaStream_t stream =
115123
cudaStreamDefault);
124+
static void allocate(pointer* ptr, size_t size, cudaStream_t stream =
125+
cudaStreamDefault) {
126+
CHECK(try_allocate(ptr, size, stream));
127+
}
116128
static void deallocate(pointer ptr, cudaStream_t = cudaStreamDefault);
117129
static void GetInfo(size_t* free_mem, size_t* used_mem);
118130

src/caffe/layers/cudnn_conv_layer.cu

+16-3
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,13 @@ void CuDNNConvolutionLayer<Dtype>::Forward_gpu(
2626
if (workspace_fwd_sizes_[i] > workspace_limit_bytes) {
2727
this->Reshape(bottom, top);
2828
}
29-
workspace.reserve(workspace_fwd_sizes_[i]);
29+
// Sometimes closer to zero we might have memory info diverged from reality
30+
// If try_reserve fails, it updates the info internally and we proceed with
31+
// Reshape one more time
32+
if (!workspace.try_reserve(workspace_fwd_sizes_[i])) {
33+
this->Reshape(bottom, top);
34+
workspace.reserve(workspace_fwd_sizes_[i]);
35+
}
3036

3137
// Forward through cuDNN in parallel over groups.
3238
for (int g = 0; g < this->group_; g++) {
@@ -85,8 +91,15 @@ void CuDNNConvolutionLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
8591
}
8692
// To remove pressure on allocator, allocate the larger of the
8793
// workspaces needed for the following steps
88-
workspace.reserve(std::max(workspace_bwd_filter_sizes_[i],
89-
workspace_bwd_data_sizes_[i]));
94+
// Sometimes closer to zero we might have memory info diverged from reality
95+
// If try_reserve fails, it updates the info internally and we proceed with
96+
// Reshape one more time
97+
if (!workspace.try_reserve(std::max(workspace_bwd_filter_sizes_[i],
98+
workspace_bwd_data_sizes_[i]))) {
99+
this->Reshape(bottom, top);
100+
workspace.reserve(std::max(workspace_bwd_filter_sizes_[i],
101+
workspace_bwd_data_sizes_[i]));
102+
}
90103

91104
// Backward through cuDNN in parallel over groups and gradients.
92105
for (int g = 0; g < this->group_; g++) {

src/caffe/util/gpu_memory.cpp

+13-14
Original file line numberDiff line numberDiff line change
@@ -56,19 +56,21 @@ void GPUMemoryManager::destroy() {
5656
mode_ = NO_POOL;
5757
}
5858

59-
void GPUMemoryManager::allocate(void** ptr, size_t size, cudaStream_t stream) {
59+
bool GPUMemoryManager::try_allocate(void** ptr, size_t size,
60+
cudaStream_t stream) {
6061
CHECK((ptr) != NULL);
62+
cudaError_t status = cudaSuccess, last_err = cudaSuccess;
6163
switch (mode_) {
6264
case CUB_POOL:
63-
if (cub_allocator->DeviceAllocate(ptr, size, stream) != cudaSuccess) {
65+
// Clean Cache & Retry logic is inside now
66+
status = cub_allocator->DeviceAllocate(ptr, size, stream);
67+
// If there was a retry and it succeeded we get good status here but
68+
// we need to clean up last error...
69+
last_err = cudaGetLastError();
70+
// ...and update the dev info if something was wrong
71+
if (status != cudaSuccess || last_err != cudaSuccess) {
6472
int cur_device;
6573
CUDA_CHECK(cudaGetDevice(&cur_device));
66-
// free all cached memory (for all devices), synchrionize
67-
cudaDeviceSynchronize();
68-
cudaThreadSynchronize();
69-
cub_allocator->FreeAllCached();
70-
cudaDeviceSynchronize();
71-
cudaThreadSynchronize();
7274
// Refresh per-device saved values.
7375
for (int i = 0; i < dev_info_.size(); ++i) {
7476
// only query devices that were initialized
@@ -80,16 +82,13 @@ void GPUMemoryManager::allocate(void** ptr, size_t size, cudaStream_t stream) {
8082
}
8183
}
8284
}
83-
// Retry once
84-
CUDA_CHECK(cub_allocator->DeviceAllocate(ptr, size, stream));
8585
}
86-
// If retry succeeds we need to clean up last error
87-
cudaGetLastError();
8886
break;
8987
default:
90-
CUDA_CHECK(cudaMalloc(ptr, size));
88+
status = cudaMalloc(ptr, size);
9189
break;
9290
}
91+
return status == cudaSuccess;
9392
}
9493

9594
void GPUMemoryManager::deallocate(void* ptr, cudaStream_t stream) {
@@ -172,7 +171,7 @@ void GPUMemoryManager::GetInfo(size_t* free_mem, size_t* total_mem) {
172171
CUDA_CHECK(cudaGetDevice(&cur_device));
173172
*total_mem = dev_info_[cur_device].total_;
174173
// Free memory is initial free memory minus outstanding allocations.
175-
// Assuming we only allocate via GPUMemoryManager since its constructon.
174+
// Assuming we only allocate via GPUMemoryManager since its construction.
176175
*free_mem = dev_info_[cur_device].free_ -
177176
cub_allocator->cached_bytes[cur_device].live;
178177
break;

0 commit comments

Comments
 (0)