Merge pull request #153 from drnikolaev/caffe-0.15-oom

drnikolaev · drnikolaev · commit d988833fc954 · 2016-05-31T23:57:02.000-07:00
Thanks @lukeyeager for reviewing the code. This PR will be followed by another one implementing better memory distribution algorithm for cuDNN Convolution Layer (as per our discussion).
diff --git a/include/caffe/util/gpu_memory.hpp b/include/caffe/util/gpu_memory.hpp
@@ -58,14 +58,22 @@ class GPUMemoryManager {
     }
 
     // Memory allocation/release
-    void reserve(size_t size) {
+    bool try_reserve(size_t size) {
+      bool status = true;
       if (size > size_) {
         if (ptr_) {
           GPUMemoryManager::deallocate(ptr_, stream_);
         }
-        GPUMemoryManager::allocate(&ptr_, size, stream_);
-        size_ = size;
+        status = GPUMemoryManager::try_allocate(&ptr_, size, stream_);
+        if (status) {
+          size_ = size;
+        }
       }
+      return status;
+    }
+
+    void reserve(size_t size) {
+      CHECK(try_reserve(size));
     }
 
     /*
@@ -111,8 +119,12 @@ class GPUMemoryManager {
 
  public:
   typedef void* pointer;
-  static void allocate(pointer* ptr, size_t size, cudaStream_t stream =
+  static bool try_allocate(pointer* ptr, size_t size, cudaStream_t stream =
       cudaStreamDefault);
+  static void allocate(pointer* ptr, size_t size, cudaStream_t stream =
+      cudaStreamDefault) {
+    CHECK(try_allocate(ptr, size, stream));
+  }
   static void deallocate(pointer ptr, cudaStream_t = cudaStreamDefault);
   static void GetInfo(size_t* free_mem, size_t* used_mem);
 
diff --git a/src/caffe/layers/cudnn_conv_layer.cu b/src/caffe/layers/cudnn_conv_layer.cu
@@ -26,7 +26,13 @@ void CuDNNConvolutionLayer<Dtype>::Forward_gpu(
     if (workspace_fwd_sizes_[i] > workspace_limit_bytes) {
       this->Reshape(bottom, top);
     }
-    workspace.reserve(workspace_fwd_sizes_[i]);
+    // Sometimes closer to zero we might have memory info diverged from reality
+    // If try_reserve fails, it updates the info internally and we proceed with
+    // Reshape one more time
+    if (!workspace.try_reserve(workspace_fwd_sizes_[i])) {
+      this->Reshape(bottom, top);
+      workspace.reserve(workspace_fwd_sizes_[i]);
+    }
 
     // Forward through cuDNN in parallel over groups.
     for (int g = 0; g < this->group_; g++) {
@@ -85,8 +91,15 @@ void CuDNNConvolutionLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
     }
     // To remove pressure on allocator, allocate the larger of the
     // workspaces needed for the following steps
-    workspace.reserve(std::max(workspace_bwd_filter_sizes_[i],
-        workspace_bwd_data_sizes_[i]));
+    // Sometimes closer to zero we might have memory info diverged from reality
+    // If try_reserve fails, it updates the info internally and we proceed with
+    // Reshape one more time
+    if (!workspace.try_reserve(std::max(workspace_bwd_filter_sizes_[i],
+        workspace_bwd_data_sizes_[i]))) {
+      this->Reshape(bottom, top);
+      workspace.reserve(std::max(workspace_bwd_filter_sizes_[i],
+          workspace_bwd_data_sizes_[i]));
+    }
 
     // Backward through cuDNN in parallel over groups and gradients.
     for (int g = 0; g < this->group_; g++) {
diff --git a/src/caffe/util/gpu_memory.cpp b/src/caffe/util/gpu_memory.cpp
@@ -56,19 +56,21 @@ void GPUMemoryManager::destroy() {
   mode_ = NO_POOL;
 }
 
-void GPUMemoryManager::allocate(void** ptr, size_t size, cudaStream_t stream) {
+bool GPUMemoryManager::try_allocate(void** ptr, size_t size,
+    cudaStream_t stream) {
   CHECK((ptr) != NULL);
+  cudaError_t status = cudaSuccess, last_err = cudaSuccess;
   switch (mode_) {
   case CUB_POOL:
-    if (cub_allocator->DeviceAllocate(ptr, size, stream) != cudaSuccess) {
+    // Clean Cache & Retry logic is inside now
+    status = cub_allocator->DeviceAllocate(ptr, size, stream);
+    // If there was a retry and it succeeded we get good status here but
+    // we need to clean up last error...
+    last_err = cudaGetLastError();
+    // ...and update the dev info if something was wrong
+    if (status != cudaSuccess || last_err != cudaSuccess) {
       int cur_device;
       CUDA_CHECK(cudaGetDevice(&cur_device));
-      // free all cached memory (for all devices), synchrionize
-      cudaDeviceSynchronize();
-      cudaThreadSynchronize();
-      cub_allocator->FreeAllCached();
-      cudaDeviceSynchronize();
-      cudaThreadSynchronize();
       // Refresh per-device saved values.
       for (int i = 0; i < dev_info_.size(); ++i) {
         // only query devices that were initialized
@@ -80,16 +82,13 @@ void GPUMemoryManager::allocate(void** ptr, size_t size, cudaStream_t stream) {
           }
         }
       }
-      // Retry once
-      CUDA_CHECK(cub_allocator->DeviceAllocate(ptr, size, stream));
     }
-    // If retry succeeds we need to clean up last error
-    cudaGetLastError();
     break;
   default:
-    CUDA_CHECK(cudaMalloc(ptr, size));
+    status = cudaMalloc(ptr, size);
     break;
   }
+  return status == cudaSuccess;
 }
 
 void GPUMemoryManager::deallocate(void* ptr, cudaStream_t stream) {
@@ -172,7 +171,7 @@ void GPUMemoryManager::GetInfo(size_t* free_mem, size_t* total_mem) {
     CUDA_CHECK(cudaGetDevice(&cur_device));
     *total_mem = dev_info_[cur_device].total_;
     // Free memory is initial free memory minus outstanding allocations.
-    // Assuming we only allocate via GPUMemoryManager since its constructon.
+    // Assuming we only allocate via GPUMemoryManager since its construction.
     *free_mem = dev_info_[cur_device].free_ -
         cub_allocator->cached_bytes[cur_device].live;
     break;