Skip to content

Commit 057ace8

Browse files
committed
Merge pull request #159 from drnikolaev/caffe-0.15-mem
GPU Memory Manager refactored
2 parents daa511d + 7ba7ad6 commit 057ace8

13 files changed

+183
-180
lines changed

include/caffe/layers/cudnn_conv_layer.hpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,7 @@ class CuDNNConvolutionLayer : public ConvolutionLayer<Dtype> {
6464
size_t *workspace_fwd_sizes_;
6565
size_t *workspace_bwd_data_sizes_;
6666
size_t *workspace_bwd_filter_sizes_;
67-
GPUMemoryManager::Buffer workspace;
67+
GPUMemory::Workspace workspace;
6868
int backward_passed_ctr_;
6969
};
7070
#endif

include/caffe/layers/cudnn_lcn_layer.hpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@ class CuDNNLCNLayer : public LRNLayer<Dtype> {
4141
Dtype alpha_, beta_, k_;
4242

4343
size_t tempDataSize_;
44-
GPUMemoryManager::Buffer temp1_, temp2_;
44+
GPUMemory::Workspace temp1_, temp2_;
4545
};
4646
#endif
4747

include/caffe/util/gpu_memory.hpp

+82-85
Original file line numberDiff line numberDiff line change
@@ -3,87 +3,82 @@
33

44
#include <vector>
55
#include "caffe/common.hpp"
6+
#ifndef CPU_ONLY
7+
8+
namespace cub {
9+
class CachingDeviceAllocator;
10+
}
611

712
namespace caffe {
813

9-
class GPUMemoryManager {
10-
public:
11-
enum PoolMode {
12-
NO_POOL, // Straight CUDA malloc/free (may be expensive)
13-
CUB_POOL, // CUB caching allocator
14-
#ifdef CPU_ONLY
15-
DEFAULT_POOL = NO_POOL
16-
#else
17-
DEFAULT_POOL = CUB_POOL // CUB pool is able to use unified memory properly
18-
#endif
19-
};
14+
struct GPUMemory {
15+
static void GetInfo(size_t* free_mem, size_t* used_mem) {
16+
return mgr_.GetInfo(free_mem, used_mem);
17+
}
2018

21-
static const char* pool_name();
22-
static bool using_pool() {
23-
return mode_ != NO_POOL;
19+
template <class Any>
20+
static void allocate(Any** ptr, size_t size,
21+
cudaStream_t stream = cudaStreamDefault) {
22+
CHECK(try_allocate(reinterpret_cast<void**>(ptr), size, stream));
2423
}
2524

26-
class Arena {
27-
public:
28-
Arena(const std::vector<int>& gpus, PoolMode m = DEFAULT_POOL, bool debug =
29-
false) {
30-
init(gpus, m, debug);
31-
}
32-
~Arena() {
33-
destroy();
34-
}
25+
static void deallocate(void* ptr,
26+
cudaStream_t stream = cudaStreamDefault) {
27+
mgr_.deallocate(ptr, stream);
28+
}
29+
30+
static bool try_allocate(void** ptr, size_t size,
31+
cudaStream_t stream = cudaStreamDefault) {
32+
return mgr_.try_allocate(ptr, size, stream);
33+
}
34+
35+
enum Mode {
36+
CUDA_MALLOC, // Straight CUDA malloc/free (may be expensive)
37+
CUB_ALLOCATOR // CUB caching allocator
3538
};
3639

37-
#ifndef CPU_ONLY
38-
class Buffer {
39-
public:
40-
// Construction/destruction
41-
Buffer() :
42-
ptr_(NULL), stream_(), size_(0) {
40+
// Scope initializes global Memory Manager for a given scope.
41+
// It's instantiated in test(), train() and time() Caffe brewing functions
42+
// as well as in unit tests main().
43+
struct Scope {
44+
Scope(const std::vector<int>& gpus, Mode m = CUB_ALLOCATOR,
45+
bool debug = false) {
46+
mgr_.init(gpus, m, debug);
4347
}
44-
Buffer(size_t size, cudaStream_t s = cudaStreamDefault) :
45-
stream_(s) {
48+
};
49+
50+
// Workspace's release() functionality depends on global pool availability
51+
// If pool is available, it returns memory to the pool and sets ptr to NULL
52+
// If pool is not available, it retains memory.
53+
struct Workspace {
54+
Workspace() : ptr_(NULL), stream_(), size_(0) {}
55+
Workspace(size_t size, cudaStream_t s = cudaStreamDefault) : stream_(s) {
4656
reserve(size);
4757
}
48-
~Buffer() {
49-
GPUMemoryManager::deallocate(ptr_, stream_);
50-
}
58+
~Workspace() { mgr_.deallocate(ptr_, stream_); }
5159

52-
// Accessors
53-
void* data() const {
54-
return ptr_;
55-
}
56-
size_t size() const {
57-
return size_;
58-
}
60+
void* data() const { return ptr_; }
61+
size_t size() const { return size_; }
5962

60-
// Memory allocation/release
6163
bool try_reserve(size_t size) {
6264
bool status = true;
6365
if (size > size_) {
6466
if (ptr_) {
65-
GPUMemoryManager::deallocate(ptr_, stream_);
67+
mgr_.deallocate(ptr_, stream_);
6668
}
67-
status = GPUMemoryManager::try_allocate(&ptr_, size, stream_);
69+
status = mgr_.try_allocate(&ptr_, size, stream_);
6870
if (status) {
6971
size_ = size;
7072
}
7173
}
7274
return status;
7375
}
7476

75-
void reserve(size_t size) {
76-
CHECK(try_reserve(size));
77-
}
77+
void reserve(size_t size) { CHECK(try_reserve(size)); }
7878

79-
/*
80-
* This method behaves differently depending on pool availability:
81-
* If pool is available, it returns memory to the pool and sets ptr to NULL
82-
* If pool is not available, it does nothing (retaining memory)
83-
*/
8479
void release() {
85-
if (GPUMemoryManager::using_pool()) {
86-
GPUMemoryManager::deallocate(ptr_, stream_);
80+
if (mgr_.using_pool()) {
81+
mgr_.deallocate(ptr_, stream_);
8782
ptr_ = NULL;
8883
size_ = 0;
8984
}
@@ -95,44 +90,46 @@ class GPUMemoryManager {
9590
cudaStream_t stream_;
9691
size_t size_;
9792
};
98-
static void update_dev_info(int device);
99-
#endif // CPU_ONLY
10093

10194
private:
102-
static void init(const std::vector<int>&, PoolMode, bool);
103-
static void destroy();
95+
struct Manager {
96+
Manager();
97+
~Manager();
98+
void GetInfo(size_t* free_mem, size_t* used_mem);
99+
void deallocate(void* ptr, cudaStream_t stream);
100+
bool try_allocate(void** ptr, size_t size, cudaStream_t);
101+
const char* pool_name() const;
102+
bool using_pool() const { return mode_ != CUDA_MALLOC; }
103+
void init(const std::vector<int>&, Mode, bool);
104+
105+
Mode mode_;
106+
bool debug_;
104107

105-
static bool initialized_;
106-
static PoolMode mode_;
107-
static bool debug_;
108-
109-
#ifndef CPU_ONLY
110-
struct MemInfo {
111-
MemInfo() {
112-
free_ = total_ = flush_count_ = 0;
113-
}
114-
size_t free_;
115-
size_t total_;
116-
unsigned flush_count_;
108+
private:
109+
struct DevInfo {
110+
DevInfo() {
111+
free_ = total_ = flush_count_ = 0;
112+
}
113+
size_t free_;
114+
size_t total_;
115+
unsigned flush_count_;
116+
};
117+
void update_dev_info(int device);
118+
vector<DevInfo> dev_info_;
119+
bool initialized_;
120+
cub::CachingDeviceAllocator* cub_allocator_;
121+
122+
static unsigned int BIN_GROWTH; ///< Geometric growth factor for bin-sizes
123+
static unsigned int MIN_BIN; ///< Minimum bin
124+
static unsigned int MAX_BIN; ///< Maximum bin
125+
static size_t MAX_CACHED_BYTES; ///< Maximum aggregate cached bytes
117126
};
118-
static vector<MemInfo> dev_info_;
119-
120-
public:
121-
typedef void* pointer;
122-
static bool try_allocate(pointer* ptr, size_t size, cudaStream_t stream =
123-
cudaStreamDefault);
124-
static void allocate(pointer* ptr, size_t size, cudaStream_t stream =
125-
cudaStreamDefault) {
126-
CHECK(try_allocate(ptr, size, stream));
127-
}
128-
static void deallocate(pointer ptr, cudaStream_t = cudaStreamDefault);
129-
static void GetInfo(size_t* free_mem, size_t* used_mem);
130127

131-
private:
132-
static void InitMemory(const std::vector<int>& gpus, PoolMode m);
133-
#endif
128+
static Manager mgr_;
134129
};
135130

136131
} // namespace caffe
137132

138133
#endif
134+
135+
#endif

python/caffe/_caffe.cpp

+18-1
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
#include "caffe/layers/memory_data_layer.hpp"
1919
#include "caffe/layers/python_layer.hpp"
2020
#include "caffe/sgd_solvers.hpp"
21+
#include "caffe/util/gpu_memory.hpp"
2122

2223
// Temporary solution for numpy < 1.7 versions: old macro, no promises.
2324
// You're strongly advised to upgrade to >= 1.7.
@@ -51,9 +52,25 @@ namespace caffe {
5152
typedef float Dtype;
5253
const int NPY_DTYPE = NPY_FLOAT32;
5354

55+
#ifndef CPU_ONLY
56+
shared_ptr<GPUMemory::Scope> gpu_memory_scope;
57+
#endif
58+
5459
// Selecting mode.
5560
void set_mode_cpu() { Caffe::set_mode(Caffe::CPU); }
56-
void set_mode_gpu() { Caffe::set_mode(Caffe::GPU); }
61+
void set_mode_gpu() {
62+
Caffe::set_mode(Caffe::GPU);
63+
#ifndef CPU_ONLY
64+
vector<int> gpus;
65+
int count = 0;
66+
CUDA_CHECK(cudaGetDeviceCount(&count));
67+
for (int i = 0; i < count; ++i) {
68+
gpus.push_back(i);
69+
}
70+
CHECK_GT(gpus.size(), 0);
71+
gpu_memory_scope.reset(new GPUMemory::Scope(gpus));
72+
#endif
73+
}
5774

5875
// For convenience, check that input files can be opened, and raise an
5976
// exception that boost will send to Python if not (caffe could still crash

src/caffe/layers/cudnn_conv_layer.cpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -98,7 +98,7 @@ void CuDNNConvolutionLayer<Dtype>::Reshape(
9898
// Specify workspace limit for kernels directly until we have a
9999
// planning strategy and a rewrite of Caffe's GPU memory mangagement
100100
size_t workspace_limit_bytes, total_memory;
101-
GPUMemoryManager::GetInfo(&workspace_limit_bytes, &total_memory);
101+
GPUMemory::GetInfo(&workspace_limit_bytes, &total_memory);
102102

103103
for (int i = 0; i < bottom.size(); i++) {
104104
cudnn::setTensor4dDesc<Dtype>(&bottom_descs_[i],

src/caffe/layers/cudnn_conv_layer.cu

+2-2
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ void CuDNNConvolutionLayer<Dtype>::Forward_gpu(
2222

2323
// Test free space and force reshape if allocations have changed
2424
size_t workspace_limit_bytes, total_memory;
25-
GPUMemoryManager::GetInfo(&workspace_limit_bytes, &total_memory);
25+
GPUMemory::GetInfo(&workspace_limit_bytes, &total_memory);
2626
if (workspace_fwd_sizes_[i] > workspace_limit_bytes) {
2727
this->Reshape(bottom, top);
2828
}
@@ -82,7 +82,7 @@ void CuDNNConvolutionLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
8282
const Dtype* top_diff = top[i]->gpu_diff();
8383
// Test free space and force reshape if allocations have changed
8484
size_t workspace_limit_bytes, total_memory;
85-
GPUMemoryManager::GetInfo(&workspace_limit_bytes, &total_memory);
85+
GPUMemory::GetInfo(&workspace_limit_bytes, &total_memory);
8686
if (workspace_bwd_filter_sizes_[i] > workspace_limit_bytes ||
8787
workspace_bwd_data_sizes_[i] > workspace_limit_bytes ||
8888
// We need to get workspace sizes for the default algos at 1st run

src/caffe/parallel.cpp

+4-4
Original file line numberDiff line numberDiff line change
@@ -90,15 +90,15 @@ GPUParams<Dtype>::GPUParams(shared_ptr<Solver<Dtype> > root_solver, int device)
9090
// Allocate device buffers
9191
CUDA_CHECK(cudaSetDevice(device));
9292
buffer_device_ = device;
93-
GPUMemoryManager::allocate(reinterpret_cast<void **>(&data_),
93+
GPUMemory::allocate(reinterpret_cast<void **>(&data_),
9494
size_ * sizeof(Dtype));
9595

9696
// Copy blob values
9797
const vector<Blob<Dtype>*>& net =
9898
root_solver->net()->learnable_params();
9999
apply_buffers(net, data_, size_, copy);
100100

101-
GPUMemoryManager::allocate(reinterpret_cast<void **>(&diff_),
101+
GPUMemory::allocate(reinterpret_cast<void **>(&diff_),
102102
size_ * sizeof(Dtype));
103103
caffe_gpu_set(size_, Dtype(0), diff_);
104104

@@ -114,8 +114,8 @@ GPUParams<Dtype>::~GPUParams() {
114114
int initial_device;
115115
CUDA_CHECK(cudaGetDevice(&initial_device));
116116
CUDA_CHECK(cudaSetDevice(buffer_device_));
117-
GPUMemoryManager::deallocate(data_);
118-
GPUMemoryManager::deallocate(diff_);
117+
GPUMemory::deallocate(data_);
118+
GPUMemory::deallocate(diff_);
119119
data_ = NULL;
120120
diff_ = NULL;
121121
CUDA_CHECK(cudaSetDevice(initial_device));

src/caffe/syncedmem.cpp

+5-6
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ SyncedMemory::~SyncedMemory() {
1717
if (gpu_device_ != -1) {
1818
CUDA_CHECK(cudaSetDevice(gpu_device_));
1919
}
20-
GPUMemoryManager::deallocate(gpu_ptr_);
20+
GPUMemory::deallocate(gpu_ptr_);
2121
cudaSetDevice(initial_device);
2222
}
2323
#endif // CPU_ONLY
@@ -54,15 +54,15 @@ inline void SyncedMemory::to_gpu() {
5454
switch (head_) {
5555
case UNINITIALIZED:
5656
CUDA_CHECK(cudaGetDevice(&gpu_device_));
57-
GPUMemoryManager::allocate(&gpu_ptr_, size_);
57+
GPUMemory::allocate(&gpu_ptr_, size_);
5858
caffe_gpu_memset(size_, 0, gpu_ptr_);
5959
head_ = HEAD_AT_GPU;
6060
own_gpu_data_ = true;
6161
break;
6262
case HEAD_AT_CPU:
6363
if (gpu_ptr_ == NULL) {
6464
CUDA_CHECK(cudaGetDevice(&gpu_device_));
65-
GPUMemoryManager::allocate(&gpu_ptr_, size_);
65+
GPUMemory::allocate(&gpu_ptr_, size_);
6666
own_gpu_data_ = true;
6767
}
6868
caffe_gpu_memcpy(size_, cpu_ptr_, gpu_ptr_);
@@ -111,7 +111,7 @@ void SyncedMemory::set_gpu_data(void* data) {
111111
if (gpu_device_ != -1) {
112112
CUDA_CHECK(cudaSetDevice(gpu_device_));
113113
}
114-
GPUMemoryManager::deallocate(gpu_ptr_);
114+
GPUMemory::deallocate(gpu_ptr_);
115115
cudaSetDevice(initial_device);
116116
}
117117
gpu_ptr_ = data;
@@ -144,7 +144,7 @@ void SyncedMemory::async_gpu_push(const cudaStream_t& stream) {
144144
CHECK(head_ == HEAD_AT_CPU);
145145
if (gpu_ptr_ == NULL) {
146146
CUDA_CHECK(cudaGetDevice(&gpu_device_));
147-
GPUMemoryManager::allocate(&gpu_ptr_, size_);
147+
GPUMemory::allocate(&gpu_ptr_, size_);
148148
own_gpu_data_ = true;
149149
}
150150
const cudaMemcpyKind put = cudaMemcpyHostToDevice;
@@ -155,4 +155,3 @@ void SyncedMemory::async_gpu_push(const cudaStream_t& stream) {
155155
#endif
156156

157157
} // namespace caffe
158-

src/caffe/test/test_blob.cpp

+2
Original file line numberDiff line numberDiff line change
@@ -35,12 +35,14 @@ TYPED_TEST(BlobSimpleTest, TestInitialization) {
3535
EXPECT_EQ(this->blob_->count(), 0);
3636
}
3737

38+
#ifndef CPU_ONLY
3839
TYPED_TEST(BlobSimpleTest, TestPointersCPUGPU) {
3940
EXPECT_TRUE(this->blob_preshaped_->gpu_data());
4041
EXPECT_TRUE(this->blob_preshaped_->cpu_data());
4142
EXPECT_TRUE(this->blob_preshaped_->mutable_gpu_data());
4243
EXPECT_TRUE(this->blob_preshaped_->mutable_cpu_data());
4344
}
45+
#endif
4446

4547
TYPED_TEST(BlobSimpleTest, TestReshape) {
4648
this->blob_->Reshape(2, 3, 4, 5);

src/caffe/test/test_caffe_main.cpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@ int main(int argc, char** argv) {
5050
cout << "Current device id: " << device << endl;
5151
cudaGetDeviceProperties(&CAFFE_TEST_CUDA_PROP, device);
5252
cout << "Current device name: " << CAFFE_TEST_CUDA_PROP.name << endl;
53-
caffe::GPUMemoryManager::Arena arena(devices);
53+
caffe::GPUMemory::Scope gpu_memory_scope(devices);
5454

5555
#endif
5656
// invoke the test.

0 commit comments

Comments
 (0)