@@ -20,20 +20,14 @@ void CuDNNConvolutionLayer<Dtype>::Forward_gpu(
20
20
const Dtype* bottom_data = bottom[i]->gpu_data ();
21
21
Dtype* top_data = top[i]->mutable_gpu_data ();
22
22
23
- // Test free space and force reshape if allocations have changed
24
- size_t workspace_limit_bytes, total_memory;
25
- GPUMemory::GetInfo (&workspace_limit_bytes, &total_memory);
26
- if (workspace_fwd_sizes_[i] > workspace_limit_bytes) {
27
- use_algo_seeker_ = true ;
28
- this ->Reshape (bottom, top);
29
- }
30
23
// Sometimes closer to zero we might have memory info diverged from reality
31
24
// If try_reserve fails, it updates the info internally and we proceed with
32
25
// Reshape one more time
33
- if (!workspace.try_reserve (workspace_fwd_sizes_[i])) {
26
+ // Note: if WORKSPACE_SIZE is already allocated next line does nothing.
27
+ if (!WORKSPACE.try_reserve (WORKSPACE_SIZE)) {
34
28
use_algo_seeker_ = true ;
35
29
this ->Reshape (bottom, top);
36
- workspace .reserve (workspace_fwd_sizes_[i] );
30
+ WORKSPACE .reserve (WORKSPACE_SIZE );
37
31
}
38
32
39
33
// Forward through cuDNN in parallel over groups.
@@ -44,7 +38,7 @@ void CuDNNConvolutionLayer<Dtype>::Forward_gpu(
44
38
bottom_descs_[i], bottom_data + bottom_offset_ * g,
45
39
filter_desc_, weight + this ->weight_offset_ * g,
46
40
conv_descs_[i],
47
- fwd_algo_[i], workspace .data (), workspace .size (),
41
+ fwd_algo_[i], WORKSPACE .data (), WORKSPACE .size (),
48
42
cudnn::dataType<Dtype>::zero,
49
43
top_descs_[i], top_data + top_offset_ * g));
50
44
@@ -59,14 +53,11 @@ void CuDNNConvolutionLayer<Dtype>::Forward_gpu(
59
53
}
60
54
}
61
55
62
- workspace.release ();
63
56
// Synchronize the work across groups, each of which went into its own
64
57
// stream, by launching an empty kernel into the default (null) stream.
65
58
// NOLINT_NEXT_LINE(whitespace/operators)
66
59
CUDA_CHECK (cudaStreamSynchronize (cudaStreamLegacy));
67
60
}
68
- // Possibly use faster algorithms by allowing larger workspace.
69
- use_modest_workspace_ = false ;
70
61
}
71
62
72
63
template <typename Dtype>
@@ -84,25 +75,15 @@ void CuDNNConvolutionLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
84
75
}
85
76
for (int i = 0 ; i < top.size (); ++i) {
86
77
const Dtype* top_diff = top[i]->gpu_diff ();
87
- // Test free space and force reshape if allocations have changed
88
- size_t workspace_limit_bytes, total_memory;
89
- GPUMemory::GetInfo (&workspace_limit_bytes, &total_memory);
90
- if (workspace_bwd_filter_sizes_[i] > workspace_limit_bytes ||
91
- workspace_bwd_data_sizes_[i] > workspace_limit_bytes) {
92
- use_algo_seeker_ = true ;
93
- this ->Reshape (bottom, top);
94
- }
95
- // To remove pressure on allocator, allocate the larger of the
96
- // workspaces needed for the following steps
78
+
97
79
// Sometimes closer to zero we might have memory info diverged from reality
98
80
// If try_reserve fails, it updates the info internally and we proceed with
99
- // Reshape one more time
100
- if (!workspace. try_reserve ( std::max (workspace_bwd_filter_sizes_[i],
101
- workspace_bwd_data_sizes_[i]) )) {
81
+ // Reshape one more time.
82
+ // Note: if WORKSPACE_SIZE is already allocated next line does nothing.
83
+ if (!WORKSPACE. try_reserve (WORKSPACE_SIZE )) {
102
84
use_algo_seeker_ = true ;
103
85
this ->Reshape (bottom, top);
104
- workspace.reserve (std::max (workspace_bwd_filter_sizes_[i],
105
- workspace_bwd_data_sizes_[i]));
86
+ WORKSPACE.reserve (WORKSPACE_SIZE);
106
87
}
107
88
108
89
// Backward through cuDNN in parallel over groups and gradients.
@@ -123,7 +104,7 @@ void CuDNNConvolutionLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
123
104
bottom_descs_[i], bottom_data + bottom_offset_ * g,
124
105
top_descs_[i], top_diff + top_offset_ * g,
125
106
conv_descs_[i],
126
- bwd_filter_algo_[i], workspace .data (), workspace .size (),
107
+ bwd_filter_algo_[i], WORKSPACE .data (), WORKSPACE .size (),
127
108
cudnn::dataType<Dtype>::one,
128
109
filter_desc_, weight_diff + this ->weight_offset_ * g));
129
110
}
@@ -138,18 +119,19 @@ void CuDNNConvolutionLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
138
119
filter_desc_, weight + this ->weight_offset_ * g,
139
120
top_descs_[i], top_diff + top_offset_ * g,
140
121
conv_descs_[i],
141
- bwd_data_algo_[i], workspace .data (), workspace .size (),
122
+ bwd_data_algo_[i], WORKSPACE .data (), WORKSPACE .size (),
142
123
cudnn::dataType<Dtype>::zero,
143
124
bottom_descs_[i], bottom_diff + bottom_offset_ * g));
144
125
}
145
126
}
146
127
147
- workspace.release ();
148
128
// Synchronize the work across groups, each of which went into its own
149
129
// stream, by launching an empty kernel into the default (null) stream.
150
130
// NOLINT_NEXT_LINE(whitespace/operators)
151
131
CUDA_CHECK (cudaStreamSynchronize (cudaStreamLegacy));
152
132
}
133
+ // Possibly use faster algorithms by allowing larger workspace.
134
+ use_modest_workspace_ = false ;
153
135
}
154
136
155
137
INSTANTIATE_LAYER_GPU_FUNCS (CuDNNConvolutionLayer);
0 commit comments