Skip to content

Commit 566ac69

Browse files
authored
Create conv_dw_layer.cu
1 parent d47292d commit 566ac69

File tree

1 file changed

+212
-0
lines changed

1 file changed

+212
-0
lines changed

conv_dw_layer.cu

+212
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,212 @@
1+
#include <vector>
2+
#include "caffe/layers/conv_dw_layer.hpp"
3+
#include "caffe/util/gpu_util.cuh"
4+
5+
namespace caffe {
6+
7+
template <typename Dtype>
8+
__global__ void ConvolutionDepthwiseWeightForward(const int nthreads,
9+
const Dtype* const bottom_data, const Dtype* const weight_data, const int num, const int channels,
10+
const int top_height, const int top_width, const int bottom_height, const int bottom_width,
11+
const int kernel_h, const int kernel_w, const int stride_h, const int stride_w,
12+
const int pad_h, const int pad_w, const int dilation_h, const int dilation_w,
13+
Dtype* const top_data) {
14+
CUDA_KERNEL_LOOP(index, nthreads) {
15+
const int n = index / channels / top_height / top_width;
16+
const int c = (index / top_height / top_width) % channels;
17+
const int h = (index / top_width) % top_height;
18+
const int w = index % top_width;
19+
const Dtype* weight = weight_data + c * kernel_h * kernel_w;
20+
Dtype value = 0;
21+
for (int kh = 0; kh < kernel_h; ++kh)
22+
{
23+
for (int kw = 0; kw < kernel_w; ++kw)
24+
{
25+
const int h_in = -pad_h + h * stride_h + kh * dilation_h;
26+
const int w_in = -pad_w + w * stride_w + kw * dilation_w;
27+
if ((h_in >= 0) && (h_in < bottom_height) && (w_in >= 0) && (w_in < bottom_width))
28+
{
29+
const int offset = ((n * channels + c) * bottom_height + h_in) * bottom_width + w_in;
30+
value += (*weight) * bottom_data[offset];
31+
}
32+
++weight;
33+
}
34+
}
35+
top_data[index] = value;
36+
}
37+
}
38+
39+
template <typename Dtype>
40+
__global__ void ConvolutionDepthwiseBiasForward(const int nthreads,
41+
const Dtype* const bias_data, const int num, const int channels,
42+
const int top_height, const int top_width, Dtype* const top_data) {
43+
CUDA_KERNEL_LOOP(index, nthreads) {
44+
const int c = (index / top_height / top_width) % channels;
45+
top_data[index] += bias_data[c];
46+
}
47+
}
48+
49+
template <typename Dtype>
50+
void ConvolutionDepthwiseLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
51+
const vector<Blob<Dtype>*>& top) {
52+
const Dtype* bottom_data = bottom[0]->gpu_data();
53+
Dtype* top_data = top[0]->mutable_gpu_data();
54+
const Dtype* weight_data = this->blobs_[0]->gpu_data();
55+
const int count = top[0]->count();
56+
const int num = top[0]->num();
57+
const int channels = top[0]->channels();
58+
const int top_height = top[0]->height();
59+
const int top_width = top[0]->width();
60+
const int bottom_height = bottom[0]->height();
61+
const int bottom_width = bottom[0]->width();
62+
ConvolutionDepthwiseWeightForward<Dtype><<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS>>>(
63+
count, bottom_data, weight_data, num, channels,
64+
top_height, top_width, bottom_height, bottom_width,
65+
kernel_h_, kernel_w_, stride_h_, stride_w_,
66+
pad_h_, pad_w_, dilation_h_, dilation_w_, top_data);
67+
if (this->layer_param_.convolution_param().bias_term())
68+
{
69+
const Dtype* bias_data = this->blobs_[1]->gpu_data();
70+
ConvolutionDepthwiseBiasForward<Dtype><<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS>>>(
71+
count, bias_data, num, channels,
72+
top_height, top_width, top_data);
73+
}
74+
}
75+
76+
template <typename Dtype>
77+
__global__ void ConvolutionDepthwiseWeightBackward(const int nthreads,
78+
const Dtype* const top_diff, const Dtype* const bottom_data, const int num, const int channels,
79+
const int top_height, const int top_width, const int bottom_height, const int bottom_width,
80+
const int kernel_h, const int kernel_w, const int stride_h, const int stride_w,
81+
const int pad_h, const int pad_w, const int dilation_h, const int dilation_w,
82+
Dtype* const buffer_data) {
83+
CUDA_KERNEL_LOOP(index, nthreads) {
84+
const int h = (index / top_width) % top_height;
85+
const int w = index % top_width;
86+
const int kh = (index / kernel_w / num / top_height / top_width) % kernel_h;
87+
const int kw = (index / num / top_height / top_width) % kernel_w;
88+
const int h_in = -pad_h + h * stride_h + kh * dilation_h;
89+
const int w_in = -pad_w + w * stride_w + kw * dilation_w;
90+
if ((h_in >= 0) && (h_in < bottom_height) && (w_in >= 0) && (w_in < bottom_width))
91+
{
92+
const int c = index / kernel_h / kernel_w / num / top_height / top_width;
93+
const int n = (index / top_height / top_width) % num;
94+
const int top_offset = ((n * channels + c) * top_height + h) * top_width + w;
95+
const int bottom_offset = ((n * channels + c) * bottom_height + h_in) * bottom_width + w_in;
96+
buffer_data[index] = top_diff[top_offset] * bottom_data[bottom_offset];
97+
}
98+
else
99+
{
100+
buffer_data[index] = 0;
101+
}
102+
}
103+
}
104+
105+
template <typename Dtype>
106+
__global__ void ConvolutionDepthwiseBottomBackward(const int nthreads,
107+
const Dtype* const top_diff, const Dtype* const weight_data, const int num, const int channels,
108+
const int top_height, const int top_width, const int bottom_height, const int bottom_width,
109+
const int kernel_h, const int kernel_w, const int stride_h, const int stride_w,
110+
const int pad_h, const int pad_w, const int dilation_h, const int dilation_w,
111+
Dtype* const bottom_diff) {
112+
CUDA_KERNEL_LOOP(index, nthreads) {
113+
const int n = index / channels / bottom_height / bottom_width;
114+
const int c = (index / bottom_height / bottom_width) % channels;
115+
const int h = (index / bottom_width) % bottom_height;
116+
const int w = index % bottom_width;
117+
const Dtype* weight = weight_data + c * kernel_h * kernel_w;
118+
Dtype value = 0;
119+
for (int kh = 0; kh < kernel_h; ++kh)
120+
{
121+
for (int kw = 0; kw < kernel_w; ++kw)
122+
{
123+
const int h_out_s = h + pad_h - kh * dilation_h;
124+
const int w_out_s = w + pad_w - kw * dilation_w;
125+
if (((h_out_s % stride_h) == 0) && ((w_out_s % stride_w) == 0))
126+
{
127+
const int h_out = h_out_s / stride_h;
128+
const int w_out = w_out_s / stride_w;
129+
if ((h_out >= 0) && (h_out < top_height) && (w_out >= 0) && (w_out < top_width))
130+
{
131+
const int offset = ((n * channels + c) * top_height + h_out) * top_width + w_out;
132+
value += (*weight) * top_diff[offset];
133+
}
134+
}
135+
++weight;
136+
}
137+
}
138+
bottom_diff[index] += value;
139+
}
140+
}
141+
142+
template <typename Dtype>
143+
__global__ void ConvolutionDepthwiseBiasBackward(const int nthreads,
144+
const Dtype* const top_diff, const int num, const int channels,
145+
const int top_height, const int top_width, Dtype* const buffer_data) {
146+
CUDA_KERNEL_LOOP(index, nthreads) {
147+
const int c = index / num / top_height / top_width;
148+
const int n = (index / top_height / top_width) % num;
149+
const int h = (index / top_width) % top_height;
150+
const int w = index % top_width;
151+
const int offset = ((n * channels + c) * top_height + h) * top_width + w;
152+
buffer_data[index] = top_diff[offset];
153+
}
154+
}
155+
156+
template <typename Dtype>
157+
void ConvolutionDepthwiseLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
158+
const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
159+
const Dtype* top_diff = top[0]->gpu_diff();
160+
const int bottom_count = bottom[0]->count();
161+
const int num = top[0]->num();
162+
const int channels = top[0]->channels();
163+
const int top_height = top[0]->height();
164+
const int top_width = top[0]->width();
165+
const int bottom_height = bottom[0]->height();
166+
const int bottom_width = bottom[0]->width();
167+
const int length = num * top_height * top_width;
168+
caffe_gpu_set(bottom_count, Dtype(0), bottom[0]->mutable_gpu_diff());
169+
if (this->layer_param_.convolution_param().bias_term() && this->param_propagate_down_[1])
170+
{
171+
const int bias_buffer_count = bias_buffer_.count();
172+
Dtype* bias_buffer_mutable_data = bias_buffer_.mutable_gpu_data();
173+
ConvolutionDepthwiseBiasBackward<Dtype><<<CAFFE_GET_BLOCKS(bias_buffer_count), CAFFE_CUDA_NUM_THREADS>>>(
174+
bias_buffer_count, top_diff, num, channels,
175+
top_height, top_width, bias_buffer_mutable_data);
176+
const int bias_count = this->blobs_[1]->count();
177+
const Dtype* bias_buffer_data = bias_buffer_.gpu_data();
178+
Dtype* bias_diff = this->blobs_[1]->mutable_gpu_diff();
179+
const Dtype* bias_multiplier_data = bias_multiplier_.gpu_data();
180+
caffe_gpu_gemv(CblasNoTrans, bias_count, length, Dtype(1), bias_buffer_data, bias_multiplier_data, Dtype(1), bias_diff);
181+
}
182+
if (this->param_propagate_down_[0])
183+
{
184+
const int weight_buffer_count = weight_buffer_.count();
185+
const Dtype* bottom_data = bottom[0]->gpu_data();
186+
Dtype* weight_buffer_mutable_data = weight_buffer_.mutable_gpu_data();
187+
ConvolutionDepthwiseWeightBackward<Dtype><<<CAFFE_GET_BLOCKS(weight_buffer_count), CAFFE_CUDA_NUM_THREADS>>>(
188+
weight_buffer_count, top_diff, bottom_data, num, channels,
189+
top_height, top_width, bottom_height, bottom_width,
190+
kernel_h_, kernel_w_, stride_h_, stride_w_,
191+
pad_h_, pad_w_, dilation_h_, dilation_w_, weight_buffer_mutable_data);
192+
const int weight_count = this->blobs_[0]->count();
193+
const Dtype* weight_buffer_data = weight_buffer_.gpu_data();
194+
Dtype* weight_diff = this->blobs_[0]->mutable_gpu_diff();
195+
const Dtype* weight_multiplier_data = weight_multiplier_.gpu_data();
196+
caffe_gpu_gemv(CblasNoTrans, weight_count, length, Dtype(1), weight_buffer_data, weight_multiplier_data, Dtype(1), weight_diff);
197+
}
198+
if (propagate_down[0])
199+
{
200+
const Dtype* weight_data = this->blobs_[0]->gpu_data();
201+
Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
202+
ConvolutionDepthwiseBottomBackward<Dtype><<<CAFFE_GET_BLOCKS(bottom_count), CAFFE_CUDA_NUM_THREADS>>>(
203+
bottom_count, top_diff, weight_data, num, channels,
204+
top_height, top_width, bottom_height, bottom_width,
205+
kernel_h_, kernel_w_, stride_h_, stride_w_,
206+
pad_h_, pad_w_, dilation_h_, dilation_w_, bottom_diff);
207+
}
208+
}
209+
210+
INSTANTIATE_LAYER_GPU_FUNCS(ConvolutionDepthwiseLayer);
211+
212+
} // namespace caffe

0 commit comments

Comments
 (0)