Skip to content

Commit ffda2d5

Browse files
committed
Unit-tested ND-SK convolution layer, improved CPU memory model on OpenCL, preparations for ND-SK pooling layer.
1 parent c1948cb commit ffda2d5

20 files changed

Lines changed: 835 additions & 277 deletions

include/caffe/syncedmem.hpp

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -30,8 +30,9 @@ namespace caffe {
3030
inline void CaffeMallocHost(void** ptr, size_t size) {
3131
// Make sure the memory is zero-copy usable in OpenCL
3232
// All OpenCL/CUDA memory copy operations might profit from this.
33-
posix_memalign(ptr, OPENCL_PAGE_ALIGN,
34-
((size - 1)/OPENCL_CACHE_ALIGN + 1) * OPENCL_CACHE_ALIGN);
33+
CHECK_EQ(0, posix_memalign(ptr, OPENCL_PAGE_ALIGN,
34+
((size - 1)/OPENCL_CACHE_ALIGN + 1) * OPENCL_CACHE_ALIGN))
35+
<< "Host memory allocation error";
3536
CHECK(*ptr) << "host allocation of size " << size << " failed";
3637
}
3738

include/caffe/vision_layers.hpp

Lines changed: 57 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -518,57 +518,6 @@ class ConvolutionSKLayer : public Layer<Dtype> {
518518
int M_, K_, N_;
519519
};
520520

521-
522-
/**
523-
* @brief Convolves the input image for pixelwise classification.
524-
*
525-
* Layer introduced by Hongsheng et al.
526-
*/
527-
template<typename Dtype>
528-
class ConvolutionNDSKLayer : public Layer<Dtype> {
529-
public:
530-
explicit ConvolutionNDSKLayer(const LayerParameter& param)
531-
: Layer<Dtype>(param) {
532-
}
533-
534-
virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
535-
const vector<Blob<Dtype>*>& top);
536-
virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
537-
const vector<Blob<Dtype>*>& top);
538-
539-
virtual inline const char* type() const {
540-
return "ConvolutionNDSK";
541-
}
542-
543-
protected:
544-
virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
545-
const vector<Blob<Dtype>*>& top);
546-
virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
547-
const vector<Blob<Dtype>*>& top);
548-
virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
549-
const vector<bool>& propagate_down,
550-
const vector<Blob<Dtype>*>& bottom);
551-
virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
552-
const vector<bool>& propagate_down,
553-
const vector<Blob<Dtype>*>& bottom);
554-
555-
shared_ptr< Blob<Dtype> > col_buffer();
556-
557-
int kernel_h_, kernel_w_;
558-
int stride_h_, stride_w_;
559-
int channels_;
560-
int group_;
561-
int height_, width_;
562-
int pad_h_, pad_w_;
563-
int kstride_h_, kstride_w_;
564-
int num_, num_output_;
565-
Blob<Dtype> col_buffer_;
566-
Blob<Dtype> bias_multiplier_;
567-
bool bias_term_;
568-
int M_, K_, N_;
569-
};
570-
571-
572521
/**
573522
* @brief Convolves the input image with a bank of learned filters,
574523
* and (optionally) adds biases.
@@ -925,6 +874,63 @@ class PoolingSKLayer : public Layer<Dtype> {
925874
Blob<int> max_idx_;
926875
};
927876

877+
878+
/**
879+
* @brief Pools the input image by taking the max, average, etc. within regions.
880+
*
881+
* For whole image processing, reducing redundancy.
882+
*/
883+
template<typename Dtype>
884+
class PoolingNDLayer : public Layer<Dtype> {
885+
public:
886+
explicit PoolingNDLayer(const LayerParameter& param)
887+
: Layer<Dtype>(param) {
888+
}
889+
virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
890+
const vector<Blob<Dtype>*>& top);
891+
virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
892+
const vector<Blob<Dtype>*>& top);
893+
894+
protected:
895+
virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
896+
const vector<Blob<Dtype>*>& top);
897+
virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
898+
const vector<Blob<Dtype>*>& top);
899+
virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
900+
const vector<bool>& propagate_down,
901+
const vector<Blob<Dtype>*>& bottom);
902+
virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
903+
const vector<bool>& propagate_down,
904+
const vector<Blob<Dtype>*>& bottom);
905+
906+
virtual inline const char* type() const {
907+
return "PoolingND";
908+
}
909+
virtual inline int ExactNumBottomBlobs() const {
910+
return 1;
911+
}
912+
virtual inline int MinTopBlobs() const {
913+
return 1;
914+
}
915+
// MAX POOL layers can output an extra top blob for the mask;
916+
// others can only output the pooled inputs.
917+
virtual inline int MaxTopBlobs() const {
918+
return
919+
(this->layer_param_.pooling_param().pool()
920+
== PoolingParameter_PoolMethod_MAX) ? 2 : 1;
921+
}
922+
923+
int max_top_blobs_;
924+
int pad_h_, pad_w_;
925+
int channels_;
926+
int height_, width_;
927+
int pooled_height_, pooled_width_;
928+
int kernel_h_, kernel_w_;
929+
int stride_h_, stride_w_;
930+
int kstride_h_, kstride_w_;
931+
Blob<int> max_idx_;
932+
};
933+
928934
/**
929935
* @brief Pools the input image by taking the max, average, etc. within regions.
930936
*

src/caffe/greentea/cl_kernels.cpp

Lines changed: 2 additions & 2 deletions
Large diffs are not rendered by default.

src/caffe/greentea/cl_kernels/im2col_nd.cl

Lines changed: 42 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -95,9 +95,19 @@ __kernel void TEMPLATE(col2im_nd, Dtype)(const int n, const int num_axes,
9595
__global Dtype* data_im,
9696
const int data_off) {
9797
int d_im[6];
98+
int d_col_size[6];
9899
int d_col_iter[6];
99100
int d_col_start[6];
100101
int d_col_end[6];
102+
int d_ext_patch[6];
103+
int d_idx[6];
104+
105+
for (int i = num_axes - 1; i >= 0; --i) {
106+
d_ext_patch[i] = (kernel_shape[i] - 1) * kstride[i] + 1;
107+
d_col_size[i] = (im_shape[i + 1] + 2 * pad[i] - d_ext_patch[i])
108+
/ stride[i] + 1;
109+
}
110+
101111
for (int index = get_global_id(0); index < n; index += get_global_size(0)) {
102112
// Initialize channel_in, computed in the loop below, with intermediate
103113
// computations used to compute the spatial indices.
@@ -110,51 +120,60 @@ __kernel void TEMPLATE(col2im_nd, Dtype)(const int n, const int num_axes,
110120
// Calculate col start/end indices.
111121
bool done = false;
112122
for (int i = 0; i < num_axes; ++i) {
113-
d_col_start[i] = d_col_iter[i] =
123+
// Old:
124+
/*d_col_start[i] = d_col_iter[i] =
114125
(d_im[i] < kernel_shape[i]) ?
115-
0 : (d_im[i] - kernel_shape[i]) / stride[i] + 1;
116-
d_col_end[i] = min(d_im[i] / stride[i] + 1, col_shape[i + 1]);
117-
if (d_col_start[i] >= d_col_end[i]) {
126+
0 : (d_im[i] - kernel_shape[i]) / stride[i] + 1;
127+
d_col_end[i] = min(d_im[i] / stride[i] + 1, col_shape[i + 1]);*/
128+
// New:
129+
d_col_start[i] = (d_im[i] < d_ext_patch[i]) ?
130+
d_im[i] % kstride[i] : (d_im[i] - d_ext_patch[i]) + 1;
131+
d_col_iter[i] = d_col_start[i];
132+
d_idx[i] = (d_im[i] - d_col_start[i]) / kstride[i];
133+
d_col_end[i] = (d_im[i] >= d_col_size[i]) ?
134+
(d_col_size[i] - 1) - ((d_col_size[i] - 1) - d_col_start[i])
135+
% kstride[i] : d_im[i];
136+
if (d_col_start[i] > d_col_end[i]) {
118137
// Skip computation if the dimension is 0 at any spatial axis --
119138
// final val will be 0.
120-
data_im[index + data_off] = 0;
139+
data_im[index] = 0;
121140
done = true;
122-
break; // for (int i = 0; i < num_axes; ++i)
141+
break; // for (int i = 0; i < num_axes; ++i)
123142
}
124143
}
125144
if (done) {
126-
continue;
145+
continue; // CUDA_KERNEL_LOOP(index, n)
127146
}
128147
// Loop over the col to compute the output val.
129148
Dtype val = 0;
130149
bool incremented = true;
131150
do {
132151
// Compute the final offset.
133152
int final_offset = 0;
134-
int kernel_shape_prod = 1;
153+
int coeff_prod = 1;
135154
for (int i = num_axes - 1; i >= 0; --i) {
136-
final_offset += (d_im[i] - d_col_iter[i] * stride[i])
137-
* kernel_shape_prod;
138-
kernel_shape_prod *= kernel_shape[i];
155+
final_offset += d_col_iter[i] * coeff_prod;
156+
coeff_prod *= d_col_size[i];
139157
}
140-
final_offset += kernel_shape_prod * channel_im;
141-
for (int i = 0; i < num_axes; ++i) {
142-
final_offset *= col_shape[i + 1];
143-
final_offset += d_col_iter[i];
158+
for (int i = num_axes - 1; i >= 0; --i) {
159+
final_offset += d_idx[i] * coeff_prod;
160+
coeff_prod *= kernel_shape[i];
144161
}
145-
val += data_col[final_offset + data_col_off];
162+
final_offset += channel_im * coeff_prod;
163+
val += data_col[final_offset];
146164
incremented = false;
147165
for (int i = num_axes - 1; i >= 0; --i) {
148-
const int d_max = d_col_end[i];
149-
if (d_col_iter[i] == d_max - 1) {
166+
if (d_col_iter[i] > d_col_end[i] - kstride[i]) {
150167
d_col_iter[i] = d_col_start[i];
151-
} else { // d_col_iter[i] < d_max - 1
152-
++d_col_iter[i];
168+
d_idx[i] = (d_im[i] - d_col_start[i]) / kstride[i];
169+
} else { // d_col_iter[i] <= d_max - kstride[1]
170+
d_col_iter[i] += kstride[i];
171+
--d_idx[i];
153172
incremented = true;
154173
break; // for (int i = num_axes - 1; i >= 0; --i)
155174
}
156175
} // for (int i = num_axes - 1; i >= 0; --i)
157-
} while (incremented);
158-
data_im[index + data_off] = val;
159-
}
176+
} while (incremented);
177+
data_im[index] = val;
178+
} // CUDA_KERNEL_LOOP(index, n)
160179
}

src/caffe/greentea/greentea_im2col.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -302,6 +302,7 @@ template void greentea_col2im_nd_gpu<float>(viennacl::ocl::program *prog,
302302
cl_mem col_shape, cl_mem kernel_shape, cl_mem pad,
303303
cl_mem stride, cl_mem kstride, cl_mem data_im,
304304
int data_off);
305+
305306
template void greentea_col2im_nd_gpu<double>(viennacl::ocl::program *prog,
306307
viennacl::ocl::context *ctx, cl_mem data_col,
307308
const int data_col_off, const int num_spatial_axes,

0 commit comments

Comments
 (0)