DeepLearningCode
diff --git a/‎include/caffe/syncedmem.hpp‎
Lines changed: 3 additions & 2 deletions b/‎include/caffe/syncedmem.hpp‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎include/caffe/vision_layers.hpp‎
Lines changed: 57 additions & 51 deletions b/‎include/caffe/vision_layers.hpp‎
Lines changed: 57 additions & 51 deletions
diff --git a/‎src/caffe/greentea/cl_kernels.cpp‎
Lines changed: 2 additions & 2 deletions b/‎src/caffe/greentea/cl_kernels.cpp‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎src/caffe/greentea/cl_kernels/im2col_nd.cl‎
Lines changed: 42 additions & 23 deletions b/‎src/caffe/greentea/cl_kernels/im2col_nd.cl‎
Lines changed: 42 additions & 23 deletions
diff --git a/‎src/caffe/greentea/greentea_im2col.cpp‎
Lines changed: 1 addition & 0 deletions b/‎src/caffe/greentea/greentea_im2col.cpp‎
Lines changed: 1 addition & 0 deletions
@@ -30,8 +30,9 @@ namespace caffe {
 inline void CaffeMallocHost(void** ptr, size_t size) {
   // Make sure the memory is zero-copy usable in OpenCL
   // All OpenCL/CUDA memory copy operations might profit from this.
-  posix_memalign(ptr, OPENCL_PAGE_ALIGN,
-                 ((size - 1)/OPENCL_CACHE_ALIGN + 1) * OPENCL_CACHE_ALIGN);
+  CHECK_EQ(0, posix_memalign(ptr, OPENCL_PAGE_ALIGN,
+                 ((size - 1)/OPENCL_CACHE_ALIGN + 1) * OPENCL_CACHE_ALIGN))
+        << "Host memory allocation error";
   CHECK(*ptr) << "host allocation of size " << size << " failed";
 }
 
 
@@ -518,57 +518,6 @@ class ConvolutionSKLayer : public Layer<Dtype> {
   int M_, K_, N_;
 };
 
-
-/**
- * @brief Convolves the input image for pixelwise classification.
- *
- *   Layer introduced by Hongsheng et al.
- */
-template<typename Dtype>
-class ConvolutionNDSKLayer : public Layer<Dtype> {
- public:
-  explicit ConvolutionNDSKLayer(const LayerParameter& param)
-      : Layer<Dtype>(param) {
-  }
-
-  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-                          const vector<Blob<Dtype>*>& top);
-  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
-                       const vector<Blob<Dtype>*>& top);
-
-  virtual inline const char* type() const {
-    return "ConvolutionNDSK";
-  }
-
- protected:
-  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-                           const vector<Blob<Dtype>*>& top);
-  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-                           const vector<Blob<Dtype>*>& top);
-  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
-                            const vector<bool>& propagate_down,
-                            const vector<Blob<Dtype>*>& bottom);
-  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-                            const vector<bool>& propagate_down,
-                            const vector<Blob<Dtype>*>& bottom);
-
-  shared_ptr< Blob<Dtype> > col_buffer();
-
-  int kernel_h_, kernel_w_;
-  int stride_h_, stride_w_;
-  int channels_;
-  int group_;
-  int height_, width_;
-  int pad_h_, pad_w_;
-  int kstride_h_, kstride_w_;
-  int num_, num_output_;
-  Blob<Dtype> col_buffer_;
-  Blob<Dtype> bias_multiplier_;
-  bool bias_term_;
-  int M_, K_, N_;
-};
-
-
 /**
  * @brief Convolves the input image with a bank of learned filters,
  *        and (optionally) adds biases.
@@ -925,6 +874,63 @@ class PoolingSKLayer : public Layer<Dtype> {
   Blob<int> max_idx_;
 };
 
+
+/**
+ * @brief Pools the input image by taking the max, average, etc. within regions.
+ *
+ * For whole image processing, reducing redundancy.
+ */
+template<typename Dtype>
+class PoolingNDLayer : public Layer<Dtype> {
+ public:
+  explicit PoolingNDLayer(const LayerParameter& param)
+      : Layer<Dtype>(param) {
+  }
+  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+                          const vector<Blob<Dtype>*>& top);
+  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+                       const vector<Blob<Dtype>*>& top);
+
+ protected:
+  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+                           const vector<Blob<Dtype>*>& top);
+  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+                           const vector<Blob<Dtype>*>& top);
+  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+                            const vector<bool>& propagate_down,
+                            const vector<Blob<Dtype>*>& bottom);
+  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+                            const vector<bool>& propagate_down,
+                            const vector<Blob<Dtype>*>& bottom);
+
+  virtual inline const char* type() const {
+    return "PoolingND";
+  }
+  virtual inline int ExactNumBottomBlobs() const {
+    return 1;
+  }
+  virtual inline int MinTopBlobs() const {
+    return 1;
+  }
+  // MAX POOL layers can output an extra top blob for the mask;
+  // others can only output the pooled inputs.
+  virtual inline int MaxTopBlobs() const {
+    return
+        (this->layer_param_.pooling_param().pool()
+            == PoolingParameter_PoolMethod_MAX) ? 2 : 1;
+  }
+
+  int max_top_blobs_;
+  int pad_h_, pad_w_;
+  int channels_;
+  int height_, width_;
+  int pooled_height_, pooled_width_;
+  int kernel_h_, kernel_w_;
+  int stride_h_, stride_w_;
+  int kstride_h_, kstride_w_;
+  Blob<int> max_idx_;
+};
+
 /**
  * @brief Pools the input image by taking the max, average, etc. within regions.
  *
 
@@ -95,9 +95,19 @@ __kernel void TEMPLATE(col2im_nd, Dtype)(const int n, const int num_axes,
                                   __global Dtype* data_im,
                                   const int data_off) {
   int d_im[6];
+  int d_col_size[6];
   int d_col_iter[6];
   int d_col_start[6];
   int d_col_end[6];
+  int d_ext_patch[6];
+  int d_idx[6];
+
+  for (int i = num_axes - 1; i >= 0; --i) {
+    d_ext_patch[i] = (kernel_shape[i] - 1) * kstride[i] + 1;
+    d_col_size[i] = (im_shape[i + 1] + 2 * pad[i] - d_ext_patch[i])
+        / stride[i] + 1;
+  }
+
   for (int index = get_global_id(0); index < n; index += get_global_size(0)) {
     // Initialize channel_in, computed in the loop below, with intermediate
     // computations used to compute the spatial indices.
@@ -110,51 +120,60 @@ __kernel void TEMPLATE(col2im_nd, Dtype)(const int n, const int num_axes,
     // Calculate col start/end indices.
     bool done = false;
     for (int i = 0; i < num_axes; ++i) {
-      d_col_start[i] = d_col_iter[i] =
+      // Old:
+      /*d_col_start[i] = d_col_iter[i] =
           (d_im[i] < kernel_shape[i]) ?
-              0 : (d_im[i] - kernel_shape[i]) / stride[i] + 1;
-      d_col_end[i] = min(d_im[i] / stride[i] + 1, col_shape[i + 1]);
-      if (d_col_start[i] >= d_col_end[i]) {
+          0 : (d_im[i] - kernel_shape[i]) / stride[i] + 1;
+      d_col_end[i] = min(d_im[i] / stride[i] + 1, col_shape[i + 1]);*/
+      // New:
+      d_col_start[i] = (d_im[i] < d_ext_patch[i]) ?
+          d_im[i] % kstride[i] : (d_im[i] - d_ext_patch[i]) + 1;
+      d_col_iter[i] = d_col_start[i];
+      d_idx[i] = (d_im[i] - d_col_start[i]) / kstride[i];
+      d_col_end[i] = (d_im[i] >= d_col_size[i]) ?
+          (d_col_size[i] - 1) - ((d_col_size[i] - 1) - d_col_start[i])
+          % kstride[i] : d_im[i];
+      if (d_col_start[i] > d_col_end[i]) {
         // Skip computation if the dimension is 0 at any spatial axis --
         // final val will be 0.
-        data_im[index + data_off] = 0;
+        data_im[index] = 0;
         done = true;
-        break;        // for (int i = 0; i < num_axes; ++i)
+        break;  // for (int i = 0; i < num_axes; ++i)
       }
     }
     if (done) {
-      continue;
+      continue;  // CUDA_KERNEL_LOOP(index, n)
     }
     // Loop over the col to compute the output val.
     Dtype val = 0;
     bool incremented = true;
     do {
       // Compute the final offset.
       int final_offset = 0;
-      int kernel_shape_prod = 1;
+      int coeff_prod = 1;
       for (int i = num_axes - 1; i >= 0; --i) {
-        final_offset += (d_im[i] - d_col_iter[i] * stride[i])
-            * kernel_shape_prod;
-        kernel_shape_prod *= kernel_shape[i];
+        final_offset +=  d_col_iter[i] * coeff_prod;
+        coeff_prod *= d_col_size[i];
       }
-      final_offset += kernel_shape_prod * channel_im;
-      for (int i = 0; i < num_axes; ++i) {
-        final_offset *= col_shape[i + 1];
-        final_offset += d_col_iter[i];
+      for (int i = num_axes - 1; i >= 0; --i) {
+        final_offset += d_idx[i] * coeff_prod;
+        coeff_prod *= kernel_shape[i];
       }
-      val += data_col[final_offset + data_col_off];
+      final_offset += channel_im * coeff_prod;
+      val += data_col[final_offset];
       incremented = false;
       for (int i = num_axes - 1; i >= 0; --i) {
-        const int d_max = d_col_end[i];
-        if (d_col_iter[i] == d_max - 1) {
+        if (d_col_iter[i] > d_col_end[i] - kstride[i]) {
           d_col_iter[i] = d_col_start[i];
-        } else {  // d_col_iter[i] < d_max - 1
-          ++d_col_iter[i];
+          d_idx[i] = (d_im[i] - d_col_start[i]) / kstride[i];
+        } else {  // d_col_iter[i] <= d_max - kstride[1]
+          d_col_iter[i] += kstride[i];
+          --d_idx[i];
           incremented = true;
           break;  // for (int i = num_axes - 1; i >= 0; --i)
         }
       }  // for (int i = num_axes - 1; i >= 0; --i)
-    } while (incremented);
-    data_im[index + data_off] = val;
-  }
+    }  while (incremented);
+    data_im[index] = val;
+  }  // CUDA_KERNEL_LOOP(index, n)
 }
@@ -302,6 +302,7 @@ template void greentea_col2im_nd_gpu<float>(viennacl::ocl::program *prog,
                             cl_mem col_shape, cl_mem kernel_shape, cl_mem pad,
                             cl_mem stride, cl_mem kstride, cl_mem data_im,
                             int data_off);
+
 template void greentea_col2im_nd_gpu<double>(viennacl::ocl::program *prog,
                             viennacl::ocl::context *ctx, cl_mem data_col,
                             const int data_col_off, const int num_spatial_axes,