Rework of the call protocol for kernel to lower overhead and allow

abergeron · abergeron · commit 921ba767bdbe · 2015-05-14T11:50:55.000-04:00
more possibilities.

You can now allocate dynamic shared memory and use any number of
dimensions in the call (subject to backend limits).

Also, this removes the scheduling feature from GpuKernel_call() itself
and provides it as GpuKernel_sched() that you can call if you need it.
diff --git a/src/gen_types.py b/src/gen_types.py
@@ -164,6 +164,7 @@ def add_type(name, sz):
  * List of all built-in types.
  */
 enum GPUARRAY_TYPES {
+  GA_POINTER = -2,
   GA_BUFFER = -1,
 % for i, v in sorted(TYPEMAP.items()):
   GA_${v[1].upper()} = ${i},
diff --git a/src/gpuarray/buffer.h b/src/gpuarray/buffer.h
@@ -324,12 +324,17 @@ typedef struct _gpuarray_buffer_ops {
    * Call a kernel.
    *
    * \param k kernel
-   * \param bs block size for this call (also known as local size)
-   * \param gs grid size for this call (also known as global size)
+   * \param n number of dimensions of grid/block
+   * \param bs block sizes for this call (also known as local size)
+   * \param gs grid sizes for this call (also known as global size)
+   * \param shared amount of dynamic shared memory to reserve
+   * \param args table of pointers to each argument.
    *
    * \returns GA_NO_ERROR or an error code if an error occurred.
    */
-  int (*kernel_call)(gpukernel *k, size_t bs[2], size_t gs[2], void **args);
+  int (*kernel_call)(gpukernel *k, unsigned int n,
+                     const size_t *bs, const size_t *gs,
+                     size_t shared, void **args);
 
   /**
    * Get the kernel binary.
diff --git a/src/gpuarray/kernel.h b/src/gpuarray/kernel.h
@@ -77,30 +77,37 @@ GPUARRAY_PUBLIC void GpuKernel_clear(GpuKernel *k);
 GPUARRAY_PUBLIC void *GpuKernel_context(GpuKernel *k);
 
 /**
- * Launch the execution of a kernel.
+ * Do a scheduling of local and global size for a kernel.
  *
- * You either specify the block and grid sizes (`ls` and `gs`) or the
- * total size (`n`). Set a value to `0` to indicate it is
- * unspecified. You can also specify the total size (`n`) and one of
- * the block (`ls`) or grid (`gs`) size.
+ * This function will find an optimal grid and block size for the
+ * number of elements specified in n when running kernel k.  The
+ * parameters may run a bit more instances than n for efficiency
+ * reasons, so your kernel must be ready to deal with that.
  *
- * If you leave one or both of `ls` or `gs`, it will be filled
- * according to a heuristic to get a good performance out of your
- * hardware. However the number of kernel instances that will be run
- * can be slightly higher than the total size you specified in order
- * to avoid performance degradation. Your kernel should be ready to
- * handle this.
+ * If either gs or ls is not 0 on entry its value will not be altered
+ * and will be taken into account when choosing the other value.
  *
- * \param k the kernel to launch
- * \param n number of instances to launch
- * \param ls size of launch blocks
- * \param gs size of launch grid
+ * \param k the kernel to schedule for
+ * \param n number of elements to handle
+ * \param ls local size (in/out)
+ * \param gs grid size (in/out)
  */
-GPUARRAY_PUBLIC int GpuKernel_call2(GpuKernel *k, size_t n[2],
-                                   size_t ls[2], size_t gs[2], void **args);
+GPUARRAY_PUBLIC int GpuKernel_sched(GpuKernel *k, size_t n,
+                                    size_t *ls, size_t *gs);
 
-GPUARRAY_PUBLIC int GpuKernel_call(GpuKernel *k, size_t n,
-                                  size_t ls, size_t gs, void **args);
+/**
+ * Launch the execution of a kernel.
+ *
+ * \param k the kernel to launch
+ * \param n dimensionality of the grid/blocks
+ * \param ls sizes of launch blocks
+ * \param gs sizes of launch grid
+ * \param amount of dynamic shared memory to allocate
+ * \param args table of pointers to arguments
+ */
+GPUARRAY_PUBLIC int GpuKernel_call(GpuKernel *k, unsigned int n,
+                                   const size_t *ls, const size_t *gs,
+                                   size_t shared, void **args);
 
 GPUARRAY_PUBLIC int GpuKernel_binary(const GpuKernel *k, size_t *sz,
                                     void **obj);
diff --git a/src/gpuarray/types.h b/src/gpuarray/types.h
@@ -43,6 +43,7 @@ typedef struct _gpuarray_type {
  * List of all built-in types.
  */
 enum GPUARRAY_TYPES {
+  GA_POINTER = -2,
   GA_BUFFER = -1,
   GA_BOOL = 0,
   GA_BYTE = 1,
diff --git a/src/gpuarray_buffer_cuda.c b/src/gpuarray_buffer_cuda.c
@@ -892,33 +892,41 @@ static void cuda_freekernel(gpukernel *k) {
   }
 }
 
-static int cuda_callkernel(gpukernel *k, size_t bs[2], size_t gs[2],
-                           void **args) {
+static int cuda_callkernel(gpukernel *k, unsigned int n,
+                           const size_t *bs, const size_t *gs,
+                           size_t shared, void **args) {
     cuda_context *ctx = k->ctx;
     unsigned int i;
+    int res = GA_NO_ERROR;
 
     ASSERT_KER(k);
     cuda_enter(ctx);
     if (ctx->err != CUDA_SUCCESS)
       return GA_IMPL_ERROR;
 
-    for (i = 0; i < k->argcount; i++) {
-      if (k->types[i] == GA_BUFFER) {
-        k->args[i] = &((gpudata *)args[i])->ptr;
-      } else {
-        k->args[i] = args[i];
-      }
+    switch (n) {
+    case 1:
+      ctx->err = cuLaunchKernel(k->k, gs[0], 1, 1, bs[0], 1, 1, shared,
+                                ctx->s, args, NULL);
+      break;
+    case 2:
+      ctx->err = cuLaunchKernel(k->k, gs[0], gs[1], 1, bs[0], bs[1], 1, shared,
+                                ctx->s, args, NULL);
+      break;
+    case 3:
+      ctx->err = cuLaunchKernel(k->k, gs[0], gs[1], gs[2], bs[0], bs[1], bs[2],
+                                shared, ctx->s, args, NULL);
+      break;
+    default:
+      cuda_exit(ctx);
+      return GA_VALUE_ERROR;
     }
-
-    ctx->err = cuLaunchKernel(k->k, gs[0], gs[1], 1, bs[0], bs[1], 1, 0,
-			      ctx->s, k->args, NULL);
     if (ctx->err != CUDA_SUCCESS) {
-      cuda_exit(ctx);
-      return GA_IMPL_ERROR;
+      res = GA_IMPL_ERROR;
     }
 
     cuda_exit(ctx);
-    return GA_NO_ERROR;
+    return res;
 }
 
 static int cuda_kernelbin(gpukernel *k, size_t *sz, void **obj) {
@@ -1177,7 +1185,7 @@ static int cuda_extcopy(gpudata *input, size_t ioff, gpudata *output,
   int res = GA_SYS_ERROR;
   int in_cache = 1;
   unsigned int i;
-  size_t nEls = 1, ls[2], gs[2];
+  size_t nEls = 1, ls, gs;
   gpukernel *k;
   cache_val_t *v;
   cache_key_t a;
@@ -1230,14 +1238,13 @@ static int cuda_extcopy(gpudata *input, size_t ioff, gpudata *output,
   }
 
   /* Cheap kernel scheduling */
-  res = cuda_property(NULL, NULL, *v, GA_KERNEL_PROP_MAXLSIZE, ls);
+  res = cuda_property(NULL, NULL, *v, GA_KERNEL_PROP_MAXLSIZE, &ls);
   if (res != GA_NO_ERROR) goto fail;
 
-  gs[0] = ((nEls-1) / ls[0]) + 1;
-  gs[1] = ls[1] = 1;
+  gs = ((nEls-1) / ls) + 1;
   args[0] = input;
   args[1] = output;
-  res = cuda_callkernel(*v, ls, gs, args);
+  res = cuda_callkernel(*v, 1, &ls, &gs, 0, args);
 
 fail:
   if (!in_cache)
diff --git a/src/gpuarray_buffer_opencl.c b/src/gpuarray_buffer_opencl.c
@@ -165,9 +165,12 @@ cl_mem cl_get_buf(gpudata *g) { ASSERT_BUF(g); return g->buf; }
 static gpukernel *cl_newkernel(void *ctx, unsigned int count,
                                const char **strings, const size_t *lengths,
                                const char *fname, unsigned int argcount,
-                               const int *types, int flags, int *ret, char **err_str);
+                               const int *types, int flags, int *ret,
+                               char **err_str);
 static void cl_releasekernel(gpukernel *k);
-static int cl_callkernel(gpukernel *k, size_t bs[2], size_t gs[2], void **args);
+static int cl_callkernel(gpukernel *k, unsigned int n,
+                         const size_t *bs, const size_t *gs,
+                         size_t shared, void **args);
 
 static const char CL_PREAMBLE[] =
   "#define local_barrier() barrier(CLK_LOCAL_MEM_FENCE)\n"
@@ -552,7 +555,7 @@ static int cl_memset(gpudata *dst, size_t offset, int data) {
   cl_ctx *ctx = dst->ctx;
   const char *rlk[1];
   void *args[1];
-  size_t sz, bytes, n, ls[2], gs[2];
+  size_t sz, bytes, n, ls, gs;
   gpukernel *m;
   cl_mem_flags fl;
   int type;
@@ -627,12 +630,11 @@ static int cl_memset(gpudata *dst, size_t offset, int data) {
   if (m == NULL) return res;
 
   /* Cheap kernel scheduling */
-  res = cl_property(NULL, NULL, m, GA_KERNEL_PROP_MAXLSIZE, &ls[0]);
+  res = cl_property(NULL, NULL, m, GA_KERNEL_PROP_MAXLSIZE, &ls);
   if (res != GA_NO_ERROR) goto fail;
-  gs[0] = ((n-1) / ls[0]) + 1;
-  gs[1] = ls[1] = 1;
+  gs = ((n-1) / ls) + 1;
   args[0] = dst;
-  res = cl_callkernel(m, ls, gs, args);
+  res = cl_callkernel(m, 1, &ls, &gs, 0, args);
 
  fail:
   cl_releasekernel(m);
@@ -833,10 +835,11 @@ static void cl_releasekernel(gpukernel *k) {
   }
 }
 
-static int cl_callkernel(gpukernel *k, size_t ls[2], size_t gs[2],
-                         void **args) {
+static int cl_callkernel(gpukernel *k, unsigned int n,
+                         const size_t *ls, const size_t *gs,
+                         size_t shared, void **args) {
   cl_ctx *ctx = k->ctx;
-  size_t _gs[2];
+  size_t _gs[3];
   cl_event ev;
   cl_event *evw;
   gpudata *btmp;
@@ -849,6 +852,12 @@ static int cl_callkernel(gpukernel *k, size_t ls[2], size_t gs[2],
   ASSERT_KER(k);
   ASSERT_CTX(ctx);
 
+  if (n > 3)
+    return GA_VALUE_ERROR;
+
+  if (shared != 0)
+    return GA_UNSUPPORTED_ERROR;
+
   dev = get_dev(ctx->ctx, &res);
   if (dev == NULL) return res;
 
@@ -859,16 +868,22 @@ static int cl_callkernel(gpukernel *k, size_t ls[2], size_t gs[2],
   }
 
   for (i = 0; i < k->argcount; i++) {
-    if (k->types[i] == GA_BUFFER) {
+    switch (k->types[i]) {
+    case GA_POINTER:
+      free(evw);
+      return GA_DEVSUP_ERROR;
+    case GA_BUFFER:
       btmp = (gpudata *)args[i];
       if (btmp->ev != NULL)
         evw[num_ev++] = btmp->ev;
       ctx->err = clSetKernelArg(k->k, i, sizeof(cl_mem), &btmp->buf);
-    } else if (k->types[i] == GA_SIZE) {
+      break;
+    case GA_SIZE:
       temp = *((size_t *)args[i]);
       ctx->err = clSetKernelArg(k->k, i, gpuarray_get_elsize(k->types[i]),
                                 &temp);
-    } else {
+      break;
+    default:
       ctx->err = clSetKernelArg(k->k, i, gpuarray_get_elsize(k->types[i]),
                                 args[i]);
     }
@@ -883,9 +898,15 @@ static int cl_callkernel(gpukernel *k, size_t ls[2], size_t gs[2],
     evw = NULL;
   }
 
-  _gs[0] = gs[0] * ls[0];
-  _gs[1] = gs[1] * ls[1];
-  ctx->err = clEnqueueNDRangeKernel(ctx->q, k->k, 2, NULL, _gs, ls,
+  switch (n) {
+  case 3:
+    _gs[2] = gs[2] * ls[2];
+  case 2:
+    _gs[1] = gs[1] * ls[1];
+  case 1:
+    _gs[0] = gs[0] * ls[0];
+  }
+  ctx->err = clEnqueueNDRangeKernel(ctx->q, k->k, n, NULL, _gs, ls,
 				    num_ev, evw, &ev);
   free(evw);
   if (ctx->err != CL_SUCCESS) return GA_IMPL_ERROR;
@@ -989,7 +1010,7 @@ static int cl_extcopy(gpudata *input, size_t ioff, gpudata *output,
                       const ssize_t *b_str) {
   cl_ctx *ctx = input->ctx;
   strb sb = STRB_STATIC_INIT;
-  size_t nEls, ls[2], gs[2];
+  size_t nEls, ls, gs;
   gpukernel *k;
   void *args[2];
   cl_mem_flags fl;
@@ -1058,14 +1079,13 @@ static int cl_extcopy(gpudata *input, size_t ioff, gpudata *output,
                    2, types, flags, &res, NULL);
   if (k == NULL) goto fail;
   /* Cheap kernel scheduling */
-  res = cl_property(NULL, NULL, k, GA_KERNEL_PROP_MAXLSIZE, &ls[0]);
+  res = cl_property(NULL, NULL, k, GA_KERNEL_PROP_MAXLSIZE, &ls);
   if (res != GA_NO_ERROR) goto kfail;
 
-  gs[0] = ((nEls-1) / ls[0]) + 1;
-  gs[1] = ls[1] = 1;
+  gs = ((nEls-1) / ls) + 1;
   args[0] = input;
   args[1] = output;
-  res = cl_callkernel(k, ls, gs, args);
+  res = cl_callkernel(k, 1, &ls, &gs, 0, args);
 
  kfail:
   cl_releasekernel(k);
diff --git a/src/gpuarray_kernel.c b/src/gpuarray_kernel.c
@@ -36,7 +36,7 @@ void *GpuKernel_context(GpuKernel *k) {
   return res;
 }
 
-static int do_sched(GpuKernel *k, size_t n, size_t *ls, size_t *gs) {
+int GpuKernel_sched(GpuKernel *k, size_t n, size_t *ls, size_t *gs) {
   size_t min_l;
   size_t max_l;
   size_t max_g;
@@ -70,63 +70,10 @@ static int do_sched(GpuKernel *k, size_t n, size_t *ls, size_t *gs) {
   return GA_NO_ERROR;
 }
 
-int GpuKernel_call(GpuKernel *k, size_t n, size_t bs, size_t gs, void **args) {
-  size_t _n[2], _bs[2], _gs[2];
-  _n[1] = _bs[1] = _gs[1] = 1;
-  _n[0] = n;
-  _bs[0] = bs;
-  _gs[0] = gs;
-  return GpuKernel_call2(k, _n, _bs, _gs, args);
-}
-
-int GpuKernel_call2(GpuKernel *k, size_t n[2], size_t _bs[2], size_t _gs[2],
-                    void **args) {
-  size_t bs[2] = {0, 0}, gs[2] = {0, 0};
-  int *types;
-  unsigned int argcount;
-  unsigned int i;
-  int err;
-
-  if (_bs != NULL) bs[0] = _bs[0], bs[1] = _bs[1];
-  if (_gs != NULL) gs[0] = _gs[0], gs[1] = _gs[1];
-  if (n == NULL) {
-    if (_bs == NULL || _gs == NULL ||
-        bs[0] == 0 || bs[1] == 0 ||
-        gs[0] == 0 || gs[1] == 0)
-      return GA_INVALID_ERROR;
-  } else {
-    if (bs[0] == 0 || gs[0] == 0) {
-      if (n[0] == 0)
-        return GA_INVALID_ERROR;
-      err = do_sched(k, n[0], &bs[0], &gs[0]);
-      if (err != GA_NO_ERROR)
-        return err;
-    }
-
-    if (bs[1] == 0 || gs[1] == 0) {
-      if (n[1] == 0)
-        return GA_INVALID_ERROR;
-      if (n[1] == 1) {
-        bs[1] = 1;
-        gs[1] = 1;
-      } else {
-        err = do_sched(k, n[1], &bs[1], &gs[1]);
-        if (err != GA_NO_ERROR)
-          return err;
-      }
-    }
-  }
-  err = k->ops->property(NULL, NULL, k->k, GA_KERNEL_PROP_NUMARGS, &argcount);
-  if (err != GA_NO_ERROR) return err;
-  err = k->ops->property(NULL, NULL, k->k, GA_KERNEL_PROP_TYPES, &types);
-  if (err != GA_NO_ERROR) return err;
-
-  for (i = 0; i < argcount; i++)
-    if (types[i] == GA_BUFFER)
-      k->args[i] = ((GpuArray *)args[i])->data;
-    else
-      k->args[i] = args[i];
-  return k->ops->kernel_call(k->k, bs, gs, k->args);
+int GpuKernel_call(GpuKernel *k, unsigned int n,
+                   const size_t *bs, const size_t *gs,
+                   size_t shared, void **args) {
+  return k->ops->kernel_call(k->k, n, bs, gs, shared, args);
 }
 
 int GpuKernel_binary(const GpuKernel *k, size_t *sz, void **bin) {