Skip to content

Commit 921ba76

Browse files
committed
Rework of the call protocol for kernel to lower overhead and allow
more possibilities. You can now allocate dynamic shared memory and use any number of dimensions in the call (subject to backend limits). Also, this removes the scheduling feature from GpuKernel_call() itself and provides it as GpuKernel_sched() that you can call if you need it.
1 parent fa00d36 commit 921ba76

7 files changed

Lines changed: 108 additions & 120 deletions

File tree

src/gen_types.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -164,6 +164,7 @@ def add_type(name, sz):
164164
* List of all built-in types.
165165
*/
166166
enum GPUARRAY_TYPES {
167+
GA_POINTER = -2,
167168
GA_BUFFER = -1,
168169
% for i, v in sorted(TYPEMAP.items()):
169170
GA_${v[1].upper()} = ${i},

src/gpuarray/buffer.h

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -324,12 +324,17 @@ typedef struct _gpuarray_buffer_ops {
324324
* Call a kernel.
325325
*
326326
* \param k kernel
327-
* \param bs block size for this call (also known as local size)
328-
* \param gs grid size for this call (also known as global size)
327+
* \param n number of dimensions of grid/block
328+
* \param bs block sizes for this call (also known as local size)
329+
* \param gs grid sizes for this call (also known as global size)
330+
* \param shared amount of dynamic shared memory to reserve
331+
* \param args table of pointers to each argument.
329332
*
330333
* \returns GA_NO_ERROR or an error code if an error occurred.
331334
*/
332-
int (*kernel_call)(gpukernel *k, size_t bs[2], size_t gs[2], void **args);
335+
int (*kernel_call)(gpukernel *k, unsigned int n,
336+
const size_t *bs, const size_t *gs,
337+
size_t shared, void **args);
333338

334339
/**
335340
* Get the kernel binary.

src/gpuarray/kernel.h

Lines changed: 26 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -77,30 +77,37 @@ GPUARRAY_PUBLIC void GpuKernel_clear(GpuKernel *k);
7777
GPUARRAY_PUBLIC void *GpuKernel_context(GpuKernel *k);
7878

7979
/**
80-
* Launch the execution of a kernel.
80+
* Do a scheduling of local and global size for a kernel.
8181
*
82-
* You either specify the block and grid sizes (`ls` and `gs`) or the
83-
* total size (`n`). Set a value to `0` to indicate it is
84-
* unspecified. You can also specify the total size (`n`) and one of
85-
* the block (`ls`) or grid (`gs`) size.
82+
* This function will find an optimal grid and block size for the
83+
* number of elements specified in n when running kernel k. The
84+
* parameters may run a bit more instances than n for efficiency
85+
* reasons, so your kernel must be ready to deal with that.
8686
*
87-
* If you leave one or both of `ls` or `gs`, it will be filled
88-
* according to a heuristic to get a good performance out of your
89-
* hardware. However the number of kernel instances that will be run
90-
* can be slightly higher than the total size you specified in order
91-
* to avoid performance degradation. Your kernel should be ready to
92-
* handle this.
87+
* If either gs or ls is not 0 on entry its value will not be altered
88+
* and will be taken into account when choosing the other value.
9389
*
94-
* \param k the kernel to launch
95-
* \param n number of instances to launch
96-
* \param ls size of launch blocks
97-
* \param gs size of launch grid
90+
* \param k the kernel to schedule for
91+
* \param n number of elements to handle
92+
* \param ls local size (in/out)
93+
* \param gs grid size (in/out)
9894
*/
99-
GPUARRAY_PUBLIC int GpuKernel_call2(GpuKernel *k, size_t n[2],
100-
size_t ls[2], size_t gs[2], void **args);
95+
GPUARRAY_PUBLIC int GpuKernel_sched(GpuKernel *k, size_t n,
96+
size_t *ls, size_t *gs);
10197

102-
GPUARRAY_PUBLIC int GpuKernel_call(GpuKernel *k, size_t n,
103-
size_t ls, size_t gs, void **args);
98+
/**
99+
* Launch the execution of a kernel.
100+
*
101+
* \param k the kernel to launch
102+
* \param n dimensionality of the grid/blocks
103+
* \param ls sizes of launch blocks
104+
* \param gs sizes of launch grid
105+
* \param amount of dynamic shared memory to allocate
106+
* \param args table of pointers to arguments
107+
*/
108+
GPUARRAY_PUBLIC int GpuKernel_call(GpuKernel *k, unsigned int n,
109+
const size_t *ls, const size_t *gs,
110+
size_t shared, void **args);
104111

105112
GPUARRAY_PUBLIC int GpuKernel_binary(const GpuKernel *k, size_t *sz,
106113
void **obj);

src/gpuarray/types.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@ typedef struct _gpuarray_type {
4343
* List of all built-in types.
4444
*/
4545
enum GPUARRAY_TYPES {
46+
GA_POINTER = -2,
4647
GA_BUFFER = -1,
4748
GA_BOOL = 0,
4849
GA_BYTE = 1,

src/gpuarray_buffer_cuda.c

Lines changed: 26 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -892,33 +892,41 @@ static void cuda_freekernel(gpukernel *k) {
892892
}
893893
}
894894

895-
static int cuda_callkernel(gpukernel *k, size_t bs[2], size_t gs[2],
896-
void **args) {
895+
static int cuda_callkernel(gpukernel *k, unsigned int n,
896+
const size_t *bs, const size_t *gs,
897+
size_t shared, void **args) {
897898
cuda_context *ctx = k->ctx;
898899
unsigned int i;
900+
int res = GA_NO_ERROR;
899901

900902
ASSERT_KER(k);
901903
cuda_enter(ctx);
902904
if (ctx->err != CUDA_SUCCESS)
903905
return GA_IMPL_ERROR;
904906

905-
for (i = 0; i < k->argcount; i++) {
906-
if (k->types[i] == GA_BUFFER) {
907-
k->args[i] = &((gpudata *)args[i])->ptr;
908-
} else {
909-
k->args[i] = args[i];
910-
}
907+
switch (n) {
908+
case 1:
909+
ctx->err = cuLaunchKernel(k->k, gs[0], 1, 1, bs[0], 1, 1, shared,
910+
ctx->s, args, NULL);
911+
break;
912+
case 2:
913+
ctx->err = cuLaunchKernel(k->k, gs[0], gs[1], 1, bs[0], bs[1], 1, shared,
914+
ctx->s, args, NULL);
915+
break;
916+
case 3:
917+
ctx->err = cuLaunchKernel(k->k, gs[0], gs[1], gs[2], bs[0], bs[1], bs[2],
918+
shared, ctx->s, args, NULL);
919+
break;
920+
default:
921+
cuda_exit(ctx);
922+
return GA_VALUE_ERROR;
911923
}
912-
913-
ctx->err = cuLaunchKernel(k->k, gs[0], gs[1], 1, bs[0], bs[1], 1, 0,
914-
ctx->s, k->args, NULL);
915924
if (ctx->err != CUDA_SUCCESS) {
916-
cuda_exit(ctx);
917-
return GA_IMPL_ERROR;
925+
res = GA_IMPL_ERROR;
918926
}
919927

920928
cuda_exit(ctx);
921-
return GA_NO_ERROR;
929+
return res;
922930
}
923931

924932
static int cuda_kernelbin(gpukernel *k, size_t *sz, void **obj) {
@@ -1177,7 +1185,7 @@ static int cuda_extcopy(gpudata *input, size_t ioff, gpudata *output,
11771185
int res = GA_SYS_ERROR;
11781186
int in_cache = 1;
11791187
unsigned int i;
1180-
size_t nEls = 1, ls[2], gs[2];
1188+
size_t nEls = 1, ls, gs;
11811189
gpukernel *k;
11821190
cache_val_t *v;
11831191
cache_key_t a;
@@ -1230,14 +1238,13 @@ static int cuda_extcopy(gpudata *input, size_t ioff, gpudata *output,
12301238
}
12311239

12321240
/* Cheap kernel scheduling */
1233-
res = cuda_property(NULL, NULL, *v, GA_KERNEL_PROP_MAXLSIZE, ls);
1241+
res = cuda_property(NULL, NULL, *v, GA_KERNEL_PROP_MAXLSIZE, &ls);
12341242
if (res != GA_NO_ERROR) goto fail;
12351243

1236-
gs[0] = ((nEls-1) / ls[0]) + 1;
1237-
gs[1] = ls[1] = 1;
1244+
gs = ((nEls-1) / ls) + 1;
12381245
args[0] = input;
12391246
args[1] = output;
1240-
res = cuda_callkernel(*v, ls, gs, args);
1247+
res = cuda_callkernel(*v, 1, &ls, &gs, 0, args);
12411248

12421249
fail:
12431250
if (!in_cache)

src/gpuarray_buffer_opencl.c

Lines changed: 41 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -165,9 +165,12 @@ cl_mem cl_get_buf(gpudata *g) { ASSERT_BUF(g); return g->buf; }
165165
static gpukernel *cl_newkernel(void *ctx, unsigned int count,
166166
const char **strings, const size_t *lengths,
167167
const char *fname, unsigned int argcount,
168-
const int *types, int flags, int *ret, char **err_str);
168+
const int *types, int flags, int *ret,
169+
char **err_str);
169170
static void cl_releasekernel(gpukernel *k);
170-
static int cl_callkernel(gpukernel *k, size_t bs[2], size_t gs[2], void **args);
171+
static int cl_callkernel(gpukernel *k, unsigned int n,
172+
const size_t *bs, const size_t *gs,
173+
size_t shared, void **args);
171174

172175
static const char CL_PREAMBLE[] =
173176
"#define local_barrier() barrier(CLK_LOCAL_MEM_FENCE)\n"
@@ -552,7 +555,7 @@ static int cl_memset(gpudata *dst, size_t offset, int data) {
552555
cl_ctx *ctx = dst->ctx;
553556
const char *rlk[1];
554557
void *args[1];
555-
size_t sz, bytes, n, ls[2], gs[2];
558+
size_t sz, bytes, n, ls, gs;
556559
gpukernel *m;
557560
cl_mem_flags fl;
558561
int type;
@@ -627,12 +630,11 @@ static int cl_memset(gpudata *dst, size_t offset, int data) {
627630
if (m == NULL) return res;
628631

629632
/* Cheap kernel scheduling */
630-
res = cl_property(NULL, NULL, m, GA_KERNEL_PROP_MAXLSIZE, &ls[0]);
633+
res = cl_property(NULL, NULL, m, GA_KERNEL_PROP_MAXLSIZE, &ls);
631634
if (res != GA_NO_ERROR) goto fail;
632-
gs[0] = ((n-1) / ls[0]) + 1;
633-
gs[1] = ls[1] = 1;
635+
gs = ((n-1) / ls) + 1;
634636
args[0] = dst;
635-
res = cl_callkernel(m, ls, gs, args);
637+
res = cl_callkernel(m, 1, &ls, &gs, 0, args);
636638

637639
fail:
638640
cl_releasekernel(m);
@@ -833,10 +835,11 @@ static void cl_releasekernel(gpukernel *k) {
833835
}
834836
}
835837

836-
static int cl_callkernel(gpukernel *k, size_t ls[2], size_t gs[2],
837-
void **args) {
838+
static int cl_callkernel(gpukernel *k, unsigned int n,
839+
const size_t *ls, const size_t *gs,
840+
size_t shared, void **args) {
838841
cl_ctx *ctx = k->ctx;
839-
size_t _gs[2];
842+
size_t _gs[3];
840843
cl_event ev;
841844
cl_event *evw;
842845
gpudata *btmp;
@@ -849,6 +852,12 @@ static int cl_callkernel(gpukernel *k, size_t ls[2], size_t gs[2],
849852
ASSERT_KER(k);
850853
ASSERT_CTX(ctx);
851854

855+
if (n > 3)
856+
return GA_VALUE_ERROR;
857+
858+
if (shared != 0)
859+
return GA_UNSUPPORTED_ERROR;
860+
852861
dev = get_dev(ctx->ctx, &res);
853862
if (dev == NULL) return res;
854863

@@ -859,16 +868,22 @@ static int cl_callkernel(gpukernel *k, size_t ls[2], size_t gs[2],
859868
}
860869

861870
for (i = 0; i < k->argcount; i++) {
862-
if (k->types[i] == GA_BUFFER) {
871+
switch (k->types[i]) {
872+
case GA_POINTER:
873+
free(evw);
874+
return GA_DEVSUP_ERROR;
875+
case GA_BUFFER:
863876
btmp = (gpudata *)args[i];
864877
if (btmp->ev != NULL)
865878
evw[num_ev++] = btmp->ev;
866879
ctx->err = clSetKernelArg(k->k, i, sizeof(cl_mem), &btmp->buf);
867-
} else if (k->types[i] == GA_SIZE) {
880+
break;
881+
case GA_SIZE:
868882
temp = *((size_t *)args[i]);
869883
ctx->err = clSetKernelArg(k->k, i, gpuarray_get_elsize(k->types[i]),
870884
&temp);
871-
} else {
885+
break;
886+
default:
872887
ctx->err = clSetKernelArg(k->k, i, gpuarray_get_elsize(k->types[i]),
873888
args[i]);
874889
}
@@ -883,9 +898,15 @@ static int cl_callkernel(gpukernel *k, size_t ls[2], size_t gs[2],
883898
evw = NULL;
884899
}
885900

886-
_gs[0] = gs[0] * ls[0];
887-
_gs[1] = gs[1] * ls[1];
888-
ctx->err = clEnqueueNDRangeKernel(ctx->q, k->k, 2, NULL, _gs, ls,
901+
switch (n) {
902+
case 3:
903+
_gs[2] = gs[2] * ls[2];
904+
case 2:
905+
_gs[1] = gs[1] * ls[1];
906+
case 1:
907+
_gs[0] = gs[0] * ls[0];
908+
}
909+
ctx->err = clEnqueueNDRangeKernel(ctx->q, k->k, n, NULL, _gs, ls,
889910
num_ev, evw, &ev);
890911
free(evw);
891912
if (ctx->err != CL_SUCCESS) return GA_IMPL_ERROR;
@@ -989,7 +1010,7 @@ static int cl_extcopy(gpudata *input, size_t ioff, gpudata *output,
9891010
const ssize_t *b_str) {
9901011
cl_ctx *ctx = input->ctx;
9911012
strb sb = STRB_STATIC_INIT;
992-
size_t nEls, ls[2], gs[2];
1013+
size_t nEls, ls, gs;
9931014
gpukernel *k;
9941015
void *args[2];
9951016
cl_mem_flags fl;
@@ -1058,14 +1079,13 @@ static int cl_extcopy(gpudata *input, size_t ioff, gpudata *output,
10581079
2, types, flags, &res, NULL);
10591080
if (k == NULL) goto fail;
10601081
/* Cheap kernel scheduling */
1061-
res = cl_property(NULL, NULL, k, GA_KERNEL_PROP_MAXLSIZE, &ls[0]);
1082+
res = cl_property(NULL, NULL, k, GA_KERNEL_PROP_MAXLSIZE, &ls);
10621083
if (res != GA_NO_ERROR) goto kfail;
10631084

1064-
gs[0] = ((nEls-1) / ls[0]) + 1;
1065-
gs[1] = ls[1] = 1;
1085+
gs = ((nEls-1) / ls) + 1;
10661086
args[0] = input;
10671087
args[1] = output;
1068-
res = cl_callkernel(k, ls, gs, args);
1088+
res = cl_callkernel(k, 1, &ls, &gs, 0, args);
10691089

10701090
kfail:
10711091
cl_releasekernel(k);

src/gpuarray_kernel.c

Lines changed: 5 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ void *GpuKernel_context(GpuKernel *k) {
3636
return res;
3737
}
3838

39-
static int do_sched(GpuKernel *k, size_t n, size_t *ls, size_t *gs) {
39+
int GpuKernel_sched(GpuKernel *k, size_t n, size_t *ls, size_t *gs) {
4040
size_t min_l;
4141
size_t max_l;
4242
size_t max_g;
@@ -70,63 +70,10 @@ static int do_sched(GpuKernel *k, size_t n, size_t *ls, size_t *gs) {
7070
return GA_NO_ERROR;
7171
}
7272

73-
int GpuKernel_call(GpuKernel *k, size_t n, size_t bs, size_t gs, void **args) {
74-
size_t _n[2], _bs[2], _gs[2];
75-
_n[1] = _bs[1] = _gs[1] = 1;
76-
_n[0] = n;
77-
_bs[0] = bs;
78-
_gs[0] = gs;
79-
return GpuKernel_call2(k, _n, _bs, _gs, args);
80-
}
81-
82-
int GpuKernel_call2(GpuKernel *k, size_t n[2], size_t _bs[2], size_t _gs[2],
83-
void **args) {
84-
size_t bs[2] = {0, 0}, gs[2] = {0, 0};
85-
int *types;
86-
unsigned int argcount;
87-
unsigned int i;
88-
int err;
89-
90-
if (_bs != NULL) bs[0] = _bs[0], bs[1] = _bs[1];
91-
if (_gs != NULL) gs[0] = _gs[0], gs[1] = _gs[1];
92-
if (n == NULL) {
93-
if (_bs == NULL || _gs == NULL ||
94-
bs[0] == 0 || bs[1] == 0 ||
95-
gs[0] == 0 || gs[1] == 0)
96-
return GA_INVALID_ERROR;
97-
} else {
98-
if (bs[0] == 0 || gs[0] == 0) {
99-
if (n[0] == 0)
100-
return GA_INVALID_ERROR;
101-
err = do_sched(k, n[0], &bs[0], &gs[0]);
102-
if (err != GA_NO_ERROR)
103-
return err;
104-
}
105-
106-
if (bs[1] == 0 || gs[1] == 0) {
107-
if (n[1] == 0)
108-
return GA_INVALID_ERROR;
109-
if (n[1] == 1) {
110-
bs[1] = 1;
111-
gs[1] = 1;
112-
} else {
113-
err = do_sched(k, n[1], &bs[1], &gs[1]);
114-
if (err != GA_NO_ERROR)
115-
return err;
116-
}
117-
}
118-
}
119-
err = k->ops->property(NULL, NULL, k->k, GA_KERNEL_PROP_NUMARGS, &argcount);
120-
if (err != GA_NO_ERROR) return err;
121-
err = k->ops->property(NULL, NULL, k->k, GA_KERNEL_PROP_TYPES, &types);
122-
if (err != GA_NO_ERROR) return err;
123-
124-
for (i = 0; i < argcount; i++)
125-
if (types[i] == GA_BUFFER)
126-
k->args[i] = ((GpuArray *)args[i])->data;
127-
else
128-
k->args[i] = args[i];
129-
return k->ops->kernel_call(k->k, bs, gs, k->args);
73+
int GpuKernel_call(GpuKernel *k, unsigned int n,
74+
const size_t *bs, const size_t *gs,
75+
size_t shared, void **args) {
76+
return k->ops->kernel_call(k->k, n, bs, gs, shared, args);
13077
}
13178

13279
int GpuKernel_binary(const GpuKernel *k, size_t *sz, void **bin) {

0 commit comments

Comments
 (0)