@@ -165,9 +165,12 @@ cl_mem cl_get_buf(gpudata *g) { ASSERT_BUF(g); return g->buf; }
165165static gpukernel * cl_newkernel (void * ctx , unsigned int count ,
166166 const char * * strings , const size_t * lengths ,
167167 const char * fname , unsigned int argcount ,
168- const int * types , int flags , int * ret , char * * err_str );
168+ const int * types , int flags , int * ret ,
169+ char * * err_str );
169170static void cl_releasekernel (gpukernel * k );
170- static int cl_callkernel (gpukernel * k , size_t bs [2 ], size_t gs [2 ], void * * args );
171+ static int cl_callkernel (gpukernel * k , unsigned int n ,
172+ const size_t * bs , const size_t * gs ,
173+ size_t shared , void * * args );
171174
172175static const char CL_PREAMBLE [] =
173176 "#define local_barrier() barrier(CLK_LOCAL_MEM_FENCE)\n"
@@ -552,7 +555,7 @@ static int cl_memset(gpudata *dst, size_t offset, int data) {
552555 cl_ctx * ctx = dst -> ctx ;
553556 const char * rlk [1 ];
554557 void * args [1 ];
555- size_t sz , bytes , n , ls [ 2 ] , gs [ 2 ] ;
558+ size_t sz , bytes , n , ls , gs ;
556559 gpukernel * m ;
557560 cl_mem_flags fl ;
558561 int type ;
@@ -627,12 +630,11 @@ static int cl_memset(gpudata *dst, size_t offset, int data) {
627630 if (m == NULL ) return res ;
628631
629632 /* Cheap kernel scheduling */
630- res = cl_property (NULL , NULL , m , GA_KERNEL_PROP_MAXLSIZE , & ls [ 0 ] );
633+ res = cl_property (NULL , NULL , m , GA_KERNEL_PROP_MAXLSIZE , & ls );
631634 if (res != GA_NO_ERROR ) goto fail ;
632- gs [0 ] = ((n - 1 ) / ls [0 ]) + 1 ;
633- gs [1 ] = ls [1 ] = 1 ;
635+ gs = ((n - 1 ) / ls ) + 1 ;
634636 args [0 ] = dst ;
635- res = cl_callkernel (m , ls , gs , args );
637+ res = cl_callkernel (m , 1 , & ls , & gs , 0 , args );
636638
637639 fail :
638640 cl_releasekernel (m );
@@ -833,10 +835,11 @@ static void cl_releasekernel(gpukernel *k) {
833835 }
834836}
835837
836- static int cl_callkernel (gpukernel * k , size_t ls [2 ], size_t gs [2 ],
837- void * * args ) {
838+ static int cl_callkernel (gpukernel * k , unsigned int n ,
839+ const size_t * ls , const size_t * gs ,
840+ size_t shared , void * * args ) {
838841 cl_ctx * ctx = k -> ctx ;
839- size_t _gs [2 ];
842+ size_t _gs [3 ];
840843 cl_event ev ;
841844 cl_event * evw ;
842845 gpudata * btmp ;
@@ -849,6 +852,12 @@ static int cl_callkernel(gpukernel *k, size_t ls[2], size_t gs[2],
849852 ASSERT_KER (k );
850853 ASSERT_CTX (ctx );
851854
855+ if (n > 3 )
856+ return GA_VALUE_ERROR ;
857+
858+ if (shared != 0 )
859+ return GA_UNSUPPORTED_ERROR ;
860+
852861 dev = get_dev (ctx -> ctx , & res );
853862 if (dev == NULL ) return res ;
854863
@@ -859,16 +868,22 @@ static int cl_callkernel(gpukernel *k, size_t ls[2], size_t gs[2],
859868 }
860869
861870 for (i = 0 ; i < k -> argcount ; i ++ ) {
862- if (k -> types [i ] == GA_BUFFER ) {
871+ switch (k -> types [i ]) {
872+ case GA_POINTER :
873+ free (evw );
874+ return GA_DEVSUP_ERROR ;
875+ case GA_BUFFER :
863876 btmp = (gpudata * )args [i ];
864877 if (btmp -> ev != NULL )
865878 evw [num_ev ++ ] = btmp -> ev ;
866879 ctx -> err = clSetKernelArg (k -> k , i , sizeof (cl_mem ), & btmp -> buf );
867- } else if (k -> types [i ] == GA_SIZE ) {
880+ break ;
881+ case GA_SIZE :
868882 temp = * ((size_t * )args [i ]);
869883 ctx -> err = clSetKernelArg (k -> k , i , gpuarray_get_elsize (k -> types [i ]),
870884 & temp );
871- } else {
885+ break ;
886+ default :
872887 ctx -> err = clSetKernelArg (k -> k , i , gpuarray_get_elsize (k -> types [i ]),
873888 args [i ]);
874889 }
@@ -883,9 +898,15 @@ static int cl_callkernel(gpukernel *k, size_t ls[2], size_t gs[2],
883898 evw = NULL ;
884899 }
885900
886- _gs [0 ] = gs [0 ] * ls [0 ];
887- _gs [1 ] = gs [1 ] * ls [1 ];
888- ctx -> err = clEnqueueNDRangeKernel (ctx -> q , k -> k , 2 , NULL , _gs , ls ,
901+ switch (n ) {
902+ case 3 :
903+ _gs [2 ] = gs [2 ] * ls [2 ];
904+ case 2 :
905+ _gs [1 ] = gs [1 ] * ls [1 ];
906+ case 1 :
907+ _gs [0 ] = gs [0 ] * ls [0 ];
908+ }
909+ ctx -> err = clEnqueueNDRangeKernel (ctx -> q , k -> k , n , NULL , _gs , ls ,
889910 num_ev , evw , & ev );
890911 free (evw );
891912 if (ctx -> err != CL_SUCCESS ) return GA_IMPL_ERROR ;
@@ -989,7 +1010,7 @@ static int cl_extcopy(gpudata *input, size_t ioff, gpudata *output,
9891010 const ssize_t * b_str ) {
9901011 cl_ctx * ctx = input -> ctx ;
9911012 strb sb = STRB_STATIC_INIT ;
992- size_t nEls , ls [ 2 ] , gs [ 2 ] ;
1013+ size_t nEls , ls , gs ;
9931014 gpukernel * k ;
9941015 void * args [2 ];
9951016 cl_mem_flags fl ;
@@ -1058,14 +1079,13 @@ static int cl_extcopy(gpudata *input, size_t ioff, gpudata *output,
10581079 2 , types , flags , & res , NULL );
10591080 if (k == NULL ) goto fail ;
10601081 /* Cheap kernel scheduling */
1061- res = cl_property (NULL , NULL , k , GA_KERNEL_PROP_MAXLSIZE , & ls [ 0 ] );
1082+ res = cl_property (NULL , NULL , k , GA_KERNEL_PROP_MAXLSIZE , & ls );
10621083 if (res != GA_NO_ERROR ) goto kfail ;
10631084
1064- gs [0 ] = ((nEls - 1 ) / ls [0 ]) + 1 ;
1065- gs [1 ] = ls [1 ] = 1 ;
1085+ gs = ((nEls - 1 ) / ls ) + 1 ;
10661086 args [0 ] = input ;
10671087 args [1 ] = output ;
1068- res = cl_callkernel (k , ls , gs , args );
1088+ res = cl_callkernel (k , 1 , & ls , & gs , 0 , args );
10691089
10701090 kfail :
10711091 cl_releasekernel (k );
0 commit comments