/** \file buffer.h
* \brief This file contains the interface definition for the backends.
*
* For normal use you should not call the functions defined in this
* file directly.
*
* \see array.h For managing buffers
* \see kernel.h For using kernels
*/
#ifndef GPUARRAY_BUFFER_H
#define GPUARRAY_BUFFER_H
#include
#include
#include
#include
#ifdef __cplusplus
extern "C" {
#endif
#ifdef CONFUSE_EMACS
}
#endif
struct _gpudata;
/**
* Opaque struct for buffer data.
*/
typedef struct _gpudata gpudata;
struct _gpucontext;
/**
* Opaque struct for context data.
*/
typedef struct _gpucontext gpucontext;
struct _gpukernel;
/**
* Opaque struct for kernel data.
*/
typedef struct _gpukernel gpukernel;
/**
* \brief Gets information about the number of available platforms for the
* backend specified in `name`.
* \param name [const char*] the backend name
* \param platcount [unsigned int*] will contain number of compatible platforms in host
* \return int GA_NO_ERROR, if success
*/
GPUARRAY_PUBLIC int gpu_get_platform_count(const char* name,
unsigned int* platcount);
/**
* \brief Gets information about the number of compatible devices on a specific
* host's `platform` for the backend specified in `name`.
* \param name [const char*] the backend name
* \param platform [unsigned int] number for a platform in host
* \param devcount [unsigned int*] will contain number of compatible devices in
* `platform`
* \return int GA_NO_ERROR, if success
*/
GPUARRAY_PUBLIC int gpu_get_device_count(const char* name,
unsigned int platform,
unsigned int* devcount);
/**
* Create a context on the specified device.
*
* \warning This function is not thread-safe.
*
* \param name the backend name.
* \param dev the device number. The precise meaning of the device
* number is backend-dependent
* \param flags see \ref context_flags "Context flags"
* \param ret error return location. Will be ignored if set to NULL.
*
* \returns An opaque pointer to the created context or NULL if an
* error occured.
*/
GPUARRAY_PUBLIC gpucontext *gpucontext_init(const char *name, int dev,
int flags, int *ret);
/**
* \defgroup context_flags Context flags
* @{
*/
/**
* Let the backend decide on optimal parameters, using backend-defined
* heuristics and defaults.
*
* This is the default (0) value.
*/
#define GA_CTX_DEFAULT 0x00
/**
* Optimize parameters for multi-thread performance.
*
* May decrease overall performance in single-thread scenarios.
*/
#define GA_CTX_MULTI_THREAD 0x01
/**
* Optimize parameters for single-thread performance.
*
* May decrease overall performace in multithread scenarios.
*/
#define GA_CTX_SINGLE_THREAD 0x02
/**
* Allocate a single stream per context, performing all operations in order.
*
* This will remove any attempt at exploiting parallelism in the
* underlying device by performing unrelated operations concurrently
* and/or out of order.
*
* This can help performance by removing the small cost paid for each
* operation to keep everything coherent in the face of parallelism.
* It can also hinder performance by not exploiting concurrency.
*/
#define GA_CTX_SINGLE_STREAM 0x4
/**
* Disable allocations cache (if any).
*
* This will usually decrease performance by quite a bit, but will
* enable better debugging of kernels that perform out of bounds
* access.
*/
#define GA_CTX_DISABLE_ALLOCATION_CACHE 0x10
/**
* @}
*/
/**
* Dereference a context.
*
* This removes a reference to the context and as soon as the
* reference count drops to zero the context is destroyed. The
* context can stay alive after you call this function because some
* object keep a reference to their context.
*
* \param ctx a valid context pointer.
*/
GPUARRAY_PUBLIC void gpucontext_deref(gpucontext *ctx);
/**
* Fetch a context property.
*
* The property must be a context property. The currently defined
* properties and their type are defined in \ref props "Properties".
*
* \param ctx context
* \param prop_id property id (from \ref props "Properties")
* \param res pointer to the return space of the appropriate type
*
* \returns GA_NO_ERROR or an error code if an error occurred.
*/
GPUARRAY_PUBLIC int gpucontext_property(gpucontext *ctx, int prop_id,
void *res);
/**
* Get a string describing `err`.
*
* If you need to get a description of a error that occurred during
* context creation, call this function using NULL as the context.
* This version of the call is not thread-safe.
*
* \param ctx the context in which the error occured
* \param err error code
*
* \returns string description of error
*/
GPUARRAY_PUBLIC const char *gpucontext_error(gpucontext *ctx, int err);
/**
* Allocates a buffer of size `sz` in context `ctx`.
*
* Buffers are reference counted internally and start with a
* reference count of 1.
*
* \param ctx a context pointer
* \param sz the requested size
* \param flags see \ref alloc_flags "Allocation flags"
* \param data optional pointer to host buffer
* \param ret error return pointer
*
* \returns A non-NULL pointer to a gpudata structure. This
* structure is intentionally opaque as its content may change
* according to the backend used.
*/
GPUARRAY_PUBLIC gpudata *gpudata_alloc(gpucontext *ctx, size_t sz, void *data,
int flags, int *ret);
/**
* \defgroup alloc_flags Allocation flags
* @{
*/
/**
* The buffer is available for reading and writing from kernels.
*
* This is the default (0) value.
*/
#define GA_BUFFER_READ_WRITE 0x00
/**
* Allocate the buffer in device-only memory.
*
* This is the default (0) value.
*/
#define GA_BUFFER_DEV 0x00
/**
* Signal that the memory in this buffer will only be read by kernels.
*
* You can use gpudata_write() to set the contents.
*
* You may not call gpudata_memset() with the resulting buffer as the
* destination.
*/
#define GA_BUFFER_READ_ONLY 0x01
/**
* Signal that the memory in this buffer will only be written by
* kernels (i.e. it is an output buffer).
*
* You can read the contents with gpudata_read().
*/
#define GA_BUFFER_WRITE_ONLY 0x02
/**
* Initialize the contents of the buffer with the user-supplied host
* buffer (`data`). This buffer must be at least `sz` large.
*/
#define GA_BUFFER_INIT 0x04
/**
* Allocate the buffer in host-reachable memory enabling you to
* retrieve a pointer to the contents as the
* `GA_BUFFER_PROP_HOSTPOINTER` property.
*/
#define GA_BUFFER_HOST 0x08
/*#define GA_BUFFER_USE_DATA 0x10*/
/* The upper 16 bits are private flags */
#define GA_BUFFER_MASK 0xffff
/**
* @}
*/
/**
* Increase the reference count to the passed buffer by 1.
*
* \param b a buffer
*/
GPUARRAY_PUBLIC void gpudata_retain(gpudata *b);
/**
* Release a buffer.
*
* This will decrement the reference count of the buffer by 1. If
* that count reaches 0 all associated ressources will be released.
*
* Even if your application does not have any references left to a
* buffer it may still hang around if it is in use by internal
* mechanisms (kernel call, ...)
*/
GPUARRAY_PUBLIC void gpudata_release(gpudata *b);
/**
* Check if two buffers may overlap.
*
* Both buffers must have been created with the same backend.
*
* \param a first buffer
* \param b second buffer
* \param ret error return pointer
*
* \retval 1 The buffers may overlap
* \retval 0 The buffers do not overlap.
* \retval -1 An error was encoutered, `ret` contains a detailed
* error code if not NULL.
*/
GPUARRAY_PUBLIC int gpudata_share(gpudata *a, gpudata *b, int *ret);
/**
* Copy the content of a buffer to another.
*
* Both buffers must be in the same context and contiguous.
* Additionally the buffers must not overlap otherwise the content of
* the destination buffer is not defined.
*
* \param dst destination buffer
* \param dstoff offset inside the destination buffer
* \param src source buffer
* \param srcoff offset inside the source buffer
* \param sz size of data to copy (in bytes)
*
* \returns GA_NO_ERROR or an error code if an error occurred.
*/
GPUARRAY_PUBLIC int gpudata_move(gpudata *dst, size_t dstoff,
gpudata *src, size_t srcoff,
size_t sz);
/**
* Transfer the content of buffer across contexts.
*
* If possible it will try to the the transfer in an efficient way
* using backend-specific tricks. If those fail or can't be used, it
* will fallback to a copy through the host.
*
* \param dst buffer to transfer to
* \param dstoff offset in the destination buffer
* \param src buffer to transfer from
* \param srcoff offset in the source buffer
* \param sz size of the region to transfer
*
* \returns the new buffer in dst_ctx or NULL if no efficient way to
* transfer could be found.
*/
GPUARRAY_LOCAL int gpudata_transfer(gpudata *dst, size_t dstoff,
gpudata *src, size_t srcoff,
size_t sz);
/**
* Transfer data from a buffer to memory.
*
* The buffer and the memory region must be contiguous.
*
* \param dst destination in memory
* \param src source buffer
* \param srcoff offset inside the source buffer
* \param sz size of data to copy (in bytes)
*
* \returns GA_NO_ERROR or an error code if an error occurred.
*/
GPUARRAY_PUBLIC int gpudata_read(void *dst,
gpudata *src, size_t srcoff,
size_t sz);
/**
* Transfer data from memory to a buffer.
*
* The buffer and the memory region must be contiguous.
*
* \param dst destination buffer
* \param dstoff offset inside the destination buffer
* \param src source in memory
* \param sz size of data to copy (in bytes)
*
* \returns GA_NO_ERROR or an error code if an error occurred.
*/
GPUARRAY_PUBLIC int gpudata_write(gpudata *dst, size_t dstoff,
const void *src, size_t sz);
/**
* Set a buffer to a byte pattern.
*
* This function acts like the C function memset() for device buffers.
*
* \param dst destination buffer
* \param dstoff offset into the destination buffer
* \param data byte value to write into the destination.
*
* \returns GA_NO_ERROR or an error code if an error occurred.
*/
GPUARRAY_PUBLIC int gpudata_memset(gpudata *dst, size_t dstoff, int data);
/**
* Synchronize a buffer.
*
* Waits for all previous read, writes, copies and kernel calls
* involving this buffer to be finished.
*
* This call is not required for normal use of the library as all
* exposed operations will properly synchronize amongst themselves.
* This call may be useful in a performance timing context to ensure
* that the work is really done, or before interaction with another
* library to wait for pending operations.
*/
GPUARRAY_PUBLIC int gpudata_sync(gpudata *b);
/**
* Fetch a buffer property.
*
* Can be used for buffer properties and context properties. Context
* properties will fetch the value for the context associated with the
* buffer. The currently defined properties and their type are
* defined in \ref props "Properties".
*
* \param buf buffer
* \param prop_id property id (from \ref props "Properties")
* \param res pointer to the return space of the appropriate type
*
* \returns GA_NO_ERROR or an error code if an error occurred.
*/
GPUARRAY_PUBLIC int gpudata_property(gpudata *buf, int prop_id, void *res);
GPUARRAY_PUBLIC gpucontext *gpudata_context(gpudata *b);
/**
* Compile a kernel.
*
* Compile the kernel composed of the concatenated strings in
* `strings` and return a callable kernel. If lengths is NULL then
* all the strings must be NUL-terminated. Otherwise, it doesn't
* matter (but the lengths must not include the final NUL byte if
* provided).
*
* \param ctx context to work in
* \param count number of input strings
* \param strings table of string pointers
* \param lengths (optional) length for each string in the table
* \param fname name of the kernel function (as defined in the code)
* \param flags flags for compilation (see #ga_usefl)
* \param ret error return pointer
* \param err_str returns pointer to debug message from GPU backend
* (if provided a non-NULL err_str)
*
* If `*err_str` is not NULL on return, the caller must call
* `free(*err_str)` after use.
*
* \returns Allocated kernel structure or NULL if an error occured.
* `ret` will be updated with the error code if not NULL.
*/
GPUARRAY_PUBLIC gpukernel *gpukernel_init(gpucontext *ctx, unsigned int count,
const char **strings, const size_t *lengths,
const char *fname, unsigned int numargs,
const int *typecodes, int flags, int *ret,
char **err_str);
/**
* Retain a kernel.
*
* Increase the reference count of the passed kernel by 1.
*
* \param k a kernel
*/
GPUARRAY_PUBLIC void gpukernel_retain(gpukernel *k);
/**
* Release a kernel.
*
* Decrease the reference count of a kernel. If it reaches 0, all
* resources associated with `k` will be released.
*
* If the reference count of a kernel reaches 0 while it is running,
* this call will block until completion.
*/
GPUARRAY_PUBLIC void gpukernel_release(gpukernel *k);
/**
* Set kernel argument.
*
* Buffer arguments will not be retained and it is the
* responsability of the caller to ensure that the value is still
* valid whenever a call is made.
*
* \param k kernel
* \param i argument index (starting at 0)
* \param a pointer to argument
*
* \returns GA_NO_ERROR or an error code if an error occurred.
*/
GPUARRAY_PUBLIC int gpukernel_setarg(gpukernel *k, unsigned int i, void *a);
/**
* Call a kernel.
*
* If args is NULL, it will be assumed that the arguments have
* previously been set with kernel_setarg().
*
* \param k kernel
* \param n number of dimensions of grid/block
* \param bs block sizes for this call (also known as local size)
* \param gs grid sizes for this call (also known as global size)
* \param shared amount of dynamic shared memory to reserve
* \param args table of pointers to each argument (optional).
*
* \returns GA_NO_ERROR or an error code if an error occurred.
*/
GPUARRAY_PUBLIC int gpukernel_call(gpukernel *k, unsigned int n,
const size_t *ls, const size_t *gs,
size_t shared, void **args);
/**
* Get the kernel binary.
*
* This can be use to cache kernel binaries after compilation of a
* specific device. The kernel can be recreated by calling
* kernel_alloc with the binary and size and passing `GA_USE_BINARY`
* as the use flags.
*
* The returned pointer is allocated and must be freed by the caller.
*
* \param k kernel
* \param sz size of the returned binary
* \param obj pointer to the binary for the kernel.
*
* \returns GA_NO_ERROR or an error code if an error occurred.
*/
GPUARRAY_PUBLIC int gpukernel_binary(gpukernel *k, size_t *sz, void **obj);
/**
* Fetch a property.
*
* Can be used for kernel and context properties. The context
* properties will fetch the value for the context associated with the
* kernel. The currently defined properties and their type are
* defined in \ref props "Properties".
*
* \param k kernel
* \param prop_id property id (from \ref props "Properties")
* \param res pointer to the return space of the appropriate type
*
* \returns GA_NO_ERROR or an error code if an error occurred.
*/
GPUARRAY_PUBLIC int gpukernel_property(gpukernel *k, int prop_id, void *res);
GPUARRAY_PUBLIC gpucontext *gpukernel_context(gpukernel *k);
/**
* \defgroup props Properties
* @{
*/
/* Start at 1 for GA_CTX_PROP_ */
/**
* Get the device name for the context.
*
* \note The returned string is allocated and must be freed by the caller.
*
* Type: `char *`
*/
#define GA_CTX_PROP_DEVNAME 1
/**
* Get the maximum block size (also known as local size) for a kernel
* call in the context.
*
* Type: `size_t`
*/
#define GA_CTX_PROP_MAXLSIZE 2
/**
* Get the local memory size available for a call in the context.
*
* Type: `size_t`
*/
#define GA_CTX_PROP_LMEMSIZE 3
/**
* Number of compute units in this context.
*
* compute units times local size is more or less the expected
* parallelism available on the device, but this is a very rough
* estimate.
*
* Type: `unsigned int`
*/
#define GA_CTX_PROP_NUMPROCS 4
/**
* Get the maximum group size for a kernel call in this context.
*
* Type: `size_t`
*/
#define GA_CTX_PROP_MAXGSIZE 5
/**
* Get the vector of blas ops for the context.
*
* This may differ from one context to the other in the same backend
* depending of the availability and performance of various BLAS
* libraries.
*
* Type: `const gpuarray_blas_ops *`
*/
#define GA_CTX_PROP_BLAS_OPS 6
/**
* Get the compatibility ID for the binaries generated with this context.
*
* Those binaries should work with any context which has the same ID.
*
* Type: `const char *`
*/
#define GA_CTX_PROP_BIN_ID 7
/**
* Get a pre-allocated 8 byte buffer for kernel ops.
*
* This buffer is initialized to 0 on allocation and must always be
* returned to that state after using it.
*
* This only to avoid the overhead of an allocation when calling a
* kernel that may error out. It does not preclude the need for
* synchronization and transfers.
*
* Type: `gpudata *`
*/
#define GA_CTX_PROP_ERRBUF 8
/**
* Get the total size of global memory on the device.
*
* Type: `size_t`
*/
#define GA_CTX_PROP_TOTAL_GMEM 9
/**
* Get the size of free global memory on the device.
*
* Type: `size_t`
*/
#define GA_CTX_PROP_FREE_GMEM 10
/**
* Get the status of native float16 support on the device.
*
* Type: `int`
*/
#define GA_CTX_PROP_NATIVE_FLOAT16 11
/**
* Get the maximum global size for dimension 0.
*
* Type: `size_t`
*/
#define GA_CTX_PROP_MAXGSIZE0 12
/**
* Get the maximum global size for dimension 1.
*
* Type: `size_t`
*/
#define GA_CTX_PROP_MAXGSIZE1 13
/**
* Get the maximum global size for dimension 2.
*
* Type: `size_t`
*/
#define GA_CTX_PROP_MAXGSIZE2 14
/**
* Get the maximum local size for dimension 0.
*
* Type: `size_t`
*/
#define GA_CTX_PROP_MAXLSIZE0 15
/**
* Get the maximum local size for dimension 1.
*
* Type: `size_t`
*/
#define GA_CTX_PROP_MAXLSIZE1 16
/**
* Get the maximum loca size for dimension 2.
*
* Type: `size_t`
*/
#define GA_CTX_PROP_MAXLSIZE2 17
/**
* Get the vector of collective ops for the context.
*
* Type: `const gpuarray_comm_ops *`
*/
#define GA_CTX_PROP_COMM_OPS 18
/* Start at 512 for GA_BUFFER_PROP_ */
#define GA_BUFFER_PROP_START 512
/**
* Get the context in which this buffer was allocated.
*
* Type: `gpucontext *`
*/
#define GA_BUFFER_PROP_CTX 512
/**
* The reference count of the buffer. Use only for debugging purposes.
*
* Type: `unsigned int`
*/
#define GA_BUFFER_PROP_REFCNT 513
/**
* Size of the buffer on the device.
*
* This may be larger than the requested allocation size due to a
* number of factors.
*
* Type: `size_t`
*/
#define GA_BUFFER_PROP_SIZE 514
/* Start at 1024 for GA_KERNEL_PROP_ */
#define GA_KERNEL_PROP_START 1024
/**
* Get the context for which this kernel was compiled.
*
* Type: `gpucontext *`
*/
#define GA_KERNEL_PROP_CTX 1024
/**
* Get the maximum block size (also known as local size) for a call of
* this kernel.
*
* Type: `size_t`
*/
#define GA_KERNEL_PROP_MAXLSIZE 1025
/**
* Get the prefered multiple of the block size for a call to this
* kernel.
*
* Type: `size_t`
*/
#define GA_KERNEL_PROP_PREFLSIZE 1026
/**
* Get the number of kernel arguments.
*
* Type `unsigned int`
*/
#define GA_KERNEL_PROP_NUMARGS 1027
/**
* Get the list of argument types for a kernel.
*
* This list is the same length as the number of arguments to the
* kernel. Do not modify the returned list.
*
* Type: `const int *`
*/
#define GA_KERNEL_PROP_TYPES 1028
/**
* @}
*/
/**
* Flags for gpukernel_init().
*
* It is important to specify these properly as the compilation
* machinery will ensure that the proper configuration is made to
* support the requested features or error out if the demands cannot
* be met.
*
* \warning Failure to properly specify the feature flags will in most
* cases result in silent data corruption (especially on ATI cards).
*/
typedef enum _ga_usefl {
/**
* The kernel source uses CLUDA unified language.
*/
GA_USE_CLUDA = 0x01,
/**
* The kernel makes use of small (size is smaller than 4 bytes) types.
*/
GA_USE_SMALL = 0x02,
/**
* The kernel makes use of double or complex doubles.
*/
GA_USE_DOUBLE = 0x04,
/**
* The kernel makes use of complex of complex doubles.
*/
GA_USE_COMPLEX = 0x08,
/**
* The kernel makes use of half-floats (also known as float16)
*/
GA_USE_HALF = 0x10,
/**
* The source code passed is actually a kernel binary.
*
* For the cuda backend this can also be a PTX module.
*/
GA_USE_BINARY = 0x20,
/* If you add a new flag, don't forget to update both
gpuarray_buffer_{cuda,opencl}.c with the implementation of your flag */
/**
* The kernel is made of CUDA code.
*/
GA_USE_CUDA = 0x2000,
/**
* The kernel is made of OpenCL code.
*/
GA_USE_OPENCL = 0x4000,
} ga_usefl;
#ifdef __cplusplus
}
#endif
#endif