source: CIVL/include/headers/cuda.h@ 1aaefd4

main test-branch
Last change on this file since 1aaefd4 was ea777aa, checked in by Alex Wilton <awilton@…>, 3 years ago

Moved examples, include, build_default.properties, common.xml, and README out from dev.civl.com into the root of the repo.

git-svn-id: svn://vsl.cis.udel.edu/civl/trunk@5704 fb995dde-84ed-4084-dfe6-e5aef3e2452c

  • Property mode set to 100644
File size: 10.6 KB
RevLine 
[aad342c]1
2/* Functions in this file are meant to serve as drop-in CIVL replacements
3 * for the Cuda function of the same name. Because of this, much of the
4 * documentation of these functions is identical to the documentation
5 * for its Cuda counterpart.
6 */
7
8#ifndef _CUDA
9#define _CUDA
10
11#include <civl-cuda.cvh>
12
13/* Returns in *count the number of devices with compute capability
14 * greater or equal to 1.0 that are available for execution.
15 */
16cudaError_t cudaGetDeviceCount(int *count);
17
18/* Returns in *device the current devie for the calling host thread
19 */
20cudaError_t cudaGetDevice(int * device);
21
22/* Returns in *prop the properties of device dev
23 */
24cudaError_t cudaGetDeviceProperties(struct cudaDeviceProp * prop, int dev);
25
26/* Creates and event object
27 */
28cudaError_t cudaEventCreate(cudaEvent_t *event);
29
30/* Records an event. If stream is non-zero, the event is recorded
31 * after all preceding operations in stream have been completed;
32 * otherwise, it is recorded after all preceding operations in the
33 * CUDA context have been completed. Since operation is asynchronous,
34 * cudaEventQuery() and/or cudaEventSynchronize() must be used to
35 * determine when the event has actually been recorded.
36 */
37cudaError_t cudaEventRecord(cudaEvent_t event, cudaStream_t s);
38
39/* Query the status of all device work preceding the most recent call
40 * to cudaEventRecord() (in the appropriate compute streams, as
41 * specified by the arguments to cudaEventRecord()).
42 *
43 * If this work has successfully been completed by the device, or if
44 * cudaEventRecord() has not been called on event, then cudaSuccess
45 * is returned. If this work has not yet been completed by the device
46 * then cudaErrorNotReady is returned.
47 */
48cudaError_t cudaEventQuery(cudaEvent_t event);
49
50
51/* Wait until the completion of all device work preceding the most
52 * recent call to cudaEventRecord() (in the appropriate compute streams,
53 * as specified by the arguments to cudaEventRecord()).
54 *
55 * If cudaEventRecord() has not been called on event, cudaSuccess
56 * is returned immediately.
57 */
58cudaError_t cudaEventSynchronize(cudaEvent_t event);
59
60/* since "timing" doesn't really make sense in the verification process
61 * I'm not sure what this should do. maybe it shouldn't exist.
62 */
63cudaError_t cudaEventElapsedTime(float *t, cudaEvent_t from, cudaEvent_t to);
64
65/* Destroys the event specified by event.
66 */
67cudaError_t cudaEventDestroy(cudaEvent_t event);
68
69/* Creates a new asynchronous stream.
70 */
71cudaError_t cudaStreamCreate(cudaStream_t *pStream);
72
73
74/* Blocks until stream has completed all operations.
75 */
76cudaError_t cudaStreamSynchronize(cudaStream_t stream);
77
78
79/* Destroys and cleans up the asynchronous stream specified by stream.
80 */
81cudaError_t cudaStreamDestroy(cudaStream_t pStream);
82
83/* Explicitly destroys and cleans up all resources associated with the
84 * current device in the current process. Any subsequent API call to
85 * this device will reinitialize the device.
86 */
87cudaError_t cudaDeviceReset( void );
88
89/* locks until stream has completed all operations.
90 */
91cudaError_t cudaDeviceSynchronize( void );
92
93/* Copies count bytes from the memory area pointed to by src to the
94 * memory area pointed to by dst, where kind is one of
95 * cudaMemcpyHostToHost, cudaMemcpyHostToDevice, cudaMemcpyDeviceToHost,
96 * or cudaMemcpyDeviceToDevice, and specifies the direction of the
97 * copy. The memory areas may not overlap.
98 */
99cudaError_t cudaMemcpy ( void *dst, const void *src, size_t count, enum cudaMemcpyKind kind );
100
101/* Not implemented. Prototype provided for compiling purposes.
102 */
103cudaError_t cudaMalloc( void *ptr, size_t size);
104
105/* Fills the first count bytes of the memory area pointed to by devPtr
106 * with the constant byte value value
107 */
108cudaError_t cudaMemset(void * devPtr, int value, size_t count);
109
110/* Frees the memory space pointed to by devPtr. Similar semantics to free/$free.
111 */
112cudaError_t cudaFree(void *devPtr);
113
114/* Sets device as the current device for the calling host thread. Currently,
115 * only a single device is supported, so this call always succeeds with a noop.
116 */
117cudaError_t cudaSetDevice(int device_id);
118
119/* Returns the message string from an error code
120 */
121const char* cudaGetErrorString(cudaError_t error);
122
123/* Returns the last error that has been produces by any of the runtime calls
124 * in the same host thread and resets it to cudaSuccess
125 */
126cudaError_t cudaGetLastError(void);
127
128/* DEPRECATED. DO NOT USE
129 */
130cudaError_t cudaThreadExit(void);
131
132/* Not implemented. Prototype provided for compatibilty purposes
133 */
134void __syncthreads( void );
135
136uint3 threadIdx;
137uint3 blockIdx;
138dim3 gridDim;
139dim3 blockDim;
140int warpSize;
141
142int __shfl_sync(unsigned mask, int var, int srcLane, ...);
143int __shfl_up_sync(unsigned mask, int var, unsigned int delta, ...);
144int __shfl_down_sync(unsigned mask, int var, unsigned int delta, ...);
145int __shfl_xor_sync(unsigned mask, int var, int laneMask, ...);
146
147
148/** C++ Language Extensions **/
149
150/* atomicAdd()
151 * Reads the 16-bit, 32-bit or 64-bit word old located at the address address in
152 * global or shared memory, computes (old + val), and stores the result back to
153 * memory at the same address. These three operations are performed in one atomic
154 * transaction. The function returns old.
155 */
156int cudaAtomicAdd_int(int* address, int val);
157unsigned int cudaAtomicAdd_uint(unsigned int* address, unsigned int val);
158unsigned long long int cudaAtomicAdd_ullint(unsigned long long int* address,
159 unsigned long long int val);
160float cudaAtomicAdd_float(float* address, float val);
161double cudaAtomicAdd_double(double* address, double val);
162#define atomicAdd(X,Y) _Generic(X, \
163 default : cudaAtomicAdd_int, \
164 unsigned int* : cudaAtomicAdd_uint, \
165 unsigned long long int* : cudaAtomicAdd_ullint, \
166 float* : cudaAtomicAdd_float, \
167 double* : cudaAtomicAdd_double) (X,Y)
168
169/* atomicSub()
170 * reads the 32-bit word old located at the address address in global or shared
171 * memory, computes (old - val), and stores the result back to memory at the same
172 * address. These three operations are performed in one atomic transaction. The
173 * function returns old.
174 */
175int cudaAtomicSub_int(int* address, int val);
176unsigned int cudaAtomicSub_uint(unsigned int* address, unsigned int val);
177#define atomicSub(X,Y) _Generic X, \
178 default : cudaAtomicSub_int, \
179 unsigned int* : cudaAtomicSub_uint) (X,Y)
180
181/* atomicExch()
182 * reads the 32-bit or 64-bit word old located at the address address in global
183 * or shared memory and stores val back to memory at the same address. These two
184 * operations are performed in one atomic transaction. The function returns old.
185 */
186int cudaAtomicExch_int(int* address, int val);
187unsigned int cudaAtomicExch_uint(unsigned int* address, unsigned int val);
188unsigned long long int cudaAtomicExch_ullint(unsigned long long int* address,
189 unsigned long long int val);
190float cudaAtomicExch_float(float* address, float val);
191#define atomicExch(X,Y) _Generic(X, \
192 default : cudaAtomicExch_int, \
193 unsigned int* : cudaAtomicExch_uint, \
194 unsigned long long int* : cudaAtomicExch_ullint \
195 float* : cudaAtomicExch_float) (X,Y)
196
197/* atomicMin()
198 * reads the 32-bit or 64-bit word old located at the address address in global
199 * or shared memory, computes the minimum of old and val, and stores the result
200 * back to memory at the same address. These three operations are performed in one
201 * atomic transaction. The function returns old.
202 */
203int cudaAtomicMin_int(int* address, int val);
204unsigned int cudaAtomicMin_uint(unsigned int* address, unsigned int val);
205unsigned long long int cudaAtomicMin_ullint(unsigned long long int* address,
206 unsigned long long int val);
207#define atomicMin(X,Y) _Generic(X, \
208 default : cudaAtomicMin_int, \
209 unsigned int* : cudaAtomicMin_uint, \
210 unsigned long long int* : cudaAtomicMin_ullint) (X,Y)
211
212/* atomicMax()
213 * reads the 32-bit or 64-bit word old located at the address address in global
214 * or shared memory, computes the maximum of old and val, and stores the result
215 * back to memory at the same address. These three operations are performed in one
216 * atomic transaction. The function returns old.
217 */
218int cudaAtomicMax_int(int* address, int val);
219unsigned int cudaAtomicMax_uint(unsigned int* address, unsigned int val);
220unsigned long long int cudaAtomicMax_ullint(unsigned long long int* address,
221 unsigned long long int val);
222#define atomicMax(X,Y) _Generic(X, \
223 default : cudaAtomicMax_int, \
224 unsigned int* : cudaAtomicMax_uint, \
225 unsigned long long int* : cudaAtomicMax_ullint) (X,Y)
226
227/* atomicInc()
228 * reads the 32-bit word old located at the address address in global or shared
229 * memory, computes ((old >= val) ? 0 : (old+1)), and stores the result back to
230 * memory at the same address. These three operations are performed in one atomic
231 * transaction. The function returns old.
232 */
233unsigned int atomicInc(unsigned int* address, unsigned int val);
234
235/* atomicDec()
236 * reads the 32-bit word old located at the address address in global or shared
237 * memory, computes (((old == 0) || (old > val)) ? val : (old-1) ), and stores
238 * the result back to memory at the same address. These three operations are
239 * performed in one atomic transaction. The function returns old.
240 */
241unsigned int atomicDec(unsigned int* address, unsigned int val);
242
243/* atomicCAS()
244 * reads the 16-bit, 32-bit or 64-bit word old located at the address address in
245 * global or shared memory, computes (old == compare ? val : old) , and stores the
246 * result back to memory at the same address. These three operations are performed
247 * in one atomic transaction. The function returns old (Compare And Swap).
248 */
249int cudaAtomicCAS_int(int* address, int compare, int val);
250unsigned int cudaAtomicCAS_uint(unsigned int* address,
251 unsigned int compare,
252 unsigned int val);
253unsigned long long int cudaAtomicCAS_ullint(unsigned long long int* address,
254 unsigned long long int compare,
255 unsigned long long int val);
256unsigned short int cudaAtomicCAS_usint(unsigned short int* address,
257 unsigned short int compare,
258 unsigned short int val);
259#define atomicCAS(address, compare, val) _Generic(address, \
260 default : cudaAtomicCAS_int, \
261 unsigned int* : cudaAtomicCAS_uint, \
262 unsigned long long int* : cudaAtomicCAS_ullint, \
263 unsigned short int* : cudaAtomicCAS_usint) (address, compare, val)
264
265#endif
Note: See TracBrowser for help on using the repository browser.