Context Navigation

cuda.h@ 1aaefd4

main test-branch

Last change on this file since 1aaefd4 was ea777aa, checked in by Alex Wilton <awilton@…>, 3 years ago

Moved examples, include, build_default.properties, common.xml, and README out from dev.civl.com into the root of the repo.

git-svn-id: svn://vsl.cis.udel.edu/civl/trunk@5704 fb995dde-84ed-4084-dfe6-e5aef3e2452c

Property mode set to 100644

File size: 10.6 KB

Line
1
2	/* Functions in this file are meant to serve as drop-in CIVL replacements
3	* for the Cuda function of the same name. Because of this, much of the
4	* documentation of these functions is identical to the documentation
5	* for its Cuda counterpart.
6	*/
7
8	#ifndef _CUDA
9	#define _CUDA
10
11	#include <civl-cuda.cvh>
12
13	/* Returns in *count the number of devices with compute capability
14	* greater or equal to 1.0 that are available for execution.
15	*/
16	cudaError_t cudaGetDeviceCount(int *count);
17
18	/* Returns in *device the current devie for the calling host thread
19	*/
20	cudaError_t cudaGetDevice(int * device);
21
22	/* Returns in *prop the properties of device dev
23	*/
24	cudaError_t cudaGetDeviceProperties(struct cudaDeviceProp * prop, int dev);
25
26	/* Creates and event object
27	*/
28	cudaError_t cudaEventCreate(cudaEvent_t *event);
29
30	/* Records an event. If stream is non-zero, the event is recorded
31	* after all preceding operations in stream have been completed;
32	* otherwise, it is recorded after all preceding operations in the
33	* CUDA context have been completed. Since operation is asynchronous,
34	* cudaEventQuery() and/or cudaEventSynchronize() must be used to
35	* determine when the event has actually been recorded.
36	*/
37	cudaError_t cudaEventRecord(cudaEvent_t event, cudaStream_t s);
38
39	/* Query the status of all device work preceding the most recent call
40	* to cudaEventRecord() (in the appropriate compute streams, as
41	* specified by the arguments to cudaEventRecord()).
42	*
43	* If this work has successfully been completed by the device, or if
44	* cudaEventRecord() has not been called on event, then cudaSuccess
45	* is returned. If this work has not yet been completed by the device
46	* then cudaErrorNotReady is returned.
47	*/
48	cudaError_t cudaEventQuery(cudaEvent_t event);
49
50
51	/* Wait until the completion of all device work preceding the most
52	* recent call to cudaEventRecord() (in the appropriate compute streams,
53	* as specified by the arguments to cudaEventRecord()).
54	*
55	* If cudaEventRecord() has not been called on event, cudaSuccess
56	* is returned immediately.
57	*/
58	cudaError_t cudaEventSynchronize(cudaEvent_t event);
59
60	/* since "timing" doesn't really make sense in the verification process
61	* I'm not sure what this should do. maybe it shouldn't exist.
62	*/
63	cudaError_t cudaEventElapsedTime(float *t, cudaEvent_t from, cudaEvent_t to);
64
65	/* Destroys the event specified by event.
66	*/
67	cudaError_t cudaEventDestroy(cudaEvent_t event);
68
69	/* Creates a new asynchronous stream.
70	*/
71	cudaError_t cudaStreamCreate(cudaStream_t *pStream);
72
73
74	/* Blocks until stream has completed all operations.
75	*/
76	cudaError_t cudaStreamSynchronize(cudaStream_t stream);
77
78
79	/* Destroys and cleans up the asynchronous stream specified by stream.
80	*/
81	cudaError_t cudaStreamDestroy(cudaStream_t pStream);
82
83	/* Explicitly destroys and cleans up all resources associated with the
84	* current device in the current process. Any subsequent API call to
85	* this device will reinitialize the device.
86	*/
87	cudaError_t cudaDeviceReset( void );
88
89	/* locks until stream has completed all operations.
90	*/
91	cudaError_t cudaDeviceSynchronize( void );
92
93	/* Copies count bytes from the memory area pointed to by src to the
94	* memory area pointed to by dst, where kind is one of
95	* cudaMemcpyHostToHost, cudaMemcpyHostToDevice, cudaMemcpyDeviceToHost,
96	* or cudaMemcpyDeviceToDevice, and specifies the direction of the
97	* copy. The memory areas may not overlap.
98	*/
99	cudaError_t cudaMemcpy ( void dst, const void src, size_t count, enum cudaMemcpyKind kind );
100
101	/* Not implemented. Prototype provided for compiling purposes.
102	*/
103	cudaError_t cudaMalloc( void *ptr, size_t size);
104
105	/* Fills the first count bytes of the memory area pointed to by devPtr
106	* with the constant byte value value
107	*/
108	cudaError_t cudaMemset(void * devPtr, int value, size_t count);
109
110	/* Frees the memory space pointed to by devPtr. Similar semantics to free/$free.
111	*/
112	cudaError_t cudaFree(void *devPtr);
113
114	/* Sets device as the current device for the calling host thread. Currently,
115	* only a single device is supported, so this call always succeeds with a noop.
116	*/
117	cudaError_t cudaSetDevice(int device_id);
118
119	/* Returns the message string from an error code
120	*/
121	const char* cudaGetErrorString(cudaError_t error);
122
123	/* Returns the last error that has been produces by any of the runtime calls
124	* in the same host thread and resets it to cudaSuccess
125	*/
126	cudaError_t cudaGetLastError(void);
127
128	/* DEPRECATED. DO NOT USE
129	*/
130	cudaError_t cudaThreadExit(void);
131
132	/* Not implemented. Prototype provided for compatibilty purposes
133	*/
134	void __syncthreads( void );
135
136	uint3 threadIdx;
137	uint3 blockIdx;
138	dim3 gridDim;
139	dim3 blockDim;
140	int warpSize;
141
142	int __shfl_sync(unsigned mask, int var, int srcLane, ...);
143	int __shfl_up_sync(unsigned mask, int var, unsigned int delta, ...);
144	int __shfl_down_sync(unsigned mask, int var, unsigned int delta, ...);
145	int __shfl_xor_sync(unsigned mask, int var, int laneMask, ...);
146
147
148	/ C++ Language Extensions /
149
150	/* atomicAdd()
151	* Reads the 16-bit, 32-bit or 64-bit word old located at the address address in
152	* global or shared memory, computes (old + val), and stores the result back to
153	* memory at the same address. These three operations are performed in one atomic
154	* transaction. The function returns old.
155	*/
156	int cudaAtomicAdd_int(int* address, int val);
157	unsigned int cudaAtomicAdd_uint(unsigned int* address, unsigned int val);
158	unsigned long long int cudaAtomicAdd_ullint(unsigned long long int* address,
159	unsigned long long int val);
160	float cudaAtomicAdd_float(float* address, float val);
161	double cudaAtomicAdd_double(double* address, double val);
162	#define atomicAdd(X,Y) _Generic(X, \
163	default : cudaAtomicAdd_int, \
164	unsigned int* : cudaAtomicAdd_uint, \
165	unsigned long long int* : cudaAtomicAdd_ullint, \
166	float* : cudaAtomicAdd_float, \
167	double* : cudaAtomicAdd_double) (X,Y)
168
169	/* atomicSub()
170	* reads the 32-bit word old located at the address address in global or shared
171	* memory, computes (old - val), and stores the result back to memory at the same
172	* address. These three operations are performed in one atomic transaction. The
173	* function returns old.
174	*/
175	int cudaAtomicSub_int(int* address, int val);
176	unsigned int cudaAtomicSub_uint(unsigned int* address, unsigned int val);
177	#define atomicSub(X,Y) _Generic X, \
178	default : cudaAtomicSub_int, \
179	unsigned int* : cudaAtomicSub_uint) (X,Y)
180
181	/* atomicExch()
182	* reads the 32-bit or 64-bit word old located at the address address in global
183	* or shared memory and stores val back to memory at the same address. These two
184	* operations are performed in one atomic transaction. The function returns old.
185	*/
186	int cudaAtomicExch_int(int* address, int val);
187	unsigned int cudaAtomicExch_uint(unsigned int* address, unsigned int val);
188	unsigned long long int cudaAtomicExch_ullint(unsigned long long int* address,
189	unsigned long long int val);
190	float cudaAtomicExch_float(float* address, float val);
191	#define atomicExch(X,Y) _Generic(X, \
192	default : cudaAtomicExch_int, \
193	unsigned int* : cudaAtomicExch_uint, \
194	unsigned long long int* : cudaAtomicExch_ullint \
195	float* : cudaAtomicExch_float) (X,Y)
196
197	/* atomicMin()
198	* reads the 32-bit or 64-bit word old located at the address address in global
199	* or shared memory, computes the minimum of old and val, and stores the result
200	* back to memory at the same address. These three operations are performed in one
201	* atomic transaction. The function returns old.
202	*/
203	int cudaAtomicMin_int(int* address, int val);
204	unsigned int cudaAtomicMin_uint(unsigned int* address, unsigned int val);
205	unsigned long long int cudaAtomicMin_ullint(unsigned long long int* address,
206	unsigned long long int val);
207	#define atomicMin(X,Y) _Generic(X, \
208	default : cudaAtomicMin_int, \
209	unsigned int* : cudaAtomicMin_uint, \
210	unsigned long long int* : cudaAtomicMin_ullint) (X,Y)
211
212	/* atomicMax()
213	* reads the 32-bit or 64-bit word old located at the address address in global
214	* or shared memory, computes the maximum of old and val, and stores the result
215	* back to memory at the same address. These three operations are performed in one
216	* atomic transaction. The function returns old.
217	*/
218	int cudaAtomicMax_int(int* address, int val);
219	unsigned int cudaAtomicMax_uint(unsigned int* address, unsigned int val);
220	unsigned long long int cudaAtomicMax_ullint(unsigned long long int* address,
221	unsigned long long int val);
222	#define atomicMax(X,Y) _Generic(X, \
223	default : cudaAtomicMax_int, \
224	unsigned int* : cudaAtomicMax_uint, \
225	unsigned long long int* : cudaAtomicMax_ullint) (X,Y)
226
227	/* atomicInc()
228	* reads the 32-bit word old located at the address address in global or shared
229	* memory, computes ((old >= val) ? 0 : (old+1)), and stores the result back to
230	* memory at the same address. These three operations are performed in one atomic
231	* transaction. The function returns old.
232	*/
233	unsigned int atomicInc(unsigned int* address, unsigned int val);
234
235	/* atomicDec()
236	* reads the 32-bit word old located at the address address in global or shared
237	* memory, computes (((old == 0) \|\| (old > val)) ? val : (old-1) ), and stores
238	* the result back to memory at the same address. These three operations are
239	* performed in one atomic transaction. The function returns old.
240	*/
241	unsigned int atomicDec(unsigned int* address, unsigned int val);
242
243	/* atomicCAS()
244	* reads the 16-bit, 32-bit or 64-bit word old located at the address address in
245	* global or shared memory, computes (old == compare ? val : old) , and stores the
246	* result back to memory at the same address. These three operations are performed
247	* in one atomic transaction. The function returns old (Compare And Swap).
248	*/
249	int cudaAtomicCAS_int(int* address, int compare, int val);
250	unsigned int cudaAtomicCAS_uint(unsigned int* address,
251	unsigned int compare,
252	unsigned int val);
253	unsigned long long int cudaAtomicCAS_ullint(unsigned long long int* address,
254	unsigned long long int compare,
255	unsigned long long int val);
256	unsigned short int cudaAtomicCAS_usint(unsigned short int* address,
257	unsigned short int compare,
258	unsigned short int val);
259	#define atomicCAS(address, compare, val) _Generic(address, \
260	default : cudaAtomicCAS_int, \
261	unsigned int* : cudaAtomicCAS_uint, \
262	unsigned long long int* : cudaAtomicCAS_ullint, \
263	unsigned short int* : cudaAtomicCAS_usint) (address, compare, val)
264
265	#endif

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: CIVL/include/headers/cuda.h@ 1aaefd4

Download in other formats: