Context Navigation

cuda.h@ 1aaefd4

main test-branch

Last change on this file since 1aaefd4 was ea777aa, checked in by Alex Wilton <awilton@…>, 3 years ago

Moved examples, include, build_default.properties, common.xml, and README out from dev.civl.com into the root of the repo.

git-svn-id: svn://vsl.cis.udel.edu/civl/trunk@5704 fb995dde-84ed-4084-dfe6-e5aef3e2452c

Property mode set to 100644

File size: 10.6 KB

Rev	Line
[aad342c]	1
	2	/* Functions in this file are meant to serve as drop-in CIVL replacements
	3	* for the Cuda function of the same name. Because of this, much of the
	4	* documentation of these functions is identical to the documentation
	5	* for its Cuda counterpart.
	6	*/
	7
	8	#ifndef _CUDA
	9	#define _CUDA
	10
	11	#include <civl-cuda.cvh>
	12
	13	/* Returns in *count the number of devices with compute capability
	14	* greater or equal to 1.0 that are available for execution.
	15	*/
	16	cudaError_t cudaGetDeviceCount(int *count);
	17
	18	/* Returns in *device the current devie for the calling host thread
	19	*/
	20	cudaError_t cudaGetDevice(int * device);
	21
	22	/* Returns in *prop the properties of device dev
	23	*/
	24	cudaError_t cudaGetDeviceProperties(struct cudaDeviceProp * prop, int dev);
	25
	26	/* Creates and event object
	27	*/
	28	cudaError_t cudaEventCreate(cudaEvent_t *event);
	29
	30	/* Records an event. If stream is non-zero, the event is recorded
	31	* after all preceding operations in stream have been completed;
	32	* otherwise, it is recorded after all preceding operations in the
	33	* CUDA context have been completed. Since operation is asynchronous,
	34	* cudaEventQuery() and/or cudaEventSynchronize() must be used to
	35	* determine when the event has actually been recorded.
	36	*/
	37	cudaError_t cudaEventRecord(cudaEvent_t event, cudaStream_t s);
	38
	39	/* Query the status of all device work preceding the most recent call
	40	* to cudaEventRecord() (in the appropriate compute streams, as
	41	* specified by the arguments to cudaEventRecord()).
	42	*
	43	* If this work has successfully been completed by the device, or if
	44	* cudaEventRecord() has not been called on event, then cudaSuccess
	45	* is returned. If this work has not yet been completed by the device
	46	* then cudaErrorNotReady is returned.
	47	*/
	48	cudaError_t cudaEventQuery(cudaEvent_t event);
	49
	50
	51	/* Wait until the completion of all device work preceding the most
	52	* recent call to cudaEventRecord() (in the appropriate compute streams,
	53	* as specified by the arguments to cudaEventRecord()).
	54	*
	55	* If cudaEventRecord() has not been called on event, cudaSuccess
	56	* is returned immediately.
	57	*/
	58	cudaError_t cudaEventSynchronize(cudaEvent_t event);
	59
	60	/* since "timing" doesn't really make sense in the verification process
	61	* I'm not sure what this should do. maybe it shouldn't exist.
	62	*/
	63	cudaError_t cudaEventElapsedTime(float *t, cudaEvent_t from, cudaEvent_t to);
	64
	65	/* Destroys the event specified by event.
	66	*/
	67	cudaError_t cudaEventDestroy(cudaEvent_t event);
	68
	69	/* Creates a new asynchronous stream.
	70	*/
	71	cudaError_t cudaStreamCreate(cudaStream_t *pStream);
	72
	73
	74	/* Blocks until stream has completed all operations.
	75	*/
	76	cudaError_t cudaStreamSynchronize(cudaStream_t stream);
	77
	78
	79	/* Destroys and cleans up the asynchronous stream specified by stream.
	80	*/
	81	cudaError_t cudaStreamDestroy(cudaStream_t pStream);
	82
	83	/* Explicitly destroys and cleans up all resources associated with the
	84	* current device in the current process. Any subsequent API call to
	85	* this device will reinitialize the device.
	86	*/
	87	cudaError_t cudaDeviceReset( void );
	88
	89	/* locks until stream has completed all operations.
	90	*/
	91	cudaError_t cudaDeviceSynchronize( void );
	92
	93	/* Copies count bytes from the memory area pointed to by src to the
	94	* memory area pointed to by dst, where kind is one of
	95	* cudaMemcpyHostToHost, cudaMemcpyHostToDevice, cudaMemcpyDeviceToHost,
	96	* or cudaMemcpyDeviceToDevice, and specifies the direction of the
	97	* copy. The memory areas may not overlap.
	98	*/
	99	cudaError_t cudaMemcpy ( void dst, const void src, size_t count, enum cudaMemcpyKind kind );
	100
	101	/* Not implemented. Prototype provided for compiling purposes.
	102	*/
	103	cudaError_t cudaMalloc( void *ptr, size_t size);
	104
	105	/* Fills the first count bytes of the memory area pointed to by devPtr
	106	* with the constant byte value value
	107	*/
	108	cudaError_t cudaMemset(void * devPtr, int value, size_t count);
	109
	110	/* Frees the memory space pointed to by devPtr. Similar semantics to free/$free.
	111	*/
	112	cudaError_t cudaFree(void *devPtr);
	113
	114	/* Sets device as the current device for the calling host thread. Currently,
	115	* only a single device is supported, so this call always succeeds with a noop.
	116	*/
	117	cudaError_t cudaSetDevice(int device_id);
	118
	119	/* Returns the message string from an error code
	120	*/
	121	const char* cudaGetErrorString(cudaError_t error);
	122
	123	/* Returns the last error that has been produces by any of the runtime calls
	124	* in the same host thread and resets it to cudaSuccess
	125	*/
	126	cudaError_t cudaGetLastError(void);
	127
	128	/* DEPRECATED. DO NOT USE
	129	*/
	130	cudaError_t cudaThreadExit(void);
	131
	132	/* Not implemented. Prototype provided for compatibilty purposes
	133	*/
	134	void __syncthreads( void );
	135
	136	uint3 threadIdx;
	137	uint3 blockIdx;
	138	dim3 gridDim;
	139	dim3 blockDim;
	140	int warpSize;
	141
	142	int __shfl_sync(unsigned mask, int var, int srcLane, ...);
	143	int __shfl_up_sync(unsigned mask, int var, unsigned int delta, ...);
	144	int __shfl_down_sync(unsigned mask, int var, unsigned int delta, ...);
	145	int __shfl_xor_sync(unsigned mask, int var, int laneMask, ...);
	146
	147
	148	/ C++ Language Extensions /
	149
	150	/* atomicAdd()
	151	* Reads the 16-bit, 32-bit or 64-bit word old located at the address address in
	152	* global or shared memory, computes (old + val), and stores the result back to
	153	* memory at the same address. These three operations are performed in one atomic
	154	* transaction. The function returns old.
	155	*/
	156	int cudaAtomicAdd_int(int* address, int val);
	157	unsigned int cudaAtomicAdd_uint(unsigned int* address, unsigned int val);
	158	unsigned long long int cudaAtomicAdd_ullint(unsigned long long int* address,
	159	unsigned long long int val);
	160	float cudaAtomicAdd_float(float* address, float val);
	161	double cudaAtomicAdd_double(double* address, double val);
	162	#define atomicAdd(X,Y) _Generic(X, \
	163	default : cudaAtomicAdd_int, \
	164	unsigned int* : cudaAtomicAdd_uint, \
	165	unsigned long long int* : cudaAtomicAdd_ullint, \
	166	float* : cudaAtomicAdd_float, \
	167	double* : cudaAtomicAdd_double) (X,Y)
	168
	169	/* atomicSub()
	170	* reads the 32-bit word old located at the address address in global or shared
	171	* memory, computes (old - val), and stores the result back to memory at the same
	172	* address. These three operations are performed in one atomic transaction. The
	173	* function returns old.
	174	*/
	175	int cudaAtomicSub_int(int* address, int val);
	176	unsigned int cudaAtomicSub_uint(unsigned int* address, unsigned int val);
	177	#define atomicSub(X,Y) _Generic X, \
	178	default : cudaAtomicSub_int, \
	179	unsigned int* : cudaAtomicSub_uint) (X,Y)
	180
	181	/* atomicExch()
	182	* reads the 32-bit or 64-bit word old located at the address address in global
	183	* or shared memory and stores val back to memory at the same address. These two
	184	* operations are performed in one atomic transaction. The function returns old.
	185	*/
	186	int cudaAtomicExch_int(int* address, int val);
	187	unsigned int cudaAtomicExch_uint(unsigned int* address, unsigned int val);
	188	unsigned long long int cudaAtomicExch_ullint(unsigned long long int* address,
	189	unsigned long long int val);
	190	float cudaAtomicExch_float(float* address, float val);
	191	#define atomicExch(X,Y) _Generic(X, \
	192	default : cudaAtomicExch_int, \
	193	unsigned int* : cudaAtomicExch_uint, \
	194	unsigned long long int* : cudaAtomicExch_ullint \
	195	float* : cudaAtomicExch_float) (X,Y)
	196
	197	/* atomicMin()
	198	* reads the 32-bit or 64-bit word old located at the address address in global
	199	* or shared memory, computes the minimum of old and val, and stores the result
	200	* back to memory at the same address. These three operations are performed in one
	201	* atomic transaction. The function returns old.
	202	*/
	203	int cudaAtomicMin_int(int* address, int val);
	204	unsigned int cudaAtomicMin_uint(unsigned int* address, unsigned int val);
	205	unsigned long long int cudaAtomicMin_ullint(unsigned long long int* address,
	206	unsigned long long int val);
	207	#define atomicMin(X,Y) _Generic(X, \
	208	default : cudaAtomicMin_int, \
	209	unsigned int* : cudaAtomicMin_uint, \
	210	unsigned long long int* : cudaAtomicMin_ullint) (X,Y)
	211
	212	/* atomicMax()
	213	* reads the 32-bit or 64-bit word old located at the address address in global
	214	* or shared memory, computes the maximum of old and val, and stores the result
	215	* back to memory at the same address. These three operations are performed in one
	216	* atomic transaction. The function returns old.
	217	*/
	218	int cudaAtomicMax_int(int* address, int val);
	219	unsigned int cudaAtomicMax_uint(unsigned int* address, unsigned int val);
	220	unsigned long long int cudaAtomicMax_ullint(unsigned long long int* address,
	221	unsigned long long int val);
	222	#define atomicMax(X,Y) _Generic(X, \
	223	default : cudaAtomicMax_int, \
	224	unsigned int* : cudaAtomicMax_uint, \
	225	unsigned long long int* : cudaAtomicMax_ullint) (X,Y)
	226
	227	/* atomicInc()
	228	* reads the 32-bit word old located at the address address in global or shared
	229	* memory, computes ((old >= val) ? 0 : (old+1)), and stores the result back to
	230	* memory at the same address. These three operations are performed in one atomic
	231	* transaction. The function returns old.
	232	*/
	233	unsigned int atomicInc(unsigned int* address, unsigned int val);
	234
	235	/* atomicDec()
	236	* reads the 32-bit word old located at the address address in global or shared
	237	* memory, computes (((old == 0) \|\| (old > val)) ? val : (old-1) ), and stores
	238	* the result back to memory at the same address. These three operations are
	239	* performed in one atomic transaction. The function returns old.
	240	*/
	241	unsigned int atomicDec(unsigned int* address, unsigned int val);
	242
	243	/* atomicCAS()
	244	* reads the 16-bit, 32-bit or 64-bit word old located at the address address in
	245	* global or shared memory, computes (old == compare ? val : old) , and stores the
	246	* result back to memory at the same address. These three operations are performed
	247	* in one atomic transaction. The function returns old (Compare And Swap).
	248	*/
	249	int cudaAtomicCAS_int(int* address, int compare, int val);
	250	unsigned int cudaAtomicCAS_uint(unsigned int* address,
	251	unsigned int compare,
	252	unsigned int val);
	253	unsigned long long int cudaAtomicCAS_ullint(unsigned long long int* address,
	254	unsigned long long int compare,
	255	unsigned long long int val);
	256	unsigned short int cudaAtomicCAS_usint(unsigned short int* address,
	257	unsigned short int compare,
	258	unsigned short int val);
	259	#define atomicCAS(address, compare, val) _Generic(address, \
	260	default : cudaAtomicCAS_int, \
	261	unsigned int* : cudaAtomicCAS_uint, \
	262	unsigned long long int* : cudaAtomicCAS_ullint, \
	263	unsigned short int* : cudaAtomicCAS_usint) (address, compare, val)
	264
	265	#endif

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: CIVL/include/headers/cuda.h@ 1aaefd4

Download in other formats: