source: CIVL/include/impls/cuda.cvl@ 1aaefd4

main test-branch
Last change on this file since 1aaefd4 was ea777aa, checked in by Alex Wilton <awilton@…>, 3 years ago

Moved examples, include, build_default.properties, common.xml, and README out from dev.civl.com into the root of the repo.

git-svn-id: svn://vsl.cis.udel.edu/civl/trunk@5704 fb995dde-84ed-4084-dfe6-e5aef3e2452c

  • Property mode set to 100644
File size: 12.9 KB
Line 
1#include <civlc.cvh>
2
3/* Functions in this file are meant to serve as drop-in CIVL replacements
4 * for the Cuda function of the same name. Because of this, much of the
5 * documentation of these functions is identical to the documentation
6 * for its Cuda counterpart.
7 */
8
9#include <string.h>
10#include <stdlib.h>
11#include <civl-cuda.cvh>
12#include <concurrency.cvh>
13#include <cuda.h>
14
15/* Returns in *count the number of devices with compute capability
16 * greater or equal to 1.0 that are available for execution.
17 */
18cudaError_t cudaGetDeviceCount(int *count) {
19 // possibly this should return an value specified as $input?
20 *count = 1;
21 return cudaSuccess;
22}
23
24/* Returns in *device the current device for the calling host thread
25 */
26cudaError_t cudaGetDevice(int * device) {
27 *device = 0;
28 return cudaSuccess;
29}
30
31/* Returns in *prop the properties of device dev
32 */
33cudaError_t cudaGetDeviceProperties(struct cudaDeviceProp * prop, int dev) {
34 if (dev > 1)
35 return cudaErrorInvalidDevice;
36
37 strcpy(prop->name, "CIVL_CudaDevice0");
38
39 return cudaSuccess;
40}
41
42/* Creates and event object
43 */
44cudaError_t cudaEventCreate(cudaEvent_t *event) {
45 *event = $cuda_event_create();
46 return cudaSuccess;
47}
48
49/* Records an event. If stream is non-zero, the event is recorded
50 * after all preceding operations in stream have been completed;
51 * otherwise, it is recorded after all preceding operations in the
52 * CUDA context have been completed. Since operation is asynchronous,
53 * cudaEventQuery() and/or cudaEventSynchronize() must be used to
54 * determine when the event has actually been recorded.
55 */
56cudaError_t cudaEventRecord(cudaEvent_t event, cudaStream_t s) {
57 if ($cuda_get_instances(event) != NULL) {
58 //printf("freeing instance list b\n");
59 $free($cuda_get_instances(event));
60 }
61 if (s == NULL) {
62 $cuda_set_instances(event, $cuda_all_most_recent_kernels(), $cuda_get_num_streams(&$cuda_current_context) + 1);
63 } else {
64 //printf("mallocing instance list c\n");
65 $cuda_kernel_instance_t **instanceList = ($cuda_kernel_instance_t**)malloc(sizeof($cuda_kernel_instance_t*));
66 instanceList[0] = $cuda_get_instance($cuda_get_most_recent(s));
67 $cuda_set_instances(event, instanceList, 1);
68 }
69}
70
71/* Query the status of all device work preceding the most recent call
72 * to cudaEventRecord() (in the appropriate compute streams, as
73 * specified by the arguments to cudaEventRecord()).
74 *
75 * If this work has successfully been completed by the device, or if
76 * cudaEventRecord() has not been called on event, then cudaSuccess
77 * is returned. If this work has not yet been completed by the device
78 * then cudaErrorNotReady is returned.
79 */
80cudaError_t cudaEventQuery(cudaEvent_t event) {
81 _Bool allKernelsFinished = $true;
82
83 for (int i = 0; i < $cuda_get_num_instances(event); i++) {
84 if ($cuda_get_status($cuda_get_instances(event)[i]) != $cuda_kernel_status_finished) {
85 allKernelsFinished = $false;
86 break;
87 }
88 }
89 return allKernelsFinished ? cudaSuccess : cudaErrorNotReady;
90}
91
92/* Wait until the completion of all device work preceding the most
93 * recent call to cudaEventRecord() (in the appropriate compute streams,
94 * as specified by the arguments to cudaEventRecord()).
95 *
96 * If cudaEventRecord() has not been called on event, cudaSuccess
97 * is returned immediately.
98 */
99cudaError_t cudaEventSynchronize(cudaEvent_t event) {
100 $cuda_event_wait(event);
101 return cudaSuccess;
102}
103
104/* since "timing" doesn't really make sense in the verification process
105 * I'm not sure what this should do. maybe it shouldn't exist.
106 */
107cudaError_t cudaEventElapsedTime(float *t, cudaEvent_t from, cudaEvent_t to) {
108 *t = 1.0;
109 return cudaSuccess;
110}
111
112/* Destroys the event specified by event.
113 */
114cudaError_t cudaEventDestroy(cudaEvent_t event) {
115 $cuda_event_destroy(event);
116 return cudaSuccess;
117}
118
119/* Creates a new asynchronous stream.
120 */
121cudaError_t cudaStreamCreate(cudaStream_t *pStream) {
122 $cuda_stream_node_t *newNode = $cuda_stream_node_create();
123
124 *pStream = $cuda_stream_create();
125 $cuda_set_stream(newNode, *pStream);
126 $cuda_set_next(newNode, $cuda_get_head_node(&$cuda_current_context));
127 $cuda_add_new_stream(&$cuda_current_context, newNode);
128
129 return cudaSuccess;
130}
131
132/* Blocks until stream has completed all operations.
133 */
134cudaError_t cudaStreamSynchronize(cudaStream_t stream) {
135 cudaStream_t s;
136
137 if (stream == NULL)
138 s = $cuda_get_null_stream(&$cuda_current_context);
139 else
140 s = stream;
141 $cuda_stream_wait(s);
142 return cudaSuccess;
143}
144
145/* Destroys and cleans up the asynchronous stream specified by stream.
146 */
147cudaError_t cudaStreamDestroy(cudaStream_t pStream) {
148 $assert($cuda_is_usable(pStream));
149 $cuda_set_usable(pStream, $false);
150 return cudaSuccess;
151}
152
153/* Explicitly destroys and cleans up all resources associated with the
154 * current device in the current process. Any subsequent API call to
155 * this device will reinitialize the device.
156 */
157cudaError_t cudaDeviceReset( void ) {
158 // TODO: Figure out if _cudaContext must be destroyed here
159 return cudaSuccess;
160}
161
162/* locks until stream has completed all operations.
163 */
164cudaError_t cudaDeviceSynchronize() {
165 $cuda_stream_wait_all();
166 $cuda_stream_wait($cuda_get_null_stream(&$cuda_current_context));
167 return cudaSuccess;
168}
169
170/* Copies count bytes from the memory area pointed to by src to the
171 * memory area pointed to by dst, where kind is one of
172 * cudaMemcpyHostToHost, cudaMemcpyHostToDevice, cudaMemcpyDeviceToHost,
173 * or cudaMemcpyDeviceToDevice, and specifies the direction of the
174 * copy. The memory areas may not overlap.
175 */
176cudaError_t cudaMemcpy ( void *dst, const void *src, size_t count, enum cudaMemcpyKind kind ) {
177 cudaDeviceSynchronize();
178 memcpy(dst, src, count);
179 cudaDeviceSynchronize();
180 return cudaSuccess;
181}
182
183/* Fills the first count bytes of the memory area pointed to by devPtr
184 * with the constant byte value value
185 */
186cudaError_t cudaMemset(void * devPtr, int value, size_t count) {
187 memset(devPtr, value, count);
188 return cudaSuccess;
189}
190
191/* Frees the memory space pointed to by devPtr. Similar semantics to free/$free.
192 */
193cudaError_t cudaFree(void *devPtr) {
194 $free(devPtr);
195 return cudaSuccess;
196}
197
198/* Sets device as the current device for the calling host thread. Currently,
199 * only a single device is supported, so this call always succeeds with a noop.
200 */
201cudaError_t cudaSetDevice(int device_id) {
202 return cudaSuccess;
203}
204
205/* Returns the message string from an error code
206 */
207char _cudaErrorString[10];
208
209const char* cudaGetErrorString(cudaError_t error) {
210 strcpy(_cudaErrorString, "test");
211 return _cudaErrorString;
212}
213
214/* Returns the last error that has been produced by any of the runtime calls
215 * in the same host thread and resets it to cudaSuccess
216 */
217cudaError_t cudaGetLastError() {
218 return cudaSuccess;
219}
220
221/* DEPRECATED. DO NOT USE
222 */
223cudaError_t cudaThreadExit() {
224 return cudaSuccess;
225}
226
227/* C++ Language Extensions */
228
229/* Reads the 16-bit, 32-bit or 64-bit word old located at the address address in
230 * global or shared memory, computes (old + val), and stores the result back to
231 * memory at the same address. These three operations are performed in one atomic
232 * transaction. The function returns old.
233 */
234int cudaAtomicAdd_int(int* address, int val) {
235 int old = *address;
236 *address += val;
237 // Add call to $check_data_race??
238 return old;
239}
240unsigned int cudaAtomicAdd_uint(unsigned int* address, unsigned int val) {
241 unsigned int old = *address;
242 *address += val;
243 return old;
244}
245unsigned long long int cudaAtomicAdd_ullint(unsigned long long int* address,
246 unsigned long long int val) {
247 unsigned long long int old = *address;
248 *address += val;
249 return old;
250}
251float cudaAtomicAdd_float(float* address, float val) {
252 float old = *address;
253 *address += val;
254 return old;
255}
256double cudaAtomicAdd_double(double* address, double val) {
257 double old = *address;
258 *address += val;
259 return old;
260}
261
262/* reads the 32-bit word old located at the address address in global or shared
263 * memory, computes (old - val), and stores the result back to memory at the same
264 * address. These three operations are performed in one atomic transaction. The
265 * function returns old.
266 */
267int cudaAtomicSub_int(int* address, int val) {
268 int old = *address;
269 *address -= val;
270 return old;
271}
272unsigned int cudaAtomicSub_uint(unsigned int* address, unsigned int val) {
273 unsigned int old = *address;
274 *address -= val;
275 return old;
276}
277
278/* reads the 32-bit or 64-bit word old located at the address address in global
279 * or shared memory and stores val back to memory at the same address. These two
280 * operations are performed in one atomic transaction. The function returns old.
281 */
282int cudaAtomicExch_int(int* address, int val) {
283 int old = *address;
284 *address = val;
285 return old;
286}
287unsigned int cudaAtomicExch_uint(unsigned int* address, unsigned int val) {
288 unsigned int old = *address;
289 *address = val;
290 return old;
291}
292unsigned long long int cudaAtomicExch_ullint(unsigned long long int* address,
293 unsigned long long int val) {
294 unsigned long long int old = *address;
295 *address = val;
296 return old;
297}
298float cudaAtomicExch_float(float* address, float val) {
299 float old = *address;
300 *address = val;
301 return old;
302}
303
304/* reads the 32-bit or 64-bit word old located at the address address in global
305 * or shared memory, computes the minimum of old and val, and stores the result
306 * back to memory at the same address. These three operations are performed in one
307 * atomic transaction. The function returns old.
308 */
309int cudaAtomicMin_int(int* address, int val) {
310 int old = *address;
311 *address = (old <= val) ? old : val;
312 return old;
313}
314unsigned int cudaAtomicMin_uint(unsigned int* address, unsigned int val) {
315 unsigned int old = *address;
316 *address = (old <= val) ? old : val;
317 return old;
318}
319unsigned long long int cudaAtomicMin_ullint(unsigned long long int* address,
320 unsigned long long int val) {
321 unsigned long long int old = *address;
322 *address = (old <= val) ? old : val;
323 return old;
324}
325
326/* reads the 32-bit or 64-bit word old located at the address address in global
327 * or shared memory, computes the maximum of old and val, and stores the result
328 * back to memory at the same address. These three operations are performed in one
329 * atomic transaction. The function returns old.
330 */
331int cudaAtomicMax_int(int* address, int val) {
332 int old = *address;
333 *address = (old >= val) ? old : val;
334 return old;
335}
336unsigned int cudaAtomicMax_uint(unsigned int* address, unsigned int val) {
337 unsigned int old = *address;
338 *address = (old >= val) ? old : val;
339 return old;
340}
341unsigned long long int cudaAtomicMax_ullint(unsigned long long int* address,
342 unsigned long long int val) {
343 unsigned long long int old = *address;
344 *address = (old >= val) ? old : val;
345 return old;
346}
347
348/* reads the 32-bit word old located at the address address in global or shared
349 * memory, computes ((old >= val) ? 0 : (old+1)), and stores the result back to
350 * memory at the same address. These three operations are performed in one atomic
351 * transaction. The function returns old.
352 */
353unsigned int atomicInc(unsigned int* address, unsigned int val) {
354 unsigned int old = *address;
355 *address = (old >= val) ? 0 : old + 1;
356 return old;
357}
358
359/* reads the 32-bit word old located at the address address in global or shared
360 * memory, computes (((old == 0) || (old > val)) ? val : (old-1) ), and stores
361 * the result back to memory at the same address. These three operations are
362 * performed in one atomic transaction. The function returns old.
363 */
364unsigned int atomicDec(unsigned int* address, unsigned int val) {
365 unsigned int old = *address;
366 *address = ((old == 0) || (old > val)) ? val : old-1;
367 return old;
368}
369
370/* reads the 16-bit, 32-bit or 64-bit word old located at the address address in
371 * global or shared memory, computes (old == compare ? val : old) , and stores the
372 * result back to memory at the same address. These three operations are performed
373 * in one atomic transaction. The function returns old (Compare And Swap).
374 */
375int cudaAtomicCAS_int(int* address, int compare, int val) {
376 int old = *address;
377 *address = old == compare ? val : old;
378 return old;
379}
380unsigned int cudaAtomicCAS_uint(unsigned int* address,
381 unsigned int compare,
382 unsigned int val) {
383 unsigned int old = *address;
384 *address = old == compare ? val : old;
385 return old;
386}
387unsigned long long int cudaAtomicCAS_ullint(unsigned long long int* address,
388 unsigned long long int compare,
389 unsigned long long int val) {
390 unsigned long long int old = *address;
391 *address = old == compare ? val : old;
392 return old;
393}
394unsigned short int cudaAtomicCAS_usint(unsigned short int* address,
395 unsigned short int compare,
396 unsigned short int val) {
397 unsigned short int old = *address;
398 *address = old == compare ? val : old;
399 return old;
400}
401
Note: See TracBrowser for help on using the repository browser.