| [3ff27cf] | 1 | #include <civlc.cvh>
|
|---|
| [ef14ce6] | 2 |
|
|---|
| 3 | /* Functions in this file are meant to serve as drop-in CIVL replacements
|
|---|
| 4 | * for the Cuda function of the same name. Because of this, much of the
|
|---|
| 5 | * documentation of these functions is identical to the documentation
|
|---|
| 6 | * for its Cuda counterpart.
|
|---|
| 7 | */
|
|---|
| 8 |
|
|---|
| 9 | #include <string.h>
|
|---|
| [d66b03b] | 10 | #include <stdlib.h>
|
|---|
| [42ab01f] | 11 | #include <civl-cuda.cvh>
|
|---|
| [ef14ce6] | 12 | #include <concurrency.cvh>
|
|---|
| 13 | #include <cuda.h>
|
|---|
| 14 |
|
|---|
| 15 | /* Returns in *count the number of devices with compute capability
|
|---|
| 16 | * greater or equal to 1.0 that are available for execution.
|
|---|
| 17 | */
|
|---|
| 18 | cudaError_t cudaGetDeviceCount(int *count) {
|
|---|
| 19 | // possibly this should return an value specified as $input?
|
|---|
| 20 | *count = 1;
|
|---|
| 21 | return cudaSuccess;
|
|---|
| 22 | }
|
|---|
| 23 |
|
|---|
| [082072f] | 24 | /* Returns in *device the current device for the calling host thread
|
|---|
| 25 | */
|
|---|
| 26 | cudaError_t cudaGetDevice(int * device) {
|
|---|
| 27 | *device = 0;
|
|---|
| 28 | return cudaSuccess;
|
|---|
| 29 | }
|
|---|
| 30 |
|
|---|
| [bc366d2] | 31 | /* Returns in *prop the properties of device dev
|
|---|
| 32 | */
|
|---|
| 33 | cudaError_t cudaGetDeviceProperties(struct cudaDeviceProp * prop, int dev) {
|
|---|
| 34 | if (dev > 1)
|
|---|
| 35 | return cudaErrorInvalidDevice;
|
|---|
| 36 |
|
|---|
| 37 | strcpy(prop->name, "CIVL_CudaDevice0");
|
|---|
| 38 |
|
|---|
| 39 | return cudaSuccess;
|
|---|
| 40 | }
|
|---|
| 41 |
|
|---|
| [ef14ce6] | 42 | /* Creates and event object
|
|---|
| 43 | */
|
|---|
| 44 | cudaError_t cudaEventCreate(cudaEvent_t *event) {
|
|---|
| [42ab01f] | 45 | *event = $cuda_event_create();
|
|---|
| [ef14ce6] | 46 | return cudaSuccess;
|
|---|
| 47 | }
|
|---|
| 48 |
|
|---|
| 49 | /* Records an event. If stream is non-zero, the event is recorded
|
|---|
| 50 | * after all preceding operations in stream have been completed;
|
|---|
| 51 | * otherwise, it is recorded after all preceding operations in the
|
|---|
| 52 | * CUDA context have been completed. Since operation is asynchronous,
|
|---|
| 53 | * cudaEventQuery() and/or cudaEventSynchronize() must be used to
|
|---|
| 54 | * determine when the event has actually been recorded.
|
|---|
| 55 | */
|
|---|
| 56 | cudaError_t cudaEventRecord(cudaEvent_t event, cudaStream_t s) {
|
|---|
| [42ab01f] | 57 | if ($cuda_get_instances(event) != NULL) {
|
|---|
| [ef14ce6] | 58 | //printf("freeing instance list b\n");
|
|---|
| [42ab01f] | 59 | $free($cuda_get_instances(event));
|
|---|
| [ef14ce6] | 60 | }
|
|---|
| 61 | if (s == NULL) {
|
|---|
| [42ab01f] | 62 | $cuda_set_instances(event, $cuda_all_most_recent_kernels(), $cuda_get_num_streams(&$cuda_current_context) + 1);
|
|---|
| [ef14ce6] | 63 | } else {
|
|---|
| 64 | //printf("mallocing instance list c\n");
|
|---|
| [d66b03b] | 65 | $cuda_kernel_instance_t **instanceList = ($cuda_kernel_instance_t**)malloc(sizeof($cuda_kernel_instance_t*));
|
|---|
| [42ab01f] | 66 | instanceList[0] = $cuda_get_instance($cuda_get_most_recent(s));
|
|---|
| 67 | $cuda_set_instances(event, instanceList, 1);
|
|---|
| [ef14ce6] | 68 | }
|
|---|
| 69 | }
|
|---|
| 70 |
|
|---|
| 71 | /* Query the status of all device work preceding the most recent call
|
|---|
| 72 | * to cudaEventRecord() (in the appropriate compute streams, as
|
|---|
| 73 | * specified by the arguments to cudaEventRecord()).
|
|---|
| 74 | *
|
|---|
| 75 | * If this work has successfully been completed by the device, or if
|
|---|
| 76 | * cudaEventRecord() has not been called on event, then cudaSuccess
|
|---|
| 77 | * is returned. If this work has not yet been completed by the device
|
|---|
| 78 | * then cudaErrorNotReady is returned.
|
|---|
| 79 | */
|
|---|
| 80 | cudaError_t cudaEventQuery(cudaEvent_t event) {
|
|---|
| 81 | _Bool allKernelsFinished = $true;
|
|---|
| 82 |
|
|---|
| [42ab01f] | 83 | for (int i = 0; i < $cuda_get_num_instances(event); i++) {
|
|---|
| 84 | if ($cuda_get_status($cuda_get_instances(event)[i]) != $cuda_kernel_status_finished) {
|
|---|
| [ef14ce6] | 85 | allKernelsFinished = $false;
|
|---|
| 86 | break;
|
|---|
| 87 | }
|
|---|
| 88 | }
|
|---|
| 89 | return allKernelsFinished ? cudaSuccess : cudaErrorNotReady;
|
|---|
| 90 | }
|
|---|
| 91 |
|
|---|
| 92 | /* Wait until the completion of all device work preceding the most
|
|---|
| 93 | * recent call to cudaEventRecord() (in the appropriate compute streams,
|
|---|
| 94 | * as specified by the arguments to cudaEventRecord()).
|
|---|
| 95 | *
|
|---|
| 96 | * If cudaEventRecord() has not been called on event, cudaSuccess
|
|---|
| 97 | * is returned immediately.
|
|---|
| 98 | */
|
|---|
| 99 | cudaError_t cudaEventSynchronize(cudaEvent_t event) {
|
|---|
| [42ab01f] | 100 | $cuda_event_wait(event);
|
|---|
| [ef14ce6] | 101 | return cudaSuccess;
|
|---|
| 102 | }
|
|---|
| 103 |
|
|---|
| 104 | /* since "timing" doesn't really make sense in the verification process
|
|---|
| 105 | * I'm not sure what this should do. maybe it shouldn't exist.
|
|---|
| 106 | */
|
|---|
| 107 | cudaError_t cudaEventElapsedTime(float *t, cudaEvent_t from, cudaEvent_t to) {
|
|---|
| 108 | *t = 1.0;
|
|---|
| 109 | return cudaSuccess;
|
|---|
| 110 | }
|
|---|
| 111 |
|
|---|
| 112 | /* Destroys the event specified by event.
|
|---|
| 113 | */
|
|---|
| 114 | cudaError_t cudaEventDestroy(cudaEvent_t event) {
|
|---|
| [42ab01f] | 115 | $cuda_event_destroy(event);
|
|---|
| [ef14ce6] | 116 | return cudaSuccess;
|
|---|
| 117 | }
|
|---|
| 118 |
|
|---|
| 119 | /* Creates a new asynchronous stream.
|
|---|
| 120 | */
|
|---|
| 121 | cudaError_t cudaStreamCreate(cudaStream_t *pStream) {
|
|---|
| [42ab01f] | 122 | $cuda_stream_node_t *newNode = $cuda_stream_node_create();
|
|---|
| [ef14ce6] | 123 |
|
|---|
| [42ab01f] | 124 | *pStream = $cuda_stream_create();
|
|---|
| 125 | $cuda_set_stream(newNode, *pStream);
|
|---|
| 126 | $cuda_set_next(newNode, $cuda_get_head_node(&$cuda_current_context));
|
|---|
| 127 | $cuda_add_new_stream(&$cuda_current_context, newNode);
|
|---|
| [ef14ce6] | 128 |
|
|---|
| 129 | return cudaSuccess;
|
|---|
| 130 | }
|
|---|
| 131 |
|
|---|
| 132 | /* Blocks until stream has completed all operations.
|
|---|
| 133 | */
|
|---|
| 134 | cudaError_t cudaStreamSynchronize(cudaStream_t stream) {
|
|---|
| 135 | cudaStream_t s;
|
|---|
| 136 |
|
|---|
| 137 | if (stream == NULL)
|
|---|
| [42ab01f] | 138 | s = $cuda_get_null_stream(&$cuda_current_context);
|
|---|
| [ef14ce6] | 139 | else
|
|---|
| 140 | s = stream;
|
|---|
| [42ab01f] | 141 | $cuda_stream_wait(s);
|
|---|
| [ef14ce6] | 142 | return cudaSuccess;
|
|---|
| 143 | }
|
|---|
| 144 |
|
|---|
| 145 | /* Destroys and cleans up the asynchronous stream specified by stream.
|
|---|
| 146 | */
|
|---|
| 147 | cudaError_t cudaStreamDestroy(cudaStream_t pStream) {
|
|---|
| [42ab01f] | 148 | $assert($cuda_is_usable(pStream));
|
|---|
| 149 | $cuda_set_usable(pStream, $false);
|
|---|
| [ef14ce6] | 150 | return cudaSuccess;
|
|---|
| 151 | }
|
|---|
| 152 |
|
|---|
| [30215b0] | 153 | /* Explicitly destroys and cleans up all resources associated with the
|
|---|
| 154 | * current device in the current process. Any subsequent API call to
|
|---|
| 155 | * this device will reinitialize the device.
|
|---|
| 156 | */
|
|---|
| 157 | cudaError_t cudaDeviceReset( void ) {
|
|---|
| 158 | // TODO: Figure out if _cudaContext must be destroyed here
|
|---|
| 159 | return cudaSuccess;
|
|---|
| 160 | }
|
|---|
| 161 |
|
|---|
| [ef14ce6] | 162 | /* locks until stream has completed all operations.
|
|---|
| 163 | */
|
|---|
| 164 | cudaError_t cudaDeviceSynchronize() {
|
|---|
| [42ab01f] | 165 | $cuda_stream_wait_all();
|
|---|
| 166 | $cuda_stream_wait($cuda_get_null_stream(&$cuda_current_context));
|
|---|
| [ef14ce6] | 167 | return cudaSuccess;
|
|---|
| 168 | }
|
|---|
| 169 |
|
|---|
| 170 | /* Copies count bytes from the memory area pointed to by src to the
|
|---|
| 171 | * memory area pointed to by dst, where kind is one of
|
|---|
| 172 | * cudaMemcpyHostToHost, cudaMemcpyHostToDevice, cudaMemcpyDeviceToHost,
|
|---|
| 173 | * or cudaMemcpyDeviceToDevice, and specifies the direction of the
|
|---|
| 174 | * copy. The memory areas may not overlap.
|
|---|
| 175 | */
|
|---|
| 176 | cudaError_t cudaMemcpy ( void *dst, const void *src, size_t count, enum cudaMemcpyKind kind ) {
|
|---|
| 177 | cudaDeviceSynchronize();
|
|---|
| 178 | memcpy(dst, src, count);
|
|---|
| 179 | cudaDeviceSynchronize();
|
|---|
| 180 | return cudaSuccess;
|
|---|
| 181 | }
|
|---|
| 182 |
|
|---|
| [082072f] | 183 | /* Fills the first count bytes of the memory area pointed to by devPtr
|
|---|
| 184 | * with the constant byte value value
|
|---|
| 185 | */
|
|---|
| 186 | cudaError_t cudaMemset(void * devPtr, int value, size_t count) {
|
|---|
| 187 | memset(devPtr, value, count);
|
|---|
| 188 | return cudaSuccess;
|
|---|
| 189 | }
|
|---|
| 190 |
|
|---|
| [ef14ce6] | 191 | /* Frees the memory space pointed to by devPtr. Similar semantics to free/$free.
|
|---|
| 192 | */
|
|---|
| 193 | cudaError_t cudaFree(void *devPtr) {
|
|---|
| 194 | $free(devPtr);
|
|---|
| 195 | return cudaSuccess;
|
|---|
| 196 | }
|
|---|
| 197 |
|
|---|
| 198 | /* Sets device as the current device for the calling host thread. Currently,
|
|---|
| 199 | * only a single device is supported, so this call always succeeds with a noop.
|
|---|
| 200 | */
|
|---|
| 201 | cudaError_t cudaSetDevice(int device_id) {
|
|---|
| 202 | return cudaSuccess;
|
|---|
| 203 | }
|
|---|
| [bb72a10] | 204 |
|
|---|
| [46e063eb] | 205 | /* Returns the message string from an error code
|
|---|
| 206 | */
|
|---|
| [11194f80] | 207 | char _cudaErrorString[10];
|
|---|
| 208 |
|
|---|
| [46e063eb] | 209 | const char* cudaGetErrorString(cudaError_t error) {
|
|---|
| [11194f80] | 210 | strcpy(_cudaErrorString, "test");
|
|---|
| 211 | return _cudaErrorString;
|
|---|
| [46e063eb] | 212 | }
|
|---|
| 213 |
|
|---|
| [bb72a10] | 214 | /* Returns the last error that has been produced by any of the runtime calls
|
|---|
| 215 | * in the same host thread and resets it to cudaSuccess
|
|---|
| 216 | */
|
|---|
| 217 | cudaError_t cudaGetLastError() {
|
|---|
| 218 | return cudaSuccess;
|
|---|
| 219 | }
|
|---|
| [46e063eb] | 220 |
|
|---|
| 221 | /* DEPRECATED. DO NOT USE
|
|---|
| 222 | */
|
|---|
| 223 | cudaError_t cudaThreadExit() {
|
|---|
| 224 | return cudaSuccess;
|
|---|
| 225 | }
|
|---|
| 226 |
|
|---|
| [a688ff1] | 227 | /* C++ Language Extensions */
|
|---|
| 228 |
|
|---|
| 229 | /* Reads the 16-bit, 32-bit or 64-bit word old located at the address address in
|
|---|
| 230 | * global or shared memory, computes (old + val), and stores the result back to
|
|---|
| 231 | * memory at the same address. These three operations are performed in one atomic
|
|---|
| 232 | * transaction. The function returns old.
|
|---|
| 233 | */
|
|---|
| 234 | int cudaAtomicAdd_int(int* address, int val) {
|
|---|
| 235 | int old = *address;
|
|---|
| 236 | *address += val;
|
|---|
| [d811c20] | 237 | // Add call to $check_data_race??
|
|---|
| [a688ff1] | 238 | return old;
|
|---|
| 239 | }
|
|---|
| 240 | unsigned int cudaAtomicAdd_uint(unsigned int* address, unsigned int val) {
|
|---|
| 241 | unsigned int old = *address;
|
|---|
| 242 | *address += val;
|
|---|
| 243 | return old;
|
|---|
| 244 | }
|
|---|
| 245 | unsigned long long int cudaAtomicAdd_ullint(unsigned long long int* address,
|
|---|
| 246 | unsigned long long int val) {
|
|---|
| 247 | unsigned long long int old = *address;
|
|---|
| 248 | *address += val;
|
|---|
| 249 | return old;
|
|---|
| 250 | }
|
|---|
| 251 | float cudaAtomicAdd_float(float* address, float val) {
|
|---|
| 252 | float old = *address;
|
|---|
| 253 | *address += val;
|
|---|
| 254 | return old;
|
|---|
| 255 | }
|
|---|
| 256 | double cudaAtomicAdd_double(double* address, double val) {
|
|---|
| 257 | double old = *address;
|
|---|
| 258 | *address += val;
|
|---|
| 259 | return old;
|
|---|
| 260 | }
|
|---|
| 261 |
|
|---|
| 262 | /* reads the 32-bit word old located at the address address in global or shared
|
|---|
| 263 | * memory, computes (old - val), and stores the result back to memory at the same
|
|---|
| 264 | * address. These three operations are performed in one atomic transaction. The
|
|---|
| 265 | * function returns old.
|
|---|
| 266 | */
|
|---|
| 267 | int cudaAtomicSub_int(int* address, int val) {
|
|---|
| 268 | int old = *address;
|
|---|
| 269 | *address -= val;
|
|---|
| 270 | return old;
|
|---|
| 271 | }
|
|---|
| 272 | unsigned int cudaAtomicSub_uint(unsigned int* address, unsigned int val) {
|
|---|
| 273 | unsigned int old = *address;
|
|---|
| 274 | *address -= val;
|
|---|
| 275 | return old;
|
|---|
| 276 | }
|
|---|
| 277 |
|
|---|
| 278 | /* reads the 32-bit or 64-bit word old located at the address address in global
|
|---|
| 279 | * or shared memory and stores val back to memory at the same address. These two
|
|---|
| 280 | * operations are performed in one atomic transaction. The function returns old.
|
|---|
| 281 | */
|
|---|
| 282 | int cudaAtomicExch_int(int* address, int val) {
|
|---|
| 283 | int old = *address;
|
|---|
| 284 | *address = val;
|
|---|
| 285 | return old;
|
|---|
| 286 | }
|
|---|
| 287 | unsigned int cudaAtomicExch_uint(unsigned int* address, unsigned int val) {
|
|---|
| 288 | unsigned int old = *address;
|
|---|
| 289 | *address = val;
|
|---|
| 290 | return old;
|
|---|
| 291 | }
|
|---|
| 292 | unsigned long long int cudaAtomicExch_ullint(unsigned long long int* address,
|
|---|
| 293 | unsigned long long int val) {
|
|---|
| 294 | unsigned long long int old = *address;
|
|---|
| 295 | *address = val;
|
|---|
| 296 | return old;
|
|---|
| 297 | }
|
|---|
| 298 | float cudaAtomicExch_float(float* address, float val) {
|
|---|
| 299 | float old = *address;
|
|---|
| 300 | *address = val;
|
|---|
| 301 | return old;
|
|---|
| 302 | }
|
|---|
| 303 |
|
|---|
| 304 | /* reads the 32-bit or 64-bit word old located at the address address in global
|
|---|
| 305 | * or shared memory, computes the minimum of old and val, and stores the result
|
|---|
| 306 | * back to memory at the same address. These three operations are performed in one
|
|---|
| 307 | * atomic transaction. The function returns old.
|
|---|
| 308 | */
|
|---|
| 309 | int cudaAtomicMin_int(int* address, int val) {
|
|---|
| 310 | int old = *address;
|
|---|
| 311 | *address = (old <= val) ? old : val;
|
|---|
| 312 | return old;
|
|---|
| 313 | }
|
|---|
| 314 | unsigned int cudaAtomicMin_uint(unsigned int* address, unsigned int val) {
|
|---|
| 315 | unsigned int old = *address;
|
|---|
| 316 | *address = (old <= val) ? old : val;
|
|---|
| 317 | return old;
|
|---|
| 318 | }
|
|---|
| 319 | unsigned long long int cudaAtomicMin_ullint(unsigned long long int* address,
|
|---|
| 320 | unsigned long long int val) {
|
|---|
| 321 | unsigned long long int old = *address;
|
|---|
| 322 | *address = (old <= val) ? old : val;
|
|---|
| 323 | return old;
|
|---|
| 324 | }
|
|---|
| 325 |
|
|---|
| 326 | /* reads the 32-bit or 64-bit word old located at the address address in global
|
|---|
| 327 | * or shared memory, computes the maximum of old and val, and stores the result
|
|---|
| 328 | * back to memory at the same address. These three operations are performed in one
|
|---|
| 329 | * atomic transaction. The function returns old.
|
|---|
| 330 | */
|
|---|
| 331 | int cudaAtomicMax_int(int* address, int val) {
|
|---|
| 332 | int old = *address;
|
|---|
| 333 | *address = (old >= val) ? old : val;
|
|---|
| 334 | return old;
|
|---|
| 335 | }
|
|---|
| 336 | unsigned int cudaAtomicMax_uint(unsigned int* address, unsigned int val) {
|
|---|
| 337 | unsigned int old = *address;
|
|---|
| 338 | *address = (old >= val) ? old : val;
|
|---|
| 339 | return old;
|
|---|
| 340 | }
|
|---|
| 341 | unsigned long long int cudaAtomicMax_ullint(unsigned long long int* address,
|
|---|
| 342 | unsigned long long int val) {
|
|---|
| 343 | unsigned long long int old = *address;
|
|---|
| 344 | *address = (old >= val) ? old : val;
|
|---|
| 345 | return old;
|
|---|
| 346 | }
|
|---|
| 347 |
|
|---|
| 348 | /* reads the 32-bit word old located at the address address in global or shared
|
|---|
| 349 | * memory, computes ((old >= val) ? 0 : (old+1)), and stores the result back to
|
|---|
| 350 | * memory at the same address. These three operations are performed in one atomic
|
|---|
| 351 | * transaction. The function returns old.
|
|---|
| 352 | */
|
|---|
| 353 | unsigned int atomicInc(unsigned int* address, unsigned int val) {
|
|---|
| 354 | unsigned int old = *address;
|
|---|
| 355 | *address = (old >= val) ? 0 : old + 1;
|
|---|
| 356 | return old;
|
|---|
| 357 | }
|
|---|
| 358 |
|
|---|
| 359 | /* reads the 32-bit word old located at the address address in global or shared
|
|---|
| 360 | * memory, computes (((old == 0) || (old > val)) ? val : (old-1) ), and stores
|
|---|
| 361 | * the result back to memory at the same address. These three operations are
|
|---|
| 362 | * performed in one atomic transaction. The function returns old.
|
|---|
| 363 | */
|
|---|
| 364 | unsigned int atomicDec(unsigned int* address, unsigned int val) {
|
|---|
| 365 | unsigned int old = *address;
|
|---|
| 366 | *address = ((old == 0) || (old > val)) ? val : old-1;
|
|---|
| 367 | return old;
|
|---|
| 368 | }
|
|---|
| 369 |
|
|---|
| 370 | /* reads the 16-bit, 32-bit or 64-bit word old located at the address address in
|
|---|
| 371 | * global or shared memory, computes (old == compare ? val : old) , and stores the
|
|---|
| 372 | * result back to memory at the same address. These three operations are performed
|
|---|
| 373 | * in one atomic transaction. The function returns old (Compare And Swap).
|
|---|
| 374 | */
|
|---|
| 375 | int cudaAtomicCAS_int(int* address, int compare, int val) {
|
|---|
| 376 | int old = *address;
|
|---|
| 377 | *address = old == compare ? val : old;
|
|---|
| 378 | return old;
|
|---|
| 379 | }
|
|---|
| 380 | unsigned int cudaAtomicCAS_uint(unsigned int* address,
|
|---|
| 381 | unsigned int compare,
|
|---|
| 382 | unsigned int val) {
|
|---|
| 383 | unsigned int old = *address;
|
|---|
| 384 | *address = old == compare ? val : old;
|
|---|
| 385 | return old;
|
|---|
| 386 | }
|
|---|
| 387 | unsigned long long int cudaAtomicCAS_ullint(unsigned long long int* address,
|
|---|
| 388 | unsigned long long int compare,
|
|---|
| 389 | unsigned long long int val) {
|
|---|
| 390 | unsigned long long int old = *address;
|
|---|
| 391 | *address = old == compare ? val : old;
|
|---|
| 392 | return old;
|
|---|
| 393 | }
|
|---|
| 394 | unsigned short int cudaAtomicCAS_usint(unsigned short int* address,
|
|---|
| 395 | unsigned short int compare,
|
|---|
| 396 | unsigned short int val) {
|
|---|
| 397 | unsigned short int old = *address;
|
|---|
| 398 | *address = old == compare ? val : old;
|
|---|
| 399 | return old;
|
|---|
| 400 | }
|
|---|
| 401 |
|
|---|