source: CIVL/examples/cuda/newCudaMockup.cvl@ 7ffcb1b

main test-branch
Last change on this file since 7ffcb1b was 9cabba4, checked in by Alex Wilton <awilton@…>, 2 years ago

Merged CUDA branch into trunk.

git-svn-id: svn://vsl.cis.udel.edu/civl/trunk@5867 fb995dde-84ed-4084-dfe6-e5aef3e2452c

  • Property mode set to 100644
File size: 40.6 KB
Line 
1#include <concurrency.cvh>
2#include <comm.cvh>
3#include <stdlib.h>
4#include <stdio.h>
5#include <stdbool.h>
6#include <string.h>
7#include <mem.cvh>
8#pragma CIVL ACSL
9
10///////////
11// Types //
12///////////
13
14enum cudaError {
15 cudaSuccess
16};
17typedef enum cudaError cudaError_t;
18
19//typedef $cuda_memcpy_kind cudaMemcpyKind;
20
21typedef struct {
22 unsigned int x, y, z;
23} dim3;
24
25/* used to represent a location in a three dimensional grid
26 */
27typedef struct {
28 unsigned int x, y, z;
29} uint3;
30
31typedef enum {
32 cudaMemcpyHostToHost,
33 cudaMemcpyHostToDevice,
34 cudaMemcpyDeviceToHost,
35 cudaMemcpyDeviceToDevice,
36 cudaMemcpyDefault
37} cudaMemcpyKind;
38
39typedef struct $cuda_op_state* $cuda_op_state_t;
40typedef struct $cuda_op_state {
41 _Bool start;
42 $proc op;
43} $cuda_op_state;
44
45typedef struct $cuda_op_state_node* $cuda_op_state_node_t;
46typedef struct $cuda_op_state_node {
47 $cuda_op_state_t opState;
48 $cuda_op_state_node_t next;
49} $cuda_op_state_node;
50
51typedef struct $cuda_stream_node* $cuda_stream_node_t;
52typedef struct $cuda_stream* $cuda_stream_t;
53typedef $cuda_stream_t cudaStream_t;
54typedef struct $cuda_stream {
55 $cuda_op_state_node_t head;
56 $cuda_op_state_node_t tail;
57 int numOps;
58 $cuda_stream_node_t containingNode;
59 _Bool alive;
60} $cuda_stream;
61
62typedef struct $cuda_stream_node{
63 cudaStream_t stream;
64 $cuda_stream_node_t prev;
65 $cuda_stream_node_t next;
66} $cuda_stream_node;
67
68typedef struct $cuda_context* $cuda_context_t;
69typedef struct $cuda_context {
70 $cuda_stream_node_t head;
71 int numStreams;
72} $cuda_context;
73
74typedef struct $cuda_kernel_instance* $cuda_kernel_instance_t;
75typedef struct $cuda_kernel_instance {
76 $mem* readSets;
77 $mem* writeSets;
78 int size;
79} $cuda_kernel_instance;
80
81typedef struct $cuda_memcpy_data {
82 void* dst;
83 const void* src;
84 size_t count;
85 cudaMemcpyKind kind;
86} $cuda_memcpy_data;
87
88//////////////////////
89// Global Variables //
90//////////////////////
91
92$gcomm $cuda_gcomm = $gcomm_create($here, 2);
93const int $CUDA_PLACE_HOST = 0;
94const int $CUDA_PLACE_DEVICE = 1;
95$comm $cuda_host_comm = $comm_create($here, $cuda_gcomm, $CUDA_PLACE_HOST);
96
97/**
98 * Tags used for message-passing between host and device
99 */
100enum $cuda_tag {
101 // Predefined tags
102 $CUDA_TAG_TEARDOWN,
103 $CUDA_TAG_SCOPE_REQUEST,
104 $CUDA_TAG_cudaFree,
105 $CUDA_TAG_cudaMemcpy,
106 $CUDA_TAG_cudaMemcpyAsync,
107 // Generated tags (by transformer)
108 $CUDA_TAG_LAUNCH_kernel_1
109};
110
111///////////////////
112// CIVL-CUDA API //
113///////////////////
114
115$scope $cuda_host_request_device_scope() {
116 $comm_enqueue($cuda_host_comm, $message_pack($CUDA_PLACE_HOST, $CUDA_PLACE_DEVICE, $CUDA_TAG_SCOPE_REQUEST, NULL, 0));
117 $message response = $comm_dequeue($cuda_host_comm, $CUDA_PLACE_DEVICE, $CUDA_TAG_SCOPE_REQUEST);
118 $scope result;
119 $message_unpack(response, &result, sizeof($scope));
120
121 return result;
122}
123
124void $cuda_host_memcpy(void* dst, const void* src, size_t count, cudaMemcpyKind kind, _Bool async) {
125 if (kind == cudaMemcpyHostToHost) {
126 memcpy(dst, src, count);
127 } else {
128 $cuda_memcpy_data args;
129 args.dst = dst;
130 args.src = src;
131 args.count = count;
132 args.kind = kind;
133
134 int tag = async ? $CUDA_TAG_cudaMemcpyAsync : $CUDA_TAG_cudaMemcpy;
135
136 $comm_enqueue($cuda_host_comm, $message_pack($CUDA_PLACE_HOST, $CUDA_PLACE_DEVICE, tag, &args, sizeof($cuda_memcpy_data)));
137 $comm_dequeue($cuda_host_comm, $CUDA_PLACE_DEVICE, tag);
138 }
139}
140
141$cuda_stream_node_t $create_new_stream_node($scope cudaScope) {
142 cudaStream_t newStream = (cudaStream_t) $malloc(cudaScope, sizeof($cuda_stream));
143 newStream->head = NULL;
144 newStream->tail = NULL;
145 newStream->numOps = 0;
146 newStream->alive = true;
147
148 $cuda_stream_node_t newHead = ($cuda_stream_node_t) $malloc(cudaScope, sizeof($cuda_stream_node));
149 newHead->stream = newStream;
150 newStream->containingNode = newHead;
151 newHead->prev = NULL;
152 newHead->next = NULL;
153
154 return newHead;
155}
156
157/*@ depends_on \nothing;
158 @ assigns \nothing;
159 @ reads \nothing;
160 @*/
161$atomic_f $proc $destroy_stream_node($cuda_stream_node_t node) {
162 $proc lastOpProc = $proc_null;
163 cudaStream_t stream = node->stream;
164
165 if (node->prev != NULL) {
166 node->prev->next = node->next;
167 }
168 if (node->next != NULL) {
169 node->next->prev = node->prev;
170 }
171 free(node);
172
173 stream->alive = false;
174 if(stream->tail != NULL)
175 lastOpProc = stream->tail->opState->op;
176
177 void destroyStreamWhenComplete($proc lastOpProc, cudaStream_t stream) {
178 $wait(lastOpProc);
179 free(stream);
180 }
181
182 return $spawn destroyStreamWhenComplete(lastOpProc, stream);
183}
184
185/*@ depends_on \access(stream);
186 @ assigns stream;
187 @ reads \nothing;
188 @*/
189$atomic_f $proc $stream_enqueue($scope cudaScope, cudaStream_t stream, $message opParams, void(*opProc)($message, $cuda_op_state_t, cudaStream_t)) {
190 $assert(stream->alive, "Attempt to enqueue a CUDA operation onto a destroyed stream");
191
192 $cuda_op_state_t newOpState = ($cuda_op_state_t) $malloc(cudaScope, sizeof($cuda_op_state));
193 newOpState->start = false;
194 newOpState->op = $spawn opProc(opParams, newOpState, stream);
195
196 $cuda_op_state_node_t newOpStateNode = ($cuda_op_state_node_t) $malloc(cudaScope, sizeof($cuda_op_state_node));
197 newOpStateNode->opState = newOpState;
198 newOpStateNode->next = NULL;
199
200 if (stream->tail == NULL) {
201 stream->head = newOpStateNode;
202 stream->tail = newOpStateNode;
203 newOpState->start = true;
204 } else {
205 stream->tail->next = newOpStateNode;
206 stream->tail = newOpStateNode;
207 }
208 stream->numOps++;
209
210 return newOpState->op;
211}
212
213/*@ depends_on \nothing;
214 @ assigns \nothing;
215 @ reads \nothing;
216 @*/
217$atomic_f void $stream_dequeue(cudaStream_t stream) {
218 $assert(stream->head != NULL, "Attempt to dequeue an empty stream");
219
220 if (stream->head == stream->tail) {
221 stream->tail = NULL;
222 }
223
224 $cuda_op_state_node_t oldHead = stream->head;
225 stream->head = oldHead->next;
226 if (stream->head != NULL) {
227 stream->head->opState->start = true;
228 }
229
230 stream->numOps--;
231 free(oldHead->opState);
232 free(oldHead);
233}
234
235// Helper function
236int $dim3_index(dim3 size, uint3 location) {
237 return location.x + size.x * (location.y + size.y * location.z);
238}
239
240// Helper function
241int $cuda_kernel_index (dim3 gDim, dim3 bDim, uint3 bIdx, uint3 tIdx) {
242 return $dim3_index(gDim, bIdx) * (bDim.x * bDim.y * bDim.z) + $dim3_index(bDim, tIdx);
243}
244
245void $cuda_run_and_wait_on_procs(dim3 dim, void spawningFunction(uint3)) {
246 //TODO: calculate length and index, replace this function in the kernel
247 $local_start();
248 int length = dim.x * dim.y * dim.z;
249 $proc procArray[length];
250 $range rx = 0 .. dim.x - 1;
251 $range ry = 0 .. dim.y - 1;
252 $range rz = 0 .. dim.z - 1;
253 $domain(3) dom = ($domain(3)){rx, ry, rz};
254 $for(int x,y,z : dom){
255 uint3 id = { x, y, z };
256 int index = $dim3_index(dim, id);
257 procArray[index] = $spawn spawningFunction(id);
258 }
259 $local_end();
260 $waitall(procArray,length);
261}
262
263
264// CUDA Ops //
265
266void $cuda_memcpy_proc($message m, $cuda_op_state_t opState, cudaStream_t stream) {
267
268 $when(opState->start);
269 $cuda_memcpy_data args;
270 $message_unpack(m, &args, sizeof($cuda_memcpy_data));
271
272 if (args.kind == cudaMemcpyHostToDevice || cudaMemcpyDeviceToDevice) {
273 args.dst = $reveal(args.dst);
274 }
275 if (args.kind == cudaMemcpyDeviceToHost || cudaMemcpyDeviceToDevice) {
276 args.src = $reveal(args.src);
277 }
278 memcpy(args.dst, args.src, args.count);
279
280 $stream_dequeue(stream);
281}
282
283$message $cuda_memcpy($scope cudaScope, cudaStream_t stream, $message request, _Bool async) {
284 $cuda_memcpy_data args;
285 $message_unpack(request, &args, sizeof($cuda_memcpy_data));
286
287 $proc memcpyProc = $stream_enqueue(cudaScope, stream, request, $cuda_memcpy_proc);
288
289 if (!async && args.kind != cudaMemcpyDeviceToDevice) {
290 $wait(memcpyProc);
291 }
292 int tag = async ? $CUDA_TAG_cudaMemcpyAsync : $CUDA_TAG_cudaMemcpy;
293
294 return $message_pack($CUDA_PLACE_DEVICE, $CUDA_PLACE_HOST, tag, NULL, 0);
295}
296
297$message $cuda_free($message request) {
298 void* devPtr;
299 $message_unpack(request, &devPtr, sizeof(void*));
300 free($reveal(devPtr));
301
302 return $message_pack($CUDA_PLACE_DEVICE, $CUDA_PLACE_HOST, $CUDA_TAG_cudaFree, NULL, 0);
303}
304
305$cuda_kernel_instance_t $create_kernel_instance($scope cudaScope, dim3 gridDim, dim3 blockDim){
306 int threadsPerBlock = (blockDim.x * blockDim.y) * blockDim.z;
307 int numBlocks = (gridDim.x * gridDim.y) * gridDim.z;
308 int numThreads = threadsPerBlock * numBlocks;
309
310 $cuda_kernel_instance_t $kernel = ($cuda_kernel_instance_t)$malloc(cudaScope, sizeof($cuda_kernel_instance));
311 $mem* readSets = ($mem*)$malloc(cudaScope, sizeof($mem) * numThreads);
312 $mem* writeSets = ($mem*)$malloc(cudaScope, sizeof($mem) * numThreads);
313 //Is this meant to be the number of threads in the grid?
314 int size = numThreads;
315
316 $kernel->readSets = readSets;
317 $kernel->writeSets = writeSets;
318 $kernel->size = size;
319
320 return $kernel;
321}
322
323void $destroy_kernel_instance($cuda_kernel_instance_t $kernel){
324 free($kernel->readSets);
325 free($kernel->writeSets);
326 free($kernel);
327 return;
328}
329
330void $clear_mem_sets($cuda_kernel_instance_t k, int cur_tid) {
331 k->writeSets[cur_tid] = $mem_empty();
332 k->readSets[cur_tid] = $mem_empty();
333}
334
335void $clear_all_mem_sets($cuda_kernel_instance_t k){
336 for(int i = 0; i < k->size; i++)
337 $clear_mem_sets(k, i);
338}
339
340$atomic_f void $check_data_race($cuda_kernel_instance_t k, int cur_tid) {
341 //printf("Current id: %d\n", cur_tid);
342 $mem out_s0 = $mem_empty();
343 $mem out_s1 = $mem_empty();
344 $mem cur_mw = $write_set_pop();
345 $mem cur_mr = $read_set_pop();
346
347 // Update current R/W sets
348 k->writeSets[cur_tid] = cur_mw;
349 k->readSets[cur_tid] = cur_mr;
350
351 /*
352 printf("CHECKING DATA RACE %d [\n", cur_tid);
353 for (int tmp_tid = 0; tmp_tid < k->size; tmp_tid++) {
354 printf(" RS %d: %s\n", tmp_tid, k->read_sets[tmp_tid]);
355 printf(" WS %d: %s\n", tmp_tid, k->write_sets[tmp_tid]);
356 }
357 printf("]\n");
358 */
359
360 // Check data race
361 for (int tmp_tid = 0; tmp_tid < k->size; tmp_tid++) {
362 if (tmp_tid == cur_tid) continue;
363
364 $mem tmp_mr = k->readSets[tmp_tid];
365 $mem tmp_mw = k->writeSets[tmp_tid];
366
367 $assert($mem_no_intersect(cur_mr, tmp_mw, &out_s0, &out_s1),
368 "Data-race detected: %p read by thread %d intersects %p written by thread %d\n",
369 out_s0, cur_tid, out_s1, tmp_tid);
370 $assert($mem_no_intersect(cur_mw, tmp_mr, &out_s0, &out_s1),
371 "Data-race detected: %p read by thread %d intersects %p written by thread %d\n",
372 out_s0, cur_tid, out_s1, tmp_tid);
373 $assert($mem_no_intersect(cur_mw, tmp_mw, &out_s0, &out_s1),
374 "Data-race detected: %p written by thread %d intersects %p written by thread %d\n",
375 out_s0, cur_tid, out_s1, tmp_tid);
376 }
377 // Update current R/W sets
378 //k->writeSets[cur_tid] = $mem_empty();
379 //k->readSets[cur_tid] = $mem_empty();
380 $read_set_push();
381 $write_set_push();
382}
383
384void $cuda_barrier($cuda_kernel_instance_t k, int kernel_id, $barrier g) {
385 /*$check_data_race(k, kernel_id);
386 We have to push a new read and write set before the barrier call to ignore it's reads and writes
387 $read_set_push();
388 $write_set_push();
389 void captured_clear_mems(){
390 $clear_all_mem_sets(k);
391 }
392 */
393 //$barrier_call_execute(g, captured_clear_mems);
394 $local_end();
395 $barrier_call(g);
396 $local_start();
397 //$read_set_pop();
398 //$write_set_pop();
399}
400
401
402
403////////////////////////////////////////////
404// CUDA API Functions (For Host-use Only) //
405////////////////////////////////////////////
406
407cudaError_t cudaFree(void* devPtr) {
408 $comm_enqueue($cuda_host_comm, $message_pack($CUDA_PLACE_HOST, $CUDA_PLACE_DEVICE, $CUDA_TAG_cudaFree, &devPtr, sizeof(void*)));
409 $comm_dequeue($cuda_host_comm, $CUDA_PLACE_DEVICE, $CUDA_TAG_cudaFree);
410
411 return cudaSuccess;
412}
413
414cudaError_t cudaMemcpy(void* dst, const void* src, size_t count, cudaMemcpyKind kind) {
415 $cuda_host_memcpy(dst, src, count, kind, false);
416 return cudaSuccess;
417}
418
419cudaError_t cudaMemcpyAsync(void* dst, const void* src, size_t count,
420 cudaMemcpyKind kind, cudaStream_t stream) {
421 $cuda_host_memcpy(dst, src, count, kind, true);
422 return cudaSuccess;
423}
424
425/*
426cudaError_t cudaStreamCreate(cudaStream_t * pStream) {
427 // Create new stream node in linked list
428 $cuda_stream_node_t newHead = $create_new_stream_node();
429 newHead->next = $cuda_global_context.head;
430 $cuda_global_context.head->prev = newHead;
431
432 // Update cuda context's head to be the new node we created
433 $cuda_global_context.head = newHead;
434 $cuda_global_context.numStreams++;
435
436 return cudaSuccess;
437}
438*/
439
440/*
441cudaError_t cudaStreamSynchronize(cudaStream_t stream) {
442 stream = $default_stream_if_null(stream);
443 $assert(stream->alive, "Attempt to synchronize with a destroyed stream");
444 $when(stream->head == NULL) return cudaSuccess;
445}
446*/
447
448/*
449cudaError_t cudaStreamDestroy(cudaStream_t stream) {
450 $assert(stream != NULL && stream != $cuda_default_stream, "Attempt to destroy default stream");
451 $assert(stream->alive, "Attempt to destroy an already destroyed stream");
452 $destroy_stream_node(stream->containingNode);
453 return cudaSuccess;
454}
455*/
456
457/*
458cudaError_t cudaDeviceSynchronize() {
459 $proc* opsToWaitOn;
460 int numOps = 0;
461
462 $atomic {
463 opsToWaitOn = ($proc*) malloc(sizeof($proc) * $cuda_global_context.numStreams);
464
465 for ($cuda_stream_node_t node = $cuda_global_context.head;
466 node != NULL;
467 node = node->next) {
468 if (node->stream->tail != NULL) {
469 opsToWaitOn[numOps] = node->stream->tail->opState->op;
470 numOps++;
471 }
472 }
473 }
474 $waitall(opsToWaitOn, numOps);
475
476 return cudaSuccess;
477}
478*/
479
480////////////////
481// Warp stuff //
482////////////////
483
484// TODO: Handle thread termination!
485
486$input int warpSize = 32;
487
488typedef struct $cuda_warp* $cuda_warp_t;
489typedef struct $cuda_warp {
490 int size;
491 int num_alive;
492 _Bool alive[];
493 int num_in_barrier;
494 _Bool in_barrier[];
495 int reductionLane;
496 $gcomm warp_gcomm;
497} $cuda_warp;
498
499$cuda_warp_t $create_cuda_warp($scope warpScope, int size) {
500 $cuda_warp_t newWarp = ($cuda_warp_t) malloc(sizeof($cuda_warp));
501 newWarp->size = size;
502 newWarp->num_alive = size;
503 newWarp->alive = (_Bool[size])$lambda(int i) $true;
504 newWarp->num_in_barrier = 0;
505 newWarp->in_barrier = (_Bool[size])$lambda(int i) $false;
506 newWarp->reductionLane = -1;
507 newWarp->warp_gcomm = $gcomm_create(warpScope, size);
508
509 return newWarp;
510}
511
512void $destroy_cuda_warp($cuda_warp_t warp) {
513 $assert(warp != NULL, "Attempt to destroy a NULL warp");
514
515 $gcomm_destroy(warp->warp_gcomm, NULL);
516 free(warp);
517}
518
519/*@ depends_on \access(warp);
520 @ executes_when \true;
521 @ */
522$atomic_f void $cuda_warp_barrier_update($cuda_warp_t warp) {
523 if (warp->num_in_barrier == warp->num_alive) {
524 warp->num_in_barrier = 0;
525 for (int i = 0; i < warp->size; i++) {
526 warp->in_barrier[i] = $false;
527 }
528 }
529}
530
531typedef struct $cuda_lane* $cuda_lane_t;
532typedef struct $cuda_lane {
533 int laneID;
534 $cuda_warp_t warp;
535 $comm lane_comm;
536} $cuda_lane;
537
538$cuda_lane_t $create_cuda_lane($scope laneScope, $cuda_warp_t warp, int laneID) {
539 $assert(warp != NULL, "Attempt to create lane from NULL warp");
540
541 $cuda_lane_t newLane = ($cuda_lane_t) malloc(sizeof($cuda_lane));
542 newLane->laneID = laneID;
543 newLane->warp = warp;
544 newLane->lane_comm = $comm_create(laneScope, warp->warp_gcomm, laneID);
545
546 return newLane;
547};
548
549/*@ depends_on \access(lane);
550 @ executes_when \true;
551 @ */
552$atomic_f void $destroy_cuda_lane($cuda_lane_t lane) {
553 $assert(lane != NULL, "Attempt to destroy NULL lane");
554
555 lane->warp->alive[lane->laneID] = $false;
556 lane->warp->num_alive--;
557 $cuda_warp_barrier_update(lane->warp);
558 $comm_destroy(lane->lane_comm);
559 free(lane);
560}
561
562/*@ depends_on \access(lane);
563 @ executes_when \true;
564 @ */
565$atomic_f void $cuda_warp_barrier_enter($cuda_lane_t lane) {
566 $cuda_warp_t warp = lane->warp;
567 int place = lane->laneID;
568 $assert(!warp->in_barrier[place]);
569
570 warp->in_barrier[place] = $true;
571 warp->num_in_barrier++;
572 $cuda_warp_barrier_update(warp);
573}
574
575/*@ depends_on \access(lane);
576 @ */
577$atomic_f void $cuda_warp_barrier_exit($cuda_lane_t lane) {
578 $when(!lane->warp->in_barrier[lane->laneID]);
579}
580
581void $cuda_warp_barrier_call($cuda_lane_t lane) {
582 $cuda_warp_barrier_enter(lane);
583 $cuda_warp_barrier_exit(lane);
584}
585
586typedef enum {
587 $CUDA_WARP_TAG_shfl_sync,
588 $CUDA_WARP_TAG_shfl_up_sync,
589 $CUDA_WARP_TAG_shfl_down_sync,
590 $CUDA_WARP_TAG_shfl_xor_sync,
591 $CUDA_WARP_TAG_all_sync,
592 $CUDA_WARP_TAG_any_sync,
593 $CUDA_WARP_TAG_ballot_sync
594} $cuda_warp_tag;
595
596#define $GET_ARG_1(_1, ...) _1
597#define $GET_ARG_2(_1, _2, ...) _2
598#define $CUDA_SHFL_PARAM_MACRO(...) $GET_ARG_1(__VA_ARGS__, warpSize, 0), $GET_ARG_2(__VA_ARGS__, warpSize, 0)
599
600#define __shfl_sync(mask, var, ...) \
601 _Generic(var, \
602 default: $cuda__shfl_sync_int, \
603 unsigned int: $cuda__shfl_sync_uint, \
604 long: $cuda__shfl_sync_long, \
605 unsigned long: $cuda__shfl_sync_ulong, \
606 long long: $cuda__shfl_sync_ll, \
607 unsigned long long: $cuda__shfl_sync_ull, \
608 float: $cuda__shfl_sync_float,\
609 double: $cuda__shfl_sync_double) (mask, var, $CUDA_SHFL_PARAM_MACRO(__VA_ARGS__), $lane)
610
611#define __shfl_up_sync(mask, var, ...) \
612 _Generic(var, \
613 default: $cuda__shfl_up_sync_int, \
614 unsigned int: $cuda__shfl_up_sync_uint, \
615 long: $cuda__shfl_up_sync_long, \
616 unsigned long: $cuda__shfl_up_sync_ulong, \
617 long long: $cuda__shfl_up_sync_ll, \
618 unsigned long long: $cuda__shfl_up_sync_ull, \
619 float: $cuda__shfl_up_sync_float, \
620 double: $cuda__shfl_up_sync_double) (mask, var, $CUDA_SHFL_PARAM_MACRO(__VA_ARGS__), $lane)
621
622#define __shfl_down_sync(mask, var, ...) \
623 _Generic(var, \
624 default: $cuda__shfl_down_sync_int, \
625 unsigned int: $cuda__shfl_down_sync_uint, \
626 long: $cuda__shfl_down_sync_long, \
627 unsigned long: $cuda__shfl_down_sync_ulong, \
628 long long: $cuda__shfl_down_sync_ll, \
629 unsigned long long: $cuda__shfl_down_sync_ull, \
630 float: $cuda__shfl_down_sync_float, \
631 double: $cuda__shfl_down_sync_double) (mask, var, $CUDA_SHFL_PARAM_MACRO(__VA_ARGS__), $lane)
632
633#define __shfl_xor_sync(mask, var, ...) \
634 _Generic(var, \
635 default: $cuda__shfl_xor_sync_int, \
636 unsigned int: $cuda__shfl_xor_sync_uint, \
637 long: $cuda__shfl_xor_sync_long, \
638 unsigned long: $cuda__shfl_xor_sync_ulong, \
639 long long: $cuda__shfl_xor_sync_ll, \
640 unsigned long long: $cuda__shfl_xor_sync_ull, \
641 float: $cuda__shfl_xor_sync_float, \
642 double: $cuda__shfl_xor_sync_double) (mask, var, $CUDA_SHFL_PARAM_MACRO(__VA_ARGS__), $lane)
643
644#define $CUDA_GENERIC_SHFL_BODY() \
645 $assert (width <= warpSize); \
646 for (int v = width; v > 1; v /= 2) { \
647 $assert(v % 2 == 0); \
648 } \
649 \
650 int requestLane; \
651 switch(tag) { \
652 case $CUDA_WARP_TAG_shfl_sync: \
653 requestLane = lane->laneID/width + laneParam % width; \
654 break; \
655 case $CUDA_WARP_TAG_shfl_up_sync: \
656 requestLane = lane->laneID - laneParam; \
657 break; \
658 case $CUDA_WARP_TAG_shfl_down_sync: \
659 requestLane = lane->laneID + laneParam; \
660 break; \
661 case $CUDA_WARP_TAG_shfl_xor_sync: \
662 requestLane = lane->laneID ^ laneParam; \
663 break; \
664 } \
665 _Bool validSrcLane = requestLane >= 0 && requestLane < lane->warp->size; \
666 if (validSrcLane) { \
667 $comm_enqueue(lane->lane_comm, $message_pack(lane->laneID, requestLane, tag, NULL, 0)); \
668 } \
669 \
670 $local_end(); \
671 $cuda_warp_barrier_call(lane); \
672 $local_start(); \
673 \
674 while ($comm_probe(lane->lane_comm, $COMM_ANY_SOURCE, tag)) { \
675 $message request = $comm_dequeue(lane->lane_comm, $COMM_ANY_SOURCE, tag); \
676 \
677 $comm_enqueue(lane->lane_comm, $message_pack(lane->laneID, $message_source(request), tag, &var, typeSize)); \
678 } \
679 \
680 $local_end(); \
681 $cuda_warp_barrier_call(lane); \
682 $local_start(); \
683 \
684 if (validSrcLane) { \
685 $message result = $comm_dequeue(lane->lane_comm, requestLane, tag); \
686 $message_unpack(result, &resultVal, typeSize); \
687 } else { \
688 $havoc(&resultVal); \
689 }
690
691#define $CUDA_DEFINE_SHFL(NAME, T, TAG) \
692 T NAME(unsigned mask, T var, int laneParam, int width, $cuda_lane_t lane) { \
693 T resultVal; \
694 int typeSize = sizeof(T); \
695 $cuda_warp_tag tag = TAG; \
696 \
697 $CUDA_GENERIC_SHFL_BODY(); \
698 \
699 return resultVal; \
700 }
701
702$CUDA_DEFINE_SHFL($cuda__shfl_sync_int, int, $CUDA_WARP_TAG_shfl_sync)
703$CUDA_DEFINE_SHFL($cuda__shfl_sync_uint, unsigned int, $CUDA_WARP_TAG_shfl_sync)
704$CUDA_DEFINE_SHFL($cuda__shfl_sync_long, long, $CUDA_WARP_TAG_shfl_sync)
705$CUDA_DEFINE_SHFL($cuda__shfl_sync_ulong, unsigned long, $CUDA_WARP_TAG_shfl_sync)
706$CUDA_DEFINE_SHFL($cuda__shfl_sync_ll, long long, $CUDA_WARP_TAG_shfl_sync)
707$CUDA_DEFINE_SHFL($cuda__shfl_sync_ull, unsigned long long, $CUDA_WARP_TAG_shfl_sync)
708$CUDA_DEFINE_SHFL($cuda__shfl_sync_float, float, $CUDA_WARP_TAG_shfl_sync)
709$CUDA_DEFINE_SHFL($cuda__shfl_sync_double, double, $CUDA_WARP_TAG_shfl_sync)
710
711$CUDA_DEFINE_SHFL($cuda__shfl_up_sync_int, int, $CUDA_WARP_TAG_shfl_up_sync)
712$CUDA_DEFINE_SHFL($cuda__shfl_up_sync_uint, unsigned int, $CUDA_WARP_TAG_shfl_up_sync)
713$CUDA_DEFINE_SHFL($cuda__shfl_up_sync_long, long, $CUDA_WARP_TAG_shfl_up_sync)
714$CUDA_DEFINE_SHFL($cuda__shfl_up_sync_ulong, unsigned long, $CUDA_WARP_TAG_shfl_up_sync)
715$CUDA_DEFINE_SHFL($cuda__shfl_up_sync_ll, long long, $CUDA_WARP_TAG_shfl_up_sync)
716$CUDA_DEFINE_SHFL($cuda__shfl_up_sync_ull, unsigned long long, $CUDA_WARP_TAG_shfl_up_sync)
717$CUDA_DEFINE_SHFL($cuda__shfl_up_sync_float, float, $CUDA_WARP_TAG_shfl_up_sync)
718$CUDA_DEFINE_SHFL($cuda__shfl_up_sync_double, double, $CUDA_WARP_TAG_shfl_up_sync)
719
720$CUDA_DEFINE_SHFL($cuda__shfl_down_sync_int, int, $CUDA_WARP_TAG_shfl_down_sync)
721$CUDA_DEFINE_SHFL($cuda__shfl_down_sync_uint, unsigned int, $CUDA_WARP_TAG_shfl_down_sync)
722$CUDA_DEFINE_SHFL($cuda__shfl_down_sync_long, long, $CUDA_WARP_TAG_shfl_down_sync)
723$CUDA_DEFINE_SHFL($cuda__shfl_down_sync_ulong, unsigned long, $CUDA_WARP_TAG_shfl_down_sync)
724$CUDA_DEFINE_SHFL($cuda__shfl_down_sync_ll, long long, $CUDA_WARP_TAG_shfl_down_sync)
725$CUDA_DEFINE_SHFL($cuda__shfl_down_sync_ull, unsigned long long, $CUDA_WARP_TAG_shfl_down_sync)
726$CUDA_DEFINE_SHFL($cuda__shfl_down_sync_float, float, $CUDA_WARP_TAG_shfl_down_sync)
727$CUDA_DEFINE_SHFL($cuda__shfl_down_sync_double, double, $CUDA_WARP_TAG_shfl_down_sync)
728
729$CUDA_DEFINE_SHFL($cuda__shfl_xor_sync_int, int, $CUDA_WARP_TAG_shfl_xor_sync)
730$CUDA_DEFINE_SHFL($cuda__shfl_xor_sync_uint, unsigned int, $CUDA_WARP_TAG_shfl_xor_sync)
731$CUDA_DEFINE_SHFL($cuda__shfl_xor_sync_long, long, $CUDA_WARP_TAG_shfl_xor_sync)
732$CUDA_DEFINE_SHFL($cuda__shfl_xor_sync_ulong, unsigned long, $CUDA_WARP_TAG_shfl_xor_sync)
733$CUDA_DEFINE_SHFL($cuda__shfl_xor_sync_ll, long long, $CUDA_WARP_TAG_shfl_xor_sync)
734$CUDA_DEFINE_SHFL($cuda__shfl_xor_sync_ull, unsigned long long, $CUDA_WARP_TAG_shfl_xor_sync)
735$CUDA_DEFINE_SHFL($cuda__shfl_xor_sync_float, float, $CUDA_WARP_TAG_shfl_xor_sync)
736$CUDA_DEFINE_SHFL($cuda__shfl_xor_sync_double, double, $CUDA_WARP_TAG_shfl_xor_sync)
737
738#define __ballot_sync(mask, predicate) $cuda__ballot_sync(mask, predicate, $lane)
739#define __all_sync(mask, predicate) $cuda__all_sync(mask, predicate, $lane)
740#define __any_sync(mask, predicate) $cuda__any_sync(mask, predicate, $lane)
741
742#define $CUDA_GENERIC_COND_REDUCTION_BODY(COND, T_REDUCTION, F_REDUCTION) \
743 $cuda_warp_t warp = lane->warp; \
744 int laneID = lane->laneID; \
745 $comm comm = lane->lane_comm; \
746 if (warp->reductionLane == -1) { \
747 warp->reductionLane = laneID; \
748 \
749 result = initialValue; \
750 for (int i = 0; i < warp->size; i++) { \
751 if (i == laneID) { \
752 operand = value; \
753 } else { \
754 $local_end(); \
755 $when(!warp->alive[i] || $comm_probe(comm, i, tag)) $local_start(); \
756 \
757 if (!warp->alive[i]) { \
758 operand = initialValue; \
759 } else { \
760 $local_end(); \
761 $message_unpack($comm_dequeue(comm, i, tag), &operand, typeSize); \
762 $local_start(); \
763 } \
764 } \
765 \
766 if (COND) { \
767 result = T_REDUCTION; \
768 } else { \
769 result = F_REDUCTION; \
770 } \
771 } \
772 \
773 warp->reductionLane = -1; \
774 \
775 for (int i = 0; i< warp->size; i++) { \
776 if (i != laneID && warp->alive[i]) { \
777 $comm_enqueue(comm, $message_pack(laneID, i, tag, &result, typeSize)); \
778 } \
779 } \
780 } else { \
781 int reductionLane = warp->reductionLane; \
782 $comm_enqueue(comm, $message_pack(laneID, reductionLane, tag, &value, typeSize)); \
783 $local_end(); \
784 $message_unpack($comm_dequeue(comm, reductionLane, tag), &result, typeSize); \
785 $local_start(); \
786 }
787
788#define $CUDA_GENERIC_REDUCTION_BODY(REDUCTION) $CUDA_GENERIC_COND_REDUCTION_BODY($true, REDUCTION, result)
789
790int $cuda__all_sync(unsigned mask, int value, $cuda_lane_t lane) {
791 $cuda_warp_tag tag = $CUDA_WARP_TAG_all_sync;
792 int typeSize = sizeof(int);
793 int initialValue = 1;
794 int result, operand;
795
796 $CUDA_GENERIC_COND_REDUCTION_BODY(result != 0 && operand != 0, 1, 0);
797
798 return result;
799}
800
801int $cuda__any_sync(unsigned mask, int value, $cuda_lane_t lane) {
802 $cuda_warp_tag tag = $CUDA_WARP_TAG_any_sync;
803 int typeSize = sizeof(int);
804 int initialValue = 0;
805 int result, operand;
806
807 $CUDA_GENERIC_COND_REDUCTION_BODY(result != 0 || operand != 0, 1, 0);
808
809 return result;
810}
811
812unsigned $cuda__ballot_sync(unsigned mask, int value, $cuda_lane_t lane) {
813 $cuda_warp_tag tag = $CUDA_WARP_TAG_ballot_sync;
814 int initialValue = 0;
815 int typeSize = sizeof(int);
816 unsigned result;
817 int operand;
818
819 $CUDA_GENERIC_COND_REDUCTION_BODY(operand == 0, 2 * result, 2 * result + 1);
820
821 return result;
822}
823
824/*
825int $cuda__ballot_sync(unsigned mask, int value, $cuda_lane_t lane) {
826 $cuda_warp_tag tag = $CUDA_WARP_TAG_ballot_sync;
827 int initialValue = 0;
828 int result;
829
830 if (warp->reductionLane == -1) {
831 warp->reductionLane = laneID;
832
833 result = initialValue;
834 for (int i = 0; i < warp->size; i++) {
835 int operand;
836 if (i == laneID) {
837 operand = value;
838 } else {
839 $local_end();
840 $when(!warp->alive[i] || $comm_probe(comm, i, tag)) $local_start();
841
842 if (!warp->alive[i]) {
843 operand = initialValue;
844 } else {
845 $local_end();
846 $message_unpack($comm_dequeue(comm, i, tag), &operand, sizeof(int));
847 $local_start();
848 }
849 }
850
851 result = 2 * result + (operand == 0 ? 0 : 1);
852 }
853
854 warp->reductionLane = -1;
855
856 for (int i = 0; i< warp->size; i++) {
857 if (i != laneID && warp->alive[i]) {
858 $comm_enqueue(comm, $message_pack(laneID, i, tag, &result, sizeof(int)));
859 }
860 }
861 } else {
862 int reductionLane = warp->reductionLane;
863 $comm_enqueue(comm, $message_pack(laneID, reductionLane, tag, &value, sizeof(int)));
864 $local_end();
865 $message_unpack($comm_dequeue(comm, reductionLane, tag), &result, sizeof(int));
866 $local_start();
867 }
868
869 return result;
870}
871*/
872
873//////////////////////////////////
874// Generated code from kernel_1 //
875//////////////////////////////////
876
877typedef struct {
878 dim3 gridDim;
879 dim3 blockDim;
880 size_t $cudaMemSize;
881 cudaStream_t $cudaStream;
882 float* A;
883 const float* B;
884 float* C;
885 int numElements;
886} $cuda_kernel_1_data;
887
888void $cuda_reveal_kernel_1_args($cuda_kernel_1_data* args) {
889 args->A = $reveal(args->A);
890 args->B = $reveal(args->B);
891 args->C = $reveal(args->C);
892}
893
894void $cuda_host_launch_kernel_1(dim3 gridDim, dim3 blockDim,
895 size_t $cudaMemSize, cudaStream_t $cudaStream,
896 float* A, const float* B, float* C, int numElements) {
897 $cuda_kernel_1_data args;
898 args.gridDim = gridDim;
899 args.blockDim = blockDim;
900 args.$cudaMemSize = $cudaMemSize;
901 args.$cudaStream = $cudaStream;
902 args.A = A;
903 args.B = B;
904 args.C = C;
905 args.numElements = numElements;
906
907 $comm_enqueue($cuda_host_comm, $message_pack($CUDA_PLACE_HOST, $CUDA_PLACE_DEVICE, $CUDA_TAG_LAUNCH_kernel_1, &args, sizeof($cuda_kernel_1_data)));
908 $comm_dequeue($cuda_host_comm, $CUDA_PLACE_DEVICE, $CUDA_TAG_LAUNCH_kernel_1);
909}
910
911void $cuda_kernel_1(dim3 gridDim, dim3 blockDim, size_t _cuda_mem_size,
912 float *A, const float *B, float *C, int numElements) {
913 $cuda_kernel_instance_t $kernel = $create_kernel_instance($here, gridDim, blockDim);
914 void $cuda_block(uint3 blockIdx) {
915 int $numThreads = (blockDim.x * blockDim.y) * blockDim.z;
916 int $numWarps = ($numThreads - 1)/warpSize + 1;
917 $scope $block_root = $here;
918 $gbarrier $cuda_block_barrier = $gbarrier_create($block_root, $numThreads);
919
920 $cuda_warp_t $warps[$numWarps];
921 for (int i = 0; i < $numWarps - 1; i++) {
922 $warps[i] = $create_cuda_warp($block_root, warpSize);
923 }
924 $warps[$numWarps-1] = $create_cuda_warp($block_root, (($numThreads - 1) % warpSize) + 1);
925
926 void $cuda_thread(uint3 threadIdx) {
927 $local_start();
928 int _cuda_tid = $dim3_index(blockDim, threadIdx);
929 int _cuda_kid = $cuda_kernel_index(gridDim, blockDim, blockIdx, threadIdx);
930 //$clear_mem_sets($kernel, _cuda_kid);
931 $barrier $cuda_thread_barrier = $barrier_create($here, $cuda_block_barrier, _cuda_tid);
932 $cuda_lane_t $lane = $create_cuda_lane($here, $warps[_cuda_tid / warpSize], _cuda_tid % warpSize);
933
934 //$read_set_push();
935 //$write_set_push();
936
937 // Kernel REDUCTION start
938 /*
939 int lane = threadIdx.x % warpSize;
940 int thisWarpSize = warpSize;
941 if (threadIdx.x - lane + warpSize > blockDim.x) {
942 thisWarpSize = ((blockDim.x - 1) % warpSize) + 1;
943 }
944
945 int i = blockDim.x * blockIdx.x + threadIdx.x;
946 int warpStart = i - lane;
947 printf("%d,%d - i: %d, warpStart: %d, thisWarpSize: %d\n", blockIdx.x, threadIdx.x,i, warpStart, thisWarpSize);
948 int remainingElements = numElements;
949
950 while (remainingElements > 1) {
951 //printf("%d,%d - remainingElements: %d\n", blockIdx.x, threadIdx.x, remainingElements);
952 if (remainingElements < numElements) {
953 // __syncThreads()
954 //printf("%d,%d - entering barrier\n", blockIdx.x, threadIdx.x);
955
956 $cuda_barrier($kernel, _cuda_kid, $cuda_thread_barrier);
957 //printf("%d,%d - exiting barrier\n", blockIdx.x, threadIdx.x);
958 }
959
960 if (warpStart + 1 < remainingElements) {
961 float val = i < numElements ? A[i] : 0;
962
963 for (int offset = warpSize/2; offset > 0; offset /= 2) {
964 float tmp = __shfl_down_sync(0, val, offset);
965 if (lane + offset < thisWarpSize) {
966 val += tmp;
967 }
968 }
969
970 if (i < numElements) {
971 A[i] = val;
972 }
973 }
974
975 i *= warpSize;
976 //warpStart *= warpSize;
977 remainingElements = ((remainingElements - 1) / warpSize) + 1;
978 }
979
980 if (i == 0) {
981 *C = A[0];
982 }
983 // Kernel REDUCTION end
984 */
985 // Kernel BALLOT TEST start
986
987 int i = threadIdx.x;
988 if (i < numElements) {
989 int result = __ballot_sync(~0, A[i] > 0);
990 if (i == 0) {
991 printf("Result: %d\n", result);
992 *C = 0;
993 while(result > 0) {
994 if (result % 2)
995 *C += 1;
996 result /= 2;
997 }
998 printf("done calculating result\n");
999 }
1000 }
1001 // Kernel BALLOT TEST end
1002 //$check_data_race($kernel, _cuda_kid);
1003 //$read_set_pop();
1004 //$write_set_pop();
1005 $barrier_destroy($cuda_thread_barrier);
1006 $destroy_cuda_lane($lane);
1007 $local_end();
1008 }
1009 $cuda_run_and_wait_on_procs(blockDim, $cuda_thread);
1010 $gbarrier_destroy($cuda_block_barrier);
1011
1012 for (int i = 0; i < $numWarps; i++) {
1013 $destroy_cuda_warp($warps[i]);
1014 }
1015 }
1016 $cuda_run_and_wait_on_procs(gridDim, $cuda_block);
1017 $destroy_kernel_instance($kernel);
1018}
1019
1020void $cuda_kernel_1_proc ($message request, $cuda_op_state_t opState, cudaStream_t cudaStream) {
1021 $when(opState->start);
1022
1023 $cuda_kernel_1_data args;
1024 $message_unpack(request, &args, sizeof($cuda_kernel_1_data));
1025 $cuda_reveal_kernel_1_args(&args);
1026
1027 $cuda_kernel_1(args.gridDim, args.blockDim, args.$cudaMemSize, args.A, args.B, args.C, args.numElements);
1028 $stream_dequeue(cudaStream);
1029}
1030
1031/////////////////
1032// CUDA "file" //
1033/////////////////
1034
1035void $cuda_main() {
1036
1037 // Device Variables
1038
1039 $scope $cuda_scope = $here;
1040
1041 $comm $cuda_device_comm = $comm_create($cuda_scope, $cuda_gcomm, 1);
1042 $cuda_context $cuda_global_context;
1043 cudaStream_t $cuda_default_stream;
1044
1045 // Helper function to get the default stream if passed NULL, and just returns stream otherwise
1046 // Currently unused until we support streams other than the default one.
1047 cudaStream_t $default_stream_if_null(cudaStream_t stream) {
1048 return stream == NULL ? $cuda_default_stream : stream;
1049 }
1050
1051 // Device Logic
1052
1053 $cuda_stream_node_t defaultStreamNode = $create_new_stream_node($cuda_scope);
1054 $cuda_default_stream = defaultStreamNode->stream;
1055
1056 $cuda_global_context.head = defaultStreamNode;
1057 $cuda_global_context.numStreams = 1;
1058
1059 while (true) {
1060 $message request = $comm_dequeue($cuda_device_comm, $CUDA_PLACE_HOST, $COMM_ANY_TAG);
1061 $message response;
1062 const int tag = $message_tag(request);
1063
1064 switch(tag) {
1065 case $CUDA_TAG_SCOPE_REQUEST :
1066 response = $message_pack($CUDA_PLACE_DEVICE, $CUDA_PLACE_HOST, $CUDA_TAG_SCOPE_REQUEST, &$cuda_scope, sizeof($scope));
1067 break;
1068 case $CUDA_TAG_cudaFree :
1069 response = $cuda_free(request);
1070 break;
1071 case $CUDA_TAG_cudaMemcpy :
1072 response = $cuda_memcpy($cuda_scope, $cuda_default_stream, request, false);
1073 break;
1074 case $CUDA_TAG_cudaMemcpyAsync :
1075 response = $cuda_memcpy($cuda_scope, $cuda_default_stream, request, true);
1076 break;
1077 case $CUDA_TAG_LAUNCH_kernel_1 :
1078 $stream_enqueue($cuda_scope, $cuda_default_stream, request, $cuda_kernel_1_proc);
1079
1080 response = $message_pack($CUDA_PLACE_DEVICE, $CUDA_PLACE_HOST, tag, NULL, 0);
1081 break;
1082 case $CUDA_TAG_TEARDOWN : {
1083 $proc destructor = $destroy_stream_node($cuda_default_stream->containingNode);
1084 $wait(destructor);
1085 $comm_destroy($cuda_device_comm);
1086 return;
1087 }
1088 default :
1089 $assert(false, "Unknown CUDA request");
1090 }
1091
1092 $comm_enqueue($cuda_device_comm, response);
1093 }
1094}
1095
1096///////////////
1097// Host file //
1098///////////////
1099
1100$input int N;
1101$assume (N > 0);
1102$input float A[N];
1103// Currently unused but left in to save time
1104$input float B[N];
1105
1106void $host_main() {
1107 int size = N * sizeof(float);
1108 int numBlocks = 1;
1109 //int numThreads = N%2 == 0? N/2 : (N+1)/2;
1110 int numThreads = warpSize;
1111
1112 float* cuda_A;
1113 // cudaMalloc((void **)&cuda_A, size);
1114 {
1115 $scope deviceScope = $cuda_host_request_device_scope();
1116 cuda_A = $hide((float*)$malloc(deviceScope, size));
1117 }
1118 cudaMemcpy(cuda_A, A, size, cudaMemcpyHostToDevice);
1119
1120 float* cuda_B;
1121 // cudaMalloc((void **)&cuda_B, size);
1122 {
1123 $scope deviceScope = $cuda_host_request_device_scope();
1124 cuda_B = $hide((float*)$malloc(deviceScope, size));
1125 }
1126 cudaMemcpy(cuda_B, B, size, cudaMemcpyHostToDevice);
1127
1128 float* cuda_C;
1129 // cudaMalloc((void **)&cuda_C, sizeof(float));
1130 {
1131 $scope deviceScope = $cuda_host_request_device_scope();
1132 cuda_C = $hide((float*)$malloc(deviceScope, sizeof(float)));
1133 }
1134
1135 dim3 gridDim = {numBlocks, 1, 1};
1136 dim3 blockDim = {numThreads, 1, 1};
1137 // kernel_1<<<gridDim, blockDim>>>(cuda_A, cuda_B, cuda_C, N);
1138 $cuda_host_launch_kernel_1(gridDim, blockDim, 0, NULL, cuda_A, cuda_B, cuda_C, N);
1139
1140 // Checking correctness
1141 float* C = (float *)malloc(size);
1142
1143 cudaMemcpy(C, cuda_C, sizeof(float), cudaMemcpyDeviceToHost);
1144
1145 // REDUCTION ASSERTION
1146 /*
1147 float sum = 0;
1148 for(int i = 0; i < N; i++)
1149 sum += A[i];
1150
1151 $assert(*C == sum);
1152 */
1153 // BALLOT ASSERTION
1154 float count = 0;
1155 for (int i = 0; i < N; i++) {
1156 if (A[i] > 0)
1157 count++;
1158 }
1159 $assert(*C == count);
1160
1161 free(C);
1162
1163 cudaFree(cuda_A);
1164 cudaFree(cuda_B);
1165 cudaFree(cuda_C);
1166
1167}
1168
1169int main() {
1170 $proc host = $spawn $host_main();
1171 $proc cuda = $spawn $cuda_main();
1172 $wait(host);
1173 $comm_enqueue($cuda_host_comm, $message_pack($CUDA_PLACE_HOST, $CUDA_PLACE_DEVICE, $CUDA_TAG_TEARDOWN, NULL, 0));
1174 $comm_destroy($cuda_host_comm);
1175 $wait(cuda);
1176 $gcomm_destroy($cuda_gcomm, NULL);
1177}
Note: See TracBrowser for help on using the repository browser.