source: CIVL/examples/cuda/newCudaMockup.cvl@ cb46bb6

1.23 2.0 main test-branch
Last change on this file since cb46bb6 was cb46bb6, checked in by Alex Wilton <awilton@…>, 3 years ago

Renamed and to the more succinct names and .

git-svn-id: svn://vsl.cis.udel.edu/civl/trunk@5746 fb995dde-84ed-4084-dfe6-e5aef3e2452c

  • Property mode set to 100644
File size: 18.6 KB
Line 
1/**
2 * TODO:
3 * - implement cudaMemset and cudaMemsetAsync
4 * - flesh out basic structure of cuda kernel:
5 * - spawn gridDim blocks
6 * - spawn blockDim threads
7 * Alternatively, spawn blockDim warps and then spawn warp's threads
8 * - wait for blocks to finish
9 * - Add in block-level barriers and a __syncthreads() nested function that uses the barrier.
10 * - Add in data race checking support and implement atomicAdd for integers
11 * - Handle dependencies at atomic blocks (replace some with local blocks if possible)
12 */
13
14#include <concurrency.cvh>
15#include <comm.cvh>
16#include <stdlib.h>
17#include <stdio.h>
18#include <stdbool.h>
19#include <string.h>
20
21///////////
22// Types //
23///////////
24
25enum cudaError {
26 cudaSuccess
27};
28typedef enum cudaError cudaError_t;
29
30typedef enum cudaMemcpyKind {
31 cudaMemcpyHostToHost,
32 cudaMemcpyHostToDevice,
33 cudaMemcpyDeviceToHost,
34 cudaMemcpyDeviceToDevice,
35 cudaMemcpyDefault
36} cudaMemcpyKind;
37
38typedef struct {
39 unsigned int x, y, z;
40} dim3;
41
42/* used to represent a location in a three dimensional grid
43 */
44typedef struct {
45 unsigned int x, y, z;
46} uint3;
47
48typedef struct $cuda_op_state* $cuda_op_state_t;
49struct $cuda_op_state {
50 _Bool start;
51 $proc op;
52};
53
54typedef struct $cuda_op_state_node* $cuda_op_state_node_t;
55struct $cuda_op_state_node {
56 $cuda_op_state_t opState;
57 $cuda_op_state_node_t next;
58};
59
60typedef struct cudaStream* cudaStream_t;
61typedef struct $cuda_stream_node* $cuda_stream_node_t;
62struct cudaStream {
63 $cuda_op_state_node_t head;
64 $cuda_op_state_node_t tail;
65 int numOps;
66 $cuda_stream_node_t containingNode;
67 _Bool alive;
68};
69
70struct $cuda_stream_node {
71 cudaStream_t stream;
72 $cuda_stream_node_t prev;
73 $cuda_stream_node_t next;
74};
75
76typedef struct $cuda_context {
77 $cuda_stream_node_t head; //list of streams
78 int numStreams;
79} $cuda_context;
80
81typedef struct $cuda_memcpy_data {
82 void* dst;
83 const void* src;
84 size_t count;
85 cudaMemcpyKind kind;
86} $cuda_memcpy_data;
87
88typedef struct $cuda_kernel_1_data {
89 dim3 gridDim;
90 dim3 blockDim;
91 size_t $cudaMemSize;
92 cudaStream_t $cudaStream;
93 const float* A;
94 const float* B;
95 float* C;
96 int numElements;
97} $cuda_kernel_1_data;
98
99//////////////////////
100// Global Variables //
101//////////////////////
102
103$gcomm $cuda_gcomm = $gcomm_create($here, 2);
104const int $CUDA_PLACE_HOST = 0;
105const int $CUDA_PLACE_DEVICE = 1;
106$comm $cuda_host_comm = $comm_create($here, $cuda_gcomm, $CUDA_PLACE_HOST);
107
108/**
109 * Tags used for message-passing between host and device
110 */
111
112enum $cuda_tag {
113 // Predefined tags
114 $CUDA_TAG_TEARDOWN,
115 $CUDA_TAG_SCOPE_REQUEST,
116 $CUDA_TAG_cudaFree,
117 $CUDA_TAG_cudaMemcpy,
118 $CUDA_TAG_cudaMemcpyAsync,
119 // Generated tags (by transformer)
120 $CUDA_TAG_LAUNCH_kernel_1
121};
122
123////////////////////////////////////////////
124// CUDA API Functions (For Host-use Only) //
125////////////////////////////////////////////
126
127/*
128cudaError_t cudaMalloc(void** devPtr, size_t size) {
129 $comm_enqueue($cuda_host_comm, $message_pack($CUDA_PLACE_HOST, $CUDA_PLACE_DEVICE, $CUDA_TAG_cudaMalloc, &size, sizeof(size_t)));
130 $message response = $comm_dequeue($cuda_host_comm, $CUDA_PLACE_DEVICE, $CUDA_TAG_cudaMalloc);
131 $message_unpack(response, devPtr, sizeof(void*));
132
133 return cudaSuccess;
134}
135*/
136
137$scope $cuda_host_request_device_scope() {
138 $comm_enqueue($cuda_host_comm, $message_pack($CUDA_PLACE_HOST, $CUDA_PLACE_DEVICE, $CUDA_TAG_SCOPE_REQUEST, NULL, 0));
139 $message response = $comm_dequeue($cuda_host_comm, $CUDA_PLACE_DEVICE, $CUDA_TAG_SCOPE_REQUEST);
140 $scope result;
141 $message_unpack(response, &result, sizeof($scope));
142
143 return result;
144}
145
146cudaError_t cudaFree(void* devPtr) {
147 $comm_enqueue($cuda_host_comm, $message_pack($CUDA_PLACE_HOST, $CUDA_PLACE_DEVICE, $CUDA_TAG_cudaFree, &devPtr, sizeof(void*)));
148 $comm_dequeue($cuda_host_comm, $CUDA_PLACE_DEVICE, $CUDA_TAG_cudaFree);
149
150 return cudaSuccess;
151}
152
153void $cuda_helper_host_memcpy(void* dst, const void* src, size_t count, cudaMemcpyKind kind, _Bool async) {
154 if (kind == cudaMemcpyHostToHost) {
155 memcpy(dst, src, count);
156 } else {
157 $cuda_memcpy_data args;
158 args.dst = dst;
159 args.src = src;
160 args.count = count;
161 args.kind = kind;
162
163 int tag = async ? $CUDA_TAG_cudaMemcpyAsync : $CUDA_TAG_cudaMemcpy;
164
165 $comm_enqueue($cuda_host_comm, $message_pack($CUDA_PLACE_HOST, $CUDA_PLACE_DEVICE, tag, &args, sizeof($cuda_memcpy_data)));
166 $comm_dequeue($cuda_host_comm, $CUDA_PLACE_DEVICE, tag);
167 }
168}
169
170cudaError_t cudaMemcpy(void* dst, const void* src, size_t count, cudaMemcpyKind kind) {
171 $cuda_helper_host_memcpy(dst, src, count, kind, false);
172 return cudaSuccess;
173}
174
175cudaError_t cudaMemcpyAsync(void* dst, const void* src, size_t count,
176 cudaMemcpyKind kind, cudaStream_t stream) {
177 $cuda_helper_host_memcpy(dst, src, count, kind, true);
178 return cudaSuccess;
179}
180
181/**
182 * TODO:
183 * - test
184 * - atomic?
185 */
186/*
187cudaError_t cudaStreamCreate(cudaStream_t * pStream) {
188 // Create new stream node in linked list
189 $cuda_stream_node_t newHead = $create_new_stream_node();
190 newHead->next = $cuda_global_context.head;
191 $cuda_global_context.head->prev = newHead;
192
193 // Update cuda context's head to be the new node we created
194 $cuda_global_context.head = newHead;
195 $cuda_global_context.numStreams++;
196
197 return cudaSuccess;
198}
199*/
200
201/**
202 * TODO:
203 * - test
204 * - atomic?
205 */
206/*
207cudaError_t cudaStreamSynchronize(cudaStream_t stream) {
208 stream = $default_stream_if_null(stream);
209 $assert(stream->alive, "Attempt to synchronize with a destroyed stream");
210 $when(stream->head == NULL) return cudaSuccess;
211}
212*/
213
214// TODO: atomic
215/*
216cudaError_t cudaStreamDestroy(cudaStream_t stream) {
217 $assert(stream != NULL && stream != $cuda_default_stream, "Attempt to destroy default stream");
218 $assert(stream->alive, "Attempt to destroy an already destroyed stream");
219 $destroy_stream_node(stream->containingNode);
220 return cudaSuccess;
221}
222*/
223
224/*
225cudaError_t cudaDeviceSynchronize() {
226 $proc* opsToWaitOn;
227 int numOps = 0;
228
229 $atomic {
230 opsToWaitOn = ($proc*) malloc(sizeof($proc) * $cuda_global_context.numStreams);
231
232 for ($cuda_stream_node_t node = $cuda_global_context.head;
233 node != NULL;
234 node = node->next) {
235 if (node->stream->tail != NULL) {
236 opsToWaitOn[numOps] = node->stream->tail->opState->op;
237 numOps++;
238 }
239 }
240 }
241 $waitall(opsToWaitOn, numOps);
242
243 return cudaSuccess;
244}
245*/
246
247void $cuda_host_launch_kernel_1(dim3 gridDim, dim3 blockDim, size_t $cudaMemSize, cudaStream_t $cudaStream,
248 const float* A, const float* B, float* C, int numElements) {
249 $cuda_kernel_1_data args;
250 args.gridDim = gridDim;
251 args.blockDim = blockDim;
252 args.$cudaMemSize = $cudaMemSize;
253 args.$cudaStream = $cudaStream;
254 args.A = A;
255 args.B = B;
256 args.C = C;
257 args.numElements = numElements;
258
259 $comm_enqueue($cuda_host_comm, $message_pack($CUDA_PLACE_HOST, $CUDA_PLACE_DEVICE, $CUDA_TAG_LAUNCH_kernel_1, &args, sizeof($cuda_kernel_1_data)));
260 $comm_dequeue($cuda_host_comm, $CUDA_PLACE_DEVICE, $CUDA_TAG_LAUNCH_kernel_1);
261}
262
263/////////////////
264// CUDA "file" //
265/////////////////
266
267void _cuda_main() {
268
269 //////////////////////
270 // Device Variables //
271 //////////////////////
272
273 $scope $cuda_scope = $here;
274
275 $comm $cuda_device_comm = $comm_create($cuda_scope, $cuda_gcomm, 1);
276 $cuda_context $cuda_global_context;
277 cudaStream_t $cuda_default_stream;
278
279 /////////////////////////////////
280 // Context & Stream Management //
281 /////////////////////////////////
282
283 // Helper function to get the default stream if passed NULL, and just returns stream otherwise
284 cudaStream_t $default_stream_if_null(cudaStream_t stream) {
285 return stream == NULL ? $cuda_default_stream : stream;
286 }
287
288 $cuda_stream_node_t $create_new_stream_node() {
289 cudaStream_t newStream = (cudaStream_t) malloc(sizeof(struct cudaStream));
290 newStream->head = NULL;
291 newStream->tail = NULL;
292 newStream->numOps = 0;
293 newStream->alive = true;
294
295 $cuda_stream_node_t newHead = ($cuda_stream_node_t) malloc(sizeof(struct $cuda_stream_node));
296 newHead->stream = newStream;
297 newStream->containingNode = newHead;
298 newHead->prev = NULL;
299 newHead->next = NULL;
300
301 return newHead;
302 }
303
304 //@ depends_on \nothing;
305 $atomic_f $proc $destroy_stream_node($cuda_stream_node_t node) {
306 $proc lastOpProc = $proc_null;
307 cudaStream_t stream = node->stream;
308
309 if (node->prev != NULL) {
310 node->prev->next = node->next;
311 }
312 if (node->next != NULL) {
313 node->next->prev = node->prev;
314 }
315 free(node);
316
317 stream->alive = false;
318 if(stream->tail != NULL)
319 lastOpProc = stream->tail->opState->op;
320
321 void $destroy_stream_when_complete($proc lastOpProc, cudaStream_t stream) {
322 $wait(lastOpProc);
323 free(stream);
324 }
325
326 return $spawn $destroy_stream_when_complete(lastOpProc, stream);
327 }
328
329 /**
330 * Enqueues the calling $proc as a new cuda operation onto stream. Then blocks until the cuda operation is allowed to execute.
331 *
332 * Reasoning behind using enqueuedFlag:
333 * + Enforces in the interface more explicitly that device proc shouldn't continue until new op state is created and properly filled out (including the $proc field)
334 * + Keeps symmetry since this method means the op does both enqueueing and dequeueing. The alternative technique would have device enqueuing and op dequeuing.
335 * + Reduces dependencies since device proc will not be manipulating the streams nor have direct access to the newly created op state.
336 * + Keeps responsibility of device proc strictly to interpreting messages, spawning appropriate ops and sending message.
337 */
338 //@ depends_on \nothing;
339 $atomic_f $cuda_op_state_t $stream_enqueue(_Bool* enqueuedFlag, cudaStream_t stream) {
340 $cuda_op_state_t newOpState = ($cuda_op_state_t) $malloc($cuda_scope, sizeof(struct $cuda_op_state));
341 newOpState->start = false;
342 newOpState->op = $self;
343
344 $cuda_op_state_node_t newOpStateNode = ($cuda_op_state_node_t) $malloc($cuda_scope, sizeof(struct $cuda_op_state_node));
345 newOpStateNode->opState = newOpState;
346 newOpStateNode->next = NULL;
347
348 stream = $default_stream_if_null(stream);
349 $assert(stream->alive, "Attempt to enqueue a CUDA operation onto a destroyed stream");
350
351 if (stream->tail == NULL) {
352 stream->head = newOpStateNode;
353 stream->tail = newOpStateNode;
354 newOpState->start = true;
355 } else {
356 stream->tail->next = newOpStateNode;
357 stream->tail = newOpStateNode;
358 }
359 stream->numOps++;
360 *enqueuedFlag = true;
361
362 return newOpState;
363 }
364
365 //@ depends_on \nothing;
366 $atomic_f void $stream_dequeue(cudaStream_t stream) {
367 stream = $default_stream_if_null(stream);
368 $assert(stream->head != NULL, "Attempt to dequeue an empty stream");
369
370 if (stream->head == stream->tail) {
371 stream->tail = NULL;
372 }
373
374 $cuda_op_state_node_t oldHead = stream->head;
375 stream->head = oldHead->next;
376 if (stream->head != NULL) {
377 stream->head->opState->start = true;
378 }
379
380 stream->numOps--;
381 free(oldHead->opState);
382 free(oldHead);
383 }
384
385 ///////////////////////////////
386 // CUDA Function Definitions //
387 ///////////////////////////////
388
389 /**
390 * Only called at start of program
391 */
392 void $cuda_setup() {
393 $cuda_stream_node_t defaultStreamNode = $create_new_stream_node();
394 $cuda_default_stream = defaultStreamNode->stream;
395
396 $cuda_global_context.head = defaultStreamNode;
397 $cuda_global_context.numStreams = 1;
398 }
399
400 /**
401 * Only called at end of program
402 */
403 void $cuda_teardown() {
404 $proc destructor = $destroy_stream_node($cuda_default_stream->containingNode);
405 $wait(destructor);
406 $comm_destroy($cuda_device_comm);
407 }
408
409 $message $cuda_free($message request) {
410 void* devPtr;
411 $message_unpack(request, &devPtr, sizeof(void*));
412 free($reveal(devPtr));
413 //free(devPtr);
414
415 return $message_pack($CUDA_PLACE_DEVICE, $CUDA_PLACE_HOST, $CUDA_TAG_cudaFree, NULL, 0);
416 }
417
418 void $cuda_memcpy_proc(void* dst, const void* src, size_t count, cudaMemcpyKind kind, _Bool* enqueuedFlag, cudaStream_t stream) {
419 $cuda_op_state_t opState = $stream_enqueue(enqueuedFlag, stream);
420 $when(opState->start);
421
422 if (kind == cudaMemcpyHostToDevice || cudaMemcpyDeviceToDevice) {
423 dst = $reveal(dst);
424 }
425 if (kind == cudaMemcpyDeviceToHost || cudaMemcpyDeviceToDevice) {
426 src = $reveal(src);
427 }
428 memcpy(dst, src, count);
429
430 $stream_dequeue(stream);
431 }
432
433 $message $cuda_memcpy($message request, _Bool async) {
434 $cuda_memcpy_data args;
435 $message_unpack(request, &args, sizeof($cuda_memcpy_data));
436
437 _Bool enqueuedFlag = false;
438 $proc memcpyProc = $spawn $cuda_memcpy_proc(args.dst, args.src, args.count, args.kind, &enqueuedFlag, $cuda_default_stream);
439 $when(enqueuedFlag);
440
441 if (!async && args.kind != cudaMemcpyDeviceToDevice) {
442 $wait(memcpyProc);
443 }
444 int tag = async ? $CUDA_TAG_cudaMemcpyAsync : $CUDA_TAG_cudaMemcpy;
445
446 return $message_pack($CUDA_PLACE_DEVICE, $CUDA_PLACE_HOST, tag, NULL, 0);
447 }
448
449 ////////////////////////
450 // Kernel Definitions //
451 ////////////////////////
452
453 // Helper function
454 int $dim3_index(dim3 size, uint3 location) {
455 return location.x + size.x * (location.y + size.y * location.z);
456 }
457
458 // Helper function
459 int $cuda_kernel_index (dim3 gDim, dim3 bDim, uint3 bIdx, uint3 tIdx) {
460 return $dim3_index(gDim, bIdx) * (bDim.x * bDim.y * bDim.z) + $dim3_index(bDim, tIdx);
461 }
462
463 void $cuda_run_and_wait_on_procs(dim3 dim, void spawningFunction(uint3)) {
464 //TODO: calculate length and index, replace this function in the kernel
465 $local_start();
466 int length = dim.x * dim.y * dim.z;
467 $proc proc_array[length];
468 $range rx = 0 .. dim.x - 1;
469 $range ry = 0 .. dim.y - 1;
470 $range rz = 0 .. dim.z - 1;
471 $domain(3) dom = ($domain(3)){rx, ry, rz};
472 $for(int x,y,z : dom){
473 uint3 id = { x, y, z };
474 int index = $dim3_index(dim, id);
475 proc_array[index] = $spawn spawningFunction(id);
476 }
477 $local_end();
478 $waitall(proc_array,length);
479 }
480
481 // Generated from kernel_1 definition
482 void $cuda_kernel_1(dim3 gridDim, dim3 blockDim, size_t _cuda_mem_size,
483 const float *A, const float *B, float *C, int numElements) {
484 void _cuda_block(uint3 blockIdx) {
485 int numThreads = (blockDim.x * blockDim.y) * blockDim.z;
486 $scope _block_root = $here;
487 $gbarrier _cuda_block_barrier = $gbarrier_create($here, blockDim.x * blockDim.y * blockDim.z);
488 void _cuda_thread(uint3 threadIdx) {
489 int _cuda_tid = $dim3_index(blockDim, threadIdx);
490 int _cuda_kid = $cuda_kernel_index(gridDim, blockDim, blockIdx, threadIdx);
491 $barrier _cuda_thread_barrier = $barrier_create($here, _cuda_block_barrier, _cuda_tid);
492 $local_start();
493 // Kernel definition start
494
495 int i = blockDim.x * blockIdx.x + threadIdx.x;
496
497 if (i < numElements)
498 {
499 C[i] = A[i] + B[i];
500 }
501
502 // Kernel definition end
503 $local_end();
504 $barrier_destroy(_cuda_thread_barrier);
505 }
506 $cuda_run_and_wait_on_procs(blockDim, _cuda_thread);
507 $gbarrier_destroy(_cuda_block_barrier);
508 }
509 $cuda_run_and_wait_on_procs(gridDim, _cuda_block);
510 }
511
512 void $cuda_kernel_1_proc (_Bool* enqueuedFlag, dim3 gridDim, dim3 blockDim,
513 size_t $cudaMemSize, cudaStream_t $cudaStream,
514 const float *A, const float *B, float *C, int numElements) {
515 $cuda_op_state_t opState = $stream_enqueue(enqueuedFlag, $cudaStream);
516 $when(opState->start);
517 $cuda_kernel_1(gridDim, blockDim, $cudaMemSize, A, B, C, numElements);
518 $stream_dequeue($cudaStream);
519 }
520
521 $message $cuda_device_launch_kernel_1($message request) {
522 $cuda_kernel_1_data args;
523 $message_unpack(request, &args, sizeof($cuda_kernel_1_data));
524
525 _Bool enqueuedFlag = false;
526 $spawn $cuda_kernel_1_proc(&enqueuedFlag, args.gridDim, args.blockDim, args.$cudaMemSize, args.$cudaStream, args.A, args.B, args.C, args.numElements);
527 $when(enqueuedFlag);
528
529 return $message_pack($CUDA_PLACE_DEVICE, $CUDA_PLACE_HOST, $CUDA_TAG_LAUNCH_kernel_1, NULL, 0);
530 }
531
532 /////////////////
533 // Device main //
534 /////////////////
535
536 $cuda_setup();
537
538 while (true) {
539 $message request = $comm_dequeue($cuda_device_comm, $CUDA_PLACE_HOST, $COMM_ANY_TAG);
540 $message response;
541 const int tag = $message_tag(request);
542
543 switch(tag) {
544 case $CUDA_TAG_SCOPE_REQUEST :
545 response = $message_pack($CUDA_PLACE_DEVICE, $CUDA_PLACE_HOST, $CUDA_TAG_SCOPE_REQUEST, &$cuda_scope, sizeof($scope));
546 break;
547 case $CUDA_TAG_cudaFree :
548 response = $cuda_free(request);
549 break;
550 case $CUDA_TAG_cudaMemcpy :
551 response = $cuda_memcpy(request, false);
552 break;
553 case $CUDA_TAG_cudaMemcpyAsync :
554 response = $cuda_memcpy(request, true);
555 break;
556 case $CUDA_TAG_LAUNCH_kernel_1 :
557 response = $cuda_device_launch_kernel_1(request);
558 break;
559 case $CUDA_TAG_TEARDOWN :
560 $cuda_teardown();
561 return;
562 default :
563 $assert(false, "Unknown CUDA request");
564 }
565
566 $comm_enqueue($cuda_device_comm, response);
567 }
568}
569
570///////////////
571// Host file //
572///////////////
573
574$input int N;
575$assume (N > 0);
576$input float A[N];
577$input float B[N];
578
579void _host_main() {
580 int size = N * sizeof(float);
581 int numBlocks = 2;
582 int numThreads = N%2 == 0? N/2 : (N+1)/2;
583
584 float* cuda_A;
585 // cudaMalloc((void **)&cuda_A, size);
586 {
587 $scope deviceScope = $cuda_host_request_device_scope();
588 cuda_A = $hide((float*)$malloc(deviceScope, size));
589 //cuda_A = (float*)$malloc(deviceScope, size);
590 }
591 cudaMemcpy(cuda_A, A, size, cudaMemcpyHostToDevice);
592
593 float* cuda_B;
594 // cudaMalloc((void **)&cuda_B, size);
595 {
596 $scope deviceScope = $cuda_host_request_device_scope();
597 cuda_B = $hide((float*)$malloc(deviceScope, size));
598 //cuda_B = (float*)$malloc(deviceScope, size);
599 }
600 cudaMemcpy(cuda_B, B, size, cudaMemcpyHostToDevice);
601
602 float* cuda_C;
603 // cudaMalloc((void **)&cuda_C, size);
604 {
605 $scope deviceScope = $cuda_host_request_device_scope();
606 cuda_C = $hide((float*)$malloc(deviceScope, size));
607 //cuda_C = (float*)$malloc(deviceScope, size);
608 }
609
610 dim3 gridDim = {numBlocks, 1, 1};
611 dim3 blockDim = {numThreads, 1, 1};
612 // kernel_1<<<gridDim, blockDim>>>(cuda_A, cuda_B, cuda_C, N);
613 $cuda_host_launch_kernel_1(gridDim, blockDim, 0, NULL, cuda_A, cuda_B, cuda_C, N);
614
615 //Checking correctness
616 float* C = (float *)malloc(size);
617
618 cudaMemcpy(C, cuda_C, size, cudaMemcpyDeviceToHost);
619
620 for(int i = 0; i < N; i++)
621 $assert(C[i] == A[i] + B[i]);
622
623 free(C);
624
625 cudaFree(cuda_A);
626 cudaFree(cuda_B);
627 cudaFree(cuda_C);
628
629 // inserted by transformer
630 $comm_enqueue($cuda_host_comm, $message_pack($CUDA_PLACE_HOST, $CUDA_PLACE_DEVICE, $CUDA_TAG_TEARDOWN, NULL, 0));
631 $comm_destroy($cuda_host_comm);
632}
633
634int main() {
635 $proc host = $spawn _host_main();
636 $proc cuda = $spawn _cuda_main();
637 $wait(host);
638 $wait(cuda);
639 $gcomm_destroy($cuda_gcomm, NULL);
640}
Note: See TracBrowser for help on using the repository browser.