source: CIVL/examples/cuda/newCudaMockup.cvl@ 1aaefd4

main test-branch
Last change on this file since 1aaefd4 was 7e3e7af, checked in by Alex Wilton <awilton@…>, 3 years ago

Fixed a small POR issue in cuda mockup

git-svn-id: svn://vsl.cis.udel.edu/civl/trunk@5749 fb995dde-84ed-4084-dfe6-e5aef3e2452c

  • Property mode set to 100644
File size: 18.7 KB
Line 
1/**
2 * TODO:
3 * - implement cudaMemset and cudaMemsetAsync
4 * - flesh out basic structure of cuda kernel:
5 * - spawn gridDim blocks
6 * - spawn blockDim threads
7 * Alternatively, spawn blockDim warps and then spawn warp's threads
8 * - wait for blocks to finish
9 * - Add in block-level barriers and a __syncthreads() nested function that uses the barrier.
10 * - Add in data race checking support and implement atomicAdd for integers
11 * - Handle dependencies at atomic blocks (replace some with local blocks if possible)
12 */
13
14#include <concurrency.cvh>
15#include <comm.cvh>
16#include <stdlib.h>
17#include <stdio.h>
18#include <stdbool.h>
19#include <string.h>
20
21///////////
22// Types //
23///////////
24
25enum cudaError {
26 cudaSuccess
27};
28typedef enum cudaError cudaError_t;
29
30typedef enum cudaMemcpyKind {
31 cudaMemcpyHostToHost,
32 cudaMemcpyHostToDevice,
33 cudaMemcpyDeviceToHost,
34 cudaMemcpyDeviceToDevice,
35 cudaMemcpyDefault
36} cudaMemcpyKind;
37
38typedef struct {
39 unsigned int x, y, z;
40} dim3;
41
42/* used to represent a location in a three dimensional grid
43 */
44typedef struct {
45 unsigned int x, y, z;
46} uint3;
47
48typedef struct $cuda_op_state* $cuda_op_state_t;
49struct $cuda_op_state {
50 _Bool start;
51 $proc op;
52};
53
54typedef struct $cuda_op_state_node* $cuda_op_state_node_t;
55struct $cuda_op_state_node {
56 $cuda_op_state_t opState;
57 $cuda_op_state_node_t next;
58};
59
60typedef struct cudaStream* cudaStream_t;
61typedef struct $cuda_stream_node* $cuda_stream_node_t;
62struct cudaStream {
63 $cuda_op_state_node_t head;
64 $cuda_op_state_node_t tail;
65 int numOps;
66 $cuda_stream_node_t containingNode;
67 _Bool alive;
68};
69
70struct $cuda_stream_node {
71 cudaStream_t stream;
72 $cuda_stream_node_t prev;
73 $cuda_stream_node_t next;
74};
75
76typedef struct $cuda_context {
77 $cuda_stream_node_t head; //list of streams
78 int numStreams;
79} $cuda_context;
80
81typedef struct $cuda_memcpy_data {
82 void* dst;
83 const void* src;
84 size_t count;
85 cudaMemcpyKind kind;
86} $cuda_memcpy_data;
87
88typedef struct $cuda_kernel_1_data {
89 dim3 gridDim;
90 dim3 blockDim;
91 size_t $cudaMemSize;
92 cudaStream_t $cudaStream;
93 const float* A;
94 const float* B;
95 float* C;
96 int numElements;
97} $cuda_kernel_1_data;
98
99//////////////////////
100// Global Variables //
101//////////////////////
102
103$gcomm $cuda_gcomm = $gcomm_create($here, 2);
104const int $CUDA_PLACE_HOST = 0;
105const int $CUDA_PLACE_DEVICE = 1;
106$comm $cuda_host_comm = $comm_create($here, $cuda_gcomm, $CUDA_PLACE_HOST);
107
108/**
109 * Tags used for message-passing between host and device
110 */
111
112enum $cuda_tag {
113 // Predefined tags
114 $CUDA_TAG_TEARDOWN,
115 $CUDA_TAG_SCOPE_REQUEST,
116 $CUDA_TAG_cudaFree,
117 $CUDA_TAG_cudaMemcpy,
118 $CUDA_TAG_cudaMemcpyAsync,
119 // Generated tags (by transformer)
120 $CUDA_TAG_LAUNCH_kernel_1
121};
122
123////////////////////////////////////////////
124// CUDA API Functions (For Host-use Only) //
125////////////////////////////////////////////
126
127/*
128cudaError_t cudaMalloc(void** devPtr, size_t size) {
129 $comm_enqueue($cuda_host_comm, $message_pack($CUDA_PLACE_HOST, $CUDA_PLACE_DEVICE, $CUDA_TAG_cudaMalloc, &size, sizeof(size_t)));
130 $message response = $comm_dequeue($cuda_host_comm, $CUDA_PLACE_DEVICE, $CUDA_TAG_cudaMalloc);
131 $message_unpack(response, devPtr, sizeof(void*));
132
133 return cudaSuccess;
134}
135*/
136
137$scope $cuda_host_request_device_scope() {
138 $comm_enqueue($cuda_host_comm, $message_pack($CUDA_PLACE_HOST, $CUDA_PLACE_DEVICE, $CUDA_TAG_SCOPE_REQUEST, NULL, 0));
139 $message response = $comm_dequeue($cuda_host_comm, $CUDA_PLACE_DEVICE, $CUDA_TAG_SCOPE_REQUEST);
140 $scope result;
141 $message_unpack(response, &result, sizeof($scope));
142
143 return result;
144}
145
146cudaError_t cudaFree(void* devPtr) {
147 $comm_enqueue($cuda_host_comm, $message_pack($CUDA_PLACE_HOST, $CUDA_PLACE_DEVICE, $CUDA_TAG_cudaFree, &devPtr, sizeof(void*)));
148 $comm_dequeue($cuda_host_comm, $CUDA_PLACE_DEVICE, $CUDA_TAG_cudaFree);
149
150 return cudaSuccess;
151}
152
153void $cuda_helper_host_memcpy(void* dst, const void* src, size_t count, cudaMemcpyKind kind, _Bool async) {
154 if (kind == cudaMemcpyHostToHost) {
155 memcpy(dst, src, count);
156 } else {
157 $cuda_memcpy_data args;
158 args.dst = dst;
159 args.src = src;
160 args.count = count;
161 args.kind = kind;
162
163 int tag = async ? $CUDA_TAG_cudaMemcpyAsync : $CUDA_TAG_cudaMemcpy;
164
165 $comm_enqueue($cuda_host_comm, $message_pack($CUDA_PLACE_HOST, $CUDA_PLACE_DEVICE, tag, &args, sizeof($cuda_memcpy_data)));
166 $comm_dequeue($cuda_host_comm, $CUDA_PLACE_DEVICE, tag);
167 }
168}
169
170cudaError_t cudaMemcpy(void* dst, const void* src, size_t count, cudaMemcpyKind kind) {
171 $cuda_helper_host_memcpy(dst, src, count, kind, false);
172 return cudaSuccess;
173}
174
175cudaError_t cudaMemcpyAsync(void* dst, const void* src, size_t count,
176 cudaMemcpyKind kind, cudaStream_t stream) {
177 $cuda_helper_host_memcpy(dst, src, count, kind, true);
178 return cudaSuccess;
179}
180
181/**
182 * TODO:
183 * - test
184 * - atomic?
185 */
186/*
187cudaError_t cudaStreamCreate(cudaStream_t * pStream) {
188 // Create new stream node in linked list
189 $cuda_stream_node_t newHead = $create_new_stream_node();
190 newHead->next = $cuda_global_context.head;
191 $cuda_global_context.head->prev = newHead;
192
193 // Update cuda context's head to be the new node we created
194 $cuda_global_context.head = newHead;
195 $cuda_global_context.numStreams++;
196
197 return cudaSuccess;
198}
199*/
200
201/**
202 * TODO:
203 * - test
204 * - atomic?
205 */
206/*
207cudaError_t cudaStreamSynchronize(cudaStream_t stream) {
208 stream = $default_stream_if_null(stream);
209 $assert(stream->alive, "Attempt to synchronize with a destroyed stream");
210 $when(stream->head == NULL) return cudaSuccess;
211}
212*/
213
214// TODO: atomic
215/*
216cudaError_t cudaStreamDestroy(cudaStream_t stream) {
217 $assert(stream != NULL && stream != $cuda_default_stream, "Attempt to destroy default stream");
218 $assert(stream->alive, "Attempt to destroy an already destroyed stream");
219 $destroy_stream_node(stream->containingNode);
220 return cudaSuccess;
221}
222*/
223
224/*
225cudaError_t cudaDeviceSynchronize() {
226 $proc* opsToWaitOn;
227 int numOps = 0;
228
229 $atomic {
230 opsToWaitOn = ($proc*) malloc(sizeof($proc) * $cuda_global_context.numStreams);
231
232 for ($cuda_stream_node_t node = $cuda_global_context.head;
233 node != NULL;
234 node = node->next) {
235 if (node->stream->tail != NULL) {
236 opsToWaitOn[numOps] = node->stream->tail->opState->op;
237 numOps++;
238 }
239 }
240 }
241 $waitall(opsToWaitOn, numOps);
242
243 return cudaSuccess;
244}
245*/
246
247void $cuda_host_launch_kernel_1(dim3 gridDim, dim3 blockDim, size_t $cudaMemSize, cudaStream_t $cudaStream,
248 const float* A, const float* B, float* C, int numElements) {
249 $cuda_kernel_1_data args;
250 args.gridDim = gridDim;
251 args.blockDim = blockDim;
252 args.$cudaMemSize = $cudaMemSize;
253 args.$cudaStream = $cudaStream;
254 args.A = A;
255 args.B = B;
256 args.C = C;
257 args.numElements = numElements;
258
259 $comm_enqueue($cuda_host_comm, $message_pack($CUDA_PLACE_HOST, $CUDA_PLACE_DEVICE, $CUDA_TAG_LAUNCH_kernel_1, &args, sizeof($cuda_kernel_1_data)));
260 $comm_dequeue($cuda_host_comm, $CUDA_PLACE_DEVICE, $CUDA_TAG_LAUNCH_kernel_1);
261}
262
263/////////////////
264// CUDA "file" //
265/////////////////
266
267void _cuda_main() {
268
269 //////////////////////
270 // Device Variables //
271 //////////////////////
272
273 $scope $cuda_scope = $here;
274
275 $comm $cuda_device_comm = $comm_create($cuda_scope, $cuda_gcomm, 1);
276 $cuda_context $cuda_global_context;
277 cudaStream_t $cuda_default_stream;
278
279 /////////////////////////////////
280 // Context & Stream Management //
281 /////////////////////////////////
282
283 // Helper function to get the default stream if passed NULL, and just returns stream otherwise
284 cudaStream_t $default_stream_if_null(cudaStream_t stream) {
285 return stream == NULL ? $cuda_default_stream : stream;
286 }
287
288 $cuda_stream_node_t $create_new_stream_node() {
289 cudaStream_t newStream = (cudaStream_t) malloc(sizeof(struct cudaStream));
290 newStream->head = NULL;
291 newStream->tail = NULL;
292 newStream->numOps = 0;
293 newStream->alive = true;
294
295 $cuda_stream_node_t newHead = ($cuda_stream_node_t) malloc(sizeof(struct $cuda_stream_node));
296 newHead->stream = newStream;
297 newStream->containingNode = newHead;
298 newHead->prev = NULL;
299 newHead->next = NULL;
300
301 return newHead;
302 }
303
304 //@ depends_on \nothing;
305 $atomic_f $proc $destroy_stream_node($cuda_stream_node_t node) {
306 $proc lastOpProc = $proc_null;
307 cudaStream_t stream = node->stream;
308
309 if (node->prev != NULL) {
310 node->prev->next = node->next;
311 }
312 if (node->next != NULL) {
313 node->next->prev = node->prev;
314 }
315 free(node);
316
317 stream->alive = false;
318 if(stream->tail != NULL)
319 lastOpProc = stream->tail->opState->op;
320
321 void $destroy_stream_when_complete($proc lastOpProc, cudaStream_t stream) {
322 $wait(lastOpProc);
323 free(stream);
324 }
325
326 return $spawn $destroy_stream_when_complete(lastOpProc, stream);
327 }
328
329 /**
330 * Enqueues the calling $proc as a new cuda operation onto stream. Then blocks until the cuda operation is allowed to execute.
331 *
332 * Reasoning behind using enqueuedFlag:
333 * + Enforces in the interface more explicitly that device proc shouldn't continue until new op state is created and properly filled out (including the $proc field)
334 * + Keeps symmetry since this method means the op does both enqueueing and dequeueing. The alternative technique would have device enqueuing and op dequeuing.
335 * + Reduces dependencies since device proc will not be manipulating the streams nor have direct access to the newly created op state.
336 * + Keeps responsibility of device proc strictly to interpreting messages, spawning appropriate ops and sending message.
337 */
338 //@ depends_on \nothing;
339 $atomic_f $cuda_op_state_t $stream_enqueue(_Bool** enqueuedFlag, cudaStream_t stream) {
340 $cuda_op_state_t newOpState = ($cuda_op_state_t) $malloc($cuda_scope, sizeof(struct $cuda_op_state));
341 newOpState->start = false;
342 newOpState->op = $self;
343
344 $cuda_op_state_node_t newOpStateNode = ($cuda_op_state_node_t) $malloc($cuda_scope, sizeof(struct $cuda_op_state_node));
345 newOpStateNode->opState = newOpState;
346 newOpStateNode->next = NULL;
347
348 stream = $default_stream_if_null(stream);
349 $assert(stream->alive, "Attempt to enqueue a CUDA operation onto a destroyed stream");
350
351 if (stream->tail == NULL) {
352 stream->head = newOpStateNode;
353 stream->tail = newOpStateNode;
354 newOpState->start = true;
355 } else {
356 stream->tail->next = newOpStateNode;
357 stream->tail = newOpStateNode;
358 }
359 stream->numOps++;
360 **enqueuedFlag = true;
361 *enqueuedFlag = NULL;
362
363 return newOpState;
364 }
365
366 //@ depends_on \nothing;
367 $atomic_f void $stream_dequeue(cudaStream_t stream) {
368 stream = $default_stream_if_null(stream);
369 $assert(stream->head != NULL, "Attempt to dequeue an empty stream");
370
371 if (stream->head == stream->tail) {
372 stream->tail = NULL;
373 }
374
375 $cuda_op_state_node_t oldHead = stream->head;
376 stream->head = oldHead->next;
377 if (stream->head != NULL) {
378 stream->head->opState->start = true;
379 }
380
381 stream->numOps--;
382 free(oldHead->opState);
383 free(oldHead);
384 }
385
386 ///////////////////////////////
387 // CUDA Function Definitions //
388 ///////////////////////////////
389
390 /**
391 * Only called at start of program
392 */
393 void $cuda_setup() {
394 $cuda_stream_node_t defaultStreamNode = $create_new_stream_node();
395 $cuda_default_stream = defaultStreamNode->stream;
396
397 $cuda_global_context.head = defaultStreamNode;
398 $cuda_global_context.numStreams = 1;
399 }
400
401 /**
402 * Only called at end of program
403 */
404 void $cuda_teardown() {
405 $proc destructor = $destroy_stream_node($cuda_default_stream->containingNode);
406 $wait(destructor);
407 $comm_destroy($cuda_device_comm);
408 }
409
410 $message $cuda_free($message request) {
411 void* devPtr;
412 $message_unpack(request, &devPtr, sizeof(void*));
413 free($reveal(devPtr));
414 //free(devPtr);
415
416 return $message_pack($CUDA_PLACE_DEVICE, $CUDA_PLACE_HOST, $CUDA_TAG_cudaFree, NULL, 0);
417 }
418
419 void $cuda_memcpy_proc(void* dst, const void* src, size_t count, cudaMemcpyKind kind, _Bool* enqueuedFlag, cudaStream_t stream) {
420 $cuda_op_state_t opState = $stream_enqueue(&enqueuedFlag, stream);
421 $when(opState->start);
422
423 if (kind == cudaMemcpyHostToDevice || cudaMemcpyDeviceToDevice) {
424 dst = $reveal(dst);
425 }
426 if (kind == cudaMemcpyDeviceToHost || cudaMemcpyDeviceToDevice) {
427 src = $reveal(src);
428 }
429 memcpy(dst, src, count);
430
431 $stream_dequeue(stream);
432 }
433
434 $message $cuda_memcpy($message request, _Bool async) {
435 $cuda_memcpy_data args;
436 $message_unpack(request, &args, sizeof($cuda_memcpy_data));
437
438 _Bool enqueuedFlag = false;
439 $proc memcpyProc = $spawn $cuda_memcpy_proc(args.dst, args.src, args.count, args.kind, &enqueuedFlag, $cuda_default_stream);
440 $when(enqueuedFlag);
441
442 if (!async && args.kind != cudaMemcpyDeviceToDevice) {
443 $wait(memcpyProc);
444 }
445 int tag = async ? $CUDA_TAG_cudaMemcpyAsync : $CUDA_TAG_cudaMemcpy;
446
447 return $message_pack($CUDA_PLACE_DEVICE, $CUDA_PLACE_HOST, tag, NULL, 0);
448 }
449
450 ////////////////////////
451 // Kernel Definitions //
452 ////////////////////////
453
454 // Helper function
455 int $dim3_index(dim3 size, uint3 location) {
456 return location.x + size.x * (location.y + size.y * location.z);
457 }
458
459 // Helper function
460 int $cuda_kernel_index (dim3 gDim, dim3 bDim, uint3 bIdx, uint3 tIdx) {
461 return $dim3_index(gDim, bIdx) * (bDim.x * bDim.y * bDim.z) + $dim3_index(bDim, tIdx);
462 }
463
464 void $cuda_run_and_wait_on_procs(dim3 dim, void spawningFunction(uint3)) {
465 //TODO: calculate length and index, replace this function in the kernel
466 $local_start();
467 int length = dim.x * dim.y * dim.z;
468 $proc proc_array[length];
469 $range rx = 0 .. dim.x - 1;
470 $range ry = 0 .. dim.y - 1;
471 $range rz = 0 .. dim.z - 1;
472 $domain(3) dom = ($domain(3)){rx, ry, rz};
473 $for(int x,y,z : dom){
474 uint3 id = { x, y, z };
475 int index = $dim3_index(dim, id);
476 proc_array[index] = $spawn spawningFunction(id);
477 }
478 $local_end();
479 $waitall(proc_array,length);
480 }
481
482 // Generated from kernel_1 definition
483 void $cuda_kernel_1(dim3 gridDim, dim3 blockDim, size_t _cuda_mem_size,
484 const float *A, const float *B, float *C, int numElements) {
485 void _cuda_block(uint3 blockIdx) {
486 int numThreads = (blockDim.x * blockDim.y) * blockDim.z;
487 $scope _block_root = $here;
488 $gbarrier _cuda_block_barrier = $gbarrier_create($here, blockDim.x * blockDim.y * blockDim.z);
489 void _cuda_thread(uint3 threadIdx) {
490 int _cuda_tid = $dim3_index(blockDim, threadIdx);
491 int _cuda_kid = $cuda_kernel_index(gridDim, blockDim, blockIdx, threadIdx);
492 $barrier _cuda_thread_barrier = $barrier_create($here, _cuda_block_barrier, _cuda_tid);
493 $local_start();
494 // Kernel definition start
495
496 int i = blockDim.x * blockIdx.x + threadIdx.x;
497
498 if (i < numElements)
499 {
500 C[i] = A[i] + B[i];
501 }
502
503 // Kernel definition end
504 $local_end();
505 $barrier_destroy(_cuda_thread_barrier);
506 }
507 $cuda_run_and_wait_on_procs(blockDim, _cuda_thread);
508 $gbarrier_destroy(_cuda_block_barrier);
509 }
510 $cuda_run_and_wait_on_procs(gridDim, _cuda_block);
511 }
512
513 void $cuda_kernel_1_proc (_Bool* enqueuedFlag, dim3 gridDim, dim3 blockDim,
514 size_t $cudaMemSize, cudaStream_t $cudaStream,
515 const float *A, const float *B, float *C, int numElements) {
516 $cuda_op_state_t opState = $stream_enqueue(&enqueuedFlag, $cudaStream);
517 $when(opState->start);
518 $cuda_kernel_1(gridDim, blockDim, $cudaMemSize, A, B, C, numElements);
519 $stream_dequeue($cudaStream);
520 }
521
522 $message $cuda_device_launch_kernel_1($message request) {
523 $cuda_kernel_1_data args;
524 $message_unpack(request, &args, sizeof($cuda_kernel_1_data));
525
526 _Bool enqueuedFlag = false;
527 $spawn $cuda_kernel_1_proc(&enqueuedFlag, args.gridDim, args.blockDim, args.$cudaMemSize, args.$cudaStream, $reveal(args.A), $reveal(args.B), $reveal(args.C), args.numElements);
528 $when(enqueuedFlag);
529
530 return $message_pack($CUDA_PLACE_DEVICE, $CUDA_PLACE_HOST, $CUDA_TAG_LAUNCH_kernel_1, NULL, 0);
531 }
532
533 /////////////////
534 // Device main //
535 /////////////////
536
537 $cuda_setup();
538
539 while (true) {
540 $message request = $comm_dequeue($cuda_device_comm, $CUDA_PLACE_HOST, $COMM_ANY_TAG);
541 $message response;
542 const int tag = $message_tag(request);
543
544 switch(tag) {
545 case $CUDA_TAG_SCOPE_REQUEST :
546 response = $message_pack($CUDA_PLACE_DEVICE, $CUDA_PLACE_HOST, $CUDA_TAG_SCOPE_REQUEST, &$cuda_scope, sizeof($scope));
547 break;
548 case $CUDA_TAG_cudaFree :
549 response = $cuda_free(request);
550 break;
551 case $CUDA_TAG_cudaMemcpy :
552 response = $cuda_memcpy(request, false);
553 break;
554 case $CUDA_TAG_cudaMemcpyAsync :
555 response = $cuda_memcpy(request, true);
556 break;
557 case $CUDA_TAG_LAUNCH_kernel_1 :
558 response = $cuda_device_launch_kernel_1(request);
559 break;
560 case $CUDA_TAG_TEARDOWN :
561 $cuda_teardown();
562 return;
563 default :
564 $assert(false, "Unknown CUDA request");
565 }
566
567 $comm_enqueue($cuda_device_comm, response);
568 }
569}
570
571///////////////
572// Host file //
573///////////////
574
575$input int N;
576$assume (N > 0);
577$input float A[N];
578$input float B[N];
579
580void _host_main() {
581 int size = N * sizeof(float);
582 int numBlocks = 2;
583 int numThreads = N%2 == 0? N/2 : (N+1)/2;
584
585 float* cuda_A;
586 // cudaMalloc((void **)&cuda_A, size);
587 {
588 $scope deviceScope = $cuda_host_request_device_scope();
589 cuda_A = $hide((float*)$malloc(deviceScope, size));
590 //cuda_A = (float*)$malloc(deviceScope, size);
591 }
592 cudaMemcpy(cuda_A, A, size, cudaMemcpyHostToDevice);
593
594 float* cuda_B;
595 // cudaMalloc((void **)&cuda_B, size);
596 {
597 $scope deviceScope = $cuda_host_request_device_scope();
598 cuda_B = $hide((float*)$malloc(deviceScope, size));
599 //cuda_B = (float*)$malloc(deviceScope, size);
600 }
601 cudaMemcpy(cuda_B, B, size, cudaMemcpyHostToDevice);
602
603 float* cuda_C;
604 // cudaMalloc((void **)&cuda_C, size);
605 {
606 $scope deviceScope = $cuda_host_request_device_scope();
607 cuda_C = $hide((float*)$malloc(deviceScope, size));
608 //cuda_C = (float*)$malloc(deviceScope, size);
609 }
610
611 dim3 gridDim = {numBlocks, 1, 1};
612 dim3 blockDim = {numThreads, 1, 1};
613 // kernel_1<<<gridDim, blockDim>>>(cuda_A, cuda_B, cuda_C, N);
614 $cuda_host_launch_kernel_1(gridDim, blockDim, 0, NULL, cuda_A, cuda_B, cuda_C, N);
615
616 //Checking correctness
617 float* C = (float *)malloc(size);
618
619 cudaMemcpy(C, cuda_C, size, cudaMemcpyDeviceToHost);
620
621 for(int i = 0; i < N; i++)
622 $assert(C[i] == A[i] + B[i]);
623
624 free(C);
625
626 cudaFree(cuda_A);
627 cudaFree(cuda_B);
628 cudaFree(cuda_C);
629
630 // inserted by transformer
631 $comm_enqueue($cuda_host_comm, $message_pack($CUDA_PLACE_HOST, $CUDA_PLACE_DEVICE, $CUDA_TAG_TEARDOWN, NULL, 0));
632 $comm_destroy($cuda_host_comm);
633}
634
635int main() {
636 $proc host = $spawn _host_main();
637 $proc cuda = $spawn _cuda_main();
638 $wait(host);
639 $wait(cuda);
640 $gcomm_destroy($cuda_gcomm, NULL);
641}
Note: See TracBrowser for help on using the repository browser.