Context Navigation

newCudaMockup.cvl@ 1aaefd4

main test-branch

Last change on this file since 1aaefd4 was 7e3e7af, checked in by Alex Wilton <awilton@…>, 3 years ago

Fixed a small POR issue in cuda mockup

git-svn-id: svn://vsl.cis.udel.edu/civl/trunk@5749 fb995dde-84ed-4084-dfe6-e5aef3e2452c

Property mode set to 100644

File size: 18.7 KB

Line
1	/**
2	* TODO:
3	* - implement cudaMemset and cudaMemsetAsync
4	* - flesh out basic structure of cuda kernel:
5	* - spawn gridDim blocks
6	* - spawn blockDim threads
7	* Alternatively, spawn blockDim warps and then spawn warp's threads
8	* - wait for blocks to finish
9	* - Add in block-level barriers and a __syncthreads() nested function that uses the barrier.
10	* - Add in data race checking support and implement atomicAdd for integers
11	* - Handle dependencies at atomic blocks (replace some with local blocks if possible)
12	*/
13
14	#include <concurrency.cvh>
15	#include <comm.cvh>
16	#include <stdlib.h>
17	#include <stdio.h>
18	#include <stdbool.h>
19	#include <string.h>
20
21	///////////
22	// Types //
23	///////////
24
25	enum cudaError {
26	cudaSuccess
27	};
28	typedef enum cudaError cudaError_t;
29
30	typedef enum cudaMemcpyKind {
31	cudaMemcpyHostToHost,
32	cudaMemcpyHostToDevice,
33	cudaMemcpyDeviceToHost,
34	cudaMemcpyDeviceToDevice,
35	cudaMemcpyDefault
36	} cudaMemcpyKind;
37
38	typedef struct {
39	unsigned int x, y, z;
40	} dim3;
41
42	/* used to represent a location in a three dimensional grid
43	*/
44	typedef struct {
45	unsigned int x, y, z;
46	} uint3;
47
48	typedef struct $cuda_op_state* $cuda_op_state_t;
49	struct $cuda_op_state {
50	_Bool start;
51	$proc op;
52	};
53
54	typedef struct $cuda_op_state_node* $cuda_op_state_node_t;
55	struct $cuda_op_state_node {
56	$cuda_op_state_t opState;
57	$cuda_op_state_node_t next;
58	};
59
60	typedef struct cudaStream* cudaStream_t;
61	typedef struct $cuda_stream_node* $cuda_stream_node_t;
62	struct cudaStream {
63	$cuda_op_state_node_t head;
64	$cuda_op_state_node_t tail;
65	int numOps;
66	$cuda_stream_node_t containingNode;
67	_Bool alive;
68	};
69
70	struct $cuda_stream_node {
71	cudaStream_t stream;
72	$cuda_stream_node_t prev;
73	$cuda_stream_node_t next;
74	};
75
76	typedef struct $cuda_context {
77	$cuda_stream_node_t head; //list of streams
78	int numStreams;
79	} $cuda_context;
80
81	typedef struct $cuda_memcpy_data {
82	void* dst;
83	const void* src;
84	size_t count;
85	cudaMemcpyKind kind;
86	} $cuda_memcpy_data;
87
88	typedef struct $cuda_kernel_1_data {
89	dim3 gridDim;
90	dim3 blockDim;
91	size_t $cudaMemSize;
92	cudaStream_t $cudaStream;
93	const float* A;
94	const float* B;
95	float* C;
96	int numElements;
97	} $cuda_kernel_1_data;
98
99	//////////////////////
100	// Global Variables //
101	//////////////////////
102
103	$gcomm $cuda_gcomm = $gcomm_create($here, 2);
104	const int $CUDA_PLACE_HOST = 0;
105	const int $CUDA_PLACE_DEVICE = 1;
106	$comm $cuda_host_comm = $comm_create($here, $cuda_gcomm, $CUDA_PLACE_HOST);
107
108	/**
109	* Tags used for message-passing between host and device
110	*/
111
112	enum $cuda_tag {
113	// Predefined tags
114	$CUDA_TAG_TEARDOWN,
115	$CUDA_TAG_SCOPE_REQUEST,
116	$CUDA_TAG_cudaFree,
117	$CUDA_TAG_cudaMemcpy,
118	$CUDA_TAG_cudaMemcpyAsync,
119	// Generated tags (by transformer)
120	$CUDA_TAG_LAUNCH_kernel_1
121	};
122
123	////////////////////////////////////////////
124	// CUDA API Functions (For Host-use Only) //
125	////////////////////////////////////////////
126
127	/*
128	cudaError_t cudaMalloc(void** devPtr, size_t size) {
129	$comm_enqueue($cuda_host_comm, $message_pack($CUDA_PLACE_HOST, $CUDA_PLACE_DEVICE, $CUDA_TAG_cudaMalloc, &size, sizeof(size_t)));
130	$message response = $comm_dequeue($cuda_host_comm, $CUDA_PLACE_DEVICE, $CUDA_TAG_cudaMalloc);
131	$message_unpack(response, devPtr, sizeof(void*));
132
133	return cudaSuccess;
134	}
135	*/
136
137	$scope $cuda_host_request_device_scope() {
138	$comm_enqueue($cuda_host_comm, $message_pack($CUDA_PLACE_HOST, $CUDA_PLACE_DEVICE, $CUDA_TAG_SCOPE_REQUEST, NULL, 0));
139	$message response = $comm_dequeue($cuda_host_comm, $CUDA_PLACE_DEVICE, $CUDA_TAG_SCOPE_REQUEST);
140	$scope result;
141	$message_unpack(response, &result, sizeof($scope));
142
143	return result;
144	}
145
146	cudaError_t cudaFree(void* devPtr) {
147	$comm_enqueue($cuda_host_comm, $message_pack($CUDA_PLACE_HOST, $CUDA_PLACE_DEVICE, $CUDA_TAG_cudaFree, &devPtr, sizeof(void*)));
148	$comm_dequeue($cuda_host_comm, $CUDA_PLACE_DEVICE, $CUDA_TAG_cudaFree);
149
150	return cudaSuccess;
151	}
152
153	void $cuda_helper_host_memcpy(void* dst, const void* src, size_t count, cudaMemcpyKind kind, _Bool async) {
154	if (kind == cudaMemcpyHostToHost) {
155	memcpy(dst, src, count);
156	} else {
157	$cuda_memcpy_data args;
158	args.dst = dst;
159	args.src = src;
160	args.count = count;
161	args.kind = kind;
162
163	int tag = async ? $CUDA_TAG_cudaMemcpyAsync : $CUDA_TAG_cudaMemcpy;
164
165	$comm_enqueue($cuda_host_comm, $message_pack($CUDA_PLACE_HOST, $CUDA_PLACE_DEVICE, tag, &args, sizeof($cuda_memcpy_data)));
166	$comm_dequeue($cuda_host_comm, $CUDA_PLACE_DEVICE, tag);
167	}
168	}
169
170	cudaError_t cudaMemcpy(void* dst, const void* src, size_t count, cudaMemcpyKind kind) {
171	$cuda_helper_host_memcpy(dst, src, count, kind, false);
172	return cudaSuccess;
173	}
174
175	cudaError_t cudaMemcpyAsync(void* dst, const void* src, size_t count,
176	cudaMemcpyKind kind, cudaStream_t stream) {
177	$cuda_helper_host_memcpy(dst, src, count, kind, true);
178	return cudaSuccess;
179	}
180
181	/**
182	* TODO:
183	* - test
184	* - atomic?
185	*/
186	/*
187	cudaError_t cudaStreamCreate(cudaStream_t * pStream) {
188	// Create new stream node in linked list
189	$cuda_stream_node_t newHead = $create_new_stream_node();
190	newHead->next = $cuda_global_context.head;
191	$cuda_global_context.head->prev = newHead;
192
193	// Update cuda context's head to be the new node we created
194	$cuda_global_context.head = newHead;
195	$cuda_global_context.numStreams++;
196
197	return cudaSuccess;
198	}
199	*/
200
201	/**
202	* TODO:
203	* - test
204	* - atomic?
205	*/
206	/*
207	cudaError_t cudaStreamSynchronize(cudaStream_t stream) {
208	stream = $default_stream_if_null(stream);
209	$assert(stream->alive, "Attempt to synchronize with a destroyed stream");
210	$when(stream->head == NULL) return cudaSuccess;
211	}
212	*/
213
214	// TODO: atomic
215	/*
216	cudaError_t cudaStreamDestroy(cudaStream_t stream) {
217	$assert(stream != NULL && stream != $cuda_default_stream, "Attempt to destroy default stream");
218	$assert(stream->alive, "Attempt to destroy an already destroyed stream");
219	$destroy_stream_node(stream->containingNode);
220	return cudaSuccess;
221	}
222	*/
223
224	/*
225	cudaError_t cudaDeviceSynchronize() {
226	$proc* opsToWaitOn;
227	int numOps = 0;
228
229	$atomic {
230	opsToWaitOn = ($proc) malloc(sizeof($proc) $cuda_global_context.numStreams);
231
232	for ($cuda_stream_node_t node = $cuda_global_context.head;
233	node != NULL;
234	node = node->next) {
235	if (node->stream->tail != NULL) {
236	opsToWaitOn[numOps] = node->stream->tail->opState->op;
237	numOps++;
238	}
239	}
240	}
241	$waitall(opsToWaitOn, numOps);
242
243	return cudaSuccess;
244	}
245	*/
246
247	void $cuda_host_launch_kernel_1(dim3 gridDim, dim3 blockDim, size_t $cudaMemSize, cudaStream_t $cudaStream,
248	const float* A, const float* B, float* C, int numElements) {
249	$cuda_kernel_1_data args;
250	args.gridDim = gridDim;
251	args.blockDim = blockDim;
252	args.$cudaMemSize = $cudaMemSize;
253	args.$cudaStream = $cudaStream;
254	args.A = A;
255	args.B = B;
256	args.C = C;
257	args.numElements = numElements;
258
259	$comm_enqueue($cuda_host_comm, $message_pack($CUDA_PLACE_HOST, $CUDA_PLACE_DEVICE, $CUDA_TAG_LAUNCH_kernel_1, &args, sizeof($cuda_kernel_1_data)));
260	$comm_dequeue($cuda_host_comm, $CUDA_PLACE_DEVICE, $CUDA_TAG_LAUNCH_kernel_1);
261	}
262
263	/////////////////
264	// CUDA "file" //
265	/////////////////
266
267	void _cuda_main() {
268
269	//////////////////////
270	// Device Variables //
271	//////////////////////
272
273	$scope $cuda_scope = $here;
274
275	$comm $cuda_device_comm = $comm_create($cuda_scope, $cuda_gcomm, 1);
276	$cuda_context $cuda_global_context;
277	cudaStream_t $cuda_default_stream;
278
279	/////////////////////////////////
280	// Context & Stream Management //
281	/////////////////////////////////
282
283	// Helper function to get the default stream if passed NULL, and just returns stream otherwise
284	cudaStream_t $default_stream_if_null(cudaStream_t stream) {
285	return stream == NULL ? $cuda_default_stream : stream;
286	}
287
288	$cuda_stream_node_t $create_new_stream_node() {
289	cudaStream_t newStream = (cudaStream_t) malloc(sizeof(struct cudaStream));
290	newStream->head = NULL;
291	newStream->tail = NULL;
292	newStream->numOps = 0;
293	newStream->alive = true;
294
295	$cuda_stream_node_t newHead = ($cuda_stream_node_t) malloc(sizeof(struct $cuda_stream_node));
296	newHead->stream = newStream;
297	newStream->containingNode = newHead;
298	newHead->prev = NULL;
299	newHead->next = NULL;
300
301	return newHead;
302	}
303
304	//@ depends_on \nothing;
305	$atomic_f $proc $destroy_stream_node($cuda_stream_node_t node) {
306	$proc lastOpProc = $proc_null;
307	cudaStream_t stream = node->stream;
308
309	if (node->prev != NULL) {
310	node->prev->next = node->next;
311	}
312	if (node->next != NULL) {
313	node->next->prev = node->prev;
314	}
315	free(node);
316
317	stream->alive = false;
318	if(stream->tail != NULL)
319	lastOpProc = stream->tail->opState->op;
320
321	void $destroy_stream_when_complete($proc lastOpProc, cudaStream_t stream) {
322	$wait(lastOpProc);
323	free(stream);
324	}
325
326	return $spawn $destroy_stream_when_complete(lastOpProc, stream);
327	}
328
329	/**
330	* Enqueues the calling $proc as a new cuda operation onto stream. Then blocks until the cuda operation is allowed to execute.
331	*
332	* Reasoning behind using enqueuedFlag:
333	* + Enforces in the interface more explicitly that device proc shouldn't continue until new op state is created and properly filled out (including the $proc field)
334	* + Keeps symmetry since this method means the op does both enqueueing and dequeueing. The alternative technique would have device enqueuing and op dequeuing.
335	* + Reduces dependencies since device proc will not be manipulating the streams nor have direct access to the newly created op state.
336	* + Keeps responsibility of device proc strictly to interpreting messages, spawning appropriate ops and sending message.
337	*/
338	//@ depends_on \nothing;
339	$atomic_f $cuda_op_state_t $stream_enqueue(_Bool** enqueuedFlag, cudaStream_t stream) {
340	$cuda_op_state_t newOpState = ($cuda_op_state_t) $malloc($cuda_scope, sizeof(struct $cuda_op_state));
341	newOpState->start = false;
342	newOpState->op = $self;
343
344	$cuda_op_state_node_t newOpStateNode = ($cuda_op_state_node_t) $malloc($cuda_scope, sizeof(struct $cuda_op_state_node));
345	newOpStateNode->opState = newOpState;
346	newOpStateNode->next = NULL;
347
348	stream = $default_stream_if_null(stream);
349	$assert(stream->alive, "Attempt to enqueue a CUDA operation onto a destroyed stream");
350
351	if (stream->tail == NULL) {
352	stream->head = newOpStateNode;
353	stream->tail = newOpStateNode;
354	newOpState->start = true;
355	} else {
356	stream->tail->next = newOpStateNode;
357	stream->tail = newOpStateNode;
358	}
359	stream->numOps++;
360	**enqueuedFlag = true;
361	*enqueuedFlag = NULL;
362
363	return newOpState;
364	}
365
366	//@ depends_on \nothing;
367	$atomic_f void $stream_dequeue(cudaStream_t stream) {
368	stream = $default_stream_if_null(stream);
369	$assert(stream->head != NULL, "Attempt to dequeue an empty stream");
370
371	if (stream->head == stream->tail) {
372	stream->tail = NULL;
373	}
374
375	$cuda_op_state_node_t oldHead = stream->head;
376	stream->head = oldHead->next;
377	if (stream->head != NULL) {
378	stream->head->opState->start = true;
379	}
380
381	stream->numOps--;
382	free(oldHead->opState);
383	free(oldHead);
384	}
385
386	///////////////////////////////
387	// CUDA Function Definitions //
388	///////////////////////////////
389
390	/**
391	* Only called at start of program
392	*/
393	void $cuda_setup() {
394	$cuda_stream_node_t defaultStreamNode = $create_new_stream_node();
395	$cuda_default_stream = defaultStreamNode->stream;
396
397	$cuda_global_context.head = defaultStreamNode;
398	$cuda_global_context.numStreams = 1;
399	}
400
401	/**
402	* Only called at end of program
403	*/
404	void $cuda_teardown() {
405	$proc destructor = $destroy_stream_node($cuda_default_stream->containingNode);
406	$wait(destructor);
407	$comm_destroy($cuda_device_comm);
408	}
409
410	$message $cuda_free($message request) {
411	void* devPtr;
412	$message_unpack(request, &devPtr, sizeof(void*));
413	free($reveal(devPtr));
414	//free(devPtr);
415
416	return $message_pack($CUDA_PLACE_DEVICE, $CUDA_PLACE_HOST, $CUDA_TAG_cudaFree, NULL, 0);
417	}
418
419	void $cuda_memcpy_proc(void* dst, const void* src, size_t count, cudaMemcpyKind kind, _Bool* enqueuedFlag, cudaStream_t stream) {
420	$cuda_op_state_t opState = $stream_enqueue(&enqueuedFlag, stream);
421	$when(opState->start);
422
423	if (kind == cudaMemcpyHostToDevice \|\| cudaMemcpyDeviceToDevice) {
424	dst = $reveal(dst);
425	}
426	if (kind == cudaMemcpyDeviceToHost \|\| cudaMemcpyDeviceToDevice) {
427	src = $reveal(src);
428	}
429	memcpy(dst, src, count);
430
431	$stream_dequeue(stream);
432	}
433
434	$message $cuda_memcpy($message request, _Bool async) {
435	$cuda_memcpy_data args;
436	$message_unpack(request, &args, sizeof($cuda_memcpy_data));
437
438	_Bool enqueuedFlag = false;
439	$proc memcpyProc = $spawn $cuda_memcpy_proc(args.dst, args.src, args.count, args.kind, &enqueuedFlag, $cuda_default_stream);
440	$when(enqueuedFlag);
441
442	if (!async && args.kind != cudaMemcpyDeviceToDevice) {
443	$wait(memcpyProc);
444	}
445	int tag = async ? $CUDA_TAG_cudaMemcpyAsync : $CUDA_TAG_cudaMemcpy;
446
447	return $message_pack($CUDA_PLACE_DEVICE, $CUDA_PLACE_HOST, tag, NULL, 0);
448	}
449
450	////////////////////////
451	// Kernel Definitions //
452	////////////////////////
453
454	// Helper function
455	int $dim3_index(dim3 size, uint3 location) {
456	return location.x + size.x * (location.y + size.y * location.z);
457	}
458
459	// Helper function
460	int $cuda_kernel_index (dim3 gDim, dim3 bDim, uint3 bIdx, uint3 tIdx) {
461	return $dim3_index(gDim, bIdx) * (bDim.x * bDim.y * bDim.z) + $dim3_index(bDim, tIdx);
462	}
463
464	void $cuda_run_and_wait_on_procs(dim3 dim, void spawningFunction(uint3)) {
465	//TODO: calculate length and index, replace this function in the kernel
466	$local_start();
467	int length = dim.x * dim.y * dim.z;
468	$proc proc_array[length];
469	$range rx = 0 .. dim.x - 1;
470	$range ry = 0 .. dim.y - 1;
471	$range rz = 0 .. dim.z - 1;
472	$domain(3) dom = ($domain(3)){rx, ry, rz};
473	$for(int x,y,z : dom){
474	uint3 id = { x, y, z };
475	int index = $dim3_index(dim, id);
476	proc_array[index] = $spawn spawningFunction(id);
477	}
478	$local_end();
479	$waitall(proc_array,length);
480	}
481
482	// Generated from kernel_1 definition
483	void $cuda_kernel_1(dim3 gridDim, dim3 blockDim, size_t _cuda_mem_size,
484	const float A, const float B, float *C, int numElements) {
485	void _cuda_block(uint3 blockIdx) {
486	int numThreads = (blockDim.x * blockDim.y) * blockDim.z;
487	$scope _block_root = $here;
488	$gbarrier _cuda_block_barrier = $gbarrier_create($here, blockDim.x * blockDim.y * blockDim.z);
489	void _cuda_thread(uint3 threadIdx) {
490	int _cuda_tid = $dim3_index(blockDim, threadIdx);
491	int _cuda_kid = $cuda_kernel_index(gridDim, blockDim, blockIdx, threadIdx);
492	$barrier _cuda_thread_barrier = $barrier_create($here, _cuda_block_barrier, _cuda_tid);
493	$local_start();
494	// Kernel definition start
495
496	int i = blockDim.x * blockIdx.x + threadIdx.x;
497
498	if (i < numElements)
499	{
500	C[i] = A[i] + B[i];
501	}
502
503	// Kernel definition end
504	$local_end();
505	$barrier_destroy(_cuda_thread_barrier);
506	}
507	$cuda_run_and_wait_on_procs(blockDim, _cuda_thread);
508	$gbarrier_destroy(_cuda_block_barrier);
509	}
510	$cuda_run_and_wait_on_procs(gridDim, _cuda_block);
511	}
512
513	void $cuda_kernel_1_proc (_Bool* enqueuedFlag, dim3 gridDim, dim3 blockDim,
514	size_t $cudaMemSize, cudaStream_t $cudaStream,
515	const float A, const float B, float *C, int numElements) {
516	$cuda_op_state_t opState = $stream_enqueue(&enqueuedFlag, $cudaStream);
517	$when(opState->start);
518	$cuda_kernel_1(gridDim, blockDim, $cudaMemSize, A, B, C, numElements);
519	$stream_dequeue($cudaStream);
520	}
521
522	$message $cuda_device_launch_kernel_1($message request) {
523	$cuda_kernel_1_data args;
524	$message_unpack(request, &args, sizeof($cuda_kernel_1_data));
525
526	_Bool enqueuedFlag = false;
527	$spawn $cuda_kernel_1_proc(&enqueuedFlag, args.gridDim, args.blockDim, args.$cudaMemSize, args.$cudaStream, $reveal(args.A), $reveal(args.B), $reveal(args.C), args.numElements);
528	$when(enqueuedFlag);
529
530	return $message_pack($CUDA_PLACE_DEVICE, $CUDA_PLACE_HOST, $CUDA_TAG_LAUNCH_kernel_1, NULL, 0);
531	}
532
533	/////////////////
534	// Device main //
535	/////////////////
536
537	$cuda_setup();
538
539	while (true) {
540	$message request = $comm_dequeue($cuda_device_comm, $CUDA_PLACE_HOST, $COMM_ANY_TAG);
541	$message response;
542	const int tag = $message_tag(request);
543
544	switch(tag) {
545	case $CUDA_TAG_SCOPE_REQUEST :
546	response = $message_pack($CUDA_PLACE_DEVICE, $CUDA_PLACE_HOST, $CUDA_TAG_SCOPE_REQUEST, &$cuda_scope, sizeof($scope));
547	break;
548	case $CUDA_TAG_cudaFree :
549	response = $cuda_free(request);
550	break;
551	case $CUDA_TAG_cudaMemcpy :
552	response = $cuda_memcpy(request, false);
553	break;
554	case $CUDA_TAG_cudaMemcpyAsync :
555	response = $cuda_memcpy(request, true);
556	break;
557	case $CUDA_TAG_LAUNCH_kernel_1 :
558	response = $cuda_device_launch_kernel_1(request);
559	break;
560	case $CUDA_TAG_TEARDOWN :
561	$cuda_teardown();
562	return;
563	default :
564	$assert(false, "Unknown CUDA request");
565	}
566
567	$comm_enqueue($cuda_device_comm, response);
568	}
569	}
570
571	///////////////
572	// Host file //
573	///////////////
574
575	$input int N;
576	$assume (N > 0);
577	$input float A[N];
578	$input float B[N];
579
580	void _host_main() {
581	int size = N * sizeof(float);
582	int numBlocks = 2;
583	int numThreads = N%2 == 0? N/2 : (N+1)/2;
584
585	float* cuda_A;
586	// cudaMalloc((void **)&cuda_A, size);
587	{
588	$scope deviceScope = $cuda_host_request_device_scope();
589	cuda_A = $hide((float*)$malloc(deviceScope, size));
590	//cuda_A = (float*)$malloc(deviceScope, size);
591	}
592	cudaMemcpy(cuda_A, A, size, cudaMemcpyHostToDevice);
593
594	float* cuda_B;
595	// cudaMalloc((void **)&cuda_B, size);
596	{
597	$scope deviceScope = $cuda_host_request_device_scope();
598	cuda_B = $hide((float*)$malloc(deviceScope, size));
599	//cuda_B = (float*)$malloc(deviceScope, size);
600	}
601	cudaMemcpy(cuda_B, B, size, cudaMemcpyHostToDevice);
602
603	float* cuda_C;
604	// cudaMalloc((void **)&cuda_C, size);
605	{
606	$scope deviceScope = $cuda_host_request_device_scope();
607	cuda_C = $hide((float*)$malloc(deviceScope, size));
608	//cuda_C = (float*)$malloc(deviceScope, size);
609	}
610
611	dim3 gridDim = {numBlocks, 1, 1};
612	dim3 blockDim = {numThreads, 1, 1};
613	// kernel_1<<<gridDim, blockDim>>>(cuda_A, cuda_B, cuda_C, N);
614	$cuda_host_launch_kernel_1(gridDim, blockDim, 0, NULL, cuda_A, cuda_B, cuda_C, N);
615
616	//Checking correctness
617	float* C = (float *)malloc(size);
618
619	cudaMemcpy(C, cuda_C, size, cudaMemcpyDeviceToHost);
620
621	for(int i = 0; i < N; i++)
622	$assert(C[i] == A[i] + B[i]);
623
624	free(C);
625
626	cudaFree(cuda_A);
627	cudaFree(cuda_B);
628	cudaFree(cuda_C);
629
630	// inserted by transformer
631	$comm_enqueue($cuda_host_comm, $message_pack($CUDA_PLACE_HOST, $CUDA_PLACE_DEVICE, $CUDA_TAG_TEARDOWN, NULL, 0));
632	$comm_destroy($cuda_host_comm);
633	}
634
635	int main() {
636	$proc host = $spawn _host_main();
637	$proc cuda = $spawn _cuda_main();
638	$wait(host);
639	$wait(cuda);
640	$gcomm_destroy($cuda_gcomm, NULL);
641	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

Original Format