Context Navigation

newCudaMockup.cvl@ 7ffcb1b

main test-branch

Last change on this file since 7ffcb1b was 9cabba4, checked in by Alex Wilton <awilton@…>, 2 years ago

Merged CUDA branch into trunk.

git-svn-id: svn://vsl.cis.udel.edu/civl/trunk@5867 fb995dde-84ed-4084-dfe6-e5aef3e2452c

Property mode set to 100644

File size: 40.6 KB

Line
1	#include <concurrency.cvh>
2	#include <comm.cvh>
3	#include <stdlib.h>
4	#include <stdio.h>
5	#include <stdbool.h>
6	#include <string.h>
7	#include <mem.cvh>
8	#pragma CIVL ACSL
9
10	///////////
11	// Types //
12	///////////
13
14	enum cudaError {
15	cudaSuccess
16	};
17	typedef enum cudaError cudaError_t;
18
19	//typedef $cuda_memcpy_kind cudaMemcpyKind;
20
21	typedef struct {
22	unsigned int x, y, z;
23	} dim3;
24
25	/* used to represent a location in a three dimensional grid
26	*/
27	typedef struct {
28	unsigned int x, y, z;
29	} uint3;
30
31	typedef enum {
32	cudaMemcpyHostToHost,
33	cudaMemcpyHostToDevice,
34	cudaMemcpyDeviceToHost,
35	cudaMemcpyDeviceToDevice,
36	cudaMemcpyDefault
37	} cudaMemcpyKind;
38
39	typedef struct $cuda_op_state* $cuda_op_state_t;
40	typedef struct $cuda_op_state {
41	_Bool start;
42	$proc op;
43	} $cuda_op_state;
44
45	typedef struct $cuda_op_state_node* $cuda_op_state_node_t;
46	typedef struct $cuda_op_state_node {
47	$cuda_op_state_t opState;
48	$cuda_op_state_node_t next;
49	} $cuda_op_state_node;
50
51	typedef struct $cuda_stream_node* $cuda_stream_node_t;
52	typedef struct $cuda_stream* $cuda_stream_t;
53	typedef $cuda_stream_t cudaStream_t;
54	typedef struct $cuda_stream {
55	$cuda_op_state_node_t head;
56	$cuda_op_state_node_t tail;
57	int numOps;
58	$cuda_stream_node_t containingNode;
59	_Bool alive;
60	} $cuda_stream;
61
62	typedef struct $cuda_stream_node{
63	cudaStream_t stream;
64	$cuda_stream_node_t prev;
65	$cuda_stream_node_t next;
66	} $cuda_stream_node;
67
68	typedef struct $cuda_context* $cuda_context_t;
69	typedef struct $cuda_context {
70	$cuda_stream_node_t head;
71	int numStreams;
72	} $cuda_context;
73
74	typedef struct $cuda_kernel_instance* $cuda_kernel_instance_t;
75	typedef struct $cuda_kernel_instance {
76	$mem* readSets;
77	$mem* writeSets;
78	int size;
79	} $cuda_kernel_instance;
80
81	typedef struct $cuda_memcpy_data {
82	void* dst;
83	const void* src;
84	size_t count;
85	cudaMemcpyKind kind;
86	} $cuda_memcpy_data;
87
88	//////////////////////
89	// Global Variables //
90	//////////////////////
91
92	$gcomm $cuda_gcomm = $gcomm_create($here, 2);
93	const int $CUDA_PLACE_HOST = 0;
94	const int $CUDA_PLACE_DEVICE = 1;
95	$comm $cuda_host_comm = $comm_create($here, $cuda_gcomm, $CUDA_PLACE_HOST);
96
97	/**
98	* Tags used for message-passing between host and device
99	*/
100	enum $cuda_tag {
101	// Predefined tags
102	$CUDA_TAG_TEARDOWN,
103	$CUDA_TAG_SCOPE_REQUEST,
104	$CUDA_TAG_cudaFree,
105	$CUDA_TAG_cudaMemcpy,
106	$CUDA_TAG_cudaMemcpyAsync,
107	// Generated tags (by transformer)
108	$CUDA_TAG_LAUNCH_kernel_1
109	};
110
111	///////////////////
112	// CIVL-CUDA API //
113	///////////////////
114
115	$scope $cuda_host_request_device_scope() {
116	$comm_enqueue($cuda_host_comm, $message_pack($CUDA_PLACE_HOST, $CUDA_PLACE_DEVICE, $CUDA_TAG_SCOPE_REQUEST, NULL, 0));
117	$message response = $comm_dequeue($cuda_host_comm, $CUDA_PLACE_DEVICE, $CUDA_TAG_SCOPE_REQUEST);
118	$scope result;
119	$message_unpack(response, &result, sizeof($scope));
120
121	return result;
122	}
123
124	void $cuda_host_memcpy(void* dst, const void* src, size_t count, cudaMemcpyKind kind, _Bool async) {
125	if (kind == cudaMemcpyHostToHost) {
126	memcpy(dst, src, count);
127	} else {
128	$cuda_memcpy_data args;
129	args.dst = dst;
130	args.src = src;
131	args.count = count;
132	args.kind = kind;
133
134	int tag = async ? $CUDA_TAG_cudaMemcpyAsync : $CUDA_TAG_cudaMemcpy;
135
136	$comm_enqueue($cuda_host_comm, $message_pack($CUDA_PLACE_HOST, $CUDA_PLACE_DEVICE, tag, &args, sizeof($cuda_memcpy_data)));
137	$comm_dequeue($cuda_host_comm, $CUDA_PLACE_DEVICE, tag);
138	}
139	}
140
141	$cuda_stream_node_t $create_new_stream_node($scope cudaScope) {
142	cudaStream_t newStream = (cudaStream_t) $malloc(cudaScope, sizeof($cuda_stream));
143	newStream->head = NULL;
144	newStream->tail = NULL;
145	newStream->numOps = 0;
146	newStream->alive = true;
147
148	$cuda_stream_node_t newHead = ($cuda_stream_node_t) $malloc(cudaScope, sizeof($cuda_stream_node));
149	newHead->stream = newStream;
150	newStream->containingNode = newHead;
151	newHead->prev = NULL;
152	newHead->next = NULL;
153
154	return newHead;
155	}
156
157	/*@ depends_on \nothing;
158	@ assigns \nothing;
159	@ reads \nothing;
160	@*/
161	$atomic_f $proc $destroy_stream_node($cuda_stream_node_t node) {
162	$proc lastOpProc = $proc_null;
163	cudaStream_t stream = node->stream;
164
165	if (node->prev != NULL) {
166	node->prev->next = node->next;
167	}
168	if (node->next != NULL) {
169	node->next->prev = node->prev;
170	}
171	free(node);
172
173	stream->alive = false;
174	if(stream->tail != NULL)
175	lastOpProc = stream->tail->opState->op;
176
177	void destroyStreamWhenComplete($proc lastOpProc, cudaStream_t stream) {
178	$wait(lastOpProc);
179	free(stream);
180	}
181
182	return $spawn destroyStreamWhenComplete(lastOpProc, stream);
183	}
184
185	/*@ depends_on \access(stream);
186	@ assigns stream;
187	@ reads \nothing;
188	@*/
189	$atomic_f $proc $stream_enqueue($scope cudaScope, cudaStream_t stream, $message opParams, void(*opProc)($message, $cuda_op_state_t, cudaStream_t)) {
190	$assert(stream->alive, "Attempt to enqueue a CUDA operation onto a destroyed stream");
191
192	$cuda_op_state_t newOpState = ($cuda_op_state_t) $malloc(cudaScope, sizeof($cuda_op_state));
193	newOpState->start = false;
194	newOpState->op = $spawn opProc(opParams, newOpState, stream);
195
196	$cuda_op_state_node_t newOpStateNode = ($cuda_op_state_node_t) $malloc(cudaScope, sizeof($cuda_op_state_node));
197	newOpStateNode->opState = newOpState;
198	newOpStateNode->next = NULL;
199
200	if (stream->tail == NULL) {
201	stream->head = newOpStateNode;
202	stream->tail = newOpStateNode;
203	newOpState->start = true;
204	} else {
205	stream->tail->next = newOpStateNode;
206	stream->tail = newOpStateNode;
207	}
208	stream->numOps++;
209
210	return newOpState->op;
211	}
212
213	/*@ depends_on \nothing;
214	@ assigns \nothing;
215	@ reads \nothing;
216	@*/
217	$atomic_f void $stream_dequeue(cudaStream_t stream) {
218	$assert(stream->head != NULL, "Attempt to dequeue an empty stream");
219
220	if (stream->head == stream->tail) {
221	stream->tail = NULL;
222	}
223
224	$cuda_op_state_node_t oldHead = stream->head;
225	stream->head = oldHead->next;
226	if (stream->head != NULL) {
227	stream->head->opState->start = true;
228	}
229
230	stream->numOps--;
231	free(oldHead->opState);
232	free(oldHead);
233	}
234
235	// Helper function
236	int $dim3_index(dim3 size, uint3 location) {
237	return location.x + size.x * (location.y + size.y * location.z);
238	}
239
240	// Helper function
241	int $cuda_kernel_index (dim3 gDim, dim3 bDim, uint3 bIdx, uint3 tIdx) {
242	return $dim3_index(gDim, bIdx) * (bDim.x * bDim.y * bDim.z) + $dim3_index(bDim, tIdx);
243	}
244
245	void $cuda_run_and_wait_on_procs(dim3 dim, void spawningFunction(uint3)) {
246	//TODO: calculate length and index, replace this function in the kernel
247	$local_start();
248	int length = dim.x * dim.y * dim.z;
249	$proc procArray[length];
250	$range rx = 0 .. dim.x - 1;
251	$range ry = 0 .. dim.y - 1;
252	$range rz = 0 .. dim.z - 1;
253	$domain(3) dom = ($domain(3)){rx, ry, rz};
254	$for(int x,y,z : dom){
255	uint3 id = { x, y, z };
256	int index = $dim3_index(dim, id);
257	procArray[index] = $spawn spawningFunction(id);
258	}
259	$local_end();
260	$waitall(procArray,length);
261	}
262
263
264	// CUDA Ops //
265
266	void $cuda_memcpy_proc($message m, $cuda_op_state_t opState, cudaStream_t stream) {
267
268	$when(opState->start);
269	$cuda_memcpy_data args;
270	$message_unpack(m, &args, sizeof($cuda_memcpy_data));
271
272	if (args.kind == cudaMemcpyHostToDevice \|\| cudaMemcpyDeviceToDevice) {
273	args.dst = $reveal(args.dst);
274	}
275	if (args.kind == cudaMemcpyDeviceToHost \|\| cudaMemcpyDeviceToDevice) {
276	args.src = $reveal(args.src);
277	}
278	memcpy(args.dst, args.src, args.count);
279
280	$stream_dequeue(stream);
281	}
282
283	$message $cuda_memcpy($scope cudaScope, cudaStream_t stream, $message request, _Bool async) {
284	$cuda_memcpy_data args;
285	$message_unpack(request, &args, sizeof($cuda_memcpy_data));
286
287	$proc memcpyProc = $stream_enqueue(cudaScope, stream, request, $cuda_memcpy_proc);
288
289	if (!async && args.kind != cudaMemcpyDeviceToDevice) {
290	$wait(memcpyProc);
291	}
292	int tag = async ? $CUDA_TAG_cudaMemcpyAsync : $CUDA_TAG_cudaMemcpy;
293
294	return $message_pack($CUDA_PLACE_DEVICE, $CUDA_PLACE_HOST, tag, NULL, 0);
295	}
296
297	$message $cuda_free($message request) {
298	void* devPtr;
299	$message_unpack(request, &devPtr, sizeof(void*));
300	free($reveal(devPtr));
301
302	return $message_pack($CUDA_PLACE_DEVICE, $CUDA_PLACE_HOST, $CUDA_TAG_cudaFree, NULL, 0);
303	}
304
305	$cuda_kernel_instance_t $create_kernel_instance($scope cudaScope, dim3 gridDim, dim3 blockDim){
306	int threadsPerBlock = (blockDim.x * blockDim.y) * blockDim.z;
307	int numBlocks = (gridDim.x * gridDim.y) * gridDim.z;
308	int numThreads = threadsPerBlock * numBlocks;
309
310	$cuda_kernel_instance_t $kernel = ($cuda_kernel_instance_t)$malloc(cudaScope, sizeof($cuda_kernel_instance));
311	$mem* readSets = ($mem)$malloc(cudaScope, sizeof($mem) numThreads);
312	$mem* writeSets = ($mem)$malloc(cudaScope, sizeof($mem) numThreads);
313	//Is this meant to be the number of threads in the grid?
314	int size = numThreads;
315
316	$kernel->readSets = readSets;
317	$kernel->writeSets = writeSets;
318	$kernel->size = size;
319
320	return $kernel;
321	}
322
323	void $destroy_kernel_instance($cuda_kernel_instance_t $kernel){
324	free($kernel->readSets);
325	free($kernel->writeSets);
326	free($kernel);
327	return;
328	}
329
330	void $clear_mem_sets($cuda_kernel_instance_t k, int cur_tid) {
331	k->writeSets[cur_tid] = $mem_empty();
332	k->readSets[cur_tid] = $mem_empty();
333	}
334
335	void $clear_all_mem_sets($cuda_kernel_instance_t k){
336	for(int i = 0; i < k->size; i++)
337	$clear_mem_sets(k, i);
338	}
339
340	$atomic_f void $check_data_race($cuda_kernel_instance_t k, int cur_tid) {
341	//printf("Current id: %d\n", cur_tid);
342	$mem out_s0 = $mem_empty();
343	$mem out_s1 = $mem_empty();
344	$mem cur_mw = $write_set_pop();
345	$mem cur_mr = $read_set_pop();
346
347	// Update current R/W sets
348	k->writeSets[cur_tid] = cur_mw;
349	k->readSets[cur_tid] = cur_mr;
350
351	/*
352	printf("CHECKING DATA RACE %d [\n", cur_tid);
353	for (int tmp_tid = 0; tmp_tid < k->size; tmp_tid++) {
354	printf(" RS %d: %s\n", tmp_tid, k->read_sets[tmp_tid]);
355	printf(" WS %d: %s\n", tmp_tid, k->write_sets[tmp_tid]);
356	}
357	printf("]\n");
358	*/
359
360	// Check data race
361	for (int tmp_tid = 0; tmp_tid < k->size; tmp_tid++) {
362	if (tmp_tid == cur_tid) continue;
363
364	$mem tmp_mr = k->readSets[tmp_tid];
365	$mem tmp_mw = k->writeSets[tmp_tid];
366
367	$assert($mem_no_intersect(cur_mr, tmp_mw, &out_s0, &out_s1),
368	"Data-race detected: %p read by thread %d intersects %p written by thread %d\n",
369	out_s0, cur_tid, out_s1, tmp_tid);
370	$assert($mem_no_intersect(cur_mw, tmp_mr, &out_s0, &out_s1),
371	"Data-race detected: %p read by thread %d intersects %p written by thread %d\n",
372	out_s0, cur_tid, out_s1, tmp_tid);
373	$assert($mem_no_intersect(cur_mw, tmp_mw, &out_s0, &out_s1),
374	"Data-race detected: %p written by thread %d intersects %p written by thread %d\n",
375	out_s0, cur_tid, out_s1, tmp_tid);
376	}
377	// Update current R/W sets
378	//k->writeSets[cur_tid] = $mem_empty();
379	//k->readSets[cur_tid] = $mem_empty();
380	$read_set_push();
381	$write_set_push();
382	}
383
384	void $cuda_barrier($cuda_kernel_instance_t k, int kernel_id, $barrier g) {
385	/*$check_data_race(k, kernel_id);
386	We have to push a new read and write set before the barrier call to ignore it's reads and writes
387	$read_set_push();
388	$write_set_push();
389	void captured_clear_mems(){
390	$clear_all_mem_sets(k);
391	}
392	*/
393	//$barrier_call_execute(g, captured_clear_mems);
394	$local_end();
395	$barrier_call(g);
396	$local_start();
397	//$read_set_pop();
398	//$write_set_pop();
399	}
400
401
402
403	////////////////////////////////////////////
404	// CUDA API Functions (For Host-use Only) //
405	////////////////////////////////////////////
406
407	cudaError_t cudaFree(void* devPtr) {
408	$comm_enqueue($cuda_host_comm, $message_pack($CUDA_PLACE_HOST, $CUDA_PLACE_DEVICE, $CUDA_TAG_cudaFree, &devPtr, sizeof(void*)));
409	$comm_dequeue($cuda_host_comm, $CUDA_PLACE_DEVICE, $CUDA_TAG_cudaFree);
410
411	return cudaSuccess;
412	}
413
414	cudaError_t cudaMemcpy(void* dst, const void* src, size_t count, cudaMemcpyKind kind) {
415	$cuda_host_memcpy(dst, src, count, kind, false);
416	return cudaSuccess;
417	}
418
419	cudaError_t cudaMemcpyAsync(void* dst, const void* src, size_t count,
420	cudaMemcpyKind kind, cudaStream_t stream) {
421	$cuda_host_memcpy(dst, src, count, kind, true);
422	return cudaSuccess;
423	}
424
425	/*
426	cudaError_t cudaStreamCreate(cudaStream_t * pStream) {
427	// Create new stream node in linked list
428	$cuda_stream_node_t newHead = $create_new_stream_node();
429	newHead->next = $cuda_global_context.head;
430	$cuda_global_context.head->prev = newHead;
431
432	// Update cuda context's head to be the new node we created
433	$cuda_global_context.head = newHead;
434	$cuda_global_context.numStreams++;
435
436	return cudaSuccess;
437	}
438	*/
439
440	/*
441	cudaError_t cudaStreamSynchronize(cudaStream_t stream) {
442	stream = $default_stream_if_null(stream);
443	$assert(stream->alive, "Attempt to synchronize with a destroyed stream");
444	$when(stream->head == NULL) return cudaSuccess;
445	}
446	*/
447
448	/*
449	cudaError_t cudaStreamDestroy(cudaStream_t stream) {
450	$assert(stream != NULL && stream != $cuda_default_stream, "Attempt to destroy default stream");
451	$assert(stream->alive, "Attempt to destroy an already destroyed stream");
452	$destroy_stream_node(stream->containingNode);
453	return cudaSuccess;
454	}
455	*/
456
457	/*
458	cudaError_t cudaDeviceSynchronize() {
459	$proc* opsToWaitOn;
460	int numOps = 0;
461
462	$atomic {
463	opsToWaitOn = ($proc) malloc(sizeof($proc) $cuda_global_context.numStreams);
464
465	for ($cuda_stream_node_t node = $cuda_global_context.head;
466	node != NULL;
467	node = node->next) {
468	if (node->stream->tail != NULL) {
469	opsToWaitOn[numOps] = node->stream->tail->opState->op;
470	numOps++;
471	}
472	}
473	}
474	$waitall(opsToWaitOn, numOps);
475
476	return cudaSuccess;
477	}
478	*/
479
480	////////////////
481	// Warp stuff //
482	////////////////
483
484	// TODO: Handle thread termination!
485
486	$input int warpSize = 32;
487
488	typedef struct $cuda_warp* $cuda_warp_t;
489	typedef struct $cuda_warp {
490	int size;
491	int num_alive;
492	_Bool alive[];
493	int num_in_barrier;
494	_Bool in_barrier[];
495	int reductionLane;
496	$gcomm warp_gcomm;
497	} $cuda_warp;
498
499	$cuda_warp_t $create_cuda_warp($scope warpScope, int size) {
500	$cuda_warp_t newWarp = ($cuda_warp_t) malloc(sizeof($cuda_warp));
501	newWarp->size = size;
502	newWarp->num_alive = size;
503	newWarp->alive = (_Bool[size])$lambda(int i) $true;
504	newWarp->num_in_barrier = 0;
505	newWarp->in_barrier = (_Bool[size])$lambda(int i) $false;
506	newWarp->reductionLane = -1;
507	newWarp->warp_gcomm = $gcomm_create(warpScope, size);
508
509	return newWarp;
510	}
511
512	void $destroy_cuda_warp($cuda_warp_t warp) {
513	$assert(warp != NULL, "Attempt to destroy a NULL warp");
514
515	$gcomm_destroy(warp->warp_gcomm, NULL);
516	free(warp);
517	}
518
519	/*@ depends_on \access(warp);
520	@ executes_when \true;
521	@ */
522	$atomic_f void $cuda_warp_barrier_update($cuda_warp_t warp) {
523	if (warp->num_in_barrier == warp->num_alive) {
524	warp->num_in_barrier = 0;
525	for (int i = 0; i < warp->size; i++) {
526	warp->in_barrier[i] = $false;
527	}
528	}
529	}
530
531	typedef struct $cuda_lane* $cuda_lane_t;
532	typedef struct $cuda_lane {
533	int laneID;
534	$cuda_warp_t warp;
535	$comm lane_comm;
536	} $cuda_lane;
537
538	$cuda_lane_t $create_cuda_lane($scope laneScope, $cuda_warp_t warp, int laneID) {
539	$assert(warp != NULL, "Attempt to create lane from NULL warp");
540
541	$cuda_lane_t newLane = ($cuda_lane_t) malloc(sizeof($cuda_lane));
542	newLane->laneID = laneID;
543	newLane->warp = warp;
544	newLane->lane_comm = $comm_create(laneScope, warp->warp_gcomm, laneID);
545
546	return newLane;
547	};
548
549	/*@ depends_on \access(lane);
550	@ executes_when \true;
551	@ */
552	$atomic_f void $destroy_cuda_lane($cuda_lane_t lane) {
553	$assert(lane != NULL, "Attempt to destroy NULL lane");
554
555	lane->warp->alive[lane->laneID] = $false;
556	lane->warp->num_alive--;
557	$cuda_warp_barrier_update(lane->warp);
558	$comm_destroy(lane->lane_comm);
559	free(lane);
560	}
561
562	/*@ depends_on \access(lane);
563	@ executes_when \true;
564	@ */
565	$atomic_f void $cuda_warp_barrier_enter($cuda_lane_t lane) {
566	$cuda_warp_t warp = lane->warp;
567	int place = lane->laneID;
568	$assert(!warp->in_barrier[place]);
569
570	warp->in_barrier[place] = $true;
571	warp->num_in_barrier++;
572	$cuda_warp_barrier_update(warp);
573	}
574
575	/*@ depends_on \access(lane);
576	@ */
577	$atomic_f void $cuda_warp_barrier_exit($cuda_lane_t lane) {
578	$when(!lane->warp->in_barrier[lane->laneID]);
579	}
580
581	void $cuda_warp_barrier_call($cuda_lane_t lane) {
582	$cuda_warp_barrier_enter(lane);
583	$cuda_warp_barrier_exit(lane);
584	}
585
586	typedef enum {
587	$CUDA_WARP_TAG_shfl_sync,
588	$CUDA_WARP_TAG_shfl_up_sync,
589	$CUDA_WARP_TAG_shfl_down_sync,
590	$CUDA_WARP_TAG_shfl_xor_sync,
591	$CUDA_WARP_TAG_all_sync,
592	$CUDA_WARP_TAG_any_sync,
593	$CUDA_WARP_TAG_ballot_sync
594	} $cuda_warp_tag;
595
596	#define $GET_ARG_1(_1, ...) _1
597	#define $GET_ARG_2(_1, _2, ...) _2
598	#define $CUDA_SHFL_PARAM_MACRO(...) $GET_ARG_1(__VA_ARGS__, warpSize, 0), $GET_ARG_2(__VA_ARGS__, warpSize, 0)
599
600	#define __shfl_sync(mask, var, ...) \
601	_Generic(var, \
602	default: $cuda__shfl_sync_int, \
603	unsigned int: $cuda__shfl_sync_uint, \
604	long: $cuda__shfl_sync_long, \
605	unsigned long: $cuda__shfl_sync_ulong, \
606	long long: $cuda__shfl_sync_ll, \
607	unsigned long long: $cuda__shfl_sync_ull, \
608	float: $cuda__shfl_sync_float,\
609	double: $cuda__shfl_sync_double) (mask, var, $CUDA_SHFL_PARAM_MACRO(__VA_ARGS__), $lane)
610
611	#define __shfl_up_sync(mask, var, ...) \
612	_Generic(var, \
613	default: $cuda__shfl_up_sync_int, \
614	unsigned int: $cuda__shfl_up_sync_uint, \
615	long: $cuda__shfl_up_sync_long, \
616	unsigned long: $cuda__shfl_up_sync_ulong, \
617	long long: $cuda__shfl_up_sync_ll, \
618	unsigned long long: $cuda__shfl_up_sync_ull, \
619	float: $cuda__shfl_up_sync_float, \
620	double: $cuda__shfl_up_sync_double) (mask, var, $CUDA_SHFL_PARAM_MACRO(__VA_ARGS__), $lane)
621
622	#define __shfl_down_sync(mask, var, ...) \
623	_Generic(var, \
624	default: $cuda__shfl_down_sync_int, \
625	unsigned int: $cuda__shfl_down_sync_uint, \
626	long: $cuda__shfl_down_sync_long, \
627	unsigned long: $cuda__shfl_down_sync_ulong, \
628	long long: $cuda__shfl_down_sync_ll, \
629	unsigned long long: $cuda__shfl_down_sync_ull, \
630	float: $cuda__shfl_down_sync_float, \
631	double: $cuda__shfl_down_sync_double) (mask, var, $CUDA_SHFL_PARAM_MACRO(__VA_ARGS__), $lane)
632
633	#define __shfl_xor_sync(mask, var, ...) \
634	_Generic(var, \
635	default: $cuda__shfl_xor_sync_int, \
636	unsigned int: $cuda__shfl_xor_sync_uint, \
637	long: $cuda__shfl_xor_sync_long, \
638	unsigned long: $cuda__shfl_xor_sync_ulong, \
639	long long: $cuda__shfl_xor_sync_ll, \
640	unsigned long long: $cuda__shfl_xor_sync_ull, \
641	float: $cuda__shfl_xor_sync_float, \
642	double: $cuda__shfl_xor_sync_double) (mask, var, $CUDA_SHFL_PARAM_MACRO(__VA_ARGS__), $lane)
643
644	#define $CUDA_GENERIC_SHFL_BODY() \
645	$assert (width <= warpSize); \
646	for (int v = width; v > 1; v /= 2) { \
647	$assert(v % 2 == 0); \
648	} \
649	\
650	int requestLane; \
651	switch(tag) { \
652	case $CUDA_WARP_TAG_shfl_sync: \
653	requestLane = lane->laneID/width + laneParam % width; \
654	break; \
655	case $CUDA_WARP_TAG_shfl_up_sync: \
656	requestLane = lane->laneID - laneParam; \
657	break; \
658	case $CUDA_WARP_TAG_shfl_down_sync: \
659	requestLane = lane->laneID + laneParam; \
660	break; \
661	case $CUDA_WARP_TAG_shfl_xor_sync: \
662	requestLane = lane->laneID ^ laneParam; \
663	break; \
664	} \
665	_Bool validSrcLane = requestLane >= 0 && requestLane < lane->warp->size; \
666	if (validSrcLane) { \
667	$comm_enqueue(lane->lane_comm, $message_pack(lane->laneID, requestLane, tag, NULL, 0)); \
668	} \
669	\
670	$local_end(); \
671	$cuda_warp_barrier_call(lane); \
672	$local_start(); \
673	\
674	while ($comm_probe(lane->lane_comm, $COMM_ANY_SOURCE, tag)) { \
675	$message request = $comm_dequeue(lane->lane_comm, $COMM_ANY_SOURCE, tag); \
676	\
677	$comm_enqueue(lane->lane_comm, $message_pack(lane->laneID, $message_source(request), tag, &var, typeSize)); \
678	} \
679	\
680	$local_end(); \
681	$cuda_warp_barrier_call(lane); \
682	$local_start(); \
683	\
684	if (validSrcLane) { \
685	$message result = $comm_dequeue(lane->lane_comm, requestLane, tag); \
686	$message_unpack(result, &resultVal, typeSize); \
687	} else { \
688	$havoc(&resultVal); \
689	}
690
691	#define $CUDA_DEFINE_SHFL(NAME, T, TAG) \
692	T NAME(unsigned mask, T var, int laneParam, int width, $cuda_lane_t lane) { \
693	T resultVal; \
694	int typeSize = sizeof(T); \
695	$cuda_warp_tag tag = TAG; \
696	\
697	$CUDA_GENERIC_SHFL_BODY(); \
698	\
699	return resultVal; \
700	}
701
702	$CUDA_DEFINE_SHFL($cuda__shfl_sync_int, int, $CUDA_WARP_TAG_shfl_sync)
703	$CUDA_DEFINE_SHFL($cuda__shfl_sync_uint, unsigned int, $CUDA_WARP_TAG_shfl_sync)
704	$CUDA_DEFINE_SHFL($cuda__shfl_sync_long, long, $CUDA_WARP_TAG_shfl_sync)
705	$CUDA_DEFINE_SHFL($cuda__shfl_sync_ulong, unsigned long, $CUDA_WARP_TAG_shfl_sync)
706	$CUDA_DEFINE_SHFL($cuda__shfl_sync_ll, long long, $CUDA_WARP_TAG_shfl_sync)
707	$CUDA_DEFINE_SHFL($cuda__shfl_sync_ull, unsigned long long, $CUDA_WARP_TAG_shfl_sync)
708	$CUDA_DEFINE_SHFL($cuda__shfl_sync_float, float, $CUDA_WARP_TAG_shfl_sync)
709	$CUDA_DEFINE_SHFL($cuda__shfl_sync_double, double, $CUDA_WARP_TAG_shfl_sync)
710
711	$CUDA_DEFINE_SHFL($cuda__shfl_up_sync_int, int, $CUDA_WARP_TAG_shfl_up_sync)
712	$CUDA_DEFINE_SHFL($cuda__shfl_up_sync_uint, unsigned int, $CUDA_WARP_TAG_shfl_up_sync)
713	$CUDA_DEFINE_SHFL($cuda__shfl_up_sync_long, long, $CUDA_WARP_TAG_shfl_up_sync)
714	$CUDA_DEFINE_SHFL($cuda__shfl_up_sync_ulong, unsigned long, $CUDA_WARP_TAG_shfl_up_sync)
715	$CUDA_DEFINE_SHFL($cuda__shfl_up_sync_ll, long long, $CUDA_WARP_TAG_shfl_up_sync)
716	$CUDA_DEFINE_SHFL($cuda__shfl_up_sync_ull, unsigned long long, $CUDA_WARP_TAG_shfl_up_sync)
717	$CUDA_DEFINE_SHFL($cuda__shfl_up_sync_float, float, $CUDA_WARP_TAG_shfl_up_sync)
718	$CUDA_DEFINE_SHFL($cuda__shfl_up_sync_double, double, $CUDA_WARP_TAG_shfl_up_sync)
719
720	$CUDA_DEFINE_SHFL($cuda__shfl_down_sync_int, int, $CUDA_WARP_TAG_shfl_down_sync)
721	$CUDA_DEFINE_SHFL($cuda__shfl_down_sync_uint, unsigned int, $CUDA_WARP_TAG_shfl_down_sync)
722	$CUDA_DEFINE_SHFL($cuda__shfl_down_sync_long, long, $CUDA_WARP_TAG_shfl_down_sync)
723	$CUDA_DEFINE_SHFL($cuda__shfl_down_sync_ulong, unsigned long, $CUDA_WARP_TAG_shfl_down_sync)
724	$CUDA_DEFINE_SHFL($cuda__shfl_down_sync_ll, long long, $CUDA_WARP_TAG_shfl_down_sync)
725	$CUDA_DEFINE_SHFL($cuda__shfl_down_sync_ull, unsigned long long, $CUDA_WARP_TAG_shfl_down_sync)
726	$CUDA_DEFINE_SHFL($cuda__shfl_down_sync_float, float, $CUDA_WARP_TAG_shfl_down_sync)
727	$CUDA_DEFINE_SHFL($cuda__shfl_down_sync_double, double, $CUDA_WARP_TAG_shfl_down_sync)
728
729	$CUDA_DEFINE_SHFL($cuda__shfl_xor_sync_int, int, $CUDA_WARP_TAG_shfl_xor_sync)
730	$CUDA_DEFINE_SHFL($cuda__shfl_xor_sync_uint, unsigned int, $CUDA_WARP_TAG_shfl_xor_sync)
731	$CUDA_DEFINE_SHFL($cuda__shfl_xor_sync_long, long, $CUDA_WARP_TAG_shfl_xor_sync)
732	$CUDA_DEFINE_SHFL($cuda__shfl_xor_sync_ulong, unsigned long, $CUDA_WARP_TAG_shfl_xor_sync)
733	$CUDA_DEFINE_SHFL($cuda__shfl_xor_sync_ll, long long, $CUDA_WARP_TAG_shfl_xor_sync)
734	$CUDA_DEFINE_SHFL($cuda__shfl_xor_sync_ull, unsigned long long, $CUDA_WARP_TAG_shfl_xor_sync)
735	$CUDA_DEFINE_SHFL($cuda__shfl_xor_sync_float, float, $CUDA_WARP_TAG_shfl_xor_sync)
736	$CUDA_DEFINE_SHFL($cuda__shfl_xor_sync_double, double, $CUDA_WARP_TAG_shfl_xor_sync)
737
738	#define __ballot_sync(mask, predicate) $cuda__ballot_sync(mask, predicate, $lane)
739	#define __all_sync(mask, predicate) $cuda__all_sync(mask, predicate, $lane)
740	#define __any_sync(mask, predicate) $cuda__any_sync(mask, predicate, $lane)
741
742	#define $CUDA_GENERIC_COND_REDUCTION_BODY(COND, T_REDUCTION, F_REDUCTION) \
743	$cuda_warp_t warp = lane->warp; \
744	int laneID = lane->laneID; \
745	$comm comm = lane->lane_comm; \
746	if (warp->reductionLane == -1) { \
747	warp->reductionLane = laneID; \
748	\
749	result = initialValue; \
750	for (int i = 0; i < warp->size; i++) { \
751	if (i == laneID) { \
752	operand = value; \
753	} else { \
754	$local_end(); \
755	$when(!warp->alive[i] \|\| $comm_probe(comm, i, tag)) $local_start(); \
756	\
757	if (!warp->alive[i]) { \
758	operand = initialValue; \
759	} else { \
760	$local_end(); \
761	$message_unpack($comm_dequeue(comm, i, tag), &operand, typeSize); \
762	$local_start(); \
763	} \
764	} \
765	\
766	if (COND) { \
767	result = T_REDUCTION; \
768	} else { \
769	result = F_REDUCTION; \
770	} \
771	} \
772	\
773	warp->reductionLane = -1; \
774	\
775	for (int i = 0; i< warp->size; i++) { \
776	if (i != laneID && warp->alive[i]) { \
777	$comm_enqueue(comm, $message_pack(laneID, i, tag, &result, typeSize)); \
778	} \
779	} \
780	} else { \
781	int reductionLane = warp->reductionLane; \
782	$comm_enqueue(comm, $message_pack(laneID, reductionLane, tag, &value, typeSize)); \
783	$local_end(); \
784	$message_unpack($comm_dequeue(comm, reductionLane, tag), &result, typeSize); \
785	$local_start(); \
786	}
787
788	#define $CUDA_GENERIC_REDUCTION_BODY(REDUCTION) $CUDA_GENERIC_COND_REDUCTION_BODY($true, REDUCTION, result)
789
790	int $cuda__all_sync(unsigned mask, int value, $cuda_lane_t lane) {
791	$cuda_warp_tag tag = $CUDA_WARP_TAG_all_sync;
792	int typeSize = sizeof(int);
793	int initialValue = 1;
794	int result, operand;
795
796	$CUDA_GENERIC_COND_REDUCTION_BODY(result != 0 && operand != 0, 1, 0);
797
798	return result;
799	}
800
801	int $cuda__any_sync(unsigned mask, int value, $cuda_lane_t lane) {
802	$cuda_warp_tag tag = $CUDA_WARP_TAG_any_sync;
803	int typeSize = sizeof(int);
804	int initialValue = 0;
805	int result, operand;
806
807	$CUDA_GENERIC_COND_REDUCTION_BODY(result != 0 \|\| operand != 0, 1, 0);
808
809	return result;
810	}
811
812	unsigned $cuda__ballot_sync(unsigned mask, int value, $cuda_lane_t lane) {
813	$cuda_warp_tag tag = $CUDA_WARP_TAG_ballot_sync;
814	int initialValue = 0;
815	int typeSize = sizeof(int);
816	unsigned result;
817	int operand;
818
819	$CUDA_GENERIC_COND_REDUCTION_BODY(operand == 0, 2 * result, 2 * result + 1);
820
821	return result;
822	}
823
824	/*
825	int $cuda__ballot_sync(unsigned mask, int value, $cuda_lane_t lane) {
826	$cuda_warp_tag tag = $CUDA_WARP_TAG_ballot_sync;
827	int initialValue = 0;
828	int result;
829
830	if (warp->reductionLane == -1) {
831	warp->reductionLane = laneID;
832
833	result = initialValue;
834	for (int i = 0; i < warp->size; i++) {
835	int operand;
836	if (i == laneID) {
837	operand = value;
838	} else {
839	$local_end();
840	$when(!warp->alive[i] \|\| $comm_probe(comm, i, tag)) $local_start();
841
842	if (!warp->alive[i]) {
843	operand = initialValue;
844	} else {
845	$local_end();
846	$message_unpack($comm_dequeue(comm, i, tag), &operand, sizeof(int));
847	$local_start();
848	}
849	}
850
851	result = 2 * result + (operand == 0 ? 0 : 1);
852	}
853
854	warp->reductionLane = -1;
855
856	for (int i = 0; i< warp->size; i++) {
857	if (i != laneID && warp->alive[i]) {
858	$comm_enqueue(comm, $message_pack(laneID, i, tag, &result, sizeof(int)));
859	}
860	}
861	} else {
862	int reductionLane = warp->reductionLane;
863	$comm_enqueue(comm, $message_pack(laneID, reductionLane, tag, &value, sizeof(int)));
864	$local_end();
865	$message_unpack($comm_dequeue(comm, reductionLane, tag), &result, sizeof(int));
866	$local_start();
867	}
868
869	return result;
870	}
871	*/
872
873	//////////////////////////////////
874	// Generated code from kernel_1 //
875	//////////////////////////////////
876
877	typedef struct {
878	dim3 gridDim;
879	dim3 blockDim;
880	size_t $cudaMemSize;
881	cudaStream_t $cudaStream;
882	float* A;
883	const float* B;
884	float* C;
885	int numElements;
886	} $cuda_kernel_1_data;
887
888	void $cuda_reveal_kernel_1_args($cuda_kernel_1_data* args) {
889	args->A = $reveal(args->A);
890	args->B = $reveal(args->B);
891	args->C = $reveal(args->C);
892	}
893
894	void $cuda_host_launch_kernel_1(dim3 gridDim, dim3 blockDim,
895	size_t $cudaMemSize, cudaStream_t $cudaStream,
896	float* A, const float* B, float* C, int numElements) {
897	$cuda_kernel_1_data args;
898	args.gridDim = gridDim;
899	args.blockDim = blockDim;
900	args.$cudaMemSize = $cudaMemSize;
901	args.$cudaStream = $cudaStream;
902	args.A = A;
903	args.B = B;
904	args.C = C;
905	args.numElements = numElements;
906
907	$comm_enqueue($cuda_host_comm, $message_pack($CUDA_PLACE_HOST, $CUDA_PLACE_DEVICE, $CUDA_TAG_LAUNCH_kernel_1, &args, sizeof($cuda_kernel_1_data)));
908	$comm_dequeue($cuda_host_comm, $CUDA_PLACE_DEVICE, $CUDA_TAG_LAUNCH_kernel_1);
909	}
910
911	void $cuda_kernel_1(dim3 gridDim, dim3 blockDim, size_t _cuda_mem_size,
912	float A, const float B, float *C, int numElements) {
913	$cuda_kernel_instance_t $kernel = $create_kernel_instance($here, gridDim, blockDim);
914	void $cuda_block(uint3 blockIdx) {
915	int $numThreads = (blockDim.x * blockDim.y) * blockDim.z;
916	int $numWarps = ($numThreads - 1)/warpSize + 1;
917	$scope $block_root = $here;
918	$gbarrier $cuda_block_barrier = $gbarrier_create($block_root, $numThreads);
919
920	$cuda_warp_t $warps[$numWarps];
921	for (int i = 0; i < $numWarps - 1; i++) {
922	$warps[i] = $create_cuda_warp($block_root, warpSize);
923	}
924	$warps[$numWarps-1] = $create_cuda_warp($block_root, (($numThreads - 1) % warpSize) + 1);
925
926	void $cuda_thread(uint3 threadIdx) {
927	$local_start();
928	int _cuda_tid = $dim3_index(blockDim, threadIdx);
929	int _cuda_kid = $cuda_kernel_index(gridDim, blockDim, blockIdx, threadIdx);
930	//$clear_mem_sets($kernel, _cuda_kid);
931	$barrier $cuda_thread_barrier = $barrier_create($here, $cuda_block_barrier, _cuda_tid);
932	$cuda_lane_t $lane = $create_cuda_lane($here, $warps[_cuda_tid / warpSize], _cuda_tid % warpSize);
933
934	//$read_set_push();
935	//$write_set_push();
936
937	// Kernel REDUCTION start
938	/*
939	int lane = threadIdx.x % warpSize;
940	int thisWarpSize = warpSize;
941	if (threadIdx.x - lane + warpSize > blockDim.x) {
942	thisWarpSize = ((blockDim.x - 1) % warpSize) + 1;
943	}
944
945	int i = blockDim.x * blockIdx.x + threadIdx.x;
946	int warpStart = i - lane;
947	printf("%d,%d - i: %d, warpStart: %d, thisWarpSize: %d\n", blockIdx.x, threadIdx.x,i, warpStart, thisWarpSize);
948	int remainingElements = numElements;
949
950	while (remainingElements > 1) {
951	//printf("%d,%d - remainingElements: %d\n", blockIdx.x, threadIdx.x, remainingElements);
952	if (remainingElements < numElements) {
953	// __syncThreads()
954	//printf("%d,%d - entering barrier\n", blockIdx.x, threadIdx.x);
955
956	$cuda_barrier($kernel, _cuda_kid, $cuda_thread_barrier);
957	//printf("%d,%d - exiting barrier\n", blockIdx.x, threadIdx.x);
958	}
959
960	if (warpStart + 1 < remainingElements) {
961	float val = i < numElements ? A[i] : 0;
962
963	for (int offset = warpSize/2; offset > 0; offset /= 2) {
964	float tmp = __shfl_down_sync(0, val, offset);
965	if (lane + offset < thisWarpSize) {
966	val += tmp;
967	}
968	}
969
970	if (i < numElements) {
971	A[i] = val;
972	}
973	}
974
975	i *= warpSize;
976	//warpStart *= warpSize;
977	remainingElements = ((remainingElements - 1) / warpSize) + 1;
978	}
979
980	if (i == 0) {
981	*C = A[0];
982	}
983	// Kernel REDUCTION end
984	*/
985	// Kernel BALLOT TEST start
986
987	int i = threadIdx.x;
988	if (i < numElements) {
989	int result = __ballot_sync(~0, A[i] > 0);
990	if (i == 0) {
991	printf("Result: %d\n", result);
992	*C = 0;
993	while(result > 0) {
994	if (result % 2)
995	*C += 1;
996	result /= 2;
997	}
998	printf("done calculating result\n");
999	}
1000	}
1001	// Kernel BALLOT TEST end
1002	//$check_data_race($kernel, _cuda_kid);
1003	//$read_set_pop();
1004	//$write_set_pop();
1005	$barrier_destroy($cuda_thread_barrier);
1006	$destroy_cuda_lane($lane);
1007	$local_end();
1008	}
1009	$cuda_run_and_wait_on_procs(blockDim, $cuda_thread);
1010	$gbarrier_destroy($cuda_block_barrier);
1011
1012	for (int i = 0; i < $numWarps; i++) {
1013	$destroy_cuda_warp($warps[i]);
1014	}
1015	}
1016	$cuda_run_and_wait_on_procs(gridDim, $cuda_block);
1017	$destroy_kernel_instance($kernel);
1018	}
1019
1020	void $cuda_kernel_1_proc ($message request, $cuda_op_state_t opState, cudaStream_t cudaStream) {
1021	$when(opState->start);
1022
1023	$cuda_kernel_1_data args;
1024	$message_unpack(request, &args, sizeof($cuda_kernel_1_data));
1025	$cuda_reveal_kernel_1_args(&args);
1026
1027	$cuda_kernel_1(args.gridDim, args.blockDim, args.$cudaMemSize, args.A, args.B, args.C, args.numElements);
1028	$stream_dequeue(cudaStream);
1029	}
1030
1031	/////////////////
1032	// CUDA "file" //
1033	/////////////////
1034
1035	void $cuda_main() {
1036
1037	// Device Variables
1038
1039	$scope $cuda_scope = $here;
1040
1041	$comm $cuda_device_comm = $comm_create($cuda_scope, $cuda_gcomm, 1);
1042	$cuda_context $cuda_global_context;
1043	cudaStream_t $cuda_default_stream;
1044
1045	// Helper function to get the default stream if passed NULL, and just returns stream otherwise
1046	// Currently unused until we support streams other than the default one.
1047	cudaStream_t $default_stream_if_null(cudaStream_t stream) {
1048	return stream == NULL ? $cuda_default_stream : stream;
1049	}
1050
1051	// Device Logic
1052
1053	$cuda_stream_node_t defaultStreamNode = $create_new_stream_node($cuda_scope);
1054	$cuda_default_stream = defaultStreamNode->stream;
1055
1056	$cuda_global_context.head = defaultStreamNode;
1057	$cuda_global_context.numStreams = 1;
1058
1059	while (true) {
1060	$message request = $comm_dequeue($cuda_device_comm, $CUDA_PLACE_HOST, $COMM_ANY_TAG);
1061	$message response;
1062	const int tag = $message_tag(request);
1063
1064	switch(tag) {
1065	case $CUDA_TAG_SCOPE_REQUEST :
1066	response = $message_pack($CUDA_PLACE_DEVICE, $CUDA_PLACE_HOST, $CUDA_TAG_SCOPE_REQUEST, &$cuda_scope, sizeof($scope));
1067	break;
1068	case $CUDA_TAG_cudaFree :
1069	response = $cuda_free(request);
1070	break;
1071	case $CUDA_TAG_cudaMemcpy :
1072	response = $cuda_memcpy($cuda_scope, $cuda_default_stream, request, false);
1073	break;
1074	case $CUDA_TAG_cudaMemcpyAsync :
1075	response = $cuda_memcpy($cuda_scope, $cuda_default_stream, request, true);
1076	break;
1077	case $CUDA_TAG_LAUNCH_kernel_1 :
1078	$stream_enqueue($cuda_scope, $cuda_default_stream, request, $cuda_kernel_1_proc);
1079
1080	response = $message_pack($CUDA_PLACE_DEVICE, $CUDA_PLACE_HOST, tag, NULL, 0);
1081	break;
1082	case $CUDA_TAG_TEARDOWN : {
1083	$proc destructor = $destroy_stream_node($cuda_default_stream->containingNode);
1084	$wait(destructor);
1085	$comm_destroy($cuda_device_comm);
1086	return;
1087	}
1088	default :
1089	$assert(false, "Unknown CUDA request");
1090	}
1091
1092	$comm_enqueue($cuda_device_comm, response);
1093	}
1094	}
1095
1096	///////////////
1097	// Host file //
1098	///////////////
1099
1100	$input int N;
1101	$assume (N > 0);
1102	$input float A[N];
1103	// Currently unused but left in to save time
1104	$input float B[N];
1105
1106	void $host_main() {
1107	int size = N * sizeof(float);
1108	int numBlocks = 1;
1109	//int numThreads = N%2 == 0? N/2 : (N+1)/2;
1110	int numThreads = warpSize;
1111
1112	float* cuda_A;
1113	// cudaMalloc((void **)&cuda_A, size);
1114	{
1115	$scope deviceScope = $cuda_host_request_device_scope();
1116	cuda_A = $hide((float*)$malloc(deviceScope, size));
1117	}
1118	cudaMemcpy(cuda_A, A, size, cudaMemcpyHostToDevice);
1119
1120	float* cuda_B;
1121	// cudaMalloc((void **)&cuda_B, size);
1122	{
1123	$scope deviceScope = $cuda_host_request_device_scope();
1124	cuda_B = $hide((float*)$malloc(deviceScope, size));
1125	}
1126	cudaMemcpy(cuda_B, B, size, cudaMemcpyHostToDevice);
1127
1128	float* cuda_C;
1129	// cudaMalloc((void **)&cuda_C, sizeof(float));
1130	{
1131	$scope deviceScope = $cuda_host_request_device_scope();
1132	cuda_C = $hide((float*)$malloc(deviceScope, sizeof(float)));
1133	}
1134
1135	dim3 gridDim = {numBlocks, 1, 1};
1136	dim3 blockDim = {numThreads, 1, 1};
1137	// kernel_1<<<gridDim, blockDim>>>(cuda_A, cuda_B, cuda_C, N);
1138	$cuda_host_launch_kernel_1(gridDim, blockDim, 0, NULL, cuda_A, cuda_B, cuda_C, N);
1139
1140	// Checking correctness
1141	float* C = (float *)malloc(size);
1142
1143	cudaMemcpy(C, cuda_C, sizeof(float), cudaMemcpyDeviceToHost);
1144
1145	// REDUCTION ASSERTION
1146	/*
1147	float sum = 0;
1148	for(int i = 0; i < N; i++)
1149	sum += A[i];
1150
1151	$assert(*C == sum);
1152	*/
1153	// BALLOT ASSERTION
1154	float count = 0;
1155	for (int i = 0; i < N; i++) {
1156	if (A[i] > 0)
1157	count++;
1158	}
1159	$assert(*C == count);
1160
1161	free(C);
1162
1163	cudaFree(cuda_A);
1164	cudaFree(cuda_B);
1165	cudaFree(cuda_C);
1166
1167	}
1168
1169	int main() {
1170	$proc host = $spawn $host_main();
1171	$proc cuda = $spawn $cuda_main();
1172	$wait(host);
1173	$comm_enqueue($cuda_host_comm, $message_pack($CUDA_PLACE_HOST, $CUDA_PLACE_DEVICE, $CUDA_TAG_TEARDOWN, NULL, 0));
1174	$comm_destroy($cuda_host_comm);
1175	$wait(cuda);
1176	$gcomm_destroy($cuda_gcomm, NULL);
1177	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

Original Format