Context Navigation

newCudaMockup.cvl

main

Last change on this file was 9dbe9864, checked in by Alex Wilton <awilton@…>, 2 years ago

Merged CUDA branch into trunk.

git-svn-id: svn://vsl.cis.udel.edu/civl/trunk@5890 fb995dde-84ed-4084-dfe6-e5aef3e2452c

Property mode set to 100644

File size: 67.4 KB

Line
1	#include <concurrency.cvh>
2	#include <comm.cvh>
3	#include <stdlib.h>
4	#include <stdio.h>
5	#include <stdbool.h>
6	#include <string.h>
7	#include <mem.cvh>
8	#pragma CIVL ACSL
9
10	///////////
11	// Types //
12	///////////
13
14	enum cudaError {
15	cudaSuccess
16	};
17	typedef enum cudaError cudaError_t;
18
19	//typedef $cuda_memcpy_kind cudaMemcpyKind;
20
21	typedef struct {
22	unsigned int x, y, z;
23	} dim3;
24
25	/* used to represent a location in a three dimensional grid
26	*/
27	typedef struct {
28	unsigned int x, y, z;
29	} uint3;
30
31	typedef enum {
32	cudaMemcpyHostToHost,
33	cudaMemcpyHostToDevice,
34	cudaMemcpyDeviceToHost,
35	cudaMemcpyDeviceToDevice,
36	cudaMemcpyDefault
37	} cudaMemcpyKind;
38
39	typedef struct $cuda_op_state* $cuda_op_state_t;
40	typedef struct $cuda_op_state {
41	_Bool start;
42	$proc op;
43	} $cuda_op_state;
44
45	typedef struct $cuda_op_state_node* $cuda_op_state_node_t;
46	typedef struct $cuda_op_state_node {
47	$cuda_op_state_t opState;
48	$cuda_op_state_node_t next;
49	} $cuda_op_state_node;
50
51	typedef struct $cuda_stream_node* $cuda_stream_node_t;
52	typedef struct $cuda_stream* $cuda_stream_t;
53	typedef $cuda_stream_t cudaStream_t;
54	typedef struct $cuda_stream {
55	$cuda_op_state_node_t head;
56	$cuda_op_state_node_t tail;
57	int numOps;
58	$cuda_stream_node_t containingNode;
59	_Bool alive;
60	} $cuda_stream;
61
62	typedef struct $cuda_stream_node{
63	cudaStream_t stream;
64	$cuda_stream_node_t prev;
65	$cuda_stream_node_t next;
66	} $cuda_stream_node;
67
68	typedef struct $cuda_context* $cuda_context_t;
69	typedef struct $cuda_context {
70	$cuda_stream_node_t head;
71	int numStreams;
72	} $cuda_context;
73
74	typedef struct $cuda_mem_set* $cuda_mem_set_t;
75	typedef struct $cuda_mem_set {
76	$mem reads;
77	$mem writes;
78	} $cuda_mem_set;
79
80	void $cuda_mem_set_clear($cuda_mem_set_t memSet) {
81	$mem emptyset = $mem_empty();
82	memSet->reads = emptyset;
83	memSet->writes = emptyset;
84	}
85
86	$cuda_mem_set_t $cuda_mem_set_create($scope scope) {
87	$cuda_mem_set_t newMemSet = ($cuda_mem_set_t) $malloc(scope, sizeof($cuda_mem_set));
88
89	$cuda_mem_set_clear(newMemSet);
90
91	return newMemSet;
92	}
93
94	void $cuda_mem_set_destroy($cuda_mem_set_t memSet) {
95	free(memSet);
96	}
97
98	void $cuda_mem_set_add($cuda_mem_set_t memSet, $mem reads, $mem writes) {
99	memSet->reads = $mem_union(memSet->reads, reads);
100	memSet->writes = $mem_union(memSet->writes, writes);
101	}
102
103	void $cuda_check_mems_disjoint($cuda_mem_set_t m1, int b1, int w1, int l1,
104	$cuda_mem_set_t m2, int b2, int w2, int l2) {
105	$mem out1 = $mem_empty();
106	$mem out2 = $mem_empty();
107
108	$assert($mem_no_intersect(m1->reads, m2->writes, &out1, &out2),
109	"Data-race detected: %p read by thread <%d, %d, %d> intersects %p written by thread <%d, %d, %d>\n",
110	out1, b1, w1, l1, out2, b2, w2, l2);
111	$assert($mem_no_intersect(m1->writes, m2->reads, &out1, &out2),
112	"Data-race detected: %p written by thread <%d, %d, %d> intersects %p read by thread <%d, %d, %d>\n",
113	out1, b1, w1, l1, out2, b2, w2, l2);
114	$assert($mem_no_intersect(m1->writes, m2->writes, &out1, &out2),
115	"Data-race detected: %p written by thread <%d, %d, %d> intersects %p written by thread <%d, %d, %d>\n",
116	out1, b1, w1, l1, out2, b2, w2, l2);
117	}
118
119	typedef enum {
120	$CUDA_WARP_TAG_EMPTY,
121	$CUDA_WARP_TAG_warpsync,
122	$CUDA_WARP_TAG_shfl_sync,
123	$CUDA_WARP_TAG_shfl_up_sync,
124	$CUDA_WARP_TAG_shfl_down_sync,
125	$CUDA_WARP_TAG_shfl_xor_sync,
126	$CUDA_WARP_TAG_all_sync,
127	$CUDA_WARP_TAG_any_sync,
128	$CUDA_WARP_TAG_ballot_sync
129	} $cuda_warp_tag;
130
131	$input int warpSize = 32;
132
133	typedef struct $cuda_warp_data* $cuda_warp_data_t;
134	typedef struct $cuda_warp_data {
135	int size;
136	$cuda_mem_set_t** memSets;
137	_Bool patching[];
138	int num_alive;
139	_Bool alive[];
140	int num_in_barrier;
141	_Bool in_barrier[];
142	$cuda_warp_tag currOp;
143	int reductionLane;
144	$gcomm gcomm;
145	} $cuda_warp_data;
146
147	/*@ depends_on \access(warp);
148	@ executes_when \true;
149	@ */
150	$atomic_f void $cuda_warp_barrier_update($cuda_warp_data_t warp) {
151	if (warp->num_in_barrier == warp->num_alive) {
152	warp->num_in_barrier = 0;
153	warp->currOp = $CUDA_WARP_TAG_EMPTY;
154	for (int i = 0; i < warp->size; i++) {
155	warp->in_barrier[i] = $false;
156	$cuda_mem_set_clear(warp->memSets[i][i]);
157	}
158	}
159	}
160
161	$cuda_warp_data_t $create_cuda_warp_data($scope warpScope, int size) {
162	$cuda_warp_data_t newWarp = ($cuda_warp_data_t) malloc(sizeof($cuda_warp_data));
163	newWarp->size = size;
164	newWarp->memSets = ($cuda_mem_set_t*) $malloc(warpScope, size sizeof($cuda_mem_set_t*));
165	for (int i = 0; i < size; i++) {
166	newWarp->memSets[i] = ($cuda_mem_set_t) $malloc(warpScope, size sizeof($cuda_mem_set_t));
167	for (int j = 0; j < size; j++) {
168	newWarp->memSets[i][j] = $cuda_mem_set_create(warpScope);
169	}
170	}
171	newWarp->patching = (_Bool[size])$lambda(int i) $false;
172	newWarp->num_alive = size;
173	newWarp->alive = (_Bool[size])$lambda(int i) $true;
174	newWarp->num_in_barrier = 0;
175	newWarp->in_barrier = (_Bool[size])$lambda(int i) $false;
176	newWarp->currOp = $CUDA_WARP_TAG_EMPTY;
177	newWarp->reductionLane = -1;
178	newWarp->gcomm = $gcomm_create(warpScope, size);
179
180	return newWarp;
181	}
182
183	void $destroy_cuda_warp_data($cuda_warp_data_t warp) {
184	$assert(warp != NULL, "Attempt to destroy a NULL warp");
185
186	for (int i = 0; i < warp->size; i++) {
187	for (int j = 0; j < warp->size; j++) {
188	$cuda_mem_set_destroy(warp->memSets[i][j]);
189	}
190	free(warp->memSets[i]);
191	}
192	free(warp->memSets);
193	$gcomm_destroy(warp->gcomm, NULL);
194	free(warp);
195	}
196
197	typedef struct $cuda_block_data* $cuda_block_data_t;
198	typedef struct $cuda_block_data {
199	int size;
200	$cuda_mem_set_t* memSets;
201	int predArray[];
202	$gbarrier gbarrier;
203	int currBarrierID;
204	int numWarps;
205	$cuda_warp_data_t* warps;
206	} $cuda_block_data;
207
208	$cuda_block_data_t $create_cuda_block_data($scope scope, int size) {
209	$cuda_block_data_t newBlock = ($cuda_block_data_t) $malloc(scope, sizeof($cuda_block_data));
210
211	newBlock->size = size;
212	newBlock->numWarps = (size - 1)/warpSize + 1;
213
214	newBlock->predArray = (int[size]) $lambda(int i) 0;
215
216	newBlock->gbarrier = $gbarrier_create(scope, size);
217	newBlock->currBarrierID = -1;
218
219	newBlock->memSets = ($cuda_mem_set_t) $malloc(scope, size sizeof($cuda_mem_set_t));
220	for (int i = 0; i < size; i++) {
221	newBlock->memSets[i] = $cuda_mem_set_create(scope);
222	}
223	newBlock->warps = ($cuda_warp_data_t) $malloc(scope, newBlock->numWarps sizeof($cuda_warp_data_t));
224	for (int i = 0; i < newBlock->numWarps - 1; i++) {
225	newBlock->warps[i] = $create_cuda_warp_data(scope, warpSize);
226	}
227	int lastIndex = newBlock->numWarps - 1;
228	newBlock->warps[lastIndex] = $create_cuda_warp_data(scope, ((size - 1) % warpSize) + 1);
229
230	return newBlock;
231	}
232
233	void $destroy_cuda_block_data($cuda_block_data_t block) {
234	for (int i = 0; i < block->size; i++) {
235	$cuda_mem_set_destroy(block->memSets[i]);
236	}
237	free(block->memSets);
238
239	for (int i = 0; i < block->numWarps; i++) {
240	$destroy_cuda_warp_data(block->warps[i]);
241	}
242	free(block->warps);
243
244	$gbarrier_destroy(block->gbarrier);
245
246	free(block);
247	}
248
249	typedef struct $cuda_kernel_data* $cuda_kernel_data_t;
250	typedef struct $cuda_kernel_data {
251	int size;
252	$cuda_mem_set_t* memSets;
253	int numBlocks;
254	$cuda_block_data_t* blocks;
255	} $cuda_kernel_data;
256
257	$cuda_kernel_data_t $create_cuda_kernel_data($scope scope, dim3 gridDim, dim3 blockDim){
258	int numBlocks = (gridDim.x * gridDim.y) * gridDim.z;
259	int threadsPerBlock = (blockDim.x * blockDim.y) * blockDim.z;
260
261	$cuda_kernel_data_t newKernel = ($cuda_kernel_data_t)$malloc(scope, sizeof($cuda_kernel_data));
262
263	newKernel->size = threadsPerBlock * numBlocks;
264	newKernel->numBlocks = numBlocks;
265
266	newKernel->memSets = ($cuda_mem_set_t) $malloc(scope, newKernel->size sizeof($cuda_mem_set_t));
267	for (int i = 0; i < newKernel->size; i++) {
268	newKernel->memSets[i] = $cuda_mem_set_create(scope);
269	}
270
271	newKernel->blocks = ($cuda_block_data_t) $malloc(scope, newKernel->numBlocks sizeof($cuda_block_data_t));
272	for (int i = 0; i < newKernel->numBlocks; i++) {
273	newKernel->blocks[i] = $create_cuda_block_data(scope, threadsPerBlock);
274	}
275
276	return newKernel;
277	}
278
279	void $destroy_cuda_kernel_data($cuda_kernel_data_t kernel){
280	for (int i = 0; i < kernel->size; i++) {
281	$cuda_mem_set_destroy(kernel->memSets[i]);
282	}
283	free(kernel->memSets);
284
285	for (int i = 0; i < kernel->numBlocks; i++) {
286	$destroy_cuda_block_data(kernel->blocks[i]);
287	}
288	free(kernel->blocks);
289	free(kernel);
290	}
291
292	typedef struct $cuda_thread_data* $cuda_thread_data_t;
293	typedef struct $cuda_thread_data {
294	$cuda_kernel_data_t kernel;
295	$cuda_block_data_t block;
296	$cuda_warp_data_t warp;
297	int blockID;
298	int warpID;
299	int laneID;
300	$comm lane_comm;
301	$barrier block_barrier;
302	} $cuda_thread_data;
303
304	/*@ depends_on \access(thread);
305	@ executes_when \true;
306	@*/
307	$atomic_f void $cuda_check_warp_data_race($cuda_thread_data_t thread) {
308	int blockID = thread->blockID;
309	int warpID = thread->warpID;
310	int laneID = thread->laneID;
311	$cuda_mem_set_t** warpMems = thread->warp->memSets;
312
313	for (int i = 0; i < thread->warp->size; i++) {
314	if (i != laneID) {
315	$cuda_check_mems_disjoint(warpMems[laneID][laneID], blockID, warpID, laneID,
316	warpMems[i][i], blockID, warpID, i);
317	}
318	}
319	}
320
321	/*@ depends_on \access(thread);
322	@ executes_when \true;
323	@*/
324	$atomic_f void $cuda_check_block_data_race($cuda_thread_data_t thread) {
325	int blockID = thread->blockID;
326	int warpID = thread->warpID;
327	int laneID = thread->laneID;
328	int indexInBlock = warpID * warpSize + laneID;
329	$cuda_mem_set_t* blockMems = thread->block->memSets;
330
331	for (int i = 0; i < thread->block->numWarps; i++) {
332	if (i != warpID) {
333	int currWarpSize = thread->block->warps[i]->size;
334	for (int j = 0; j < currWarpSize; j++) {
335	$cuda_check_mems_disjoint(blockMems[indexInBlock], blockID, warpID, laneID,
336	blockMems[i * warpSize + j], blockID, i, j);
337	}
338	}
339	}
340	}
341
342	/*@ depends_on \access(thread);
343	@ executes_when \true;
344	@*/
345	$atomic_f void $cuda_check_kernel_data_race($cuda_thread_data_t thread) {
346	int blockID = thread->blockID;
347	int warpID = thread->warpID;
348	int laneID = thread->laneID;
349	int threadsPerBlock = thread->block->size;
350	int indexInKernel = blockID * threadsPerBlock + warpID * warpSize + laneID;
351	$cuda_mem_set_t* kernelMems = thread->kernel->memSets;
352	//printf("%d, %d, %d - Checking kernel data race. km index %d.\n\n", thread->blockID, thread->warpID, thread->laneID, indexInKernel);
353
354	for (int i = 0; i < thread->kernel->numBlocks; i++) {
355	if (i != blockID) {
356	$cuda_block_data_t block = thread->kernel->blocks[i];
357	for (int j = 0; j < block->numWarps; j++) {
358	int currWarpSize = block->warps[j]->size;
359	for (int k = 0; k < currWarpSize; k++) {
360	//printf("%d, %d, %d - Checking kdr against <%d,%d,%d> with km index %d.\n\n", thread->blockID, thread->warpID, thread->laneID, i,j,k, i * threadsPerBlock + j * warpSize + k);
361	$cuda_check_mems_disjoint(kernelMems[indexInKernel], blockID, warpID, laneID,
362	kernelMems[i * threadsPerBlock + j * warpSize + k], i, j, k);
363	}
364	}
365	}
366	}
367	}
368
369	/*@ depends_on \access(thread);
370	@ executes_when \true;
371	@*/
372	$atomic_f void $cuda_start_mem_patching($cuda_thread_data_t thread) {
373	thread->warp->patching[thread->laneID] = $true;
374	}
375
376	/*@ depends_on \access(thread);
377	@ executes_when \true;
378	@*/
379	$atomic_f void $cuda_update_patches($cuda_thread_data_t thread) {
380	$cuda_warp_data_t warp = thread->warp;
381	int lane = thread->laneID;
382
383	for (int i = 0; i < warp->size; i++) {
384	if (warp->patching[i]) {
385	$cuda_mem_set_add(warp->memSets[i][lane], warp->memSets[lane][lane]->reads, warp->memSets[lane][lane]->writes);
386	}
387	}
388	}
389
390	/*@ depends_on \access(thread);
391	@ executes_when \true;
392	@*/
393	$atomic_f void $cuda_end_mem_patching($cuda_thread_data_t thread) {
394	if (thread->warp->patching[thread->laneID]) {
395	int size = thread->warp->size;
396	int blockID = thread->blockID;
397	int warpID = thread->warpID;
398	int laneID = thread->laneID;
399	$cuda_mem_set_t* threadMems = thread->warp->memSets[laneID];
400
401	thread->warp->patching[laneID] = $false;
402
403	for (int i = 0; i < size; i++) {
404	if (i != laneID) {
405	$cuda_check_mems_disjoint(threadMems[laneID], blockID, warpID, laneID,
406	threadMems[i], blockID, warpID, i);
407	$cuda_mem_set_clear(threadMems[i]);
408	}
409	}
410	}
411	}
412
413	/*@ depends_on \access(thread);
414	@ executes_when \true;
415	@*/
416	$atomic_f void $cuda_start_kernel_region($cuda_thread_data_t thread) {
417	int lane = thread->laneID;
418	//printf("%d, %d, %d - Starting region.\n\n", thread->blockID, thread->warpID, thread->laneID);
419	$cuda_update_patches(thread);
420	$cuda_mem_set_clear(thread->warp->memSets[lane][lane]);
421	$cuda_mem_set_clear(thread->block->memSets[thread->warpID * warpSize + thread->laneID]);
422	$cuda_mem_set_clear(thread->kernel->memSets[thread->blockID * thread->block->size + thread->warpID * warpSize + thread->laneID]);
423	//$cuda_mem_set_t km = thread->kernel->memSets[thread->blockID];
424	//printf("%d, %d, %d - kernel mem set after clear:\n\tReads: %s\n\tWrites: %s\n\n", thread->blockID, thread->warpID, thread->laneID, km->reads, km->writes);
425	$read_set_push();
426	$write_set_push();
427	}
428
429	/*@ depends_on \access(thread);
430	@ executes_when \true;
431	@*/
432	$atomic_f void $cuda_complete_warp_region($cuda_thread_data_t thread) {
433	$mem reads = $read_set_pop();
434	$mem writes = $write_set_pop();
435	int lane = thread->laneID;
436	$cuda_mem_set_t laneMem = thread->warp->memSets[lane][lane];
437	//printf("%d, %d, %d - completing warp region.\n\tReads: %s\n\tWrites: %s\n\n", thread->blockID, thread->warpID, thread->laneID, reads, writes);
438
439	$cuda_mem_set_add(laneMem, reads, writes);
440	$cuda_mem_set_add(thread->block->memSets[thread->warpID * warpSize + thread->laneID], laneMem->reads, laneMem->writes);
441	$cuda_mem_set_add(thread->kernel->memSets[thread->blockID * thread->block->size + thread->warpID * warpSize + thread->laneID], laneMem->reads, laneMem->writes);
442	//$cuda_mem_set_t km = thread->kernel->memSets[thread->blockID];
443	//printf("%d, %d, %d - kernel mem set after add:\n\tReads: %s\n\tWrites: %s\n\n", thread->blockID, thread->warpID, thread->laneID, km->reads, km->writes);
444	$cuda_end_mem_patching(thread);
445	$cuda_check_warp_data_race(thread);
446	}
447
448	/*@ depends_on \access(thread);
449	@ executes_when \true;
450	@*/
451	$atomic_f void $cuda_complete_block_region($cuda_thread_data_t thread) {
452	//printf("%d, %d, %d - completing block region\n\n", thread->blockID, thread->warpID, thread->laneID);
453	$cuda_complete_warp_region(thread);
454	$cuda_check_block_data_race(thread);
455	}
456
457	/*@ depends_on \access(thread);
458	@ executes_when \true;
459	@*/
460	$atomic_f void $cuda_complete_kernel_region($cuda_thread_data_t thread) {
461	//printf("%d, %d, %d - completing kernel region\n\n", thread->blockID, thread->warpID, thread->laneID);
462	$cuda_complete_block_region(thread);
463	$cuda_check_kernel_data_race(thread);
464	}
465
466	$cuda_thread_data_t $create_cuda_thread_data($scope scope, $cuda_kernel_data_t kernel, int blockID, int warpID, int laneID) {
467	$cuda_thread_data_t newThread = ($cuda_thread_data_t) $malloc(scope, sizeof($cuda_thread_data));
468
469	newThread->kernel = kernel;
470	newThread->block = kernel->blocks[blockID];
471	newThread->warp = newThread->block->warps[warpID];
472	newThread->blockID = blockID;
473	newThread->warpID = warpID;
474	newThread->laneID = laneID;
475	newThread->lane_comm = $comm_create(scope, kernel->blocks[blockID]->warps[warpID]->gcomm, laneID);
476	newThread->block_barrier = $barrier_create(scope, kernel->blocks[blockID]->gbarrier, warpID * warpSize + laneID);
477
478	$read_set_push();
479	$write_set_push();
480
481	return newThread;
482	}
483
484	/*@ depends_on \access(thread);
485	@ executes_when \true;
486	@ */
487	$atomic_f void $destroy_cuda_thread_data($cuda_thread_data_t thread) {
488	$assert(thread != NULL, "Attempt to destroy NULL cuda thread");
489	$cuda_complete_kernel_region(thread);
490
491	$cuda_warp_data_t warp = thread->warp;
492	warp->alive[thread->laneID] = $false;
493	warp->num_alive--;
494	$cuda_warp_barrier_update(warp);
495
496	$barrier_destroy(thread->block_barrier);
497
498	$comm_destroy(thread->lane_comm);
499	free(thread);
500	}
501
502	/*@ depends_on \access(thread);
503	@ executes_when \true;
504	@ */
505	$atomic_f void $cuda_warp_barrier_enter($cuda_thread_data_t thread, $cuda_warp_tag tag) {
506	$cuda_warp_data_t warp = thread->warp;
507	$assert(!warp->in_barrier[thread->laneID]);
508	$assert(warp->currOp == tag \|\| warp->currOp == $CUDA_WARP_TAG_EMPTY);
509
510	warp->in_barrier[thread->laneID] = $true;
511	warp->currOp = tag;
512	warp->num_in_barrier++;
513	$cuda_warp_barrier_update(warp);
514	}
515
516	// Seems this needs to be atomic to work. Why?
517	/*@ depends_on \access(thread);
518	@*/
519	$atomic_f void $cuda_warp_barrier_exit($cuda_thread_data_t thread) {
520	$when(!thread->warp->in_barrier[thread->laneID]);
521	}
522
523	void $cuda_warp_barrier_call($cuda_thread_data_t thread, $cuda_warp_tag tag) {
524	$cuda_warp_barrier_enter(thread, tag);
525	$cuda_warp_barrier_exit(thread);
526	}
527
528	void $cuda__syncthreads($cuda_thread_data_t thread, int barrierIndex) {
529	$cuda_complete_block_region(thread);
530	int indexInBlock = thread->warpID * warpSize + thread->laneID;
531
532	if (thread->block->currBarrierID == -1) {
533	thread->block->currBarrierID = barrierIndex;
534	}
535	$assert(thread->block->currBarrierID == barrierIndex,
536	"Divergent calls to __syncthreads.");
537
538	$local_end();
539	$barrier_call(thread->block_barrier);
540	$local_start();
541	int lane = thread->laneID;
542
543	$cuda_mem_set_clear(thread->warp->memSets[lane][lane]);
544	$cuda_mem_set_clear(thread->block->memSets[thread->warpID * warpSize + lane]);
545
546	if (indexInBlock == 0) {
547	thread->block->currBarrierID = -1;
548	}
549
550	$local_end();
551	$barrier_call(thread->block_barrier);
552	$local_start();
553
554	$read_set_push();
555	$write_set_push();
556	}
557
558	#define $CUDA_DEFINE_SYNCTHREADS_VARIANT(NAME, INIT, REDUCTION) \
559	int NAME(int predicate, $cuda_thread_data_t thread, int barrierIndex) { \
560	$cuda_complete_block_region(thread); \
561	int indexInBlock = thread->warpID * warpSize + thread->laneID; \
562	\
563	if (thread->block->currBarrierID == -1) { \
564	thread->block->currBarrierID = barrierIndex; \
565	} \
566	$assert(thread->block->currBarrierID == barrierIndex, \
567	"Divergent calls to NAME."); \
568	\
569	if (predicate) { \
570	thread->block->predArray[indexInBlock] = 1; \
571	} else { \
572	thread->block->predArray[indexInBlock] = 0; \
573	} \
574	\
575	$local_end(); \
576	$barrier_call(thread->block_barrier); \
577	$local_start(); \
578	int lane = thread->laneID; \
579	\
580	$cuda_mem_set_clear(thread->warp->memSets[lane][lane]); \
581	$cuda_mem_set_clear(thread->block->memSets[indexInBlock]); \
582	\
583	if (indexInBlock == 0) { \
584	int result = INIT; \
585	for (int i = 0; i < thread->block->size; i++) { \
586	REDUCTION; \
587	} \
588	thread->block->predArray[0] = result; \
589	thread->block->currBarrierID = -1; \
590	} \
591	\
592	$local_end(); \
593	$barrier_call(thread->block_barrier); \
594	$local_start(); \
595	int result = thread->block->predArray[0]; \
596	\
597	$local_end(); \
598	$barrier_call(thread->block_barrier); \
599	$local_start(); \
600	\
601	$read_set_push(); \
602	$write_set_push(); \
603	return result; \
604	}
605
606	$CUDA_DEFINE_SYNCTHREADS_VARIANT($cuda__syncthreads_count, 0, result += thread->block->predArray[i])
607	$CUDA_DEFINE_SYNCTHREADS_VARIANT($cuda__syncthreads_or, 0, if (thread->block->predArray[i]) {
608	result = 1;
609	continue;
610	})
611	$CUDA_DEFINE_SYNCTHREADS_VARIANT($cuda__syncthreads_and, 1, if (!thread->block->predArray[i]) {
612	result = 0;
613	continue;
614	})
615
616	#define $GET_ARG_1(_1, ...) _1
617	#define $GET_ARG_2(_1, _2, ...) _2
618
619	#define __syncwarp() $cuda__syncwarp($thread)
620	void $cuda__syncwarp($cuda_thread_data_t thread) {
621	$cuda_complete_warp_region(thread);
622	$local_end();
623	$cuda_warp_barrier_call(thread, $CUDA_WARP_TAG_warpsync);
624	$local_start();
625	$read_set_push();
626	$write_set_push();
627	}
628
629	#define $CUDA_SHFL_PARAM_MACRO(...) $GET_ARG_1(__VA_ARGS__, warpSize, 0), $GET_ARG_2(__VA_ARGS__, warpSize, 0)
630
631	#define __shfl_sync(mask, var, ...) \
632	_Generic(var, \
633	default: $cuda__shfl_sync_int, \
634	unsigned int: $cuda__shfl_sync_uint, \
635	long: $cuda__shfl_sync_long, \
636	unsigned long: $cuda__shfl_sync_ulong, \
637	long long: $cuda__shfl_sync_ll, \
638	unsigned long long: $cuda__shfl_sync_ull, \
639	float: $cuda__shfl_sync_float, \
640	double: $cuda__shfl_sync_double) (mask, var, $CUDA_SHFL_PARAM_MACRO(__VA_ARGS__), $thread)
641
642	#define __shfl_up_sync(mask, var, ...) \
643	_Generic(var, \
644	default: $cuda__shfl_up_sync_int, \
645	unsigned int: $cuda__shfl_up_sync_uint, \
646	long: $cuda__shfl_up_sync_long, \
647	unsigned long: $cuda__shfl_up_sync_ulong, \
648	long long: $cuda__shfl_up_sync_ll, \
649	unsigned long long: $cuda__shfl_up_sync_ull, \
650	float: $cuda__shfl_up_sync_float, \
651	double: $cuda__shfl_up_sync_double) (mask, var, $CUDA_SHFL_PARAM_MACRO(__VA_ARGS__), $thread)
652
653	#define __shfl_down_sync(mask, var, ...) \
654	_Generic(var, \
655	default: $cuda__shfl_down_sync_int, \
656	unsigned int: $cuda__shfl_down_sync_uint, \
657	long: $cuda__shfl_down_sync_long, \
658	unsigned long: $cuda__shfl_down_sync_ulong, \
659	long long: $cuda__shfl_down_sync_ll, \
660	unsigned long long: $cuda__shfl_down_sync_ull, \
661	float: $cuda__shfl_down_sync_float, \
662	double: $cuda__shfl_down_sync_double) (mask, var, $CUDA_SHFL_PARAM_MACRO(__VA_ARGS__), $thread)
663
664	#define __shfl_xor_sync(mask, var, ...) \
665	_Generic(var, \
666	default: $cuda__shfl_xor_sync_int, \
667	unsigned int: $cuda__shfl_xor_sync_uint, \
668	long: $cuda__shfl_xor_sync_long, \
669	unsigned long: $cuda__shfl_xor_sync_ulong, \
670	long long: $cuda__shfl_xor_sync_ll, \
671	unsigned long long: $cuda__shfl_xor_sync_ull, \
672	float: $cuda__shfl_xor_sync_float, \
673	double: $cuda__shfl_xor_sync_double) (mask, var, $CUDA_SHFL_PARAM_MACRO(__VA_ARGS__), $thread)
674
675	#define $CUDA_INTRINSIC_RACE_CHECK_PROTOCOL_PRE() \
676	$mem writes = $write_set_pop(); \
677	$mem reads = $read_set_pop(); \
678	$cuda_mem_set_add(thread->warp->memSets[thread->laneID][thread->laneID], \
679	reads, writes); \
680	thread->warp->patching[thread->laneID] = $true;
681
682	#define $CUDA_INTRINSIC_RACE_CHECK_PROTOCOL_POST() \
683	$read_set_push(); \
684	$write_set_push();
685
686	#define $CUDA_GENERIC_SHFL_BODY() \
687	$assert (width <= warpSize); \
688	for (int v = width; v > 1; v /= 2) { \
689	$assert(v % 2 == 0); \
690	} \
691	\
692	int requestLane; \
693	switch(tag) { \
694	case $CUDA_WARP_TAG_shfl_sync: \
695	requestLane = thread->laneID/width + laneParam % width; \
696	break; \
697	case $CUDA_WARP_TAG_shfl_up_sync: \
698	requestLane = thread->laneID - laneParam; \
699	break; \
700	case $CUDA_WARP_TAG_shfl_down_sync: \
701	requestLane = thread->laneID + laneParam; \
702	break; \
703	case $CUDA_WARP_TAG_shfl_xor_sync: \
704	requestLane = thread->laneID ^ laneParam; \
705	break; \
706	} \
707	$cuda_warp_data_t warp = thread->warp; \
708	_Bool validSrcLane = requestLane >= 0 && requestLane < warp->size; \
709	if (validSrcLane) { \
710	$comm_enqueue(thread->lane_comm, $message_pack(thread->laneID, requestLane, tag, NULL, 0)); \
711	} \
712	\
713	$local_end(); \
714	$cuda_warp_barrier_call(thread, tag); \
715	$local_start(); \
716	\
717	\
718	_Bool requested[width] = (_Bool[width])$lambda(int i) $false; \
719	int subWarpStart = thread->laneID/width; \
720	while ($comm_probe(thread->lane_comm, $COMM_ANY_SOURCE, tag)) { \
721	$message request = $comm_dequeue(thread->lane_comm, $COMM_ANY_SOURCE, tag); \
722	requested[$message_source(request) - subWarpStart] = $true; \
723	} \
724	\
725	$local_end(); \
726	$cuda_warp_barrier_call(thread, tag); \
727	$local_start(); \
728	\
729	for(int i = 0; i < width; i++) { \
730	if (requested[i]) { \
731	$comm_enqueue(thread->lane_comm, $message_pack(thread->laneID, i + subWarpStart, tag, &var, typeSize)); \
732	} \
733	} \
734	\
735	$local_end(); \
736	$cuda_warp_barrier_call(thread, tag); \
737	$local_start(); \
738	\
739	if (validSrcLane) { \
740	$message result = $comm_dequeue(thread->lane_comm, requestLane, tag); \
741	$message_unpack(result, &resultVal, typeSize); \
742	} else { \
743	$havoc(&resultVal); \
744	}
745
746	#define $CUDA_DEFINE_SHFL(NAME, T, TAG) \
747	T NAME(unsigned mask, T var, int laneParam, int width, $cuda_thread_data_t thread) { \
748	$CUDA_INTRINSIC_RACE_CHECK_PROTOCOL_PRE(); \
749	T resultVal; \
750	int typeSize = sizeof(T); \
751	$cuda_warp_tag tag = TAG; \
752	\
753	$CUDA_GENERIC_SHFL_BODY(); \
754	\
755	$CUDA_INTRINSIC_RACE_CHECK_PROTOCOL_POST(); \
756	return resultVal; \
757	}
758
759	$CUDA_DEFINE_SHFL($cuda__shfl_sync_int, int, $CUDA_WARP_TAG_shfl_sync)
760	$CUDA_DEFINE_SHFL($cuda__shfl_sync_uint, unsigned int, $CUDA_WARP_TAG_shfl_sync)
761	$CUDA_DEFINE_SHFL($cuda__shfl_sync_long, long, $CUDA_WARP_TAG_shfl_sync)
762	$CUDA_DEFINE_SHFL($cuda__shfl_sync_ulong, unsigned long, $CUDA_WARP_TAG_shfl_sync)
763	$CUDA_DEFINE_SHFL($cuda__shfl_sync_ll, long long, $CUDA_WARP_TAG_shfl_sync)
764	$CUDA_DEFINE_SHFL($cuda__shfl_sync_ull, unsigned long long, $CUDA_WARP_TAG_shfl_sync)
765	$CUDA_DEFINE_SHFL($cuda__shfl_sync_float, float, $CUDA_WARP_TAG_shfl_sync)
766	$CUDA_DEFINE_SHFL($cuda__shfl_sync_double, double, $CUDA_WARP_TAG_shfl_sync)
767
768	$CUDA_DEFINE_SHFL($cuda__shfl_up_sync_int, int, $CUDA_WARP_TAG_shfl_up_sync)
769	$CUDA_DEFINE_SHFL($cuda__shfl_up_sync_uint, unsigned int, $CUDA_WARP_TAG_shfl_up_sync)
770	$CUDA_DEFINE_SHFL($cuda__shfl_up_sync_long, long, $CUDA_WARP_TAG_shfl_up_sync)
771	$CUDA_DEFINE_SHFL($cuda__shfl_up_sync_ulong, unsigned long, $CUDA_WARP_TAG_shfl_up_sync)
772	$CUDA_DEFINE_SHFL($cuda__shfl_up_sync_ll, long long, $CUDA_WARP_TAG_shfl_up_sync)
773	$CUDA_DEFINE_SHFL($cuda__shfl_up_sync_ull, unsigned long long, $CUDA_WARP_TAG_shfl_up_sync)
774	$CUDA_DEFINE_SHFL($cuda__shfl_up_sync_float, float, $CUDA_WARP_TAG_shfl_up_sync)
775	$CUDA_DEFINE_SHFL($cuda__shfl_up_sync_double, double, $CUDA_WARP_TAG_shfl_up_sync)
776
777	$CUDA_DEFINE_SHFL($cuda__shfl_down_sync_int, int, $CUDA_WARP_TAG_shfl_down_sync)
778	$CUDA_DEFINE_SHFL($cuda__shfl_down_sync_uint, unsigned int, $CUDA_WARP_TAG_shfl_down_sync)
779	$CUDA_DEFINE_SHFL($cuda__shfl_down_sync_long, long, $CUDA_WARP_TAG_shfl_down_sync)
780	$CUDA_DEFINE_SHFL($cuda__shfl_down_sync_ulong, unsigned long, $CUDA_WARP_TAG_shfl_down_sync)
781	$CUDA_DEFINE_SHFL($cuda__shfl_down_sync_ll, long long, $CUDA_WARP_TAG_shfl_down_sync)
782	$CUDA_DEFINE_SHFL($cuda__shfl_down_sync_ull, unsigned long long, $CUDA_WARP_TAG_shfl_down_sync)
783	$CUDA_DEFINE_SHFL($cuda__shfl_down_sync_float, float, $CUDA_WARP_TAG_shfl_down_sync)
784	$CUDA_DEFINE_SHFL($cuda__shfl_down_sync_double, double, $CUDA_WARP_TAG_shfl_down_sync)
785
786	$CUDA_DEFINE_SHFL($cuda__shfl_xor_sync_int, int, $CUDA_WARP_TAG_shfl_xor_sync)
787	$CUDA_DEFINE_SHFL($cuda__shfl_xor_sync_uint, unsigned int, $CUDA_WARP_TAG_shfl_xor_sync)
788	$CUDA_DEFINE_SHFL($cuda__shfl_xor_sync_long, long, $CUDA_WARP_TAG_shfl_xor_sync)
789	$CUDA_DEFINE_SHFL($cuda__shfl_xor_sync_ulong, unsigned long, $CUDA_WARP_TAG_shfl_xor_sync)
790	$CUDA_DEFINE_SHFL($cuda__shfl_xor_sync_ll, long long, $CUDA_WARP_TAG_shfl_xor_sync)
791	$CUDA_DEFINE_SHFL($cuda__shfl_xor_sync_ull, unsigned long long, $CUDA_WARP_TAG_shfl_xor_sync)
792	$CUDA_DEFINE_SHFL($cuda__shfl_xor_sync_float, float, $CUDA_WARP_TAG_shfl_xor_sync)
793	$CUDA_DEFINE_SHFL($cuda__shfl_xor_sync_double, double, $CUDA_WARP_TAG_shfl_xor_sync)
794
795	#define __ballot_sync(mask, predicate) $cuda__ballot_sync(mask, predicate, $thread)
796	#define __all_sync(mask, predicate) $cuda__all_sync(mask, predicate, $thread)
797	#define __any_sync(mask, predicate) $cuda__any_sync(mask, predicate, $thread)
798
799	#define $CUDA_GENERIC_COND_REDUCTION_BODY(COND, T_REDUCTION, F_REDUCTION) \
800	$cuda_warp_data_t warp = thread->warp; \
801	int laneID = thread->laneID; \
802	$comm comm = thread->lane_comm; \
803	if (warp->reductionLane == -1) { \
804	warp->reductionLane = laneID; \
805	\
806	result = initialValue; \
807	for (int i = 0; i < warp->size; i++) { \
808	if (i == laneID) { \
809	operand = value; \
810	} else { \
811	$local_end(); \
812	$when(!warp->alive[i] \|\| $comm_probe(comm, i, tag)) $local_start(); \
813	\
814	if (!warp->alive[i]) { \
815	operand = initialValue; \
816	} else { \
817	$local_end(); \
818	$message_unpack($comm_dequeue(comm, i, tag), &operand, typeSize); \
819	$local_start(); \
820	} \
821	} \
822	\
823	if (COND) { \
824	result = T_REDUCTION; \
825	} else { \
826	result = F_REDUCTION; \
827	} \
828	} \
829	\
830	warp->reductionLane = -1; \
831	\
832	for (int i = 0; i< warp->size; i++) { \
833	if (i != laneID && warp->alive[i]) { \
834	$comm_enqueue(comm, $message_pack(laneID, i, tag, &result, typeSize)); \
835	} \
836	} \
837	} else { \
838	int reductionLane = warp->reductionLane; \
839	$comm_enqueue(comm, $message_pack(laneID, reductionLane, tag, &value, typeSize)); \
840	$local_end(); \
841	$message_unpack($comm_dequeue(comm, reductionLane, tag), &result, typeSize); \
842	$local_start(); \
843	}
844
845	#define $CUDA_GENERIC_REDUCTION_BODY(REDUCTION) $CUDA_GENERIC_COND_REDUCTION_BODY($true, REDUCTION, result)
846
847	int $cuda__all_sync(unsigned mask, int value, $cuda_thread_data_t thread) {
848	$CUDA_INTRINSIC_RACE_CHECK_PROTOCOL_PRE();
849
850	$cuda_warp_tag tag = $CUDA_WARP_TAG_all_sync;
851	int typeSize = sizeof(int);
852	int initialValue = 1;
853	int result, operand;
854
855	$CUDA_GENERIC_COND_REDUCTION_BODY(result != 0 && operand != 0, 1, 0);
856
857	$CUDA_INTRINSIC_RACE_CHECK_PROTOCOL_POST();
858
859	return result;
860	}
861
862	int $cuda__any_sync(unsigned mask, int value, $cuda_thread_data_t thread) {
863	$CUDA_INTRINSIC_RACE_CHECK_PROTOCOL_PRE();
864
865	$cuda_warp_tag tag = $CUDA_WARP_TAG_any_sync;
866	int typeSize = sizeof(int);
867	int initialValue = 0;
868	int result, operand;
869
870	$CUDA_GENERIC_COND_REDUCTION_BODY(result != 0 \|\| operand != 0, 1, 0);
871
872	$CUDA_INTRINSIC_RACE_CHECK_PROTOCOL_POST();
873
874	return result;
875	}
876
877	unsigned $cuda__ballot_sync(unsigned mask, int value, $cuda_thread_data_t thread) {
878	$CUDA_INTRINSIC_RACE_CHECK_PROTOCOL_PRE();
879
880	$cuda_warp_tag tag = $CUDA_WARP_TAG_ballot_sync;
881	int initialValue = 0;
882	int typeSize = sizeof(int);
883	unsigned result;
884	int operand;
885
886	$CUDA_GENERIC_COND_REDUCTION_BODY(operand == 0, 2 * result, 2 * result + 1);
887
888	$CUDA_INTRINSIC_RACE_CHECK_PROTOCOL_POST();
889
890	return result;
891	}
892
893	/* atomicAdd()
894	* Reads the 16-bit, 32-bit or 64-bit word old located at the address address in
895	* global or shared memory, computes (old + val), and stores the result back to
896	* memory at the same address. These three operations are performed in one atomic
897	* transaction. The function returns old.
898	*/
899	#define atomicAdd(X,Y) _Generic(X, \
900	default : $cuda_atomicAdd_int, \
901	unsigned int* : $cuda_atomicAdd_uint, \
902	unsigned long long int* : $cuda_atomicAdd_ullint, \
903	float* : $cuda_atomicAdd_float, \
904	double* : $cuda_atomicAdd_double) (X, Y, $thread)
905
906	/* atomicSub()
907	* reads the 32-bit word old located at the address address in global or shared
908	* memory, computes (old - val), and stores the result back to memory at the same
909	* address. These three operations are performed in one atomic transaction. The
910	* function returns old.
911	*/
912	#define atomicSub(X,Y) _Generic(X, \
913	default : $cuda_atomicSub_int, \
914	unsigned int* : $cuda_atomicSub_uint) (X, Y, $thread)
915
916	/* atomicExch()
917	* reads the 32-bit or 64-bit word old located at the address address in global
918	* or shared memory and stores val back to memory at the same address. These two
919	* operations are performed in one atomic transaction. The function returns old.
920	*/
921	#define atomicExch(X,Y) _Generic(X, \
922	default : $cuda_atomicExch_int, \
923	unsigned int* : $cuda_atomicExch_uint, \
924	unsigned long long int* : $cuda_atomicExch_ullint, \
925	float* : $cuda_atomicExch_float) (X, Y, $thread)
926
927	/* atomicMin()
928	* reads the 32-bit or 64-bit word old located at the address address in global
929	* or shared memory, computes the minimum of old and val, and stores the result
930	* back to memory at the same address. These three operations are performed in one
931	* atomic transaction. The function returns old.
932	*/
933	#define atomicMin(X,Y) _Generic(X, \
934	default : $cuda_atomicMin_int, \
935	unsigned int* : $cuda_atomicMin_uint, \
936	unsigned long long int* : $cuda_atomicMin_ullint) (X, Y, $thread)
937
938	/* atomicMax()
939	* reads the 32-bit or 64-bit word old located at the address address in global
940	* or shared memory, computes the maximum of old and val, and stores the result
941	* back to memory at the same address. These three operations are performed in one
942	* atomic transaction. The function returns old.
943	*/
944	#define atomicMax(X,Y) _Generic(X, \
945	default : $cuda_atomicMax_int, \
946	unsigned int* : $cuda_atomicMax_uint, \
947	unsigned long long int* : $cuda_atomicMax_ullint) (X, Y, $thread)
948
949	/* atomicInc()
950	* reads the 32-bit word old located at the address address in global or shared
951	* memory, computes ((old >= val) ? 0 : (old+1)), and stores the result back to
952	* memory at the same address. These three operations are performed in one atomic
953	* transaction. The function returns old.
954	*/
955	#define atomicInc(address, val) $cuda_atomicInc(address, val, $thread)
956
957	/* atomicDec()
958	* reads the 32-bit word old located at the address address in global or shared
959	* memory, computes (((old == 0) \|\| (old > val)) ? val : (old-1) ), and stores
960	* the result back to memory at the same address. These three operations are
961	* performed in one atomic transaction. The function returns old.
962	*/
963	#define atomicDec(address, val) $cuda_atomicDec(address, val, $thread)
964
965	/* atomicCAS()
966	* reads the 16-bit, 32-bit or 64-bit word old located at the address address in
967	* global or shared memory, computes (old == compare ? val : old) , and stores the
968	* result back to memory at the same address. These three operations are performed
969	* in one atomic transaction. The function returns old (Compare And Swap).
970	*/
971	#define atomicCAS(address, compare, val) _Generic(address, \
972	default : $cuda_atomicCAS_int, \
973	unsigned int* : $cuda_atomicCAS_uint, \
974	unsigned long long int* : $cuda_atomicCAS_ullint, \
975	unsigned short int* : $cuda_atomicCAS_usint) (address, compare, val, $thread)
976
977	/*
978	* reads the 32-bit or 64-bit word old located at the address address
979	* in global or shared memory, computes (old & val), and stores the
980	* result back to memory at the same address. These three operations
981	* are performed in one atomic transaction. The function returns old.
982	*/
983	#define atomicAnd(address, val) _Generic(address, \
984	default : $cuda_atomicAnd_int, \
985	unsigned int* : $cuda_atomicAnd_uint, \
986	unsigned long long int* : $cuda_atomicAnd_ullint) (address, val, $thread)
987
988	/*
989	* reads the 32-bit or 64-bit word old located at the address address
990	* in global or shared memory, computes (old \| val), and stores the
991	* result back to memory at the same address. These three operations
992	* are performed in one atomic transaction. The function returns old.
993	*/
994	#define atomicOr(address, val) _Generic(address, \
995	default : $cuda_atomicOr_int, \
996	unsigned int* : $cuda_atomicOr_uint, \
997	unsigned long long int* : $cuda_atomicOr_ullint) (address, val, $thread)
998
999	/*
1000	* reads the 32-bit or 64-bit word old located at the address address
1001	* in global or shared memory, computes (old ^ val), and stores the
1002	* result back to memory at the same address. These three operations
1003	* are performed in one atomic transaction. The function returns old.
1004	*/
1005	#define atomicXor(address, val) _Generic(address, \
1006	default : $cuda_atomicXor_int, \
1007	unsigned int* : $cuda_atomicXor_uint, \
1008	unsigned long long int* : $cuda_atomicXor_ullint) (address, val, $thread)
1009
1010	#define $CUDA_ATOMIC_PRE_ACTION(T) \
1011	$cuda_complete_kernel_region(thread); \
1012	$yield(); \
1013	$cuda_start_kernel_region(thread); \
1014	T old = *address;
1015
1016	#define $CUDA_ATOMIC_POST_ACTION() \
1017	$cuda_complete_kernel_region(thread); \
1018	$cuda_start_kernel_region(thread); \
1019	return old;
1020
1021	#define $CUDA_DEFINE_ATOMIC_ADD(NAME, T) \
1022	T NAME(T* address, T val, $cuda_thread_data_t thread) { \
1023	$CUDA_ATOMIC_PRE_ACTION(T) \
1024	*address += val; \
1025	$CUDA_ATOMIC_POST_ACTION() \
1026	}
1027
1028	$CUDA_DEFINE_ATOMIC_ADD($cuda_atomicAdd_int, int)
1029	$CUDA_DEFINE_ATOMIC_ADD($cuda_atomicAdd_uint, unsigned int)
1030	$CUDA_DEFINE_ATOMIC_ADD($cuda_atomicAdd_ullint, unsigned long long int)
1031	$CUDA_DEFINE_ATOMIC_ADD($cuda_atomicAdd_float, float)
1032	$CUDA_DEFINE_ATOMIC_ADD($cuda_atomicAdd_double, double)
1033
1034	#define $CUDA_DEFINE_ATOMIC_SUB(NAME, T) \
1035	T NAME(T* address, T val, $cuda_thread_data_t thread) { \
1036	$CUDA_ATOMIC_PRE_ACTION(T) \
1037	*address -= val; \
1038	$CUDA_ATOMIC_POST_ACTION() \
1039	}
1040
1041	$CUDA_DEFINE_ATOMIC_SUB($cuda_atomicSub_int, int)
1042	$CUDA_DEFINE_ATOMIC_SUB($cuda_atomicSub_uint, unsigned int)
1043
1044	#define $CUDA_DEFINE_ATOMIC_EXCH(NAME, T) \
1045	T NAME(T* address, T val, $cuda_thread_data_t thread) { \
1046	$CUDA_ATOMIC_PRE_ACTION(T) \
1047	*address = val; \
1048	$CUDA_ATOMIC_POST_ACTION() \
1049	}
1050
1051	$CUDA_DEFINE_ATOMIC_EXCH($cuda_atomicExch_int, int)
1052	$CUDA_DEFINE_ATOMIC_EXCH($cuda_atomicExch_uint, unsigned int)
1053	$CUDA_DEFINE_ATOMIC_EXCH($cuda_atomicExch_ullint, unsigned long long int)
1054	$CUDA_DEFINE_ATOMIC_EXCH($cuda_atomicExch_float, float)
1055
1056	#define $CUDA_DEFINE_ATOMIC_MIN(NAME, T) \
1057	T NAME(T* address, T val, $cuda_thread_data_t thread) { \
1058	$CUDA_ATOMIC_PRE_ACTION(T) \
1059	if (old <= val) *address = old; \
1060	else *address = val; \
1061	$CUDA_ATOMIC_POST_ACTION() \
1062	}
1063
1064	$CUDA_DEFINE_ATOMIC_MIN($cuda_atomicMin_int, int)
1065	$CUDA_DEFINE_ATOMIC_MIN($cuda_atomicMin_uint, unsigned int)
1066	$CUDA_DEFINE_ATOMIC_MIN($cuda_atomicMin_ullint, unsigned long long int)
1067
1068	#define $CUDA_DEFINE_ATOMIC_MAX(NAME, T) \
1069	T NAME(T* address, T val, $cuda_thread_data_t thread) { \
1070	$CUDA_ATOMIC_PRE_ACTION(T) \
1071	if (old >= val) *address = old; \
1072	else *address = val; \
1073	$CUDA_ATOMIC_POST_ACTION() \
1074	}
1075
1076	$CUDA_DEFINE_ATOMIC_MAX($cuda_atomicMax_int, int)
1077	$CUDA_DEFINE_ATOMIC_MAX($cuda_atomicMax_uint, unsigned int)
1078	$CUDA_DEFINE_ATOMIC_MAX($cuda_atomicMax_ullint, unsigned long long int)
1079
1080	unsigned int $cuda_atomicInc(unsigned int* address, unsigned int val, $cuda_thread_data_t thread) {
1081	$CUDA_ATOMIC_PRE_ACTION(unsigned int);
1082	if (old >= val) *address = 0;
1083	else *address = old + 1;
1084	$CUDA_ATOMIC_POST_ACTION()
1085	}
1086
1087	unsigned int $cuda_atomicDec(unsigned int* address, unsigned int val, $cuda_thread_data_t thread) {
1088	$CUDA_ATOMIC_PRE_ACTION(unsigned int);
1089	if (old == 0 \|\| old > val) *address = val;
1090	else *address = old - 1;
1091	$CUDA_ATOMIC_POST_ACTION()
1092	}
1093
1094	#define $CUDA_DEFINE_ATOMIC_CAS(NAME, T) \
1095	T NAME(T* address, T compare, T val, $cuda_thread_data_t thread) { \
1096	$CUDA_ATOMIC_PRE_ACTION(T) \
1097	if (old == compare) *address = val; \
1098	$CUDA_ATOMIC_POST_ACTION() \
1099	}
1100
1101	$CUDA_DEFINE_ATOMIC_CAS($cuda_atomicCAS_int, int)
1102	$CUDA_DEFINE_ATOMIC_CAS($cuda_atomicCAS_uint, unsigned int)
1103	$CUDA_DEFINE_ATOMIC_CAS($cuda_atomicCAS_ullint, unsigned long long int)
1104	$CUDA_DEFINE_ATOMIC_CAS($cuda_atomicCAS_usint, unsigned short int)
1105
1106	#define $CUDA_DEFINE_ATOMIC_AND(NAME, T) \
1107	T NAME(T* address, T val, $cuda_thread_data_t thread) { \
1108	$CUDA_ATOMIC_PRE_ACTION(T) \
1109	*address = old & val; \
1110	$CUDA_ATOMIC_POST_ACTION() \
1111	}
1112
1113	$CUDA_DEFINE_ATOMIC_AND($cuda_atomicAnd_int, int)
1114	$CUDA_DEFINE_ATOMIC_AND($cuda_atomicAnd_uint, unsigned int)
1115	$CUDA_DEFINE_ATOMIC_AND($cuda_atomicAnd_ullint, unsigned long long int)
1116
1117	#define $CUDA_DEFINE_ATOMIC_OR(NAME, T) \
1118	T NAME(T* address, T val, $cuda_thread_data_t thread) { \
1119	$CUDA_ATOMIC_PRE_ACTION(T) \
1120	*address = old \| val; \
1121	$CUDA_ATOMIC_POST_ACTION() \
1122	}
1123
1124	$CUDA_DEFINE_ATOMIC_OR($cuda_atomicOr_int, int)
1125	$CUDA_DEFINE_ATOMIC_OR($cuda_atomicOr_uint, unsigned int)
1126	$CUDA_DEFINE_ATOMIC_OR($cuda_atomicOr_ullint, unsigned long long int)
1127
1128	#define $CUDA_DEFINE_ATOMIC_XOR(NAME, T) \
1129	T NAME(T* address, T val, $cuda_thread_data_t thread) { \
1130	$CUDA_ATOMIC_PRE_ACTION(T) \
1131	*address = old ^ val; \
1132	$CUDA_ATOMIC_POST_ACTION() \
1133	}
1134
1135	$CUDA_DEFINE_ATOMIC_XOR($cuda_atomicXor_int, int)
1136	$CUDA_DEFINE_ATOMIC_XOR($cuda_atomicXor_uint, unsigned int)
1137	$CUDA_DEFINE_ATOMIC_XOR($cuda_atomicXor_ullint, unsigned long long int)
1138
1139	//////////////////////
1140	// Global Variables //
1141	//////////////////////
1142
1143	$gcomm $cuda_gcomm = $gcomm_create($here, 2);
1144	const int $CUDA_PLACE_HOST = 0;
1145	const int $CUDA_PLACE_DEVICE = 1;
1146	$comm $cuda_host_comm = $comm_create($here, $cuda_gcomm, $CUDA_PLACE_HOST);
1147
1148	/**
1149	* Tags used for message-passing between host and device
1150	*/
1151	enum $cuda_tag {
1152	// Predefined tags
1153	$CUDA_TAG_TEARDOWN,
1154	$CUDA_TAG_SCOPE_REQUEST,
1155	$CUDA_TAG_cudaFree,
1156	$CUDA_TAG_cudaMemcpy,
1157	$CUDA_TAG_cudaMemcpyAsync,
1158	// Generated tags (by transformer)
1159	$CUDA_TAG_LAUNCH_kernel_1
1160	};
1161
1162	///////////////////
1163	// CIVL-CUDA API //
1164	///////////////////
1165
1166	$scope $cuda_host_request_device_scope() {
1167	$comm_enqueue($cuda_host_comm, $message_pack($CUDA_PLACE_HOST, $CUDA_PLACE_DEVICE, $CUDA_TAG_SCOPE_REQUEST, NULL, 0));
1168	$message response = $comm_dequeue($cuda_host_comm, $CUDA_PLACE_DEVICE, $CUDA_TAG_SCOPE_REQUEST);
1169	$scope result;
1170	$message_unpack(response, &result, sizeof($scope));
1171
1172	return result;
1173	}
1174
1175	typedef struct $cuda_memcpy_data {
1176	void* dst;
1177	const void* src;
1178	size_t count;
1179	cudaMemcpyKind kind;
1180	} $cuda_memcpy_data;
1181
1182	void $cuda_host_memcpy(void* dst, const void* src, size_t count, cudaMemcpyKind kind, _Bool async) {
1183	if (kind == cudaMemcpyHostToHost) {
1184	memcpy(dst, src, count);
1185	} else {
1186	$cuda_memcpy_data args;
1187	args.dst = dst;
1188	args.src = src;
1189	args.count = count;
1190	args.kind = kind;
1191
1192	int tag = async ? $CUDA_TAG_cudaMemcpyAsync : $CUDA_TAG_cudaMemcpy;
1193
1194	$comm_enqueue($cuda_host_comm, $message_pack($CUDA_PLACE_HOST, $CUDA_PLACE_DEVICE, tag, &args, sizeof($cuda_memcpy_data)));
1195	$comm_dequeue($cuda_host_comm, $CUDA_PLACE_DEVICE, tag);
1196	}
1197	}
1198
1199	$cuda_stream_node_t $create_new_stream_node($scope cudaScope) {
1200	cudaStream_t newStream = (cudaStream_t) $malloc(cudaScope, sizeof($cuda_stream));
1201	newStream->head = NULL;
1202	newStream->tail = NULL;
1203	newStream->numOps = 0;
1204	newStream->alive = true;
1205
1206	$cuda_stream_node_t newHead = ($cuda_stream_node_t) $malloc(cudaScope, sizeof($cuda_stream_node));
1207	newHead->stream = newStream;
1208	newStream->containingNode = newHead;
1209	newHead->prev = NULL;
1210	newHead->next = NULL;
1211
1212	return newHead;
1213	}
1214
1215	/*@ depends_on \nothing;
1216	@ assigns \nothing;
1217	@ reads \nothing;
1218	@*/
1219	$atomic_f $proc $destroy_stream_node($cuda_stream_node_t node) {
1220	$proc lastOpProc = $proc_null;
1221	cudaStream_t stream = node->stream;
1222
1223	if (node->prev != NULL) {
1224	node->prev->next = node->next;
1225	}
1226	if (node->next != NULL) {
1227	node->next->prev = node->prev;
1228	}
1229	free(node);
1230
1231	stream->alive = false;
1232	if(stream->tail != NULL)
1233	lastOpProc = stream->tail->opState->op;
1234
1235	void destroyStreamWhenComplete($proc lastOpProc, cudaStream_t stream) {
1236	$wait(lastOpProc);
1237	free(stream);
1238	}
1239
1240	return $spawn destroyStreamWhenComplete(lastOpProc, stream);
1241	}
1242
1243	/*@ depends_on \access(stream);
1244	@ assigns stream;
1245	@ reads \nothing;
1246	@*/
1247	$atomic_f $proc $stream_enqueue($scope cudaScope, cudaStream_t stream, $message opParams, void(*opProc)($message, $cuda_op_state_t, cudaStream_t)) {
1248	$assert(stream->alive, "Attempt to enqueue a CUDA operation onto a destroyed stream");
1249
1250	$cuda_op_state_t newOpState = ($cuda_op_state_t) $malloc(cudaScope, sizeof($cuda_op_state));
1251	newOpState->start = false;
1252	newOpState->op = $spawn opProc(opParams, newOpState, stream);
1253
1254	$cuda_op_state_node_t newOpStateNode = ($cuda_op_state_node_t) $malloc(cudaScope, sizeof($cuda_op_state_node));
1255	newOpStateNode->opState = newOpState;
1256	newOpStateNode->next = NULL;
1257
1258	if (stream->tail == NULL) {
1259	stream->head = newOpStateNode;
1260	stream->tail = newOpStateNode;
1261	newOpState->start = true;
1262	} else {
1263	stream->tail->next = newOpStateNode;
1264	stream->tail = newOpStateNode;
1265	}
1266	stream->numOps++;
1267
1268	return newOpState->op;
1269	}
1270
1271	/*@ depends_on \nothing;
1272	@ assigns \nothing;
1273	@ reads \nothing;
1274	@*/
1275	$atomic_f void $stream_dequeue(cudaStream_t stream) {
1276	$assert(stream->head != NULL, "Attempt to dequeue an empty stream");
1277
1278	if (stream->head == stream->tail) {
1279	stream->tail = NULL;
1280	}
1281
1282	$cuda_op_state_node_t oldHead = stream->head;
1283	stream->head = oldHead->next;
1284	if (stream->head != NULL) {
1285	stream->head->opState->start = true;
1286	}
1287
1288	stream->numOps--;
1289	free(oldHead->opState);
1290	free(oldHead);
1291	}
1292
1293	// Helper function
1294	int $dim3_index(dim3 size, uint3 location) {
1295	return location.x + size.x * (location.y + size.y * location.z);
1296	}
1297
1298	// Helper function
1299	int $cuda_kernel_index (dim3 gDim, dim3 bDim, uint3 bIdx, uint3 tIdx) {
1300	return $dim3_index(gDim, bIdx) * (bDim.x * bDim.y * bDim.z) + $dim3_index(bDim, tIdx);
1301	}
1302
1303	void $cuda_run_and_wait_on_procs(dim3 dim, void spawningFunction(uint3)) {
1304	//TODO: calculate length and index, replace this function in the kernel
1305	$local_start();
1306	int length = dim.x * dim.y * dim.z;
1307	$proc procArray[length];
1308	$range rx = 0 .. dim.x - 1;
1309	$range ry = 0 .. dim.y - 1;
1310	$range rz = 0 .. dim.z - 1;
1311	$domain(3) dom = ($domain(3)){rx, ry, rz};
1312	$for(int x,y,z : dom){
1313	uint3 id = { x, y, z };
1314	int index = $dim3_index(dim, id);
1315	procArray[index] = $spawn spawningFunction(id);
1316	}
1317	$local_end();
1318	$waitall(procArray,length);
1319	}
1320
1321
1322	// CUDA Ops //
1323
1324	void $cuda_memcpy_proc($message m, $cuda_op_state_t opState, cudaStream_t stream) {
1325
1326	$when(opState->start);
1327	$cuda_memcpy_data args;
1328	$message_unpack(m, &args, sizeof($cuda_memcpy_data));
1329
1330	if (args.kind == cudaMemcpyHostToDevice \|\| cudaMemcpyDeviceToDevice) {
1331	args.dst = $reveal(args.dst);
1332	}
1333	if (args.kind == cudaMemcpyDeviceToHost \|\| cudaMemcpyDeviceToDevice) {
1334	args.src = $reveal(args.src);
1335	}
1336	memcpy(args.dst, args.src, args.count);
1337
1338	$stream_dequeue(stream);
1339	}
1340
1341	$message $cuda_memcpy($scope cudaScope, cudaStream_t stream, $message request, _Bool async) {
1342	$cuda_memcpy_data args;
1343	$message_unpack(request, &args, sizeof($cuda_memcpy_data));
1344
1345	$proc memcpyProc = $stream_enqueue(cudaScope, stream, request, $cuda_memcpy_proc);
1346
1347	if (!async && args.kind != cudaMemcpyDeviceToDevice) {
1348	$wait(memcpyProc);
1349	}
1350	int tag = async ? $CUDA_TAG_cudaMemcpyAsync : $CUDA_TAG_cudaMemcpy;
1351
1352	return $message_pack($CUDA_PLACE_DEVICE, $CUDA_PLACE_HOST, tag, NULL, 0);
1353	}
1354
1355	$message $cuda_free($message request) {
1356	void* devPtr;
1357	$message_unpack(request, &devPtr, sizeof(void*));
1358	free($reveal(devPtr));
1359
1360	return $message_pack($CUDA_PLACE_DEVICE, $CUDA_PLACE_HOST, $CUDA_TAG_cudaFree, NULL, 0);
1361	}
1362
1363
1364
1365	////////////////////////////////////////////
1366	// CUDA API Functions (For Host-use Only) //
1367	////////////////////////////////////////////
1368
1369	cudaError_t cudaFree(void* devPtr) {
1370	$comm_enqueue($cuda_host_comm, $message_pack($CUDA_PLACE_HOST, $CUDA_PLACE_DEVICE, $CUDA_TAG_cudaFree, &devPtr, sizeof(void*)));
1371	$comm_dequeue($cuda_host_comm, $CUDA_PLACE_DEVICE, $CUDA_TAG_cudaFree);
1372
1373	return cudaSuccess;
1374	}
1375
1376	cudaError_t cudaMemcpy(void* dst, const void* src, size_t count, cudaMemcpyKind kind) {
1377	$cuda_host_memcpy(dst, src, count, kind, false);
1378	return cudaSuccess;
1379	}
1380
1381	cudaError_t cudaMemcpyAsync(void* dst, const void* src, size_t count,
1382	cudaMemcpyKind kind, cudaStream_t stream) {
1383	$cuda_host_memcpy(dst, src, count, kind, true);
1384	return cudaSuccess;
1385	}
1386
1387	/*
1388	cudaError_t cudaStreamCreate(cudaStream_t * pStream) {
1389	// Create new stream node in linked list
1390	$cuda_stream_node_t newHead = $create_new_stream_node();
1391	newHead->next = $cuda_global_context.head;
1392	$cuda_global_context.head->prev = newHead;
1393
1394	// Update cuda context's head to be the new node we created
1395	$cuda_global_context.head = newHead;
1396	$cuda_global_context.numStreams++;
1397
1398	return cudaSuccess;
1399	}
1400	*/
1401
1402	/*
1403	cudaError_t cudaStreamSynchronize(cudaStream_t stream) {
1404	stream = $default_stream_if_null(stream);
1405	$assert(stream->alive, "Attempt to synchronize with a destroyed stream");
1406	$when(stream->head == NULL) return cudaSuccess;
1407	}
1408	*/
1409
1410	/*
1411	cudaError_t cudaStreamDestroy(cudaStream_t stream) {
1412	$assert(stream != NULL && stream != $cuda_default_stream, "Attempt to destroy default stream");
1413	$assert(stream->alive, "Attempt to destroy an already destroyed stream");
1414	$destroy_stream_node(stream->containingNode);
1415	return cudaSuccess;
1416	}
1417	*/
1418
1419	/*
1420	cudaError_t cudaDeviceSynchronize() {
1421	$proc* opsToWaitOn;
1422	int numOps = 0;
1423
1424	$atomic {
1425	opsToWaitOn = ($proc) malloc(sizeof($proc) $cuda_global_context.numStreams);
1426
1427	for ($cuda_stream_node_t node = $cuda_global_context.head;
1428	node != NULL;
1429	node = node->next) {
1430	if (node->stream->tail != NULL) {
1431	opsToWaitOn[numOps] = node->stream->tail->opState->op;
1432	numOps++;
1433	}
1434	}
1435	}
1436	$waitall(opsToWaitOn, numOps);
1437
1438	return cudaSuccess;
1439	}
1440	*/
1441
1442	//////////////////////////////////
1443	// Generated code from kernel_1 //
1444	//////////////////////////////////
1445
1446	typedef struct {
1447	dim3 gridDim;
1448	dim3 blockDim;
1449	size_t $cudaMemSize;
1450	cudaStream_t $cudaStream;
1451	float* A;
1452	const float* B;
1453	float* C;
1454	int numElements;
1455	} $cuda_kernel_1_params;
1456
1457	void $cuda_reveal_kernel_1_args($cuda_kernel_1_params* args) {
1458	args->A = $reveal(args->A);
1459	args->B = $reveal(args->B);
1460	args->C = $reveal(args->C);
1461	}
1462
1463	void $cuda_host_launch_kernel_1(dim3 gridDim, dim3 blockDim,
1464	size_t $cudaMemSize, cudaStream_t $cudaStream,
1465	float* A, const float* B, float* C, int numElements) {
1466	$cuda_kernel_1_params args;
1467	args.gridDim = gridDim;
1468	args.blockDim = blockDim;
1469	args.$cudaMemSize = $cudaMemSize;
1470	args.$cudaStream = $cudaStream;
1471	args.A = A;
1472	args.B = B;
1473	args.C = C;
1474	args.numElements = numElements;
1475
1476	$comm_enqueue($cuda_host_comm, $message_pack($CUDA_PLACE_HOST, $CUDA_PLACE_DEVICE, $CUDA_TAG_LAUNCH_kernel_1, &args, sizeof($cuda_kernel_1_params)));
1477	$comm_dequeue($cuda_host_comm, $CUDA_PLACE_DEVICE, $CUDA_TAG_LAUNCH_kernel_1);
1478	}
1479
1480	void $cuda_kernel_1(dim3 gridDim, dim3 blockDim, size_t _cuda_mem_size,
1481	float A, const float B, float *C, int numElements) {
1482	$cuda_kernel_data_t $kernel = $create_cuda_kernel_data($here, gridDim, blockDim);
1483	void $cuda_block(uint3 blockIdx) {
1484	void $cuda_thread(uint3 threadIdx) {
1485	$local_start();
1486	// cudaMemSet currently not supported so this is small hack to initialize C ahead of time
1487	if (blockIdx.x == 0 && threadIdx.x == 0) {
1488	for (int i = 0; i < gridDim.x; i++) {
1489	C[i] = 0;
1490	}
1491	}
1492	//$clear_mem_sets($kernel, _cuda_kid);
1493	int $cuda_tid = $dim3_index(blockDim, threadIdx);
1494	int $cuda_kid = $cuda_kernel_index(gridDim, blockDim, blockIdx, threadIdx);
1495	$cuda_thread_data_t $thread = $create_cuda_thread_data($here, $kernel, $cuda_kid/(blockDim.x * blockDim.y * blockDim.z), $cuda_tid/warpSize, $cuda_tid % warpSize);
1496
1497	// Kernel REDUCTION start
1498	/*
1499	int lane = threadIdx.x % warpSize;
1500	int thisWarpSize = warpSize;
1501	if (threadIdx.x - lane + warpSize > blockDim.x) {
1502	thisWarpSize = ((blockDim.x - 1) % warpSize) + 1;
1503	}
1504
1505	int i = blockDim.x * blockIdx.x + threadIdx.x;
1506	int warpStart = i - lane;
1507	//printf("%d,%d - i: %d, warpStart: %d, thisWarpSize: %d\n", blockIdx.x, threadIdx.x,i, warpStart, thisWarpSize);
1508	int remainingElements = numElements;
1509
1510	while (remainingElements > 1) {
1511	//printf("%d,%d - remainingElements: %d - numElements: %d\n", blockIdx.x, threadIdx.x, remainingElements, numElements);
1512	if (remainingElements < numElements) {
1513	// __syncThreads()
1514	//printf("%d,%d - entering barrier\n", blockIdx.x, threadIdx.x);
1515
1516	$cuda__syncthreads($thread, 0);
1517	//printf("%d,%d - exiting barrier\n", blockIdx.x, threadIdx.x);
1518	}
1519
1520	if (warpStart + 1 < remainingElements) {
1521	float val = i < numElements ? A[i] : 0;
1522	//printf("%d,%d - val: %d\n", blockIdx.x, threadIdx.x, val);
1523
1524	for (int offset = warpSize/2; offset > 0; offset /= 2) {
1525	//__syncwarp();
1526	float tmp = __shfl_down_sync(0, val, offset);
1527	if (lane + offset < thisWarpSize) {
1528	val += tmp;
1529	}
1530	//printf("%d,%d - offset: %d - val: %d\n", blockIdx.x, threadIdx.x, offset, val);
1531	}
1532
1533	if (i < numElements) {
1534	A[i] = val;
1535	}
1536	}
1537
1538	i *= warpSize;
1539	//warpStart *= warpSize;
1540	remainingElements = ((remainingElements - 1) / warpSize) + 1;
1541	}
1542
1543	if (i == 0) {
1544	*C = A[0];
1545	}
1546	*/
1547	// Kernel REDUCTION end
1548
1549	// Kernel REDUCTION 2 start
1550
1551	int lane = threadIdx.x % warpSize;
1552	int thisWarpSize = warpSize;
1553	if (threadIdx.x - lane + warpSize > blockDim.x) {
1554	thisWarpSize = ((blockDim.x - 1) % warpSize) + 1;
1555	}
1556
1557	int i = blockDim.x * blockIdx.x + threadIdx.x;
1558	int warpStart = i - lane;
1559
1560	if (warpStart + 1 < numElements) {
1561	float val = i < numElements ? A[i] : 0;
1562	for (int offset = warpSize/2; offset > 0; offset /= 2) {
1563	float tmp = __shfl_down_sync(0, val, offset);
1564	//float tmp = i + offset < numElements ? A[i + offset] : 0;
1565	if (lane + offset < thisWarpSize) {
1566	val += tmp;
1567	}
1568	}
1569
1570	if (i < numElements) {
1571	A[i] = val;
1572	}
1573	}
1574
1575	$cuda__syncthreads($thread, 0);
1576	if (threadIdx.x == 0) {
1577	int blockEnd = blockDim.x * (blockIdx.x + 1);
1578	if (blockEnd > numElements) {
1579	blockEnd = numElements;
1580	}
1581	for (int j = i + warpSize; j < blockEnd; j += warpSize) {
1582	A[i] += A[j];
1583	}
1584	atomicAdd(C + blockIdx.x, 1);
1585	}
1586
1587	if (i == 0) {
1588	C[0] = A[0];
1589	for (int j = 1; j < gridDim.x; j++) {
1590	while(atomicAdd(C+j,0) == 0) {}
1591	C[0] += A[j * blockDim.x];
1592	}
1593	}
1594	// Kernel REDUCTION 2 end
1595
1596	// Kernel BALLOT TEST start
1597
1598	/*
1599	int i = threadIdx.x;
1600	if (i < numElements) {
1601	int result = __ballot_sync(~0, A[i] > 0);
1602	if (i == 0) {
1603	printf("Result: %d\n", result);
1604	*C = 0;
1605	while(result > 0) {
1606	if (result % 2)
1607	*C += 1;
1608	result /= 2;
1609	}
1610	printf("done calculating result\n");
1611	}
1612	}
1613	*/
1614	// Kernel BALLOT TEST end
1615	//$check_data_race($kernel, _cuda_kid);
1616	//$read_set_pop();
1617	//$write_set_pop();
1618	$destroy_cuda_thread_data($thread);
1619	$local_end();
1620	}
1621	$cuda_run_and_wait_on_procs(blockDim, $cuda_thread);
1622	}
1623	$cuda_run_and_wait_on_procs(gridDim, $cuda_block);
1624	$destroy_cuda_kernel_data($kernel);
1625	}
1626
1627	void $cuda_kernel_1_proc ($message request, $cuda_op_state_t opState, cudaStream_t cudaStream) {
1628	$when(opState->start);
1629
1630	$cuda_kernel_1_params args;
1631	$message_unpack(request, &args, sizeof($cuda_kernel_1_params));
1632	$cuda_reveal_kernel_1_args(&args);
1633
1634	$cuda_kernel_1(args.gridDim, args.blockDim, args.$cudaMemSize, args.A, args.B, args.C, args.numElements);
1635	$stream_dequeue(cudaStream);
1636	}
1637
1638	/////////////////
1639	// CUDA "file" //
1640	/////////////////
1641
1642	void $cuda_main() {
1643
1644	// Device Variables
1645
1646	$scope $cuda_scope = $here;
1647
1648	$comm $cuda_device_comm = $comm_create($cuda_scope, $cuda_gcomm, 1);
1649	$cuda_context $cuda_global_context;
1650	cudaStream_t $cuda_default_stream;
1651
1652	// Helper function to get the default stream if passed NULL, and just returns stream otherwise
1653	// Currently unused until we support streams other than the default one.
1654	cudaStream_t $default_stream_if_null(cudaStream_t stream) {
1655	return stream == NULL ? $cuda_default_stream : stream;
1656	}
1657
1658	// Device Logic
1659
1660	$cuda_stream_node_t defaultStreamNode = $create_new_stream_node($cuda_scope);
1661	$cuda_default_stream = defaultStreamNode->stream;
1662
1663	$cuda_global_context.head = defaultStreamNode;
1664	$cuda_global_context.numStreams = 1;
1665
1666	while (true) {
1667	$message request = $comm_dequeue($cuda_device_comm, $CUDA_PLACE_HOST, $COMM_ANY_TAG);
1668	$message response;
1669	const int tag = $message_tag(request);
1670
1671	switch(tag) {
1672	case $CUDA_TAG_SCOPE_REQUEST :
1673	response = $message_pack($CUDA_PLACE_DEVICE, $CUDA_PLACE_HOST, $CUDA_TAG_SCOPE_REQUEST, &$cuda_scope, sizeof($scope));
1674	break;
1675	case $CUDA_TAG_cudaFree :
1676	response = $cuda_free(request);
1677	break;
1678	case $CUDA_TAG_cudaMemcpy :
1679	response = $cuda_memcpy($cuda_scope, $cuda_default_stream, request, false);
1680	break;
1681	case $CUDA_TAG_cudaMemcpyAsync :
1682	response = $cuda_memcpy($cuda_scope, $cuda_default_stream, request, true);
1683	break;
1684	case $CUDA_TAG_LAUNCH_kernel_1 :
1685	$stream_enqueue($cuda_scope, $cuda_default_stream, request, $cuda_kernel_1_proc);
1686
1687	response = $message_pack($CUDA_PLACE_DEVICE, $CUDA_PLACE_HOST, tag, NULL, 0);
1688	break;
1689	case $CUDA_TAG_TEARDOWN : {
1690	$proc destructor = $destroy_stream_node($cuda_default_stream->containingNode);
1691	$wait(destructor);
1692	$comm_destroy($cuda_device_comm);
1693	return;
1694	}
1695	default :
1696	$assert(false, "Unknown CUDA request");
1697	}
1698
1699	$comm_enqueue($cuda_device_comm, response);
1700	}
1701	}
1702
1703	///////////////
1704	// Host file //
1705	///////////////
1706
1707	$input int N = 16;
1708	$input int threadsPerBlock = N%2 == 0 ? N/2 : (N+1)/2;
1709	//$input int threadsPerBlock = N;
1710	$input float A[N];
1711	// Currently unused but left in to save time
1712	$input float B[N];
1713
1714	void $host_main() {
1715	int size = N * sizeof(float);
1716	int numBlocks = (N-1)/threadsPerBlock + 1;
1717	//int numBlocks = 1;
1718
1719	float* cuda_A;
1720	// cudaMalloc((void **)&cuda_A, size);
1721	{
1722	$scope deviceScope = $cuda_host_request_device_scope();
1723	cuda_A = $hide((float*)$malloc(deviceScope, size));
1724	}
1725	cudaMemcpy(cuda_A, A, size, cudaMemcpyHostToDevice);
1726
1727	float* cuda_B;
1728	// cudaMalloc((void **)&cuda_B, size);
1729	{
1730	$scope deviceScope = $cuda_host_request_device_scope();
1731	cuda_B = $hide((float*)$malloc(deviceScope, size));
1732	}
1733	cudaMemcpy(cuda_B, B, size, cudaMemcpyHostToDevice);
1734
1735	float* cuda_C;
1736	// cudaMalloc((void *)&cuda_C, sizeof(float) numBlocks);
1737	{
1738	$scope deviceScope = $cuda_host_request_device_scope();
1739	cuda_C = $hide((float)$malloc(deviceScope, sizeof(float) numBlocks));
1740	}
1741
1742	dim3 gridDim = {numBlocks, 1, 1};
1743	dim3 blockDim = {threadsPerBlock, 1, 1};
1744	// kernel_1<<<gridDim, blockDim>>>(cuda_A, cuda_B, cuda_C, N);
1745	$cuda_host_launch_kernel_1(gridDim, blockDim, 0, NULL, cuda_A, cuda_B, cuda_C, N);
1746
1747	// Checking correctness
1748	float* C = (float *)malloc(size);
1749
1750	cudaMemcpy(C, cuda_C, sizeof(float), cudaMemcpyDeviceToHost);
1751
1752	// REDUCTION ASSERTION
1753
1754	float sum = 0;
1755	for(int i = 0; i < N; i++)
1756	sum += A[i];
1757
1758	$assert(*C == sum);
1759
1760	// BALLOT ASSERTION
1761	/*
1762	float count = 0;
1763	for (int i = 0; i < N; i++) {
1764	if (A[i] > 0)
1765	count++;
1766	}
1767	$assert(*C == count);
1768	*/
1769
1770	free(C);
1771
1772	cudaFree(cuda_A);
1773	cudaFree(cuda_B);
1774	cudaFree(cuda_C);
1775
1776	}
1777
1778	int main() {
1779	$proc host = $spawn $host_main();
1780	$proc cuda = $spawn $cuda_main();
1781	$wait(host);
1782	$comm_enqueue($cuda_host_comm, $message_pack($CUDA_PLACE_HOST, $CUDA_PLACE_DEVICE, $CUDA_TAG_TEARDOWN, NULL, 0));
1783	$comm_destroy($cuda_host_comm);
1784	$wait(cuda);
1785	$gcomm_destroy($cuda_gcomm, NULL);
1786	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

Original Format