Context Navigation

newCudaMockup.cvl

main

Last change on this file was 9dbe9864, checked in by Alex Wilton <awilton@…>, 2 years ago

Merged CUDA branch into trunk.

git-svn-id: svn://vsl.cis.udel.edu/civl/trunk@5890 fb995dde-84ed-4084-dfe6-e5aef3e2452c

Property mode set to 100644

File size: 67.4 KB

Rev	Line
[30cb1de]	1	#include <concurrency.cvh>
[6dd0322]	2	#include <comm.cvh>
[30cb1de]	3	#include <stdlib.h>
[874f9d1]	4	#include <stdio.h>
[30cb1de]	5	#include <stdbool.h>
	6	#include <string.h>
[9cabba4]	7	#include <mem.cvh>
[66ea4f9]	8	#pragma CIVL ACSL
[30cb1de]	9
[6dd0322]	10	///////////
	11	// Types //
	12	///////////
	13
[109d05e]	14	enum cudaError {
	15	cudaSuccess
	16	};
	17	typedef enum cudaError cudaError_t;
	18
[9cabba4]	19	//typedef $cuda_memcpy_kind cudaMemcpyKind;
[109d05e]	20
[30cb1de]	21	typedef struct {
	22	unsigned int x, y, z;
	23	} dim3;
	24
	25	/* used to represent a location in a three dimensional grid
	26	*/
	27	typedef struct {
	28	unsigned int x, y, z;
	29	} uint3;
	30
[9cabba4]	31	typedef enum {
	32	cudaMemcpyHostToHost,
	33	cudaMemcpyHostToDevice,
	34	cudaMemcpyDeviceToHost,
	35	cudaMemcpyDeviceToDevice,
	36	cudaMemcpyDefault
	37	} cudaMemcpyKind;
	38
[30cb1de]	39	typedef struct $cuda_op_state* $cuda_op_state_t;
[9cabba4]	40	typedef struct $cuda_op_state {
[109d05e]	41	_Bool start;
[e76acca]	42	$proc op;
[9cabba4]	43	} $cuda_op_state;
[109d05e]	44
[30cb1de]	45	typedef struct $cuda_op_state_node* $cuda_op_state_node_t;
[9cabba4]	46	typedef struct $cuda_op_state_node {
[e76acca]	47	$cuda_op_state_t opState;
	48	$cuda_op_state_node_t next;
[9cabba4]	49	} $cuda_op_state_node;
[109d05e]	50
[30cb1de]	51	typedef struct $cuda_stream_node* $cuda_stream_node_t;
[9cabba4]	52	typedef struct $cuda_stream* $cuda_stream_t;
	53	typedef $cuda_stream_t cudaStream_t;
	54	typedef struct $cuda_stream {
[e76acca]	55	$cuda_op_state_node_t head;
	56	$cuda_op_state_node_t tail;
[109d05e]	57	int numOps;
	58	$cuda_stream_node_t containingNode;
	59	_Bool alive;
[9cabba4]	60	} $cuda_stream;
[109d05e]	61
[9cabba4]	62	typedef struct $cuda_stream_node{
[30cb1de]	63	cudaStream_t stream;
[109d05e]	64	$cuda_stream_node_t prev;
	65	$cuda_stream_node_t next;
[9cabba4]	66	} $cuda_stream_node;
[109d05e]	67
[9cabba4]	68	typedef struct $cuda_context* $cuda_context_t;
[109d05e]	69	typedef struct $cuda_context {
[a67c590]	70	$cuda_stream_node_t head;
[109d05e]	71	int numStreams;
	72	} $cuda_context;
	73
[9dbe9864]	74	typedef struct $cuda_mem_set* $cuda_mem_set_t;
	75	typedef struct $cuda_mem_set {
	76	$mem reads;
	77	$mem writes;
	78	} $cuda_mem_set;
	79
	80	void $cuda_mem_set_clear($cuda_mem_set_t memSet) {
	81	$mem emptyset = $mem_empty();
	82	memSet->reads = emptyset;
	83	memSet->writes = emptyset;
	84	}
[6dd0322]	85
[9dbe9864]	86	$cuda_mem_set_t $cuda_mem_set_create($scope scope) {
	87	$cuda_mem_set_t newMemSet = ($cuda_mem_set_t) $malloc(scope, sizeof($cuda_mem_set));
[6dd0322]	88
[9dbe9864]	89	$cuda_mem_set_clear(newMemSet);
[6dd0322]	90
[9dbe9864]	91	return newMemSet;
[0ffc6c8]	92	}
	93
[9dbe9864]	94	void $cuda_mem_set_destroy($cuda_mem_set_t memSet) {
	95	free(memSet);
[0ffc6c8]	96	}
	97
[9dbe9864]	98	void $cuda_mem_set_add($cuda_mem_set_t memSet, $mem reads, $mem writes) {
	99	memSet->reads = $mem_union(memSet->reads, reads);
	100	memSet->writes = $mem_union(memSet->writes, writes);
[a67c590]	101	}
	102
[9dbe9864]	103	void $cuda_check_mems_disjoint($cuda_mem_set_t m1, int b1, int w1, int l1,
	104	$cuda_mem_set_t m2, int b2, int w2, int l2) {
	105	$mem out1 = $mem_empty();
	106	$mem out2 = $mem_empty();
[a67c590]	107
[9dbe9864]	108	$assert($mem_no_intersect(m1->reads, m2->writes, &out1, &out2),
	109	"Data-race detected: %p read by thread <%d, %d, %d> intersects %p written by thread <%d, %d, %d>\n",
	110	out1, b1, w1, l1, out2, b2, w2, l2);
	111	$assert($mem_no_intersect(m1->writes, m2->reads, &out1, &out2),
	112	"Data-race detected: %p written by thread <%d, %d, %d> intersects %p read by thread <%d, %d, %d>\n",
	113	out1, b1, w1, l1, out2, b2, w2, l2);
	114	$assert($mem_no_intersect(m1->writes, m2->writes, &out1, &out2),
	115	"Data-race detected: %p written by thread <%d, %d, %d> intersects %p written by thread <%d, %d, %d>\n",
	116	out1, b1, w1, l1, out2, b2, w2, l2);
[a67c590]	117	}
	118
[9dbe9864]	119	typedef enum {
	120	$CUDA_WARP_TAG_EMPTY,
	121	$CUDA_WARP_TAG_warpsync,
	122	$CUDA_WARP_TAG_shfl_sync,
	123	$CUDA_WARP_TAG_shfl_up_sync,
	124	$CUDA_WARP_TAG_shfl_down_sync,
	125	$CUDA_WARP_TAG_shfl_xor_sync,
	126	$CUDA_WARP_TAG_all_sync,
	127	$CUDA_WARP_TAG_any_sync,
	128	$CUDA_WARP_TAG_ballot_sync
	129	} $cuda_warp_tag;
[a67c590]	130
[9dbe9864]	131	$input int warpSize = 32;
[a67c590]	132
[9dbe9864]	133	typedef struct $cuda_warp_data* $cuda_warp_data_t;
	134	typedef struct $cuda_warp_data {
	135	int size;
	136	$cuda_mem_set_t** memSets;
	137	_Bool patching[];
	138	int num_alive;
	139	_Bool alive[];
	140	int num_in_barrier;
	141	_Bool in_barrier[];
	142	$cuda_warp_tag currOp;
	143	int reductionLane;
	144	$gcomm gcomm;
	145	} $cuda_warp_data;
[a67c590]	146
[9dbe9864]	147	/*@ depends_on \access(warp);
	148	@ executes_when \true;
	149	@ */
	150	$atomic_f void $cuda_warp_barrier_update($cuda_warp_data_t warp) {
	151	if (warp->num_in_barrier == warp->num_alive) {
	152	warp->num_in_barrier = 0;
	153	warp->currOp = $CUDA_WARP_TAG_EMPTY;
	154	for (int i = 0; i < warp->size; i++) {
	155	warp->in_barrier[i] = $false;
	156	$cuda_mem_set_clear(warp->memSets[i][i]);
	157	}
[a67c590]	158	}
	159	}
	160
[9dbe9864]	161	$cuda_warp_data_t $create_cuda_warp_data($scope warpScope, int size) {
	162	$cuda_warp_data_t newWarp = ($cuda_warp_data_t) malloc(sizeof($cuda_warp_data));
	163	newWarp->size = size;
	164	newWarp->memSets = ($cuda_mem_set_t*) $malloc(warpScope, size sizeof($cuda_mem_set_t*));
	165	for (int i = 0; i < size; i++) {
	166	newWarp->memSets[i] = ($cuda_mem_set_t) $malloc(warpScope, size sizeof($cuda_mem_set_t));
	167	for (int j = 0; j < size; j++) {
	168	newWarp->memSets[i][j] = $cuda_mem_set_create(warpScope);
	169	}
	170	}
	171	newWarp->patching = (_Bool[size])$lambda(int i) $false;
	172	newWarp->num_alive = size;
	173	newWarp->alive = (_Bool[size])$lambda(int i) $true;
	174	newWarp->num_in_barrier = 0;
	175	newWarp->in_barrier = (_Bool[size])$lambda(int i) $false;
	176	newWarp->currOp = $CUDA_WARP_TAG_EMPTY;
	177	newWarp->reductionLane = -1;
	178	newWarp->gcomm = $gcomm_create(warpScope, size);
	179
	180	return newWarp;
[a67c590]	181	}
	182
[9dbe9864]	183	void $destroy_cuda_warp_data($cuda_warp_data_t warp) {
	184	$assert(warp != NULL, "Attempt to destroy a NULL warp");
[a67c590]	185
[9dbe9864]	186	for (int i = 0; i < warp->size; i++) {
	187	for (int j = 0; j < warp->size; j++) {
	188	$cuda_mem_set_destroy(warp->memSets[i][j]);
	189	}
	190	free(warp->memSets[i]);
[a67c590]	191	}
[9dbe9864]	192	free(warp->memSets);
	193	$gcomm_destroy(warp->gcomm, NULL);
	194	free(warp);
[a67c590]	195	}
	196
[9dbe9864]	197	typedef struct $cuda_block_data* $cuda_block_data_t;
	198	typedef struct $cuda_block_data {
	199	int size;
	200	$cuda_mem_set_t* memSets;
	201	int predArray[];
	202	$gbarrier gbarrier;
	203	int currBarrierID;
	204	int numWarps;
	205	$cuda_warp_data_t* warps;
	206	} $cuda_block_data;
[a67c590]	207
[9dbe9864]	208	$cuda_block_data_t $create_cuda_block_data($scope scope, int size) {
	209	$cuda_block_data_t newBlock = ($cuda_block_data_t) $malloc(scope, sizeof($cuda_block_data));
[a67c590]	210
[9dbe9864]	211	newBlock->size = size;
	212	newBlock->numWarps = (size - 1)/warpSize + 1;
	213
	214	newBlock->predArray = (int[size]) $lambda(int i) 0;
	215
	216	newBlock->gbarrier = $gbarrier_create(scope, size);
	217	newBlock->currBarrierID = -1;
[a67c590]	218
[9dbe9864]	219	newBlock->memSets = ($cuda_mem_set_t) $malloc(scope, size sizeof($cuda_mem_set_t));
	220	for (int i = 0; i < size; i++) {
	221	newBlock->memSets[i] = $cuda_mem_set_create(scope);
[a67c590]	222	}
[9dbe9864]	223	newBlock->warps = ($cuda_warp_data_t) $malloc(scope, newBlock->numWarps sizeof($cuda_warp_data_t));
	224	for (int i = 0; i < newBlock->numWarps - 1; i++) {
	225	newBlock->warps[i] = $create_cuda_warp_data(scope, warpSize);
[a67c590]	226	}
[9dbe9864]	227	int lastIndex = newBlock->numWarps - 1;
	228	newBlock->warps[lastIndex] = $create_cuda_warp_data(scope, ((size - 1) % warpSize) + 1);
	229
	230	return newBlock;
[a67c590]	231	}
	232
[9dbe9864]	233	void $destroy_cuda_block_data($cuda_block_data_t block) {
	234	for (int i = 0; i < block->size; i++) {
	235	$cuda_mem_set_destroy(block->memSets[i]);
	236	}
	237	free(block->memSets);
[a67c590]	238
[9dbe9864]	239	for (int i = 0; i < block->numWarps; i++) {
	240	$destroy_cuda_warp_data(block->warps[i]);
[a67c590]	241	}
[9dbe9864]	242	free(block->warps);
[a67c590]	243
[9dbe9864]	244	$gbarrier_destroy(block->gbarrier);
	245
	246	free(block);
[a67c590]	247	}
	248
[9dbe9864]	249	typedef struct $cuda_kernel_data* $cuda_kernel_data_t;
	250	typedef struct $cuda_kernel_data {
	251	int size;
	252	$cuda_mem_set_t* memSets;
	253	int numBlocks;
	254	$cuda_block_data_t* blocks;
	255	} $cuda_kernel_data;
[a67c590]	256
[9dbe9864]	257	$cuda_kernel_data_t $create_cuda_kernel_data($scope scope, dim3 gridDim, dim3 blockDim){
	258	int numBlocks = (gridDim.x * gridDim.y) * gridDim.z;
	259	int threadsPerBlock = (blockDim.x * blockDim.y) * blockDim.z;
[a67c590]	260
[9dbe9864]	261	$cuda_kernel_data_t newKernel = ($cuda_kernel_data_t)$malloc(scope, sizeof($cuda_kernel_data));
	262
	263	newKernel->size = threadsPerBlock * numBlocks;
	264	newKernel->numBlocks = numBlocks;
	265
	266	newKernel->memSets = ($cuda_mem_set_t) $malloc(scope, newKernel->size sizeof($cuda_mem_set_t));
	267	for (int i = 0; i < newKernel->size; i++) {
	268	newKernel->memSets[i] = $cuda_mem_set_create(scope);
	269	}
	270
	271	newKernel->blocks = ($cuda_block_data_t) $malloc(scope, newKernel->numBlocks sizeof($cuda_block_data_t));
	272	for (int i = 0; i < newKernel->numBlocks; i++) {
	273	newKernel->blocks[i] = $create_cuda_block_data(scope, threadsPerBlock);
	274	}
[9cabba4]	275
[9dbe9864]	276	return newKernel;
[9cabba4]	277	}
	278
[9dbe9864]	279	void $destroy_cuda_kernel_data($cuda_kernel_data_t kernel){
	280	for (int i = 0; i < kernel->size; i++) {
	281	$cuda_mem_set_destroy(kernel->memSets[i]);
	282	}
	283	free(kernel->memSets);
[9cabba4]	284
[9dbe9864]	285	for (int i = 0; i < kernel->numBlocks; i++) {
	286	$destroy_cuda_block_data(kernel->blocks[i]);
	287	}
	288	free(kernel->blocks);
	289	free(kernel);
[9cabba4]	290	}
	291
[9dbe9864]	292	typedef struct $cuda_thread_data* $cuda_thread_data_t;
	293	typedef struct $cuda_thread_data {
	294	$cuda_kernel_data_t kernel;
	295	$cuda_block_data_t block;
	296	$cuda_warp_data_t warp;
	297	int blockID;
	298	int warpID;
	299	int laneID;
	300	$comm lane_comm;
	301	$barrier block_barrier;
	302	} $cuda_thread_data;
[9cabba4]	303
[9dbe9864]	304	/*@ depends_on \access(thread);
	305	@ executes_when \true;
	306	@*/
	307	$atomic_f void $cuda_check_warp_data_race($cuda_thread_data_t thread) {
	308	int blockID = thread->blockID;
	309	int warpID = thread->warpID;
	310	int laneID = thread->laneID;
	311	$cuda_mem_set_t** warpMems = thread->warp->memSets;
[9cabba4]	312
[9dbe9864]	313	for (int i = 0; i < thread->warp->size; i++) {
	314	if (i != laneID) {
	315	$cuda_check_mems_disjoint(warpMems[laneID][laneID], blockID, warpID, laneID,
	316	warpMems[i][i], blockID, warpID, i);
	317	}
[9cabba4]	318	}
[9dbe9864]	319	}
	320
	321	/*@ depends_on \access(thread);
	322	@ executes_when \true;
	323	@*/
	324	$atomic_f void $cuda_check_block_data_race($cuda_thread_data_t thread) {
	325	int blockID = thread->blockID;
	326	int warpID = thread->warpID;
	327	int laneID = thread->laneID;
	328	int indexInBlock = warpID * warpSize + laneID;
	329	$cuda_mem_set_t* blockMems = thread->block->memSets;
[9cabba4]	330
[9dbe9864]	331	for (int i = 0; i < thread->block->numWarps; i++) {
	332	if (i != warpID) {
	333	int currWarpSize = thread->block->warps[i]->size;
	334	for (int j = 0; j < currWarpSize; j++) {
	335	$cuda_check_mems_disjoint(blockMems[indexInBlock], blockID, warpID, laneID,
	336	blockMems[i * warpSize + j], blockID, i, j);
	337	}
	338	}
[9cabba4]	339	}
	340	}
	341
[9dbe9864]	342	/*@ depends_on \access(thread);
	343	@ executes_when \true;
	344	@*/
	345	$atomic_f void $cuda_check_kernel_data_race($cuda_thread_data_t thread) {
	346	int blockID = thread->blockID;
	347	int warpID = thread->warpID;
	348	int laneID = thread->laneID;
	349	int threadsPerBlock = thread->block->size;
	350	int indexInKernel = blockID * threadsPerBlock + warpID * warpSize + laneID;
	351	$cuda_mem_set_t* kernelMems = thread->kernel->memSets;
	352	//printf("%d, %d, %d - Checking kernel data race. km index %d.\n\n", thread->blockID, thread->warpID, thread->laneID, indexInKernel);
[a67c590]	353
[9dbe9864]	354	for (int i = 0; i < thread->kernel->numBlocks; i++) {
	355	if (i != blockID) {
	356	$cuda_block_data_t block = thread->kernel->blocks[i];
	357	for (int j = 0; j < block->numWarps; j++) {
	358	int currWarpSize = block->warps[j]->size;
	359	for (int k = 0; k < currWarpSize; k++) {
	360	//printf("%d, %d, %d - Checking kdr against <%d,%d,%d> with km index %d.\n\n", thread->blockID, thread->warpID, thread->laneID, i,j,k, i * threadsPerBlock + j * warpSize + k);
	361	$cuda_check_mems_disjoint(kernelMems[indexInKernel], blockID, warpID, laneID,
	362	kernelMems[i * threadsPerBlock + j * warpSize + k], i, j, k);
	363	}
	364	}
	365	}
	366	}
[109d05e]	367	}
	368
[9dbe9864]	369	/*@ depends_on \access(thread);
	370	@ executes_when \true;
	371	@*/
	372	$atomic_f void $cuda_start_mem_patching($cuda_thread_data_t thread) {
	373	thread->warp->patching[thread->laneID] = $true;
[109d05e]	374	}
	375
[9dbe9864]	376	/*@ depends_on \access(thread);
	377	@ executes_when \true;
	378	@*/
	379	$atomic_f void $cuda_update_patches($cuda_thread_data_t thread) {
	380	$cuda_warp_data_t warp = thread->warp;
	381	int lane = thread->laneID;
	382
	383	for (int i = 0; i < warp->size; i++) {
	384	if (warp->patching[i]) {
	385	$cuda_mem_set_add(warp->memSets[i][lane], warp->memSets[lane][lane]->reads, warp->memSets[lane][lane]->writes);
	386	}
	387	}
[109d05e]	388	}
	389
[9dbe9864]	390	/*@ depends_on \access(thread);
	391	@ executes_when \true;
	392	@*/
	393	$atomic_f void $cuda_end_mem_patching($cuda_thread_data_t thread) {
	394	if (thread->warp->patching[thread->laneID]) {
	395	int size = thread->warp->size;
	396	int blockID = thread->blockID;
	397	int warpID = thread->warpID;
	398	int laneID = thread->laneID;
	399	$cuda_mem_set_t* threadMems = thread->warp->memSets[laneID];
	400
	401	thread->warp->patching[laneID] = $false;
[109d05e]	402
[9dbe9864]	403	for (int i = 0; i < size; i++) {
	404	if (i != laneID) {
	405	$cuda_check_mems_disjoint(threadMems[laneID], blockID, warpID, laneID,
	406	threadMems[i], blockID, warpID, i);
	407	$cuda_mem_set_clear(threadMems[i]);
[30cb1de]	408	}
	409	}
	410	}
[109d05e]	411	}
[9cabba4]	412
[9dbe9864]	413	/*@ depends_on \access(thread);
	414	@ executes_when \true;
	415	@*/
	416	$atomic_f void $cuda_start_kernel_region($cuda_thread_data_t thread) {
	417	int lane = thread->laneID;
	418	//printf("%d, %d, %d - Starting region.\n\n", thread->blockID, thread->warpID, thread->laneID);
	419	$cuda_update_patches(thread);
	420	$cuda_mem_set_clear(thread->warp->memSets[lane][lane]);
	421	$cuda_mem_set_clear(thread->block->memSets[thread->warpID * warpSize + thread->laneID]);
	422	$cuda_mem_set_clear(thread->kernel->memSets[thread->blockID * thread->block->size + thread->warpID * warpSize + thread->laneID]);
	423	//$cuda_mem_set_t km = thread->kernel->memSets[thread->blockID];
	424	//printf("%d, %d, %d - kernel mem set after clear:\n\tReads: %s\n\tWrites: %s\n\n", thread->blockID, thread->warpID, thread->laneID, km->reads, km->writes);
	425	$read_set_push();
	426	$write_set_push();
	427	}
[9cabba4]	428
[9dbe9864]	429	/*@ depends_on \access(thread);
	430	@ executes_when \true;
	431	@*/
	432	$atomic_f void $cuda_complete_warp_region($cuda_thread_data_t thread) {
	433	$mem reads = $read_set_pop();
	434	$mem writes = $write_set_pop();
	435	int lane = thread->laneID;
	436	$cuda_mem_set_t laneMem = thread->warp->memSets[lane][lane];
	437	//printf("%d, %d, %d - completing warp region.\n\tReads: %s\n\tWrites: %s\n\n", thread->blockID, thread->warpID, thread->laneID, reads, writes);
[9cabba4]	438
[9dbe9864]	439	$cuda_mem_set_add(laneMem, reads, writes);
	440	$cuda_mem_set_add(thread->block->memSets[thread->warpID * warpSize + thread->laneID], laneMem->reads, laneMem->writes);
	441	$cuda_mem_set_add(thread->kernel->memSets[thread->blockID * thread->block->size + thread->warpID * warpSize + thread->laneID], laneMem->reads, laneMem->writes);
	442	//$cuda_mem_set_t km = thread->kernel->memSets[thread->blockID];
	443	//printf("%d, %d, %d - kernel mem set after add:\n\tReads: %s\n\tWrites: %s\n\n", thread->blockID, thread->warpID, thread->laneID, km->reads, km->writes);
	444	$cuda_end_mem_patching(thread);
	445	$cuda_check_warp_data_race(thread);
[9cabba4]	446	}
	447
[9dbe9864]	448	/*@ depends_on \access(thread);
	449	@ executes_when \true;
	450	@*/
	451	$atomic_f void $cuda_complete_block_region($cuda_thread_data_t thread) {
	452	//printf("%d, %d, %d - completing block region\n\n", thread->blockID, thread->warpID, thread->laneID);
	453	$cuda_complete_warp_region(thread);
	454	$cuda_check_block_data_race(thread);
[9cabba4]	455	}
	456
[9dbe9864]	457	/*@ depends_on \access(thread);
[9cabba4]	458	@ executes_when \true;
[9dbe9864]	459	@*/
	460	$atomic_f void $cuda_complete_kernel_region($cuda_thread_data_t thread) {
	461	//printf("%d, %d, %d - completing kernel region\n\n", thread->blockID, thread->warpID, thread->laneID);
	462	$cuda_complete_block_region(thread);
	463	$cuda_check_kernel_data_race(thread);
[9cabba4]	464	}
	465
[9dbe9864]	466	$cuda_thread_data_t $create_cuda_thread_data($scope scope, $cuda_kernel_data_t kernel, int blockID, int warpID, int laneID) {
	467	$cuda_thread_data_t newThread = ($cuda_thread_data_t) $malloc(scope, sizeof($cuda_thread_data));
[9cabba4]	468
[9dbe9864]	469	newThread->kernel = kernel;
	470	newThread->block = kernel->blocks[blockID];
	471	newThread->warp = newThread->block->warps[warpID];
	472	newThread->blockID = blockID;
	473	newThread->warpID = warpID;
	474	newThread->laneID = laneID;
	475	newThread->lane_comm = $comm_create(scope, kernel->blocks[blockID]->warps[warpID]->gcomm, laneID);
	476	newThread->block_barrier = $barrier_create(scope, kernel->blocks[blockID]->gbarrier, warpID * warpSize + laneID);
[9cabba4]	477
[9dbe9864]	478	$read_set_push();
	479	$write_set_push();
	480
	481	return newThread;
	482	}
[9cabba4]	483
[9dbe9864]	484	/*@ depends_on \access(thread);
[9cabba4]	485	@ executes_when \true;
	486	@ */
[9dbe9864]	487	$atomic_f void $destroy_cuda_thread_data($cuda_thread_data_t thread) {
	488	$assert(thread != NULL, "Attempt to destroy NULL cuda thread");
	489	$cuda_complete_kernel_region(thread);
	490
	491	$cuda_warp_data_t warp = thread->warp;
	492	warp->alive[thread->laneID] = $false;
	493	warp->num_alive--;
	494	$cuda_warp_barrier_update(warp);
	495
	496	$barrier_destroy(thread->block_barrier);
	497
	498	$comm_destroy(thread->lane_comm);
	499	free(thread);
[9cabba4]	500	}
	501
[9dbe9864]	502	/*@ depends_on \access(thread);
[9cabba4]	503	@ executes_when \true;
	504	@ */
[9dbe9864]	505	$atomic_f void $cuda_warp_barrier_enter($cuda_thread_data_t thread, $cuda_warp_tag tag) {
	506	$cuda_warp_data_t warp = thread->warp;
	507	$assert(!warp->in_barrier[thread->laneID]);
	508	$assert(warp->currOp == tag \|\| warp->currOp == $CUDA_WARP_TAG_EMPTY);
[9cabba4]	509
[9dbe9864]	510	warp->in_barrier[thread->laneID] = $true;
	511	warp->currOp = tag;
[9cabba4]	512	warp->num_in_barrier++;
	513	$cuda_warp_barrier_update(warp);
	514	}
	515
[9dbe9864]	516	// Seems this needs to be atomic to work. Why?
	517	/*@ depends_on \access(thread);
	518	@*/
	519	$atomic_f void $cuda_warp_barrier_exit($cuda_thread_data_t thread) {
	520	$when(!thread->warp->in_barrier[thread->laneID]);
[9cabba4]	521	}
	522
[9dbe9864]	523	void $cuda_warp_barrier_call($cuda_thread_data_t thread, $cuda_warp_tag tag) {
	524	$cuda_warp_barrier_enter(thread, tag);
	525	$cuda_warp_barrier_exit(thread);
[9cabba4]	526	}
	527
[9dbe9864]	528	void $cuda__syncthreads($cuda_thread_data_t thread, int barrierIndex) {
	529	$cuda_complete_block_region(thread);
	530	int indexInBlock = thread->warpID * warpSize + thread->laneID;
	531
	532	if (thread->block->currBarrierID == -1) {
	533	thread->block->currBarrierID = barrierIndex;
	534	}
	535	$assert(thread->block->currBarrierID == barrierIndex,
	536	"Divergent calls to __syncthreads.");
	537
	538	$local_end();
	539	$barrier_call(thread->block_barrier);
	540	$local_start();
	541	int lane = thread->laneID;
	542
	543	$cuda_mem_set_clear(thread->warp->memSets[lane][lane]);
	544	$cuda_mem_set_clear(thread->block->memSets[thread->warpID * warpSize + lane]);
	545
	546	if (indexInBlock == 0) {
	547	thread->block->currBarrierID = -1;
	548	}
	549
	550	$local_end();
	551	$barrier_call(thread->block_barrier);
	552	$local_start();
	553
	554	$read_set_push();
	555	$write_set_push();
	556	}
	557
	558	#define $CUDA_DEFINE_SYNCTHREADS_VARIANT(NAME, INIT, REDUCTION) \
	559	int NAME(int predicate, $cuda_thread_data_t thread, int barrierIndex) { \
	560	$cuda_complete_block_region(thread); \
	561	int indexInBlock = thread->warpID * warpSize + thread->laneID; \
	562	\
	563	if (thread->block->currBarrierID == -1) { \
	564	thread->block->currBarrierID = barrierIndex; \
	565	} \
	566	$assert(thread->block->currBarrierID == barrierIndex, \
	567	"Divergent calls to NAME."); \
	568	\
	569	if (predicate) { \
	570	thread->block->predArray[indexInBlock] = 1; \
	571	} else { \
	572	thread->block->predArray[indexInBlock] = 0; \
	573	} \
	574	\
	575	$local_end(); \
	576	$barrier_call(thread->block_barrier); \
	577	$local_start(); \
	578	int lane = thread->laneID; \
	579	\
	580	$cuda_mem_set_clear(thread->warp->memSets[lane][lane]); \
	581	$cuda_mem_set_clear(thread->block->memSets[indexInBlock]); \
	582	\
	583	if (indexInBlock == 0) { \
	584	int result = INIT; \
	585	for (int i = 0; i < thread->block->size; i++) { \
	586	REDUCTION; \
	587	} \
	588	thread->block->predArray[0] = result; \
	589	thread->block->currBarrierID = -1; \
	590	} \
	591	\
	592	$local_end(); \
	593	$barrier_call(thread->block_barrier); \
	594	$local_start(); \
	595	int result = thread->block->predArray[0]; \
	596	\
	597	$local_end(); \
	598	$barrier_call(thread->block_barrier); \
	599	$local_start(); \
	600	\
	601	$read_set_push(); \
	602	$write_set_push(); \
	603	return result; \
	604	}
	605
	606	$CUDA_DEFINE_SYNCTHREADS_VARIANT($cuda__syncthreads_count, 0, result += thread->block->predArray[i])
	607	$CUDA_DEFINE_SYNCTHREADS_VARIANT($cuda__syncthreads_or, 0, if (thread->block->predArray[i]) {
	608	result = 1;
	609	continue;
	610	})
	611	$CUDA_DEFINE_SYNCTHREADS_VARIANT($cuda__syncthreads_and, 1, if (!thread->block->predArray[i]) {
	612	result = 0;
	613	continue;
	614	})
[9cabba4]	615
	616	#define $GET_ARG_1(_1, ...) _1
	617	#define $GET_ARG_2(_1, _2, ...) _2
[9dbe9864]	618
	619	#define __syncwarp() $cuda__syncwarp($thread)
	620	void $cuda__syncwarp($cuda_thread_data_t thread) {
	621	$cuda_complete_warp_region(thread);
	622	$local_end();
	623	$cuda_warp_barrier_call(thread, $CUDA_WARP_TAG_warpsync);
	624	$local_start();
	625	$read_set_push();
	626	$write_set_push();
	627	}
	628
[9cabba4]	629	#define $CUDA_SHFL_PARAM_MACRO(...) $GET_ARG_1(__VA_ARGS__, warpSize, 0), $GET_ARG_2(__VA_ARGS__, warpSize, 0)
	630
	631	#define __shfl_sync(mask, var, ...) \
	632	_Generic(var, \
	633	default: $cuda__shfl_sync_int, \
	634	unsigned int: $cuda__shfl_sync_uint, \
	635	long: $cuda__shfl_sync_long, \
	636	unsigned long: $cuda__shfl_sync_ulong, \
	637	long long: $cuda__shfl_sync_ll, \
	638	unsigned long long: $cuda__shfl_sync_ull, \
[9dbe9864]	639	float: $cuda__shfl_sync_float, \
	640	double: $cuda__shfl_sync_double) (mask, var, $CUDA_SHFL_PARAM_MACRO(__VA_ARGS__), $thread)
[9cabba4]	641
	642	#define __shfl_up_sync(mask, var, ...) \
	643	_Generic(var, \
	644	default: $cuda__shfl_up_sync_int, \
	645	unsigned int: $cuda__shfl_up_sync_uint, \
	646	long: $cuda__shfl_up_sync_long, \
	647	unsigned long: $cuda__shfl_up_sync_ulong, \
	648	long long: $cuda__shfl_up_sync_ll, \
	649	unsigned long long: $cuda__shfl_up_sync_ull, \
	650	float: $cuda__shfl_up_sync_float, \
[9dbe9864]	651	double: $cuda__shfl_up_sync_double) (mask, var, $CUDA_SHFL_PARAM_MACRO(__VA_ARGS__), $thread)
[9cabba4]	652
	653	#define __shfl_down_sync(mask, var, ...) \
	654	_Generic(var, \
	655	default: $cuda__shfl_down_sync_int, \
	656	unsigned int: $cuda__shfl_down_sync_uint, \
	657	long: $cuda__shfl_down_sync_long, \
	658	unsigned long: $cuda__shfl_down_sync_ulong, \
	659	long long: $cuda__shfl_down_sync_ll, \
	660	unsigned long long: $cuda__shfl_down_sync_ull, \
	661	float: $cuda__shfl_down_sync_float, \
[9dbe9864]	662	double: $cuda__shfl_down_sync_double) (mask, var, $CUDA_SHFL_PARAM_MACRO(__VA_ARGS__), $thread)
[9cabba4]	663
	664	#define __shfl_xor_sync(mask, var, ...) \
	665	_Generic(var, \
	666	default: $cuda__shfl_xor_sync_int, \
	667	unsigned int: $cuda__shfl_xor_sync_uint, \
	668	long: $cuda__shfl_xor_sync_long, \
	669	unsigned long: $cuda__shfl_xor_sync_ulong, \
	670	long long: $cuda__shfl_xor_sync_ll, \
	671	unsigned long long: $cuda__shfl_xor_sync_ull, \
	672	float: $cuda__shfl_xor_sync_float, \
[9dbe9864]	673	double: $cuda__shfl_xor_sync_double) (mask, var, $CUDA_SHFL_PARAM_MACRO(__VA_ARGS__), $thread)
	674
	675	#define $CUDA_INTRINSIC_RACE_CHECK_PROTOCOL_PRE() \
	676	$mem writes = $write_set_pop(); \
	677	$mem reads = $read_set_pop(); \
	678	$cuda_mem_set_add(thread->warp->memSets[thread->laneID][thread->laneID], \
	679	reads, writes); \
	680	thread->warp->patching[thread->laneID] = $true;
	681
	682	#define $CUDA_INTRINSIC_RACE_CHECK_PROTOCOL_POST() \
	683	$read_set_push(); \
	684	$write_set_push();
[9cabba4]	685
	686	#define $CUDA_GENERIC_SHFL_BODY() \
	687	$assert (width <= warpSize); \
	688	for (int v = width; v > 1; v /= 2) { \
	689	$assert(v % 2 == 0); \
	690	} \
	691	\
	692	int requestLane; \
	693	switch(tag) { \
	694	case $CUDA_WARP_TAG_shfl_sync: \
[9dbe9864]	695	requestLane = thread->laneID/width + laneParam % width; \
[9cabba4]	696	break; \
	697	case $CUDA_WARP_TAG_shfl_up_sync: \
[9dbe9864]	698	requestLane = thread->laneID - laneParam; \
[9cabba4]	699	break; \
	700	case $CUDA_WARP_TAG_shfl_down_sync: \
[9dbe9864]	701	requestLane = thread->laneID + laneParam; \
[9cabba4]	702	break; \
	703	case $CUDA_WARP_TAG_shfl_xor_sync: \
[9dbe9864]	704	requestLane = thread->laneID ^ laneParam; \
[9cabba4]	705	break; \
	706	} \
[9dbe9864]	707	$cuda_warp_data_t warp = thread->warp; \
	708	_Bool validSrcLane = requestLane >= 0 && requestLane < warp->size; \
[9cabba4]	709	if (validSrcLane) { \
[9dbe9864]	710	$comm_enqueue(thread->lane_comm, $message_pack(thread->laneID, requestLane, tag, NULL, 0)); \
[9cabba4]	711	} \
	712	\
	713	$local_end(); \
[9dbe9864]	714	$cuda_warp_barrier_call(thread, tag); \
[9cabba4]	715	$local_start(); \
	716	\
	717	\
[9dbe9864]	718	_Bool requested[width] = (_Bool[width])$lambda(int i) $false; \
	719	int subWarpStart = thread->laneID/width; \
	720	while ($comm_probe(thread->lane_comm, $COMM_ANY_SOURCE, tag)) { \
	721	$message request = $comm_dequeue(thread->lane_comm, $COMM_ANY_SOURCE, tag); \
	722	requested[$message_source(request) - subWarpStart] = $true; \
	723	} \
	724	\
	725	$local_end(); \
	726	$cuda_warp_barrier_call(thread, tag); \
	727	$local_start(); \
	728	\
	729	for(int i = 0; i < width; i++) { \
	730	if (requested[i]) { \
	731	$comm_enqueue(thread->lane_comm, $message_pack(thread->laneID, i + subWarpStart, tag, &var, typeSize)); \
	732	} \
[9cabba4]	733	} \
	734	\
	735	$local_end(); \
[9dbe9864]	736	$cuda_warp_barrier_call(thread, tag); \
[9cabba4]	737	$local_start(); \
	738	\
	739	if (validSrcLane) { \
[9dbe9864]	740	$message result = $comm_dequeue(thread->lane_comm, requestLane, tag); \
[9cabba4]	741	$message_unpack(result, &resultVal, typeSize); \
	742	} else { \
	743	$havoc(&resultVal); \
	744	}
	745
	746	#define $CUDA_DEFINE_SHFL(NAME, T, TAG) \
[9dbe9864]	747	T NAME(unsigned mask, T var, int laneParam, int width, $cuda_thread_data_t thread) { \
	748	$CUDA_INTRINSIC_RACE_CHECK_PROTOCOL_PRE(); \
[9cabba4]	749	T resultVal; \
	750	int typeSize = sizeof(T); \
	751	$cuda_warp_tag tag = TAG; \
	752	\
	753	$CUDA_GENERIC_SHFL_BODY(); \
	754	\
[9dbe9864]	755	$CUDA_INTRINSIC_RACE_CHECK_PROTOCOL_POST(); \
[9cabba4]	756	return resultVal; \
	757	}
	758
[9dbe9864]	759	$CUDA_DEFINE_SHFL($cuda__shfl_sync_int, int, $CUDA_WARP_TAG_shfl_sync)
	760	$CUDA_DEFINE_SHFL($cuda__shfl_sync_uint, unsigned int, $CUDA_WARP_TAG_shfl_sync)
	761	$CUDA_DEFINE_SHFL($cuda__shfl_sync_long, long, $CUDA_WARP_TAG_shfl_sync)
	762	$CUDA_DEFINE_SHFL($cuda__shfl_sync_ulong, unsigned long, $CUDA_WARP_TAG_shfl_sync)
	763	$CUDA_DEFINE_SHFL($cuda__shfl_sync_ll, long long, $CUDA_WARP_TAG_shfl_sync)
	764	$CUDA_DEFINE_SHFL($cuda__shfl_sync_ull, unsigned long long, $CUDA_WARP_TAG_shfl_sync)
	765	$CUDA_DEFINE_SHFL($cuda__shfl_sync_float, float, $CUDA_WARP_TAG_shfl_sync)
	766	$CUDA_DEFINE_SHFL($cuda__shfl_sync_double, double, $CUDA_WARP_TAG_shfl_sync)
	767
	768	$CUDA_DEFINE_SHFL($cuda__shfl_up_sync_int, int, $CUDA_WARP_TAG_shfl_up_sync)
	769	$CUDA_DEFINE_SHFL($cuda__shfl_up_sync_uint, unsigned int, $CUDA_WARP_TAG_shfl_up_sync)
	770	$CUDA_DEFINE_SHFL($cuda__shfl_up_sync_long, long, $CUDA_WARP_TAG_shfl_up_sync)
	771	$CUDA_DEFINE_SHFL($cuda__shfl_up_sync_ulong, unsigned long, $CUDA_WARP_TAG_shfl_up_sync)
	772	$CUDA_DEFINE_SHFL($cuda__shfl_up_sync_ll, long long, $CUDA_WARP_TAG_shfl_up_sync)
	773	$CUDA_DEFINE_SHFL($cuda__shfl_up_sync_ull, unsigned long long, $CUDA_WARP_TAG_shfl_up_sync)
	774	$CUDA_DEFINE_SHFL($cuda__shfl_up_sync_float, float, $CUDA_WARP_TAG_shfl_up_sync)
	775	$CUDA_DEFINE_SHFL($cuda__shfl_up_sync_double, double, $CUDA_WARP_TAG_shfl_up_sync)
	776
	777	$CUDA_DEFINE_SHFL($cuda__shfl_down_sync_int, int, $CUDA_WARP_TAG_shfl_down_sync)
	778	$CUDA_DEFINE_SHFL($cuda__shfl_down_sync_uint, unsigned int, $CUDA_WARP_TAG_shfl_down_sync)
	779	$CUDA_DEFINE_SHFL($cuda__shfl_down_sync_long, long, $CUDA_WARP_TAG_shfl_down_sync)
	780	$CUDA_DEFINE_SHFL($cuda__shfl_down_sync_ulong, unsigned long, $CUDA_WARP_TAG_shfl_down_sync)
	781	$CUDA_DEFINE_SHFL($cuda__shfl_down_sync_ll, long long, $CUDA_WARP_TAG_shfl_down_sync)
	782	$CUDA_DEFINE_SHFL($cuda__shfl_down_sync_ull, unsigned long long, $CUDA_WARP_TAG_shfl_down_sync)
	783	$CUDA_DEFINE_SHFL($cuda__shfl_down_sync_float, float, $CUDA_WARP_TAG_shfl_down_sync)
	784	$CUDA_DEFINE_SHFL($cuda__shfl_down_sync_double, double, $CUDA_WARP_TAG_shfl_down_sync)
	785
	786	$CUDA_DEFINE_SHFL($cuda__shfl_xor_sync_int, int, $CUDA_WARP_TAG_shfl_xor_sync)
	787	$CUDA_DEFINE_SHFL($cuda__shfl_xor_sync_uint, unsigned int, $CUDA_WARP_TAG_shfl_xor_sync)
	788	$CUDA_DEFINE_SHFL($cuda__shfl_xor_sync_long, long, $CUDA_WARP_TAG_shfl_xor_sync)
	789	$CUDA_DEFINE_SHFL($cuda__shfl_xor_sync_ulong, unsigned long, $CUDA_WARP_TAG_shfl_xor_sync)
	790	$CUDA_DEFINE_SHFL($cuda__shfl_xor_sync_ll, long long, $CUDA_WARP_TAG_shfl_xor_sync)
	791	$CUDA_DEFINE_SHFL($cuda__shfl_xor_sync_ull, unsigned long long, $CUDA_WARP_TAG_shfl_xor_sync)
	792	$CUDA_DEFINE_SHFL($cuda__shfl_xor_sync_float, float, $CUDA_WARP_TAG_shfl_xor_sync)
	793	$CUDA_DEFINE_SHFL($cuda__shfl_xor_sync_double, double, $CUDA_WARP_TAG_shfl_xor_sync)
	794
	795	#define __ballot_sync(mask, predicate) $cuda__ballot_sync(mask, predicate, $thread)
	796	#define __all_sync(mask, predicate) $cuda__all_sync(mask, predicate, $thread)
	797	#define __any_sync(mask, predicate) $cuda__any_sync(mask, predicate, $thread)
	798
	799	#define $CUDA_GENERIC_COND_REDUCTION_BODY(COND, T_REDUCTION, F_REDUCTION) \
	800	$cuda_warp_data_t warp = thread->warp; \
	801	int laneID = thread->laneID; \
	802	$comm comm = thread->lane_comm; \
	803	if (warp->reductionLane == -1) { \
	804	warp->reductionLane = laneID; \
	805	\
	806	result = initialValue; \
	807	for (int i = 0; i < warp->size; i++) { \
	808	if (i == laneID) { \
	809	operand = value; \
	810	} else { \
	811	$local_end(); \
	812	$when(!warp->alive[i] \|\| $comm_probe(comm, i, tag)) $local_start(); \
	813	\
	814	if (!warp->alive[i]) { \
	815	operand = initialValue; \
	816	} else { \
	817	$local_end(); \
	818	$message_unpack($comm_dequeue(comm, i, tag), &operand, typeSize); \
	819	$local_start(); \
	820	} \
	821	} \
	822	\
	823	if (COND) { \
	824	result = T_REDUCTION; \
	825	} else { \
	826	result = F_REDUCTION; \
	827	} \
	828	} \
	829	\
	830	warp->reductionLane = -1; \
	831	\
	832	for (int i = 0; i< warp->size; i++) { \
	833	if (i != laneID && warp->alive[i]) { \
	834	$comm_enqueue(comm, $message_pack(laneID, i, tag, &result, typeSize)); \
	835	} \
	836	} \
	837	} else { \
	838	int reductionLane = warp->reductionLane; \
	839	$comm_enqueue(comm, $message_pack(laneID, reductionLane, tag, &value, typeSize)); \
	840	$local_end(); \
	841	$message_unpack($comm_dequeue(comm, reductionLane, tag), &result, typeSize); \
	842	$local_start(); \
	843	}
	844
	845	#define $CUDA_GENERIC_REDUCTION_BODY(REDUCTION) $CUDA_GENERIC_COND_REDUCTION_BODY($true, REDUCTION, result)
	846
	847	int $cuda__all_sync(unsigned mask, int value, $cuda_thread_data_t thread) {
	848	$CUDA_INTRINSIC_RACE_CHECK_PROTOCOL_PRE();
	849
	850	$cuda_warp_tag tag = $CUDA_WARP_TAG_all_sync;
	851	int typeSize = sizeof(int);
	852	int initialValue = 1;
	853	int result, operand;
	854
	855	$CUDA_GENERIC_COND_REDUCTION_BODY(result != 0 && operand != 0, 1, 0);
	856
	857	$CUDA_INTRINSIC_RACE_CHECK_PROTOCOL_POST();
	858
	859	return result;
	860	}
	861
	862	int $cuda__any_sync(unsigned mask, int value, $cuda_thread_data_t thread) {
	863	$CUDA_INTRINSIC_RACE_CHECK_PROTOCOL_PRE();
	864
	865	$cuda_warp_tag tag = $CUDA_WARP_TAG_any_sync;
	866	int typeSize = sizeof(int);
	867	int initialValue = 0;
	868	int result, operand;
	869
	870	$CUDA_GENERIC_COND_REDUCTION_BODY(result != 0 \|\| operand != 0, 1, 0);
	871
	872	$CUDA_INTRINSIC_RACE_CHECK_PROTOCOL_POST();
	873
	874	return result;
	875	}
	876
	877	unsigned $cuda__ballot_sync(unsigned mask, int value, $cuda_thread_data_t thread) {
	878	$CUDA_INTRINSIC_RACE_CHECK_PROTOCOL_PRE();
	879
	880	$cuda_warp_tag tag = $CUDA_WARP_TAG_ballot_sync;
	881	int initialValue = 0;
	882	int typeSize = sizeof(int);
	883	unsigned result;
	884	int operand;
	885
	886	$CUDA_GENERIC_COND_REDUCTION_BODY(operand == 0, 2 * result, 2 * result + 1);
	887
	888	$CUDA_INTRINSIC_RACE_CHECK_PROTOCOL_POST();
	889
	890	return result;
	891	}
	892
	893	/* atomicAdd()
	894	* Reads the 16-bit, 32-bit or 64-bit word old located at the address address in
	895	* global or shared memory, computes (old + val), and stores the result back to
	896	* memory at the same address. These three operations are performed in one atomic
	897	* transaction. The function returns old.
	898	*/
	899	#define atomicAdd(X,Y) _Generic(X, \
	900	default : $cuda_atomicAdd_int, \
	901	unsigned int* : $cuda_atomicAdd_uint, \
	902	unsigned long long int* : $cuda_atomicAdd_ullint, \
	903	float* : $cuda_atomicAdd_float, \
	904	double* : $cuda_atomicAdd_double) (X, Y, $thread)
	905
	906	/* atomicSub()
	907	* reads the 32-bit word old located at the address address in global or shared
	908	* memory, computes (old - val), and stores the result back to memory at the same
	909	* address. These three operations are performed in one atomic transaction. The
	910	* function returns old.
	911	*/
	912	#define atomicSub(X,Y) _Generic(X, \
	913	default : $cuda_atomicSub_int, \
	914	unsigned int* : $cuda_atomicSub_uint) (X, Y, $thread)
	915
	916	/* atomicExch()
	917	* reads the 32-bit or 64-bit word old located at the address address in global
	918	* or shared memory and stores val back to memory at the same address. These two
	919	* operations are performed in one atomic transaction. The function returns old.
	920	*/
	921	#define atomicExch(X,Y) _Generic(X, \
	922	default : $cuda_atomicExch_int, \
	923	unsigned int* : $cuda_atomicExch_uint, \
	924	unsigned long long int* : $cuda_atomicExch_ullint, \
	925	float* : $cuda_atomicExch_float) (X, Y, $thread)
	926
	927	/* atomicMin()
	928	* reads the 32-bit or 64-bit word old located at the address address in global
	929	* or shared memory, computes the minimum of old and val, and stores the result
	930	* back to memory at the same address. These three operations are performed in one
	931	* atomic transaction. The function returns old.
	932	*/
	933	#define atomicMin(X,Y) _Generic(X, \
	934	default : $cuda_atomicMin_int, \
	935	unsigned int* : $cuda_atomicMin_uint, \
	936	unsigned long long int* : $cuda_atomicMin_ullint) (X, Y, $thread)
	937
	938	/* atomicMax()
	939	* reads the 32-bit or 64-bit word old located at the address address in global
	940	* or shared memory, computes the maximum of old and val, and stores the result
	941	* back to memory at the same address. These three operations are performed in one
	942	* atomic transaction. The function returns old.
	943	*/
	944	#define atomicMax(X,Y) _Generic(X, \
	945	default : $cuda_atomicMax_int, \
	946	unsigned int* : $cuda_atomicMax_uint, \
	947	unsigned long long int* : $cuda_atomicMax_ullint) (X, Y, $thread)
	948
	949	/* atomicInc()
	950	* reads the 32-bit word old located at the address address in global or shared
	951	* memory, computes ((old >= val) ? 0 : (old+1)), and stores the result back to
	952	* memory at the same address. These three operations are performed in one atomic
	953	* transaction. The function returns old.
	954	*/
	955	#define atomicInc(address, val) $cuda_atomicInc(address, val, $thread)
	956
	957	/* atomicDec()
	958	* reads the 32-bit word old located at the address address in global or shared
	959	* memory, computes (((old == 0) \|\| (old > val)) ? val : (old-1) ), and stores
	960	* the result back to memory at the same address. These three operations are
	961	* performed in one atomic transaction. The function returns old.
	962	*/
	963	#define atomicDec(address, val) $cuda_atomicDec(address, val, $thread)
	964
	965	/* atomicCAS()
	966	* reads the 16-bit, 32-bit or 64-bit word old located at the address address in
	967	* global or shared memory, computes (old == compare ? val : old) , and stores the
	968	* result back to memory at the same address. These three operations are performed
	969	* in one atomic transaction. The function returns old (Compare And Swap).
	970	*/
	971	#define atomicCAS(address, compare, val) _Generic(address, \
	972	default : $cuda_atomicCAS_int, \
	973	unsigned int* : $cuda_atomicCAS_uint, \
	974	unsigned long long int* : $cuda_atomicCAS_ullint, \
	975	unsigned short int* : $cuda_atomicCAS_usint) (address, compare, val, $thread)
	976
	977	/*
	978	* reads the 32-bit or 64-bit word old located at the address address
	979	* in global or shared memory, computes (old & val), and stores the
	980	* result back to memory at the same address. These three operations
	981	* are performed in one atomic transaction. The function returns old.
	982	*/
	983	#define atomicAnd(address, val) _Generic(address, \
	984	default : $cuda_atomicAnd_int, \
	985	unsigned int* : $cuda_atomicAnd_uint, \
	986	unsigned long long int* : $cuda_atomicAnd_ullint) (address, val, $thread)
	987
	988	/*
	989	* reads the 32-bit or 64-bit word old located at the address address
	990	* in global or shared memory, computes (old \| val), and stores the
	991	* result back to memory at the same address. These three operations
	992	* are performed in one atomic transaction. The function returns old.
	993	*/
	994	#define atomicOr(address, val) _Generic(address, \
	995	default : $cuda_atomicOr_int, \
	996	unsigned int* : $cuda_atomicOr_uint, \
	997	unsigned long long int* : $cuda_atomicOr_ullint) (address, val, $thread)
	998
	999	/*
	1000	* reads the 32-bit or 64-bit word old located at the address address
	1001	* in global or shared memory, computes (old ^ val), and stores the
	1002	* result back to memory at the same address. These three operations
	1003	* are performed in one atomic transaction. The function returns old.
	1004	*/
	1005	#define atomicXor(address, val) _Generic(address, \
	1006	default : $cuda_atomicXor_int, \
	1007	unsigned int* : $cuda_atomicXor_uint, \
	1008	unsigned long long int* : $cuda_atomicXor_ullint) (address, val, $thread)
	1009
	1010	#define $CUDA_ATOMIC_PRE_ACTION(T) \
	1011	$cuda_complete_kernel_region(thread); \
	1012	$yield(); \
	1013	$cuda_start_kernel_region(thread); \
	1014	T old = *address;
	1015
	1016	#define $CUDA_ATOMIC_POST_ACTION() \
	1017	$cuda_complete_kernel_region(thread); \
	1018	$cuda_start_kernel_region(thread); \
	1019	return old;
	1020
	1021	#define $CUDA_DEFINE_ATOMIC_ADD(NAME, T) \
	1022	T NAME(T* address, T val, $cuda_thread_data_t thread) { \
	1023	$CUDA_ATOMIC_PRE_ACTION(T) \
	1024	*address += val; \
	1025	$CUDA_ATOMIC_POST_ACTION() \
	1026	}
	1027
	1028	$CUDA_DEFINE_ATOMIC_ADD($cuda_atomicAdd_int, int)
	1029	$CUDA_DEFINE_ATOMIC_ADD($cuda_atomicAdd_uint, unsigned int)
	1030	$CUDA_DEFINE_ATOMIC_ADD($cuda_atomicAdd_ullint, unsigned long long int)
	1031	$CUDA_DEFINE_ATOMIC_ADD($cuda_atomicAdd_float, float)
	1032	$CUDA_DEFINE_ATOMIC_ADD($cuda_atomicAdd_double, double)
	1033
	1034	#define $CUDA_DEFINE_ATOMIC_SUB(NAME, T) \
	1035	T NAME(T* address, T val, $cuda_thread_data_t thread) { \
	1036	$CUDA_ATOMIC_PRE_ACTION(T) \
	1037	*address -= val; \
	1038	$CUDA_ATOMIC_POST_ACTION() \
	1039	}
	1040
	1041	$CUDA_DEFINE_ATOMIC_SUB($cuda_atomicSub_int, int)
	1042	$CUDA_DEFINE_ATOMIC_SUB($cuda_atomicSub_uint, unsigned int)
	1043
	1044	#define $CUDA_DEFINE_ATOMIC_EXCH(NAME, T) \
	1045	T NAME(T* address, T val, $cuda_thread_data_t thread) { \
	1046	$CUDA_ATOMIC_PRE_ACTION(T) \
	1047	*address = val; \
	1048	$CUDA_ATOMIC_POST_ACTION() \
	1049	}
	1050
	1051	$CUDA_DEFINE_ATOMIC_EXCH($cuda_atomicExch_int, int)
	1052	$CUDA_DEFINE_ATOMIC_EXCH($cuda_atomicExch_uint, unsigned int)
	1053	$CUDA_DEFINE_ATOMIC_EXCH($cuda_atomicExch_ullint, unsigned long long int)
	1054	$CUDA_DEFINE_ATOMIC_EXCH($cuda_atomicExch_float, float)
	1055
	1056	#define $CUDA_DEFINE_ATOMIC_MIN(NAME, T) \
	1057	T NAME(T* address, T val, $cuda_thread_data_t thread) { \
	1058	$CUDA_ATOMIC_PRE_ACTION(T) \
	1059	if (old <= val) *address = old; \
	1060	else *address = val; \
	1061	$CUDA_ATOMIC_POST_ACTION() \
	1062	}
	1063
	1064	$CUDA_DEFINE_ATOMIC_MIN($cuda_atomicMin_int, int)
	1065	$CUDA_DEFINE_ATOMIC_MIN($cuda_atomicMin_uint, unsigned int)
	1066	$CUDA_DEFINE_ATOMIC_MIN($cuda_atomicMin_ullint, unsigned long long int)
	1067
	1068	#define $CUDA_DEFINE_ATOMIC_MAX(NAME, T) \
	1069	T NAME(T* address, T val, $cuda_thread_data_t thread) { \
	1070	$CUDA_ATOMIC_PRE_ACTION(T) \
	1071	if (old >= val) *address = old; \
	1072	else *address = val; \
	1073	$CUDA_ATOMIC_POST_ACTION() \
	1074	}
	1075
	1076	$CUDA_DEFINE_ATOMIC_MAX($cuda_atomicMax_int, int)
	1077	$CUDA_DEFINE_ATOMIC_MAX($cuda_atomicMax_uint, unsigned int)
	1078	$CUDA_DEFINE_ATOMIC_MAX($cuda_atomicMax_ullint, unsigned long long int)
	1079
	1080	unsigned int $cuda_atomicInc(unsigned int* address, unsigned int val, $cuda_thread_data_t thread) {
	1081	$CUDA_ATOMIC_PRE_ACTION(unsigned int);
	1082	if (old >= val) *address = 0;
	1083	else *address = old + 1;
	1084	$CUDA_ATOMIC_POST_ACTION()
	1085	}
	1086
	1087	unsigned int $cuda_atomicDec(unsigned int* address, unsigned int val, $cuda_thread_data_t thread) {
	1088	$CUDA_ATOMIC_PRE_ACTION(unsigned int);
	1089	if (old == 0 \|\| old > val) *address = val;
	1090	else *address = old - 1;
	1091	$CUDA_ATOMIC_POST_ACTION()
	1092	}
	1093
	1094	#define $CUDA_DEFINE_ATOMIC_CAS(NAME, T) \
	1095	T NAME(T* address, T compare, T val, $cuda_thread_data_t thread) { \
	1096	$CUDA_ATOMIC_PRE_ACTION(T) \
	1097	if (old == compare) *address = val; \
	1098	$CUDA_ATOMIC_POST_ACTION() \
	1099	}
	1100
	1101	$CUDA_DEFINE_ATOMIC_CAS($cuda_atomicCAS_int, int)
	1102	$CUDA_DEFINE_ATOMIC_CAS($cuda_atomicCAS_uint, unsigned int)
	1103	$CUDA_DEFINE_ATOMIC_CAS($cuda_atomicCAS_ullint, unsigned long long int)
	1104	$CUDA_DEFINE_ATOMIC_CAS($cuda_atomicCAS_usint, unsigned short int)
	1105
	1106	#define $CUDA_DEFINE_ATOMIC_AND(NAME, T) \
	1107	T NAME(T* address, T val, $cuda_thread_data_t thread) { \
	1108	$CUDA_ATOMIC_PRE_ACTION(T) \
	1109	*address = old & val; \
	1110	$CUDA_ATOMIC_POST_ACTION() \
	1111	}
	1112
	1113	$CUDA_DEFINE_ATOMIC_AND($cuda_atomicAnd_int, int)
	1114	$CUDA_DEFINE_ATOMIC_AND($cuda_atomicAnd_uint, unsigned int)
	1115	$CUDA_DEFINE_ATOMIC_AND($cuda_atomicAnd_ullint, unsigned long long int)
	1116
	1117	#define $CUDA_DEFINE_ATOMIC_OR(NAME, T) \
	1118	T NAME(T* address, T val, $cuda_thread_data_t thread) { \
	1119	$CUDA_ATOMIC_PRE_ACTION(T) \
	1120	*address = old \| val; \
	1121	$CUDA_ATOMIC_POST_ACTION() \
	1122	}
	1123
	1124	$CUDA_DEFINE_ATOMIC_OR($cuda_atomicOr_int, int)
	1125	$CUDA_DEFINE_ATOMIC_OR($cuda_atomicOr_uint, unsigned int)
	1126	$CUDA_DEFINE_ATOMIC_OR($cuda_atomicOr_ullint, unsigned long long int)
	1127
	1128	#define $CUDA_DEFINE_ATOMIC_XOR(NAME, T) \
	1129	T NAME(T* address, T val, $cuda_thread_data_t thread) { \
	1130	$CUDA_ATOMIC_PRE_ACTION(T) \
	1131	*address = old ^ val; \
	1132	$CUDA_ATOMIC_POST_ACTION() \
	1133	}
	1134
	1135	$CUDA_DEFINE_ATOMIC_XOR($cuda_atomicXor_int, int)
	1136	$CUDA_DEFINE_ATOMIC_XOR($cuda_atomicXor_uint, unsigned int)
	1137	$CUDA_DEFINE_ATOMIC_XOR($cuda_atomicXor_ullint, unsigned long long int)
	1138
	1139	//////////////////////
	1140	// Global Variables //
	1141	//////////////////////
	1142
	1143	$gcomm $cuda_gcomm = $gcomm_create($here, 2);
	1144	const int $CUDA_PLACE_HOST = 0;
	1145	const int $CUDA_PLACE_DEVICE = 1;
	1146	$comm $cuda_host_comm = $comm_create($here, $cuda_gcomm, $CUDA_PLACE_HOST);
	1147
	1148	/**
	1149	* Tags used for message-passing between host and device
	1150	*/
	1151	enum $cuda_tag {
	1152	// Predefined tags
	1153	$CUDA_TAG_TEARDOWN,
	1154	$CUDA_TAG_SCOPE_REQUEST,
	1155	$CUDA_TAG_cudaFree,
	1156	$CUDA_TAG_cudaMemcpy,
	1157	$CUDA_TAG_cudaMemcpyAsync,
	1158	// Generated tags (by transformer)
	1159	$CUDA_TAG_LAUNCH_kernel_1
	1160	};
	1161
	1162	///////////////////
	1163	// CIVL-CUDA API //
	1164	///////////////////
	1165
	1166	$scope $cuda_host_request_device_scope() {
	1167	$comm_enqueue($cuda_host_comm, $message_pack($CUDA_PLACE_HOST, $CUDA_PLACE_DEVICE, $CUDA_TAG_SCOPE_REQUEST, NULL, 0));
	1168	$message response = $comm_dequeue($cuda_host_comm, $CUDA_PLACE_DEVICE, $CUDA_TAG_SCOPE_REQUEST);
	1169	$scope result;
	1170	$message_unpack(response, &result, sizeof($scope));
	1171
	1172	return result;
	1173	}
	1174
	1175	typedef struct $cuda_memcpy_data {
	1176	void* dst;
	1177	const void* src;
	1178	size_t count;
	1179	cudaMemcpyKind kind;
	1180	} $cuda_memcpy_data;
	1181
	1182	void $cuda_host_memcpy(void* dst, const void* src, size_t count, cudaMemcpyKind kind, _Bool async) {
	1183	if (kind == cudaMemcpyHostToHost) {
	1184	memcpy(dst, src, count);
	1185	} else {
	1186	$cuda_memcpy_data args;
	1187	args.dst = dst;
	1188	args.src = src;
	1189	args.count = count;
	1190	args.kind = kind;
	1191
	1192	int tag = async ? $CUDA_TAG_cudaMemcpyAsync : $CUDA_TAG_cudaMemcpy;
	1193
	1194	$comm_enqueue($cuda_host_comm, $message_pack($CUDA_PLACE_HOST, $CUDA_PLACE_DEVICE, tag, &args, sizeof($cuda_memcpy_data)));
	1195	$comm_dequeue($cuda_host_comm, $CUDA_PLACE_DEVICE, tag);
	1196	}
	1197	}
	1198
	1199	$cuda_stream_node_t $create_new_stream_node($scope cudaScope) {
	1200	cudaStream_t newStream = (cudaStream_t) $malloc(cudaScope, sizeof($cuda_stream));
	1201	newStream->head = NULL;
	1202	newStream->tail = NULL;
	1203	newStream->numOps = 0;
	1204	newStream->alive = true;
	1205
	1206	$cuda_stream_node_t newHead = ($cuda_stream_node_t) $malloc(cudaScope, sizeof($cuda_stream_node));
	1207	newHead->stream = newStream;
	1208	newStream->containingNode = newHead;
	1209	newHead->prev = NULL;
	1210	newHead->next = NULL;
	1211
	1212	return newHead;
	1213	}
	1214
	1215	/*@ depends_on \nothing;
	1216	@ assigns \nothing;
	1217	@ reads \nothing;
	1218	@*/
	1219	$atomic_f $proc $destroy_stream_node($cuda_stream_node_t node) {
	1220	$proc lastOpProc = $proc_null;
	1221	cudaStream_t stream = node->stream;
	1222
	1223	if (node->prev != NULL) {
	1224	node->prev->next = node->next;
	1225	}
	1226	if (node->next != NULL) {
	1227	node->next->prev = node->prev;
	1228	}
	1229	free(node);
	1230
	1231	stream->alive = false;
	1232	if(stream->tail != NULL)
	1233	lastOpProc = stream->tail->opState->op;
	1234
	1235	void destroyStreamWhenComplete($proc lastOpProc, cudaStream_t stream) {
	1236	$wait(lastOpProc);
	1237	free(stream);
	1238	}
	1239
	1240	return $spawn destroyStreamWhenComplete(lastOpProc, stream);
	1241	}
	1242
	1243	/*@ depends_on \access(stream);
	1244	@ assigns stream;
	1245	@ reads \nothing;
	1246	@*/
	1247	$atomic_f $proc $stream_enqueue($scope cudaScope, cudaStream_t stream, $message opParams, void(*opProc)($message, $cuda_op_state_t, cudaStream_t)) {
	1248	$assert(stream->alive, "Attempt to enqueue a CUDA operation onto a destroyed stream");
	1249
	1250	$cuda_op_state_t newOpState = ($cuda_op_state_t) $malloc(cudaScope, sizeof($cuda_op_state));
	1251	newOpState->start = false;
	1252	newOpState->op = $spawn opProc(opParams, newOpState, stream);
	1253
	1254	$cuda_op_state_node_t newOpStateNode = ($cuda_op_state_node_t) $malloc(cudaScope, sizeof($cuda_op_state_node));
	1255	newOpStateNode->opState = newOpState;
	1256	newOpStateNode->next = NULL;
	1257
	1258	if (stream->tail == NULL) {
	1259	stream->head = newOpStateNode;
	1260	stream->tail = newOpStateNode;
	1261	newOpState->start = true;
	1262	} else {
	1263	stream->tail->next = newOpStateNode;
	1264	stream->tail = newOpStateNode;
	1265	}
	1266	stream->numOps++;
	1267
	1268	return newOpState->op;
	1269	}
	1270
	1271	/*@ depends_on \nothing;
	1272	@ assigns \nothing;
	1273	@ reads \nothing;
	1274	@*/
	1275	$atomic_f void $stream_dequeue(cudaStream_t stream) {
	1276	$assert(stream->head != NULL, "Attempt to dequeue an empty stream");
	1277
	1278	if (stream->head == stream->tail) {
	1279	stream->tail = NULL;
	1280	}
	1281
	1282	$cuda_op_state_node_t oldHead = stream->head;
	1283	stream->head = oldHead->next;
	1284	if (stream->head != NULL) {
	1285	stream->head->opState->start = true;
	1286	}
	1287
	1288	stream->numOps--;
	1289	free(oldHead->opState);
	1290	free(oldHead);
	1291	}
	1292
	1293	// Helper function
	1294	int $dim3_index(dim3 size, uint3 location) {
	1295	return location.x + size.x * (location.y + size.y * location.z);
	1296	}
	1297
	1298	// Helper function
	1299	int $cuda_kernel_index (dim3 gDim, dim3 bDim, uint3 bIdx, uint3 tIdx) {
	1300	return $dim3_index(gDim, bIdx) * (bDim.x * bDim.y * bDim.z) + $dim3_index(bDim, tIdx);
	1301	}
	1302
	1303	void $cuda_run_and_wait_on_procs(dim3 dim, void spawningFunction(uint3)) {
	1304	//TODO: calculate length and index, replace this function in the kernel
	1305	$local_start();
	1306	int length = dim.x * dim.y * dim.z;
	1307	$proc procArray[length];
	1308	$range rx = 0 .. dim.x - 1;
	1309	$range ry = 0 .. dim.y - 1;
	1310	$range rz = 0 .. dim.z - 1;
	1311	$domain(3) dom = ($domain(3)){rx, ry, rz};
	1312	$for(int x,y,z : dom){
	1313	uint3 id = { x, y, z };
	1314	int index = $dim3_index(dim, id);
	1315	procArray[index] = $spawn spawningFunction(id);
	1316	}
	1317	$local_end();
	1318	$waitall(procArray,length);
	1319	}
	1320
[9cabba4]	1321
[9dbe9864]	1322	// CUDA Ops //
[9cabba4]	1323
[9dbe9864]	1324	void $cuda_memcpy_proc($message m, $cuda_op_state_t opState, cudaStream_t stream) {
	1325
	1326	$when(opState->start);
	1327	$cuda_memcpy_data args;
	1328	$message_unpack(m, &args, sizeof($cuda_memcpy_data));
	1329
	1330	if (args.kind == cudaMemcpyHostToDevice \|\| cudaMemcpyDeviceToDevice) {
	1331	args.dst = $reveal(args.dst);
	1332	}
	1333	if (args.kind == cudaMemcpyDeviceToHost \|\| cudaMemcpyDeviceToDevice) {
	1334	args.src = $reveal(args.src);
	1335	}
	1336	memcpy(args.dst, args.src, args.count);
	1337
	1338	$stream_dequeue(stream);
	1339	}
[9cabba4]	1340
[9dbe9864]	1341	$message $cuda_memcpy($scope cudaScope, cudaStream_t stream, $message request, _Bool async) {
	1342	$cuda_memcpy_data args;
	1343	$message_unpack(request, &args, sizeof($cuda_memcpy_data));
[9cabba4]	1344
[9dbe9864]	1345	$proc memcpyProc = $stream_enqueue(cudaScope, stream, request, $cuda_memcpy_proc);
	1346
	1347	if (!async && args.kind != cudaMemcpyDeviceToDevice) {
	1348	$wait(memcpyProc);
[9cabba4]	1349	}
[9dbe9864]	1350	int tag = async ? $CUDA_TAG_cudaMemcpyAsync : $CUDA_TAG_cudaMemcpy;
[9cabba4]	1351
[9dbe9864]	1352	return $message_pack($CUDA_PLACE_DEVICE, $CUDA_PLACE_HOST, tag, NULL, 0);
	1353	}
[9cabba4]	1354
[9dbe9864]	1355	$message $cuda_free($message request) {
	1356	void* devPtr;
	1357	$message_unpack(request, &devPtr, sizeof(void*));
	1358	free($reveal(devPtr));
[9cabba4]	1359
[9dbe9864]	1360	return $message_pack($CUDA_PLACE_DEVICE, $CUDA_PLACE_HOST, $CUDA_TAG_cudaFree, NULL, 0);
[9cabba4]	1361	}
	1362
	1363
	1364
[9dbe9864]	1365	////////////////////////////////////////////
	1366	// CUDA API Functions (For Host-use Only) //
	1367	////////////////////////////////////////////
[9cabba4]	1368
[9dbe9864]	1369	cudaError_t cudaFree(void* devPtr) {
	1370	$comm_enqueue($cuda_host_comm, $message_pack($CUDA_PLACE_HOST, $CUDA_PLACE_DEVICE, $CUDA_TAG_cudaFree, &devPtr, sizeof(void*)));
	1371	$comm_dequeue($cuda_host_comm, $CUDA_PLACE_DEVICE, $CUDA_TAG_cudaFree);
	1372
	1373	return cudaSuccess;
	1374	}
[9cabba4]	1375
[9dbe9864]	1376	cudaError_t cudaMemcpy(void* dst, const void* src, size_t count, cudaMemcpyKind kind) {
	1377	$cuda_host_memcpy(dst, src, count, kind, false);
	1378	return cudaSuccess;
	1379	}
[9cabba4]	1380
[9dbe9864]	1381	cudaError_t cudaMemcpyAsync(void* dst, const void* src, size_t count,
	1382	cudaMemcpyKind kind, cudaStream_t stream) {
	1383	$cuda_host_memcpy(dst, src, count, kind, true);
	1384	return cudaSuccess;
[9cabba4]	1385	}
	1386
	1387	/*
[9dbe9864]	1388	cudaError_t cudaStreamCreate(cudaStream_t * pStream) {
	1389	// Create new stream node in linked list
	1390	$cuda_stream_node_t newHead = $create_new_stream_node();
	1391	newHead->next = $cuda_global_context.head;
	1392	$cuda_global_context.head->prev = newHead;
	1393
	1394	// Update cuda context's head to be the new node we created
	1395	$cuda_global_context.head = newHead;
	1396	$cuda_global_context.numStreams++;
	1397
	1398	return cudaSuccess;
	1399	}
	1400	*/
[9cabba4]	1401
[9dbe9864]	1402	/*
	1403	cudaError_t cudaStreamSynchronize(cudaStream_t stream) {
	1404	stream = $default_stream_if_null(stream);
	1405	$assert(stream->alive, "Attempt to synchronize with a destroyed stream");
	1406	$when(stream->head == NULL) return cudaSuccess;
	1407	}
	1408	*/
[9cabba4]	1409
[9dbe9864]	1410	/*
	1411	cudaError_t cudaStreamDestroy(cudaStream_t stream) {
	1412	$assert(stream != NULL && stream != $cuda_default_stream, "Attempt to destroy default stream");
	1413	$assert(stream->alive, "Attempt to destroy an already destroyed stream");
	1414	$destroy_stream_node(stream->containingNode);
	1415	return cudaSuccess;
	1416	}
	1417	*/
[9cabba4]	1418
[9dbe9864]	1419	/*
	1420	cudaError_t cudaDeviceSynchronize() {
	1421	$proc* opsToWaitOn;
	1422	int numOps = 0;
	1423
	1424	$atomic {
	1425	opsToWaitOn = ($proc) malloc(sizeof($proc) $cuda_global_context.numStreams);
	1426
	1427	for ($cuda_stream_node_t node = $cuda_global_context.head;
	1428	node != NULL;
	1429	node = node->next) {
	1430	if (node->stream->tail != NULL) {
	1431	opsToWaitOn[numOps] = node->stream->tail->opState->op;
	1432	numOps++;
[9cabba4]	1433	}
	1434	}
	1435	}
[9dbe9864]	1436	$waitall(opsToWaitOn, numOps);
	1437
	1438	return cudaSuccess;
[9cabba4]	1439	}
	1440	*/
	1441
[a67c590]	1442	//////////////////////////////////
	1443	// Generated code from kernel_1 //
	1444	//////////////////////////////////
	1445
	1446	typedef struct {
	1447	dim3 gridDim;
	1448	dim3 blockDim;
	1449	size_t $cudaMemSize;
	1450	cudaStream_t $cudaStream;
[9cabba4]	1451	float* A;
[a67c590]	1452	const float* B;
	1453	float* C;
	1454	int numElements;
[9dbe9864]	1455	} $cuda_kernel_1_params;
[a67c590]	1456
[9dbe9864]	1457	void $cuda_reveal_kernel_1_args($cuda_kernel_1_params* args) {
[b2ca0b6]	1458	args->A = $reveal(args->A);
	1459	args->B = $reveal(args->B);
	1460	args->C = $reveal(args->C);
	1461	}
	1462
[a67c590]	1463	void $cuda_host_launch_kernel_1(dim3 gridDim, dim3 blockDim,
	1464	size_t $cudaMemSize, cudaStream_t $cudaStream,
[9cabba4]	1465	float* A, const float* B, float* C, int numElements) {
[9dbe9864]	1466	$cuda_kernel_1_params args;
[0ffc6c8]	1467	args.gridDim = gridDim;
	1468	args.blockDim = blockDim;
	1469	args.$cudaMemSize = $cudaMemSize;
	1470	args.$cudaStream = $cudaStream;
	1471	args.A = A;
	1472	args.B = B;
	1473	args.C = C;
	1474	args.numElements = numElements;
	1475
[9dbe9864]	1476	$comm_enqueue($cuda_host_comm, $message_pack($CUDA_PLACE_HOST, $CUDA_PLACE_DEVICE, $CUDA_TAG_LAUNCH_kernel_1, &args, sizeof($cuda_kernel_1_params)));
[0ffc6c8]	1477	$comm_dequeue($cuda_host_comm, $CUDA_PLACE_DEVICE, $CUDA_TAG_LAUNCH_kernel_1);
[109d05e]	1478	}
	1479
[66ea4f9]	1480	void $cuda_kernel_1(dim3 gridDim, dim3 blockDim, size_t _cuda_mem_size,
[9cabba4]	1481	float A, const float B, float *C, int numElements) {
[9dbe9864]	1482	$cuda_kernel_data_t $kernel = $create_cuda_kernel_data($here, gridDim, blockDim);
[9cabba4]	1483	void $cuda_block(uint3 blockIdx) {
	1484	void $cuda_thread(uint3 threadIdx) {
	1485	$local_start();
[9dbe9864]	1486	// cudaMemSet currently not supported so this is small hack to initialize C ahead of time
	1487	if (blockIdx.x == 0 && threadIdx.x == 0) {
	1488	for (int i = 0; i < gridDim.x; i++) {
	1489	C[i] = 0;
	1490	}
	1491	}
[9cabba4]	1492	//$clear_mem_sets($kernel, _cuda_kid);
[9dbe9864]	1493	int $cuda_tid = $dim3_index(blockDim, threadIdx);
	1494	int $cuda_kid = $cuda_kernel_index(gridDim, blockDim, blockIdx, threadIdx);
	1495	$cuda_thread_data_t $thread = $create_cuda_thread_data($here, $kernel, $cuda_kid/(blockDim.x * blockDim.y * blockDim.z), $cuda_tid/warpSize, $cuda_tid % warpSize);
[9cabba4]	1496
	1497	// Kernel REDUCTION start
	1498	/*
	1499	int lane = threadIdx.x % warpSize;
	1500	int thisWarpSize = warpSize;
	1501	if (threadIdx.x - lane + warpSize > blockDim.x) {
	1502	thisWarpSize = ((blockDim.x - 1) % warpSize) + 1;
[66ea4f9]	1503	}
[9cabba4]	1504
	1505	int i = blockDim.x * blockIdx.x + threadIdx.x;
	1506	int warpStart = i - lane;
[9dbe9864]	1507	//printf("%d,%d - i: %d, warpStart: %d, thisWarpSize: %d\n", blockIdx.x, threadIdx.x,i, warpStart, thisWarpSize);
[9cabba4]	1508	int remainingElements = numElements;
	1509
	1510	while (remainingElements > 1) {
[9dbe9864]	1511	//printf("%d,%d - remainingElements: %d - numElements: %d\n", blockIdx.x, threadIdx.x, remainingElements, numElements);
[9cabba4]	1512	if (remainingElements < numElements) {
	1513	// __syncThreads()
	1514	//printf("%d,%d - entering barrier\n", blockIdx.x, threadIdx.x);
	1515
[9dbe9864]	1516	$cuda__syncthreads($thread, 0);
[9cabba4]	1517	//printf("%d,%d - exiting barrier\n", blockIdx.x, threadIdx.x);
	1518	}
[66ea4f9]	1519
[9cabba4]	1520	if (warpStart + 1 < remainingElements) {
	1521	float val = i < numElements ? A[i] : 0;
[9dbe9864]	1522	//printf("%d,%d - val: %d\n", blockIdx.x, threadIdx.x, val);
[9cabba4]	1523
	1524	for (int offset = warpSize/2; offset > 0; offset /= 2) {
[9dbe9864]	1525	//__syncwarp();
[9cabba4]	1526	float tmp = __shfl_down_sync(0, val, offset);
	1527	if (lane + offset < thisWarpSize) {
	1528	val += tmp;
	1529	}
[9dbe9864]	1530	//printf("%d,%d - offset: %d - val: %d\n", blockIdx.x, threadIdx.x, offset, val);
[9cabba4]	1531	}
	1532
	1533	if (i < numElements) {
	1534	A[i] = val;
	1535	}
	1536	}
	1537
	1538	i *= warpSize;
	1539	//warpStart *= warpSize;
	1540	remainingElements = ((remainingElements - 1) / warpSize) + 1;
	1541	}
	1542
	1543	if (i == 0) {
	1544	*C = A[0];
	1545	}
	1546	*/
[9dbe9864]	1547	// Kernel REDUCTION end
	1548
	1549	// Kernel REDUCTION 2 start
	1550
	1551	int lane = threadIdx.x % warpSize;
	1552	int thisWarpSize = warpSize;
	1553	if (threadIdx.x - lane + warpSize > blockDim.x) {
	1554	thisWarpSize = ((blockDim.x - 1) % warpSize) + 1;
	1555	}
	1556
	1557	int i = blockDim.x * blockIdx.x + threadIdx.x;
	1558	int warpStart = i - lane;
	1559
	1560	if (warpStart + 1 < numElements) {
	1561	float val = i < numElements ? A[i] : 0;
	1562	for (int offset = warpSize/2; offset > 0; offset /= 2) {
	1563	float tmp = __shfl_down_sync(0, val, offset);
	1564	//float tmp = i + offset < numElements ? A[i + offset] : 0;
	1565	if (lane + offset < thisWarpSize) {
	1566	val += tmp;
	1567	}
	1568	}
	1569
	1570	if (i < numElements) {
	1571	A[i] = val;
	1572	}
	1573	}
	1574
	1575	$cuda__syncthreads($thread, 0);
	1576	if (threadIdx.x == 0) {
	1577	int blockEnd = blockDim.x * (blockIdx.x + 1);
	1578	if (blockEnd > numElements) {
	1579	blockEnd = numElements;
	1580	}
	1581	for (int j = i + warpSize; j < blockEnd; j += warpSize) {
	1582	A[i] += A[j];
	1583	}
	1584	atomicAdd(C + blockIdx.x, 1);
	1585	}
	1586
	1587	if (i == 0) {
	1588	C[0] = A[0];
	1589	for (int j = 1; j < gridDim.x; j++) {
	1590	while(atomicAdd(C+j,0) == 0) {}
	1591	C[0] += A[j * blockDim.x];
	1592	}
	1593	}
	1594	// Kernel REDUCTION 2 end
	1595
[9cabba4]	1596	// Kernel BALLOT TEST start
	1597
[9dbe9864]	1598	/*
[9cabba4]	1599	int i = threadIdx.x;
	1600	if (i < numElements) {
	1601	int result = __ballot_sync(~0, A[i] > 0);
	1602	if (i == 0) {
	1603	printf("Result: %d\n", result);
	1604	*C = 0;
	1605	while(result > 0) {
	1606	if (result % 2)
	1607	*C += 1;
	1608	result /= 2;
	1609	}
	1610	printf("done calculating result\n");
	1611	}
	1612	}
[9dbe9864]	1613	*/
[9cabba4]	1614	// Kernel BALLOT TEST end
	1615	//$check_data_race($kernel, _cuda_kid);
	1616	//$read_set_pop();
	1617	//$write_set_pop();
[9dbe9864]	1618	$destroy_cuda_thread_data($thread);
[66ea4f9]	1619	$local_end();
	1620	}
[9cabba4]	1621	$cuda_run_and_wait_on_procs(blockDim, $cuda_thread);
[66ea4f9]	1622	}
[9cabba4]	1623	$cuda_run_and_wait_on_procs(gridDim, $cuda_block);
[9dbe9864]	1624	$destroy_cuda_kernel_data($kernel);
[66ea4f9]	1625	}
	1626
[a67c590]	1627	void $cuda_kernel_1_proc ($message request, $cuda_op_state_t opState, cudaStream_t cudaStream) {
[66ea4f9]	1628	$when(opState->start);
	1629
[9dbe9864]	1630	$cuda_kernel_1_params args;
	1631	$message_unpack(request, &args, sizeof($cuda_kernel_1_params));
[b2ca0b6]	1632	$cuda_reveal_kernel_1_args(&args);
[66ea4f9]	1633
[b2ca0b6]	1634	$cuda_kernel_1(args.gridDim, args.blockDim, args.$cudaMemSize, args.A, args.B, args.C, args.numElements);
[a67c590]	1635	$stream_dequeue(cudaStream);
[66ea4f9]	1636	}
	1637
[6dd0322]	1638	/////////////////
	1639	// CUDA "file" //
	1640	/////////////////
	1641
[a67c590]	1642	void $cuda_main() {
[6dd0322]	1643
[a67c590]	1644	// Device Variables
[6dd0322]	1645
[0ffc6c8]	1646	$scope $cuda_scope = $here;
	1647
	1648	$comm $cuda_device_comm = $comm_create($cuda_scope, $cuda_gcomm, 1);
[6dd0322]	1649	$cuda_context $cuda_global_context;
	1650	cudaStream_t $cuda_default_stream;
[66ea4f9]	1651
[0ffc6c8]	1652	// Helper function to get the default stream if passed NULL, and just returns stream otherwise
[a67c590]	1653	// Currently unused until we support streams other than the default one.
[0ffc6c8]	1654	cudaStream_t $default_stream_if_null(cudaStream_t stream) {
	1655	return stream == NULL ? $cuda_default_stream : stream;
	1656	}
	1657
[a67c590]	1658	// Device Logic
[0ffc6c8]	1659
[a67c590]	1660	$cuda_stream_node_t defaultStreamNode = $create_new_stream_node($cuda_scope);
	1661	$cuda_default_stream = defaultStreamNode->stream;
[6dd0322]	1662
[a67c590]	1663	$cuda_global_context.head = defaultStreamNode;
	1664	$cuda_global_context.numStreams = 1;
[6dd0322]	1665
	1666	while (true) {
	1667	$message request = $comm_dequeue($cuda_device_comm, $CUDA_PLACE_HOST, $COMM_ANY_TAG);
	1668	$message response;
[0ffc6c8]	1669	const int tag = $message_tag(request);
[6dd0322]	1670
	1671	switch(tag) {
[0ffc6c8]	1672	case $CUDA_TAG_SCOPE_REQUEST :
	1673	response = $message_pack($CUDA_PLACE_DEVICE, $CUDA_PLACE_HOST, $CUDA_TAG_SCOPE_REQUEST, &$cuda_scope, sizeof($scope));
	1674	break;
	1675	case $CUDA_TAG_cudaFree :
	1676	response = $cuda_free(request);
[6dd0322]	1677	break;
	1678	case $CUDA_TAG_cudaMemcpy :
[a67c590]	1679	response = $cuda_memcpy($cuda_scope, $cuda_default_stream, request, false);
[0ffc6c8]	1680	break;
	1681	case $CUDA_TAG_cudaMemcpyAsync :
[a67c590]	1682	response = $cuda_memcpy($cuda_scope, $cuda_default_stream, request, true);
[6dd0322]	1683	break;
	1684	case $CUDA_TAG_LAUNCH_kernel_1 :
[a67c590]	1685	$stream_enqueue($cuda_scope, $cuda_default_stream, request, $cuda_kernel_1_proc);
	1686
	1687	response = $message_pack($CUDA_PLACE_DEVICE, $CUDA_PLACE_HOST, tag, NULL, 0);
[0ffc6c8]	1688	break;
[a67c590]	1689	case $CUDA_TAG_TEARDOWN : {
	1690	$proc destructor = $destroy_stream_node($cuda_default_stream->containingNode);
	1691	$wait(destructor);
	1692	$comm_destroy($cuda_device_comm);
[0ffc6c8]	1693	return;
[a67c590]	1694	}
[6dd0322]	1695	default :
	1696	$assert(false, "Unknown CUDA request");
	1697	}
	1698
	1699	$comm_enqueue($cuda_device_comm, response);
	1700	}
	1701	}
	1702
	1703	///////////////
	1704	// Host file //
	1705	///////////////
	1706
[9dbe9864]	1707	$input int N = 16;
	1708	$input int threadsPerBlock = N%2 == 0 ? N/2 : (N+1)/2;
	1709	//$input int threadsPerBlock = N;
[4bd0090]	1710	$input float A[N];
[9cabba4]	1711	// Currently unused but left in to save time
[4bd0090]	1712	$input float B[N];
	1713
[a67c590]	1714	void $host_main() {
[4bd0090]	1715	int size = N * sizeof(float);
[9dbe9864]	1716	int numBlocks = (N-1)/threadsPerBlock + 1;
	1717	//int numBlocks = 1;
[4bd0090]	1718
[874f9d1]	1719	float* cuda_A;
[0ffc6c8]	1720	// cudaMalloc((void **)&cuda_A, size);
[4bd0090]	1721	{
[0ffc6c8]	1722	$scope deviceScope = $cuda_host_request_device_scope();
[cb46bb6]	1723	cuda_A = $hide((float*)$malloc(deviceScope, size));
[4bd0090]	1724	}
[874f9d1]	1725	cudaMemcpy(cuda_A, A, size, cudaMemcpyHostToDevice);
[4bd0090]	1726
[874f9d1]	1727	float* cuda_B;
[4bd0090]	1728	// cudaMalloc((void **)&cuda_B, size);
[30cb1de]	1729	{
[0ffc6c8]	1730	$scope deviceScope = $cuda_host_request_device_scope();
[cb46bb6]	1731	cuda_B = $hide((float*)$malloc(deviceScope, size));
[4bd0090]	1732	}
[874f9d1]	1733	cudaMemcpy(cuda_B, B, size, cudaMemcpyHostToDevice);
[4bd0090]	1734
[874f9d1]	1735	float* cuda_C;
[9dbe9864]	1736	// cudaMalloc((void *)&cuda_C, sizeof(float) numBlocks);
[4bd0090]	1737	{
[0ffc6c8]	1738	$scope deviceScope = $cuda_host_request_device_scope();
[9dbe9864]	1739	cuda_C = $hide((float)$malloc(deviceScope, sizeof(float) numBlocks));
[4bd0090]	1740	}
	1741
[0ffc6c8]	1742	dim3 gridDim = {numBlocks, 1, 1};
[9dbe9864]	1743	dim3 blockDim = {threadsPerBlock, 1, 1};
[0ffc6c8]	1744	// kernel_1<<<gridDim, blockDim>>>(cuda_A, cuda_B, cuda_C, N);
	1745	$cuda_host_launch_kernel_1(gridDim, blockDim, 0, NULL, cuda_A, cuda_B, cuda_C, N);
[874f9d1]	1746
[a67c590]	1747	// Checking correctness
[874f9d1]	1748	float* C = (float *)malloc(size);
	1749
[9cabba4]	1750	cudaMemcpy(C, cuda_C, sizeof(float), cudaMemcpyDeviceToHost);
	1751
	1752	// REDUCTION ASSERTION
[9dbe9864]	1753
[9cabba4]	1754	float sum = 0;
[874f9d1]	1755	for(int i = 0; i < N; i++)
[9cabba4]	1756	sum += A[i];
	1757
	1758	$assert(*C == sum);
[9dbe9864]	1759
[9cabba4]	1760	// BALLOT ASSERTION
[9dbe9864]	1761	/*
[9cabba4]	1762	float count = 0;
	1763	for (int i = 0; i < N; i++) {
	1764	if (A[i] > 0)
	1765	count++;
	1766	}
	1767	$assert(*C == count);
[9dbe9864]	1768	*/
[874f9d1]	1769
	1770	free(C);
	1771
[7e3e7af]	1772	cudaFree(cuda_A);
[0ffc6c8]	1773	cudaFree(cuda_B);
	1774	cudaFree(cuda_C);
[6dd0322]	1775
[109d05e]	1776	}
	1777
	1778	int main() {
[a67c590]	1779	$proc host = $spawn $host_main();
	1780	$proc cuda = $spawn $cuda_main();
[6dd0322]	1781	$wait(host);
[9cabba4]	1782	$comm_enqueue($cuda_host_comm, $message_pack($CUDA_PLACE_HOST, $CUDA_PLACE_DEVICE, $CUDA_TAG_TEARDOWN, NULL, 0));
	1783	$comm_destroy($cuda_host_comm);
[6dd0322]	1784	$wait(cuda);
[0ffc6c8]	1785	$gcomm_destroy($cuda_gcomm, NULL);
[e76acca]	1786	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

Original Format