#include #include #include #include #include #include #include #pragma CIVL ACSL /////////// // Types // /////////// enum cudaError { cudaSuccess }; typedef enum cudaError cudaError_t; //typedef $cuda_memcpy_kind cudaMemcpyKind; typedef struct { unsigned int x, y, z; } dim3; /* used to represent a location in a three dimensional grid */ typedef struct { unsigned int x, y, z; } uint3; typedef enum { cudaMemcpyHostToHost, cudaMemcpyHostToDevice, cudaMemcpyDeviceToHost, cudaMemcpyDeviceToDevice, cudaMemcpyDefault } cudaMemcpyKind; typedef struct $cuda_op_state* $cuda_op_state_t; typedef struct $cuda_op_state { _Bool start; $proc op; } $cuda_op_state; typedef struct $cuda_op_state_node* $cuda_op_state_node_t; typedef struct $cuda_op_state_node { $cuda_op_state_t opState; $cuda_op_state_node_t next; } $cuda_op_state_node; typedef struct $cuda_stream_node* $cuda_stream_node_t; typedef struct $cuda_stream* $cuda_stream_t; typedef $cuda_stream_t cudaStream_t; typedef struct $cuda_stream { $cuda_op_state_node_t head; $cuda_op_state_node_t tail; int numOps; $cuda_stream_node_t containingNode; _Bool alive; } $cuda_stream; typedef struct $cuda_stream_node{ cudaStream_t stream; $cuda_stream_node_t prev; $cuda_stream_node_t next; } $cuda_stream_node; typedef struct $cuda_context* $cuda_context_t; typedef struct $cuda_context { $cuda_stream_node_t head; int numStreams; } $cuda_context; typedef struct $cuda_mem_set* $cuda_mem_set_t; typedef struct $cuda_mem_set { $mem reads; $mem writes; } $cuda_mem_set; void $cuda_mem_set_clear($cuda_mem_set_t memSet) { $mem emptyset = $mem_empty(); memSet->reads = emptyset; memSet->writes = emptyset; } $cuda_mem_set_t $cuda_mem_set_create($scope scope) { $cuda_mem_set_t newMemSet = ($cuda_mem_set_t) $malloc(scope, sizeof($cuda_mem_set)); $cuda_mem_set_clear(newMemSet); return newMemSet; } void $cuda_mem_set_destroy($cuda_mem_set_t memSet) { free(memSet); } void $cuda_mem_set_add($cuda_mem_set_t memSet, $mem reads, $mem writes) { memSet->reads = $mem_union(memSet->reads, reads); memSet->writes = $mem_union(memSet->writes, writes); } void $cuda_check_mems_disjoint($cuda_mem_set_t m1, int b1, int w1, int l1, $cuda_mem_set_t m2, int b2, int w2, int l2) { $mem out1 = $mem_empty(); $mem out2 = $mem_empty(); $assert($mem_no_intersect(m1->reads, m2->writes, &out1, &out2), "Data-race detected: %p read by thread <%d, %d, %d> intersects %p written by thread <%d, %d, %d>\n", out1, b1, w1, l1, out2, b2, w2, l2); $assert($mem_no_intersect(m1->writes, m2->reads, &out1, &out2), "Data-race detected: %p written by thread <%d, %d, %d> intersects %p read by thread <%d, %d, %d>\n", out1, b1, w1, l1, out2, b2, w2, l2); $assert($mem_no_intersect(m1->writes, m2->writes, &out1, &out2), "Data-race detected: %p written by thread <%d, %d, %d> intersects %p written by thread <%d, %d, %d>\n", out1, b1, w1, l1, out2, b2, w2, l2); } typedef enum { $CUDA_WARP_TAG_EMPTY, $CUDA_WARP_TAG_warpsync, $CUDA_WARP_TAG_shfl_sync, $CUDA_WARP_TAG_shfl_up_sync, $CUDA_WARP_TAG_shfl_down_sync, $CUDA_WARP_TAG_shfl_xor_sync, $CUDA_WARP_TAG_all_sync, $CUDA_WARP_TAG_any_sync, $CUDA_WARP_TAG_ballot_sync } $cuda_warp_tag; $input int warpSize = 32; typedef struct $cuda_warp_data* $cuda_warp_data_t; typedef struct $cuda_warp_data { int size; $cuda_mem_set_t** memSets; _Bool patching[]; int num_alive; _Bool alive[]; int num_in_barrier; _Bool in_barrier[]; $cuda_warp_tag currOp; int reductionLane; $gcomm gcomm; } $cuda_warp_data; /*@ depends_on \access(warp); @ executes_when \true; @ */ $atomic_f void $cuda_warp_barrier_update($cuda_warp_data_t warp) { if (warp->num_in_barrier == warp->num_alive) { warp->num_in_barrier = 0; warp->currOp = $CUDA_WARP_TAG_EMPTY; for (int i = 0; i < warp->size; i++) { warp->in_barrier[i] = $false; $cuda_mem_set_clear(warp->memSets[i][i]); } } } $cuda_warp_data_t $create_cuda_warp_data($scope warpScope, int size) { $cuda_warp_data_t newWarp = ($cuda_warp_data_t) malloc(sizeof($cuda_warp_data)); newWarp->size = size; newWarp->memSets = ($cuda_mem_set_t**) $malloc(warpScope, size * sizeof($cuda_mem_set_t*)); for (int i = 0; i < size; i++) { newWarp->memSets[i] = ($cuda_mem_set_t*) $malloc(warpScope, size * sizeof($cuda_mem_set_t)); for (int j = 0; j < size; j++) { newWarp->memSets[i][j] = $cuda_mem_set_create(warpScope); } } newWarp->patching = (_Bool[size])$lambda(int i) $false; newWarp->num_alive = size; newWarp->alive = (_Bool[size])$lambda(int i) $true; newWarp->num_in_barrier = 0; newWarp->in_barrier = (_Bool[size])$lambda(int i) $false; newWarp->currOp = $CUDA_WARP_TAG_EMPTY; newWarp->reductionLane = -1; newWarp->gcomm = $gcomm_create(warpScope, size); return newWarp; } void $destroy_cuda_warp_data($cuda_warp_data_t warp) { $assert(warp != NULL, "Attempt to destroy a NULL warp"); for (int i = 0; i < warp->size; i++) { for (int j = 0; j < warp->size; j++) { $cuda_mem_set_destroy(warp->memSets[i][j]); } free(warp->memSets[i]); } free(warp->memSets); $gcomm_destroy(warp->gcomm, NULL); free(warp); } typedef struct $cuda_block_data* $cuda_block_data_t; typedef struct $cuda_block_data { int size; $cuda_mem_set_t* memSets; int predArray[]; $gbarrier gbarrier; int currBarrierID; int numWarps; $cuda_warp_data_t* warps; } $cuda_block_data; $cuda_block_data_t $create_cuda_block_data($scope scope, int size) { $cuda_block_data_t newBlock = ($cuda_block_data_t) $malloc(scope, sizeof($cuda_block_data)); newBlock->size = size; newBlock->numWarps = (size - 1)/warpSize + 1; newBlock->predArray = (int[size]) $lambda(int i) 0; newBlock->gbarrier = $gbarrier_create(scope, size); newBlock->currBarrierID = -1; newBlock->memSets = ($cuda_mem_set_t*) $malloc(scope, size * sizeof($cuda_mem_set_t)); for (int i = 0; i < size; i++) { newBlock->memSets[i] = $cuda_mem_set_create(scope); } newBlock->warps = ($cuda_warp_data_t*) $malloc(scope, newBlock->numWarps * sizeof($cuda_warp_data_t)); for (int i = 0; i < newBlock->numWarps - 1; i++) { newBlock->warps[i] = $create_cuda_warp_data(scope, warpSize); } int lastIndex = newBlock->numWarps - 1; newBlock->warps[lastIndex] = $create_cuda_warp_data(scope, ((size - 1) % warpSize) + 1); return newBlock; } void $destroy_cuda_block_data($cuda_block_data_t block) { for (int i = 0; i < block->size; i++) { $cuda_mem_set_destroy(block->memSets[i]); } free(block->memSets); for (int i = 0; i < block->numWarps; i++) { $destroy_cuda_warp_data(block->warps[i]); } free(block->warps); $gbarrier_destroy(block->gbarrier); free(block); } typedef struct $cuda_kernel_data* $cuda_kernel_data_t; typedef struct $cuda_kernel_data { int size; $cuda_mem_set_t* memSets; int numBlocks; $cuda_block_data_t* blocks; } $cuda_kernel_data; $cuda_kernel_data_t $create_cuda_kernel_data($scope scope, dim3 gridDim, dim3 blockDim){ int numBlocks = (gridDim.x * gridDim.y) * gridDim.z; int threadsPerBlock = (blockDim.x * blockDim.y) * blockDim.z; $cuda_kernel_data_t newKernel = ($cuda_kernel_data_t)$malloc(scope, sizeof($cuda_kernel_data)); newKernel->size = threadsPerBlock * numBlocks; newKernel->numBlocks = numBlocks; newKernel->memSets = ($cuda_mem_set_t*) $malloc(scope, newKernel->size * sizeof($cuda_mem_set_t)); for (int i = 0; i < newKernel->size; i++) { newKernel->memSets[i] = $cuda_mem_set_create(scope); } newKernel->blocks = ($cuda_block_data_t*) $malloc(scope, newKernel->numBlocks * sizeof($cuda_block_data_t)); for (int i = 0; i < newKernel->numBlocks; i++) { newKernel->blocks[i] = $create_cuda_block_data(scope, threadsPerBlock); } return newKernel; } void $destroy_cuda_kernel_data($cuda_kernel_data_t kernel){ for (int i = 0; i < kernel->size; i++) { $cuda_mem_set_destroy(kernel->memSets[i]); } free(kernel->memSets); for (int i = 0; i < kernel->numBlocks; i++) { $destroy_cuda_block_data(kernel->blocks[i]); } free(kernel->blocks); free(kernel); } typedef struct $cuda_thread_data* $cuda_thread_data_t; typedef struct $cuda_thread_data { $cuda_kernel_data_t kernel; $cuda_block_data_t block; $cuda_warp_data_t warp; int blockID; int warpID; int laneID; $comm lane_comm; $barrier block_barrier; } $cuda_thread_data; /*@ depends_on \access(thread); @ executes_when \true; @*/ $atomic_f void $cuda_check_warp_data_race($cuda_thread_data_t thread) { int blockID = thread->blockID; int warpID = thread->warpID; int laneID = thread->laneID; $cuda_mem_set_t** warpMems = thread->warp->memSets; for (int i = 0; i < thread->warp->size; i++) { if (i != laneID) { $cuda_check_mems_disjoint(warpMems[laneID][laneID], blockID, warpID, laneID, warpMems[i][i], blockID, warpID, i); } } } /*@ depends_on \access(thread); @ executes_when \true; @*/ $atomic_f void $cuda_check_block_data_race($cuda_thread_data_t thread) { int blockID = thread->blockID; int warpID = thread->warpID; int laneID = thread->laneID; int indexInBlock = warpID * warpSize + laneID; $cuda_mem_set_t* blockMems = thread->block->memSets; for (int i = 0; i < thread->block->numWarps; i++) { if (i != warpID) { int currWarpSize = thread->block->warps[i]->size; for (int j = 0; j < currWarpSize; j++) { $cuda_check_mems_disjoint(blockMems[indexInBlock], blockID, warpID, laneID, blockMems[i * warpSize + j], blockID, i, j); } } } } /*@ depends_on \access(thread); @ executes_when \true; @*/ $atomic_f void $cuda_check_kernel_data_race($cuda_thread_data_t thread) { int blockID = thread->blockID; int warpID = thread->warpID; int laneID = thread->laneID; int threadsPerBlock = thread->block->size; int indexInKernel = blockID * threadsPerBlock + warpID * warpSize + laneID; $cuda_mem_set_t* kernelMems = thread->kernel->memSets; //printf("%d, %d, %d - Checking kernel data race. km index %d.\n\n", thread->blockID, thread->warpID, thread->laneID, indexInKernel); for (int i = 0; i < thread->kernel->numBlocks; i++) { if (i != blockID) { $cuda_block_data_t block = thread->kernel->blocks[i]; for (int j = 0; j < block->numWarps; j++) { int currWarpSize = block->warps[j]->size; for (int k = 0; k < currWarpSize; k++) { //printf("%d, %d, %d - Checking kdr against <%d,%d,%d> with km index %d.\n\n", thread->blockID, thread->warpID, thread->laneID, i,j,k, i * threadsPerBlock + j * warpSize + k); $cuda_check_mems_disjoint(kernelMems[indexInKernel], blockID, warpID, laneID, kernelMems[i * threadsPerBlock + j * warpSize + k], i, j, k); } } } } } /*@ depends_on \access(thread); @ executes_when \true; @*/ $atomic_f void $cuda_start_mem_patching($cuda_thread_data_t thread) { thread->warp->patching[thread->laneID] = $true; } /*@ depends_on \access(thread); @ executes_when \true; @*/ $atomic_f void $cuda_update_patches($cuda_thread_data_t thread) { $cuda_warp_data_t warp = thread->warp; int lane = thread->laneID; for (int i = 0; i < warp->size; i++) { if (warp->patching[i]) { $cuda_mem_set_add(warp->memSets[i][lane], warp->memSets[lane][lane]->reads, warp->memSets[lane][lane]->writes); } } } /*@ depends_on \access(thread); @ executes_when \true; @*/ $atomic_f void $cuda_end_mem_patching($cuda_thread_data_t thread) { if (thread->warp->patching[thread->laneID]) { int size = thread->warp->size; int blockID = thread->blockID; int warpID = thread->warpID; int laneID = thread->laneID; $cuda_mem_set_t* threadMems = thread->warp->memSets[laneID]; thread->warp->patching[laneID] = $false; for (int i = 0; i < size; i++) { if (i != laneID) { $cuda_check_mems_disjoint(threadMems[laneID], blockID, warpID, laneID, threadMems[i], blockID, warpID, i); $cuda_mem_set_clear(threadMems[i]); } } } } /*@ depends_on \access(thread); @ executes_when \true; @*/ $atomic_f void $cuda_start_kernel_region($cuda_thread_data_t thread) { int lane = thread->laneID; //printf("%d, %d, %d - Starting region.\n\n", thread->blockID, thread->warpID, thread->laneID); $cuda_update_patches(thread); $cuda_mem_set_clear(thread->warp->memSets[lane][lane]); $cuda_mem_set_clear(thread->block->memSets[thread->warpID * warpSize + thread->laneID]); $cuda_mem_set_clear(thread->kernel->memSets[thread->blockID * thread->block->size + thread->warpID * warpSize + thread->laneID]); //$cuda_mem_set_t km = thread->kernel->memSets[thread->blockID]; //printf("%d, %d, %d - kernel mem set after clear:\n\tReads: %s\n\tWrites: %s\n\n", thread->blockID, thread->warpID, thread->laneID, km->reads, km->writes); $read_set_push(); $write_set_push(); } /*@ depends_on \access(thread); @ executes_when \true; @*/ $atomic_f void $cuda_complete_warp_region($cuda_thread_data_t thread) { $mem reads = $read_set_pop(); $mem writes = $write_set_pop(); int lane = thread->laneID; $cuda_mem_set_t laneMem = thread->warp->memSets[lane][lane]; //printf("%d, %d, %d - completing warp region.\n\tReads: %s\n\tWrites: %s\n\n", thread->blockID, thread->warpID, thread->laneID, reads, writes); $cuda_mem_set_add(laneMem, reads, writes); $cuda_mem_set_add(thread->block->memSets[thread->warpID * warpSize + thread->laneID], laneMem->reads, laneMem->writes); $cuda_mem_set_add(thread->kernel->memSets[thread->blockID * thread->block->size + thread->warpID * warpSize + thread->laneID], laneMem->reads, laneMem->writes); //$cuda_mem_set_t km = thread->kernel->memSets[thread->blockID]; //printf("%d, %d, %d - kernel mem set after add:\n\tReads: %s\n\tWrites: %s\n\n", thread->blockID, thread->warpID, thread->laneID, km->reads, km->writes); $cuda_end_mem_patching(thread); $cuda_check_warp_data_race(thread); } /*@ depends_on \access(thread); @ executes_when \true; @*/ $atomic_f void $cuda_complete_block_region($cuda_thread_data_t thread) { //printf("%d, %d, %d - completing block region\n\n", thread->blockID, thread->warpID, thread->laneID); $cuda_complete_warp_region(thread); $cuda_check_block_data_race(thread); } /*@ depends_on \access(thread); @ executes_when \true; @*/ $atomic_f void $cuda_complete_kernel_region($cuda_thread_data_t thread) { //printf("%d, %d, %d - completing kernel region\n\n", thread->blockID, thread->warpID, thread->laneID); $cuda_complete_block_region(thread); $cuda_check_kernel_data_race(thread); } $cuda_thread_data_t $create_cuda_thread_data($scope scope, $cuda_kernel_data_t kernel, int blockID, int warpID, int laneID) { $cuda_thread_data_t newThread = ($cuda_thread_data_t) $malloc(scope, sizeof($cuda_thread_data)); newThread->kernel = kernel; newThread->block = kernel->blocks[blockID]; newThread->warp = newThread->block->warps[warpID]; newThread->blockID = blockID; newThread->warpID = warpID; newThread->laneID = laneID; newThread->lane_comm = $comm_create(scope, kernel->blocks[blockID]->warps[warpID]->gcomm, laneID); newThread->block_barrier = $barrier_create(scope, kernel->blocks[blockID]->gbarrier, warpID * warpSize + laneID); $read_set_push(); $write_set_push(); return newThread; } /*@ depends_on \access(thread); @ executes_when \true; @ */ $atomic_f void $destroy_cuda_thread_data($cuda_thread_data_t thread) { $assert(thread != NULL, "Attempt to destroy NULL cuda thread"); $cuda_complete_kernel_region(thread); $cuda_warp_data_t warp = thread->warp; warp->alive[thread->laneID] = $false; warp->num_alive--; $cuda_warp_barrier_update(warp); $barrier_destroy(thread->block_barrier); $comm_destroy(thread->lane_comm); free(thread); } /*@ depends_on \access(thread); @ executes_when \true; @ */ $atomic_f void $cuda_warp_barrier_enter($cuda_thread_data_t thread, $cuda_warp_tag tag) { $cuda_warp_data_t warp = thread->warp; $assert(!warp->in_barrier[thread->laneID]); $assert(warp->currOp == tag || warp->currOp == $CUDA_WARP_TAG_EMPTY); warp->in_barrier[thread->laneID] = $true; warp->currOp = tag; warp->num_in_barrier++; $cuda_warp_barrier_update(warp); } // Seems this needs to be atomic to work. Why? /*@ depends_on \access(thread); @*/ $atomic_f void $cuda_warp_barrier_exit($cuda_thread_data_t thread) { $when(!thread->warp->in_barrier[thread->laneID]); } void $cuda_warp_barrier_call($cuda_thread_data_t thread, $cuda_warp_tag tag) { $cuda_warp_barrier_enter(thread, tag); $cuda_warp_barrier_exit(thread); } void $cuda__syncthreads($cuda_thread_data_t thread, int barrierIndex) { $cuda_complete_block_region(thread); int indexInBlock = thread->warpID * warpSize + thread->laneID; if (thread->block->currBarrierID == -1) { thread->block->currBarrierID = barrierIndex; } $assert(thread->block->currBarrierID == barrierIndex, "Divergent calls to __syncthreads."); $local_end(); $barrier_call(thread->block_barrier); $local_start(); int lane = thread->laneID; $cuda_mem_set_clear(thread->warp->memSets[lane][lane]); $cuda_mem_set_clear(thread->block->memSets[thread->warpID * warpSize + lane]); if (indexInBlock == 0) { thread->block->currBarrierID = -1; } $local_end(); $barrier_call(thread->block_barrier); $local_start(); $read_set_push(); $write_set_push(); } #define $CUDA_DEFINE_SYNCTHREADS_VARIANT(NAME, INIT, REDUCTION) \ int NAME(int predicate, $cuda_thread_data_t thread, int barrierIndex) { \ $cuda_complete_block_region(thread); \ int indexInBlock = thread->warpID * warpSize + thread->laneID; \ \ if (thread->block->currBarrierID == -1) { \ thread->block->currBarrierID = barrierIndex; \ } \ $assert(thread->block->currBarrierID == barrierIndex, \ "Divergent calls to NAME."); \ \ if (predicate) { \ thread->block->predArray[indexInBlock] = 1; \ } else { \ thread->block->predArray[indexInBlock] = 0; \ } \ \ $local_end(); \ $barrier_call(thread->block_barrier); \ $local_start(); \ int lane = thread->laneID; \ \ $cuda_mem_set_clear(thread->warp->memSets[lane][lane]); \ $cuda_mem_set_clear(thread->block->memSets[indexInBlock]); \ \ if (indexInBlock == 0) { \ int result = INIT; \ for (int i = 0; i < thread->block->size; i++) { \ REDUCTION; \ } \ thread->block->predArray[0] = result; \ thread->block->currBarrierID = -1; \ } \ \ $local_end(); \ $barrier_call(thread->block_barrier); \ $local_start(); \ int result = thread->block->predArray[0]; \ \ $local_end(); \ $barrier_call(thread->block_barrier); \ $local_start(); \ \ $read_set_push(); \ $write_set_push(); \ return result; \ } $CUDA_DEFINE_SYNCTHREADS_VARIANT($cuda__syncthreads_count, 0, result += thread->block->predArray[i]) $CUDA_DEFINE_SYNCTHREADS_VARIANT($cuda__syncthreads_or, 0, if (thread->block->predArray[i]) { result = 1; continue; }) $CUDA_DEFINE_SYNCTHREADS_VARIANT($cuda__syncthreads_and, 1, if (!thread->block->predArray[i]) { result = 0; continue; }) #define $GET_ARG_1(_1, ...) _1 #define $GET_ARG_2(_1, _2, ...) _2 #define __syncwarp() $cuda__syncwarp($thread) void $cuda__syncwarp($cuda_thread_data_t thread) { $cuda_complete_warp_region(thread); $local_end(); $cuda_warp_barrier_call(thread, $CUDA_WARP_TAG_warpsync); $local_start(); $read_set_push(); $write_set_push(); } #define $CUDA_SHFL_PARAM_MACRO(...) $GET_ARG_1(__VA_ARGS__, warpSize, 0), $GET_ARG_2(__VA_ARGS__, warpSize, 0) #define __shfl_sync(mask, var, ...) \ _Generic(var, \ default: $cuda__shfl_sync_int, \ unsigned int: $cuda__shfl_sync_uint, \ long: $cuda__shfl_sync_long, \ unsigned long: $cuda__shfl_sync_ulong, \ long long: $cuda__shfl_sync_ll, \ unsigned long long: $cuda__shfl_sync_ull, \ float: $cuda__shfl_sync_float, \ double: $cuda__shfl_sync_double) (mask, var, $CUDA_SHFL_PARAM_MACRO(__VA_ARGS__), $thread) #define __shfl_up_sync(mask, var, ...) \ _Generic(var, \ default: $cuda__shfl_up_sync_int, \ unsigned int: $cuda__shfl_up_sync_uint, \ long: $cuda__shfl_up_sync_long, \ unsigned long: $cuda__shfl_up_sync_ulong, \ long long: $cuda__shfl_up_sync_ll, \ unsigned long long: $cuda__shfl_up_sync_ull, \ float: $cuda__shfl_up_sync_float, \ double: $cuda__shfl_up_sync_double) (mask, var, $CUDA_SHFL_PARAM_MACRO(__VA_ARGS__), $thread) #define __shfl_down_sync(mask, var, ...) \ _Generic(var, \ default: $cuda__shfl_down_sync_int, \ unsigned int: $cuda__shfl_down_sync_uint, \ long: $cuda__shfl_down_sync_long, \ unsigned long: $cuda__shfl_down_sync_ulong, \ long long: $cuda__shfl_down_sync_ll, \ unsigned long long: $cuda__shfl_down_sync_ull, \ float: $cuda__shfl_down_sync_float, \ double: $cuda__shfl_down_sync_double) (mask, var, $CUDA_SHFL_PARAM_MACRO(__VA_ARGS__), $thread) #define __shfl_xor_sync(mask, var, ...) \ _Generic(var, \ default: $cuda__shfl_xor_sync_int, \ unsigned int: $cuda__shfl_xor_sync_uint, \ long: $cuda__shfl_xor_sync_long, \ unsigned long: $cuda__shfl_xor_sync_ulong, \ long long: $cuda__shfl_xor_sync_ll, \ unsigned long long: $cuda__shfl_xor_sync_ull, \ float: $cuda__shfl_xor_sync_float, \ double: $cuda__shfl_xor_sync_double) (mask, var, $CUDA_SHFL_PARAM_MACRO(__VA_ARGS__), $thread) #define $CUDA_INTRINSIC_RACE_CHECK_PROTOCOL_PRE() \ $mem writes = $write_set_pop(); \ $mem reads = $read_set_pop(); \ $cuda_mem_set_add(thread->warp->memSets[thread->laneID][thread->laneID], \ reads, writes); \ thread->warp->patching[thread->laneID] = $true; #define $CUDA_INTRINSIC_RACE_CHECK_PROTOCOL_POST() \ $read_set_push(); \ $write_set_push(); #define $CUDA_GENERIC_SHFL_BODY() \ $assert (width <= warpSize); \ for (int v = width; v > 1; v /= 2) { \ $assert(v % 2 == 0); \ } \ \ int requestLane; \ switch(tag) { \ case $CUDA_WARP_TAG_shfl_sync: \ requestLane = thread->laneID/width + laneParam % width; \ break; \ case $CUDA_WARP_TAG_shfl_up_sync: \ requestLane = thread->laneID - laneParam; \ break; \ case $CUDA_WARP_TAG_shfl_down_sync: \ requestLane = thread->laneID + laneParam; \ break; \ case $CUDA_WARP_TAG_shfl_xor_sync: \ requestLane = thread->laneID ^ laneParam; \ break; \ } \ $cuda_warp_data_t warp = thread->warp; \ _Bool validSrcLane = requestLane >= 0 && requestLane < warp->size; \ if (validSrcLane) { \ $comm_enqueue(thread->lane_comm, $message_pack(thread->laneID, requestLane, tag, NULL, 0)); \ } \ \ $local_end(); \ $cuda_warp_barrier_call(thread, tag); \ $local_start(); \ \ \ _Bool requested[width] = (_Bool[width])$lambda(int i) $false; \ int subWarpStart = thread->laneID/width; \ while ($comm_probe(thread->lane_comm, $COMM_ANY_SOURCE, tag)) { \ $message request = $comm_dequeue(thread->lane_comm, $COMM_ANY_SOURCE, tag); \ requested[$message_source(request) - subWarpStart] = $true; \ } \ \ $local_end(); \ $cuda_warp_barrier_call(thread, tag); \ $local_start(); \ \ for(int i = 0; i < width; i++) { \ if (requested[i]) { \ $comm_enqueue(thread->lane_comm, $message_pack(thread->laneID, i + subWarpStart, tag, &var, typeSize)); \ } \ } \ \ $local_end(); \ $cuda_warp_barrier_call(thread, tag); \ $local_start(); \ \ if (validSrcLane) { \ $message result = $comm_dequeue(thread->lane_comm, requestLane, tag); \ $message_unpack(result, &resultVal, typeSize); \ } else { \ $havoc(&resultVal); \ } #define $CUDA_DEFINE_SHFL(NAME, T, TAG) \ T NAME(unsigned mask, T var, int laneParam, int width, $cuda_thread_data_t thread) { \ $CUDA_INTRINSIC_RACE_CHECK_PROTOCOL_PRE(); \ T resultVal; \ int typeSize = sizeof(T); \ $cuda_warp_tag tag = TAG; \ \ $CUDA_GENERIC_SHFL_BODY(); \ \ $CUDA_INTRINSIC_RACE_CHECK_PROTOCOL_POST(); \ return resultVal; \ } $CUDA_DEFINE_SHFL($cuda__shfl_sync_int, int, $CUDA_WARP_TAG_shfl_sync) $CUDA_DEFINE_SHFL($cuda__shfl_sync_uint, unsigned int, $CUDA_WARP_TAG_shfl_sync) $CUDA_DEFINE_SHFL($cuda__shfl_sync_long, long, $CUDA_WARP_TAG_shfl_sync) $CUDA_DEFINE_SHFL($cuda__shfl_sync_ulong, unsigned long, $CUDA_WARP_TAG_shfl_sync) $CUDA_DEFINE_SHFL($cuda__shfl_sync_ll, long long, $CUDA_WARP_TAG_shfl_sync) $CUDA_DEFINE_SHFL($cuda__shfl_sync_ull, unsigned long long, $CUDA_WARP_TAG_shfl_sync) $CUDA_DEFINE_SHFL($cuda__shfl_sync_float, float, $CUDA_WARP_TAG_shfl_sync) $CUDA_DEFINE_SHFL($cuda__shfl_sync_double, double, $CUDA_WARP_TAG_shfl_sync) $CUDA_DEFINE_SHFL($cuda__shfl_up_sync_int, int, $CUDA_WARP_TAG_shfl_up_sync) $CUDA_DEFINE_SHFL($cuda__shfl_up_sync_uint, unsigned int, $CUDA_WARP_TAG_shfl_up_sync) $CUDA_DEFINE_SHFL($cuda__shfl_up_sync_long, long, $CUDA_WARP_TAG_shfl_up_sync) $CUDA_DEFINE_SHFL($cuda__shfl_up_sync_ulong, unsigned long, $CUDA_WARP_TAG_shfl_up_sync) $CUDA_DEFINE_SHFL($cuda__shfl_up_sync_ll, long long, $CUDA_WARP_TAG_shfl_up_sync) $CUDA_DEFINE_SHFL($cuda__shfl_up_sync_ull, unsigned long long, $CUDA_WARP_TAG_shfl_up_sync) $CUDA_DEFINE_SHFL($cuda__shfl_up_sync_float, float, $CUDA_WARP_TAG_shfl_up_sync) $CUDA_DEFINE_SHFL($cuda__shfl_up_sync_double, double, $CUDA_WARP_TAG_shfl_up_sync) $CUDA_DEFINE_SHFL($cuda__shfl_down_sync_int, int, $CUDA_WARP_TAG_shfl_down_sync) $CUDA_DEFINE_SHFL($cuda__shfl_down_sync_uint, unsigned int, $CUDA_WARP_TAG_shfl_down_sync) $CUDA_DEFINE_SHFL($cuda__shfl_down_sync_long, long, $CUDA_WARP_TAG_shfl_down_sync) $CUDA_DEFINE_SHFL($cuda__shfl_down_sync_ulong, unsigned long, $CUDA_WARP_TAG_shfl_down_sync) $CUDA_DEFINE_SHFL($cuda__shfl_down_sync_ll, long long, $CUDA_WARP_TAG_shfl_down_sync) $CUDA_DEFINE_SHFL($cuda__shfl_down_sync_ull, unsigned long long, $CUDA_WARP_TAG_shfl_down_sync) $CUDA_DEFINE_SHFL($cuda__shfl_down_sync_float, float, $CUDA_WARP_TAG_shfl_down_sync) $CUDA_DEFINE_SHFL($cuda__shfl_down_sync_double, double, $CUDA_WARP_TAG_shfl_down_sync) $CUDA_DEFINE_SHFL($cuda__shfl_xor_sync_int, int, $CUDA_WARP_TAG_shfl_xor_sync) $CUDA_DEFINE_SHFL($cuda__shfl_xor_sync_uint, unsigned int, $CUDA_WARP_TAG_shfl_xor_sync) $CUDA_DEFINE_SHFL($cuda__shfl_xor_sync_long, long, $CUDA_WARP_TAG_shfl_xor_sync) $CUDA_DEFINE_SHFL($cuda__shfl_xor_sync_ulong, unsigned long, $CUDA_WARP_TAG_shfl_xor_sync) $CUDA_DEFINE_SHFL($cuda__shfl_xor_sync_ll, long long, $CUDA_WARP_TAG_shfl_xor_sync) $CUDA_DEFINE_SHFL($cuda__shfl_xor_sync_ull, unsigned long long, $CUDA_WARP_TAG_shfl_xor_sync) $CUDA_DEFINE_SHFL($cuda__shfl_xor_sync_float, float, $CUDA_WARP_TAG_shfl_xor_sync) $CUDA_DEFINE_SHFL($cuda__shfl_xor_sync_double, double, $CUDA_WARP_TAG_shfl_xor_sync) #define __ballot_sync(mask, predicate) $cuda__ballot_sync(mask, predicate, $thread) #define __all_sync(mask, predicate) $cuda__all_sync(mask, predicate, $thread) #define __any_sync(mask, predicate) $cuda__any_sync(mask, predicate, $thread) #define $CUDA_GENERIC_COND_REDUCTION_BODY(COND, T_REDUCTION, F_REDUCTION) \ $cuda_warp_data_t warp = thread->warp; \ int laneID = thread->laneID; \ $comm comm = thread->lane_comm; \ if (warp->reductionLane == -1) { \ warp->reductionLane = laneID; \ \ result = initialValue; \ for (int i = 0; i < warp->size; i++) { \ if (i == laneID) { \ operand = value; \ } else { \ $local_end(); \ $when(!warp->alive[i] || $comm_probe(comm, i, tag)) $local_start(); \ \ if (!warp->alive[i]) { \ operand = initialValue; \ } else { \ $local_end(); \ $message_unpack($comm_dequeue(comm, i, tag), &operand, typeSize); \ $local_start(); \ } \ } \ \ if (COND) { \ result = T_REDUCTION; \ } else { \ result = F_REDUCTION; \ } \ } \ \ warp->reductionLane = -1; \ \ for (int i = 0; i< warp->size; i++) { \ if (i != laneID && warp->alive[i]) { \ $comm_enqueue(comm, $message_pack(laneID, i, tag, &result, typeSize)); \ } \ } \ } else { \ int reductionLane = warp->reductionLane; \ $comm_enqueue(comm, $message_pack(laneID, reductionLane, tag, &value, typeSize)); \ $local_end(); \ $message_unpack($comm_dequeue(comm, reductionLane, tag), &result, typeSize); \ $local_start(); \ } #define $CUDA_GENERIC_REDUCTION_BODY(REDUCTION) $CUDA_GENERIC_COND_REDUCTION_BODY($true, REDUCTION, result) int $cuda__all_sync(unsigned mask, int value, $cuda_thread_data_t thread) { $CUDA_INTRINSIC_RACE_CHECK_PROTOCOL_PRE(); $cuda_warp_tag tag = $CUDA_WARP_TAG_all_sync; int typeSize = sizeof(int); int initialValue = 1; int result, operand; $CUDA_GENERIC_COND_REDUCTION_BODY(result != 0 && operand != 0, 1, 0); $CUDA_INTRINSIC_RACE_CHECK_PROTOCOL_POST(); return result; } int $cuda__any_sync(unsigned mask, int value, $cuda_thread_data_t thread) { $CUDA_INTRINSIC_RACE_CHECK_PROTOCOL_PRE(); $cuda_warp_tag tag = $CUDA_WARP_TAG_any_sync; int typeSize = sizeof(int); int initialValue = 0; int result, operand; $CUDA_GENERIC_COND_REDUCTION_BODY(result != 0 || operand != 0, 1, 0); $CUDA_INTRINSIC_RACE_CHECK_PROTOCOL_POST(); return result; } unsigned $cuda__ballot_sync(unsigned mask, int value, $cuda_thread_data_t thread) { $CUDA_INTRINSIC_RACE_CHECK_PROTOCOL_PRE(); $cuda_warp_tag tag = $CUDA_WARP_TAG_ballot_sync; int initialValue = 0; int typeSize = sizeof(int); unsigned result; int operand; $CUDA_GENERIC_COND_REDUCTION_BODY(operand == 0, 2 * result, 2 * result + 1); $CUDA_INTRINSIC_RACE_CHECK_PROTOCOL_POST(); return result; } /* atomicAdd() * Reads the 16-bit, 32-bit or 64-bit word old located at the address address in * global or shared memory, computes (old + val), and stores the result back to * memory at the same address. These three operations are performed in one atomic * transaction. The function returns old. */ #define atomicAdd(X,Y) _Generic(X, \ default : $cuda_atomicAdd_int, \ unsigned int* : $cuda_atomicAdd_uint, \ unsigned long long int* : $cuda_atomicAdd_ullint, \ float* : $cuda_atomicAdd_float, \ double* : $cuda_atomicAdd_double) (X, Y, $thread) /* atomicSub() * reads the 32-bit word old located at the address address in global or shared * memory, computes (old - val), and stores the result back to memory at the same * address. These three operations are performed in one atomic transaction. The * function returns old. */ #define atomicSub(X,Y) _Generic(X, \ default : $cuda_atomicSub_int, \ unsigned int* : $cuda_atomicSub_uint) (X, Y, $thread) /* atomicExch() * reads the 32-bit or 64-bit word old located at the address address in global * or shared memory and stores val back to memory at the same address. These two * operations are performed in one atomic transaction. The function returns old. */ #define atomicExch(X,Y) _Generic(X, \ default : $cuda_atomicExch_int, \ unsigned int* : $cuda_atomicExch_uint, \ unsigned long long int* : $cuda_atomicExch_ullint, \ float* : $cuda_atomicExch_float) (X, Y, $thread) /* atomicMin() * reads the 32-bit or 64-bit word old located at the address address in global * or shared memory, computes the minimum of old and val, and stores the result * back to memory at the same address. These three operations are performed in one * atomic transaction. The function returns old. */ #define atomicMin(X,Y) _Generic(X, \ default : $cuda_atomicMin_int, \ unsigned int* : $cuda_atomicMin_uint, \ unsigned long long int* : $cuda_atomicMin_ullint) (X, Y, $thread) /* atomicMax() * reads the 32-bit or 64-bit word old located at the address address in global * or shared memory, computes the maximum of old and val, and stores the result * back to memory at the same address. These three operations are performed in one * atomic transaction. The function returns old. */ #define atomicMax(X,Y) _Generic(X, \ default : $cuda_atomicMax_int, \ unsigned int* : $cuda_atomicMax_uint, \ unsigned long long int* : $cuda_atomicMax_ullint) (X, Y, $thread) /* atomicInc() * reads the 32-bit word old located at the address address in global or shared * memory, computes ((old >= val) ? 0 : (old+1)), and stores the result back to * memory at the same address. These three operations are performed in one atomic * transaction. The function returns old. */ #define atomicInc(address, val) $cuda_atomicInc(address, val, $thread) /* atomicDec() * reads the 32-bit word old located at the address address in global or shared * memory, computes (((old == 0) || (old > val)) ? val : (old-1) ), and stores * the result back to memory at the same address. These three operations are * performed in one atomic transaction. The function returns old. */ #define atomicDec(address, val) $cuda_atomicDec(address, val, $thread) /* atomicCAS() * reads the 16-bit, 32-bit or 64-bit word old located at the address address in * global or shared memory, computes (old == compare ? val : old) , and stores the * result back to memory at the same address. These three operations are performed * in one atomic transaction. The function returns old (Compare And Swap). */ #define atomicCAS(address, compare, val) _Generic(address, \ default : $cuda_atomicCAS_int, \ unsigned int* : $cuda_atomicCAS_uint, \ unsigned long long int* : $cuda_atomicCAS_ullint, \ unsigned short int* : $cuda_atomicCAS_usint) (address, compare, val, $thread) /* * reads the 32-bit or 64-bit word old located at the address address * in global or shared memory, computes (old & val), and stores the * result back to memory at the same address. These three operations * are performed in one atomic transaction. The function returns old. */ #define atomicAnd(address, val) _Generic(address, \ default : $cuda_atomicAnd_int, \ unsigned int* : $cuda_atomicAnd_uint, \ unsigned long long int* : $cuda_atomicAnd_ullint) (address, val, $thread) /* * reads the 32-bit or 64-bit word old located at the address address * in global or shared memory, computes (old | val), and stores the * result back to memory at the same address. These three operations * are performed in one atomic transaction. The function returns old. */ #define atomicOr(address, val) _Generic(address, \ default : $cuda_atomicOr_int, \ unsigned int* : $cuda_atomicOr_uint, \ unsigned long long int* : $cuda_atomicOr_ullint) (address, val, $thread) /* * reads the 32-bit or 64-bit word old located at the address address * in global or shared memory, computes (old ^ val), and stores the * result back to memory at the same address. These three operations * are performed in one atomic transaction. The function returns old. */ #define atomicXor(address, val) _Generic(address, \ default : $cuda_atomicXor_int, \ unsigned int* : $cuda_atomicXor_uint, \ unsigned long long int* : $cuda_atomicXor_ullint) (address, val, $thread) #define $CUDA_ATOMIC_PRE_ACTION(T) \ $cuda_complete_kernel_region(thread); \ $yield(); \ $cuda_start_kernel_region(thread); \ T old = *address; #define $CUDA_ATOMIC_POST_ACTION() \ $cuda_complete_kernel_region(thread); \ $cuda_start_kernel_region(thread); \ return old; #define $CUDA_DEFINE_ATOMIC_ADD(NAME, T) \ T NAME(T* address, T val, $cuda_thread_data_t thread) { \ $CUDA_ATOMIC_PRE_ACTION(T) \ *address += val; \ $CUDA_ATOMIC_POST_ACTION() \ } $CUDA_DEFINE_ATOMIC_ADD($cuda_atomicAdd_int, int) $CUDA_DEFINE_ATOMIC_ADD($cuda_atomicAdd_uint, unsigned int) $CUDA_DEFINE_ATOMIC_ADD($cuda_atomicAdd_ullint, unsigned long long int) $CUDA_DEFINE_ATOMIC_ADD($cuda_atomicAdd_float, float) $CUDA_DEFINE_ATOMIC_ADD($cuda_atomicAdd_double, double) #define $CUDA_DEFINE_ATOMIC_SUB(NAME, T) \ T NAME(T* address, T val, $cuda_thread_data_t thread) { \ $CUDA_ATOMIC_PRE_ACTION(T) \ *address -= val; \ $CUDA_ATOMIC_POST_ACTION() \ } $CUDA_DEFINE_ATOMIC_SUB($cuda_atomicSub_int, int) $CUDA_DEFINE_ATOMIC_SUB($cuda_atomicSub_uint, unsigned int) #define $CUDA_DEFINE_ATOMIC_EXCH(NAME, T) \ T NAME(T* address, T val, $cuda_thread_data_t thread) { \ $CUDA_ATOMIC_PRE_ACTION(T) \ *address = val; \ $CUDA_ATOMIC_POST_ACTION() \ } $CUDA_DEFINE_ATOMIC_EXCH($cuda_atomicExch_int, int) $CUDA_DEFINE_ATOMIC_EXCH($cuda_atomicExch_uint, unsigned int) $CUDA_DEFINE_ATOMIC_EXCH($cuda_atomicExch_ullint, unsigned long long int) $CUDA_DEFINE_ATOMIC_EXCH($cuda_atomicExch_float, float) #define $CUDA_DEFINE_ATOMIC_MIN(NAME, T) \ T NAME(T* address, T val, $cuda_thread_data_t thread) { \ $CUDA_ATOMIC_PRE_ACTION(T) \ if (old <= val) *address = old; \ else *address = val; \ $CUDA_ATOMIC_POST_ACTION() \ } $CUDA_DEFINE_ATOMIC_MIN($cuda_atomicMin_int, int) $CUDA_DEFINE_ATOMIC_MIN($cuda_atomicMin_uint, unsigned int) $CUDA_DEFINE_ATOMIC_MIN($cuda_atomicMin_ullint, unsigned long long int) #define $CUDA_DEFINE_ATOMIC_MAX(NAME, T) \ T NAME(T* address, T val, $cuda_thread_data_t thread) { \ $CUDA_ATOMIC_PRE_ACTION(T) \ if (old >= val) *address = old; \ else *address = val; \ $CUDA_ATOMIC_POST_ACTION() \ } $CUDA_DEFINE_ATOMIC_MAX($cuda_atomicMax_int, int) $CUDA_DEFINE_ATOMIC_MAX($cuda_atomicMax_uint, unsigned int) $CUDA_DEFINE_ATOMIC_MAX($cuda_atomicMax_ullint, unsigned long long int) unsigned int $cuda_atomicInc(unsigned int* address, unsigned int val, $cuda_thread_data_t thread) { $CUDA_ATOMIC_PRE_ACTION(unsigned int); if (old >= val) *address = 0; else *address = old + 1; $CUDA_ATOMIC_POST_ACTION() } unsigned int $cuda_atomicDec(unsigned int* address, unsigned int val, $cuda_thread_data_t thread) { $CUDA_ATOMIC_PRE_ACTION(unsigned int); if (old == 0 || old > val) *address = val; else *address = old - 1; $CUDA_ATOMIC_POST_ACTION() } #define $CUDA_DEFINE_ATOMIC_CAS(NAME, T) \ T NAME(T* address, T compare, T val, $cuda_thread_data_t thread) { \ $CUDA_ATOMIC_PRE_ACTION(T) \ if (old == compare) *address = val; \ $CUDA_ATOMIC_POST_ACTION() \ } $CUDA_DEFINE_ATOMIC_CAS($cuda_atomicCAS_int, int) $CUDA_DEFINE_ATOMIC_CAS($cuda_atomicCAS_uint, unsigned int) $CUDA_DEFINE_ATOMIC_CAS($cuda_atomicCAS_ullint, unsigned long long int) $CUDA_DEFINE_ATOMIC_CAS($cuda_atomicCAS_usint, unsigned short int) #define $CUDA_DEFINE_ATOMIC_AND(NAME, T) \ T NAME(T* address, T val, $cuda_thread_data_t thread) { \ $CUDA_ATOMIC_PRE_ACTION(T) \ *address = old & val; \ $CUDA_ATOMIC_POST_ACTION() \ } $CUDA_DEFINE_ATOMIC_AND($cuda_atomicAnd_int, int) $CUDA_DEFINE_ATOMIC_AND($cuda_atomicAnd_uint, unsigned int) $CUDA_DEFINE_ATOMIC_AND($cuda_atomicAnd_ullint, unsigned long long int) #define $CUDA_DEFINE_ATOMIC_OR(NAME, T) \ T NAME(T* address, T val, $cuda_thread_data_t thread) { \ $CUDA_ATOMIC_PRE_ACTION(T) \ *address = old | val; \ $CUDA_ATOMIC_POST_ACTION() \ } $CUDA_DEFINE_ATOMIC_OR($cuda_atomicOr_int, int) $CUDA_DEFINE_ATOMIC_OR($cuda_atomicOr_uint, unsigned int) $CUDA_DEFINE_ATOMIC_OR($cuda_atomicOr_ullint, unsigned long long int) #define $CUDA_DEFINE_ATOMIC_XOR(NAME, T) \ T NAME(T* address, T val, $cuda_thread_data_t thread) { \ $CUDA_ATOMIC_PRE_ACTION(T) \ *address = old ^ val; \ $CUDA_ATOMIC_POST_ACTION() \ } $CUDA_DEFINE_ATOMIC_XOR($cuda_atomicXor_int, int) $CUDA_DEFINE_ATOMIC_XOR($cuda_atomicXor_uint, unsigned int) $CUDA_DEFINE_ATOMIC_XOR($cuda_atomicXor_ullint, unsigned long long int) ////////////////////// // Global Variables // ////////////////////// $gcomm $cuda_gcomm = $gcomm_create($here, 2); const int $CUDA_PLACE_HOST = 0; const int $CUDA_PLACE_DEVICE = 1; $comm $cuda_host_comm = $comm_create($here, $cuda_gcomm, $CUDA_PLACE_HOST); /** * Tags used for message-passing between host and device */ enum $cuda_tag { // Predefined tags $CUDA_TAG_TEARDOWN, $CUDA_TAG_SCOPE_REQUEST, $CUDA_TAG_cudaFree, $CUDA_TAG_cudaMemcpy, $CUDA_TAG_cudaMemcpyAsync, // Generated tags (by transformer) $CUDA_TAG_LAUNCH_kernel_1 }; /////////////////// // CIVL-CUDA API // /////////////////// $scope $cuda_host_request_device_scope() { $comm_enqueue($cuda_host_comm, $message_pack($CUDA_PLACE_HOST, $CUDA_PLACE_DEVICE, $CUDA_TAG_SCOPE_REQUEST, NULL, 0)); $message response = $comm_dequeue($cuda_host_comm, $CUDA_PLACE_DEVICE, $CUDA_TAG_SCOPE_REQUEST); $scope result; $message_unpack(response, &result, sizeof($scope)); return result; } typedef struct $cuda_memcpy_data { void* dst; const void* src; size_t count; cudaMemcpyKind kind; } $cuda_memcpy_data; void $cuda_host_memcpy(void* dst, const void* src, size_t count, cudaMemcpyKind kind, _Bool async) { if (kind == cudaMemcpyHostToHost) { memcpy(dst, src, count); } else { $cuda_memcpy_data args; args.dst = dst; args.src = src; args.count = count; args.kind = kind; int tag = async ? $CUDA_TAG_cudaMemcpyAsync : $CUDA_TAG_cudaMemcpy; $comm_enqueue($cuda_host_comm, $message_pack($CUDA_PLACE_HOST, $CUDA_PLACE_DEVICE, tag, &args, sizeof($cuda_memcpy_data))); $comm_dequeue($cuda_host_comm, $CUDA_PLACE_DEVICE, tag); } } $cuda_stream_node_t $create_new_stream_node($scope cudaScope) { cudaStream_t newStream = (cudaStream_t) $malloc(cudaScope, sizeof($cuda_stream)); newStream->head = NULL; newStream->tail = NULL; newStream->numOps = 0; newStream->alive = true; $cuda_stream_node_t newHead = ($cuda_stream_node_t) $malloc(cudaScope, sizeof($cuda_stream_node)); newHead->stream = newStream; newStream->containingNode = newHead; newHead->prev = NULL; newHead->next = NULL; return newHead; } /*@ depends_on \nothing; @ assigns \nothing; @ reads \nothing; @*/ $atomic_f $proc $destroy_stream_node($cuda_stream_node_t node) { $proc lastOpProc = $proc_null; cudaStream_t stream = node->stream; if (node->prev != NULL) { node->prev->next = node->next; } if (node->next != NULL) { node->next->prev = node->prev; } free(node); stream->alive = false; if(stream->tail != NULL) lastOpProc = stream->tail->opState->op; void destroyStreamWhenComplete($proc lastOpProc, cudaStream_t stream) { $wait(lastOpProc); free(stream); } return $spawn destroyStreamWhenComplete(lastOpProc, stream); } /*@ depends_on \access(stream); @ assigns stream; @ reads \nothing; @*/ $atomic_f $proc $stream_enqueue($scope cudaScope, cudaStream_t stream, $message opParams, void(*opProc)($message, $cuda_op_state_t, cudaStream_t)) { $assert(stream->alive, "Attempt to enqueue a CUDA operation onto a destroyed stream"); $cuda_op_state_t newOpState = ($cuda_op_state_t) $malloc(cudaScope, sizeof($cuda_op_state)); newOpState->start = false; newOpState->op = $spawn opProc(opParams, newOpState, stream); $cuda_op_state_node_t newOpStateNode = ($cuda_op_state_node_t) $malloc(cudaScope, sizeof($cuda_op_state_node)); newOpStateNode->opState = newOpState; newOpStateNode->next = NULL; if (stream->tail == NULL) { stream->head = newOpStateNode; stream->tail = newOpStateNode; newOpState->start = true; } else { stream->tail->next = newOpStateNode; stream->tail = newOpStateNode; } stream->numOps++; return newOpState->op; } /*@ depends_on \nothing; @ assigns \nothing; @ reads \nothing; @*/ $atomic_f void $stream_dequeue(cudaStream_t stream) { $assert(stream->head != NULL, "Attempt to dequeue an empty stream"); if (stream->head == stream->tail) { stream->tail = NULL; } $cuda_op_state_node_t oldHead = stream->head; stream->head = oldHead->next; if (stream->head != NULL) { stream->head->opState->start = true; } stream->numOps--; free(oldHead->opState); free(oldHead); } // Helper function int $dim3_index(dim3 size, uint3 location) { return location.x + size.x * (location.y + size.y * location.z); } // Helper function int $cuda_kernel_index (dim3 gDim, dim3 bDim, uint3 bIdx, uint3 tIdx) { return $dim3_index(gDim, bIdx) * (bDim.x * bDim.y * bDim.z) + $dim3_index(bDim, tIdx); } void $cuda_run_and_wait_on_procs(dim3 dim, void spawningFunction(uint3)) { //TODO: calculate length and index, replace this function in the kernel $local_start(); int length = dim.x * dim.y * dim.z; $proc procArray[length]; $range rx = 0 .. dim.x - 1; $range ry = 0 .. dim.y - 1; $range rz = 0 .. dim.z - 1; $domain(3) dom = ($domain(3)){rx, ry, rz}; $for(int x,y,z : dom){ uint3 id = { x, y, z }; int index = $dim3_index(dim, id); procArray[index] = $spawn spawningFunction(id); } $local_end(); $waitall(procArray,length); } // CUDA Ops // void $cuda_memcpy_proc($message m, $cuda_op_state_t opState, cudaStream_t stream) { $when(opState->start); $cuda_memcpy_data args; $message_unpack(m, &args, sizeof($cuda_memcpy_data)); if (args.kind == cudaMemcpyHostToDevice || cudaMemcpyDeviceToDevice) { args.dst = $reveal(args.dst); } if (args.kind == cudaMemcpyDeviceToHost || cudaMemcpyDeviceToDevice) { args.src = $reveal(args.src); } memcpy(args.dst, args.src, args.count); $stream_dequeue(stream); } $message $cuda_memcpy($scope cudaScope, cudaStream_t stream, $message request, _Bool async) { $cuda_memcpy_data args; $message_unpack(request, &args, sizeof($cuda_memcpy_data)); $proc memcpyProc = $stream_enqueue(cudaScope, stream, request, $cuda_memcpy_proc); if (!async && args.kind != cudaMemcpyDeviceToDevice) { $wait(memcpyProc); } int tag = async ? $CUDA_TAG_cudaMemcpyAsync : $CUDA_TAG_cudaMemcpy; return $message_pack($CUDA_PLACE_DEVICE, $CUDA_PLACE_HOST, tag, NULL, 0); } $message $cuda_free($message request) { void* devPtr; $message_unpack(request, &devPtr, sizeof(void*)); free($reveal(devPtr)); return $message_pack($CUDA_PLACE_DEVICE, $CUDA_PLACE_HOST, $CUDA_TAG_cudaFree, NULL, 0); } //////////////////////////////////////////// // CUDA API Functions (For Host-use Only) // //////////////////////////////////////////// cudaError_t cudaFree(void* devPtr) { $comm_enqueue($cuda_host_comm, $message_pack($CUDA_PLACE_HOST, $CUDA_PLACE_DEVICE, $CUDA_TAG_cudaFree, &devPtr, sizeof(void*))); $comm_dequeue($cuda_host_comm, $CUDA_PLACE_DEVICE, $CUDA_TAG_cudaFree); return cudaSuccess; } cudaError_t cudaMemcpy(void* dst, const void* src, size_t count, cudaMemcpyKind kind) { $cuda_host_memcpy(dst, src, count, kind, false); return cudaSuccess; } cudaError_t cudaMemcpyAsync(void* dst, const void* src, size_t count, cudaMemcpyKind kind, cudaStream_t stream) { $cuda_host_memcpy(dst, src, count, kind, true); return cudaSuccess; } /* cudaError_t cudaStreamCreate(cudaStream_t * pStream) { // Create new stream node in linked list $cuda_stream_node_t newHead = $create_new_stream_node(); newHead->next = $cuda_global_context.head; $cuda_global_context.head->prev = newHead; // Update cuda context's head to be the new node we created $cuda_global_context.head = newHead; $cuda_global_context.numStreams++; return cudaSuccess; } */ /* cudaError_t cudaStreamSynchronize(cudaStream_t stream) { stream = $default_stream_if_null(stream); $assert(stream->alive, "Attempt to synchronize with a destroyed stream"); $when(stream->head == NULL) return cudaSuccess; } */ /* cudaError_t cudaStreamDestroy(cudaStream_t stream) { $assert(stream != NULL && stream != $cuda_default_stream, "Attempt to destroy default stream"); $assert(stream->alive, "Attempt to destroy an already destroyed stream"); $destroy_stream_node(stream->containingNode); return cudaSuccess; } */ /* cudaError_t cudaDeviceSynchronize() { $proc* opsToWaitOn; int numOps = 0; $atomic { opsToWaitOn = ($proc*) malloc(sizeof($proc) * $cuda_global_context.numStreams); for ($cuda_stream_node_t node = $cuda_global_context.head; node != NULL; node = node->next) { if (node->stream->tail != NULL) { opsToWaitOn[numOps] = node->stream->tail->opState->op; numOps++; } } } $waitall(opsToWaitOn, numOps); return cudaSuccess; } */ ////////////////////////////////// // Generated code from kernel_1 // ////////////////////////////////// typedef struct { dim3 gridDim; dim3 blockDim; size_t $cudaMemSize; cudaStream_t $cudaStream; float* A; const float* B; float* C; int numElements; } $cuda_kernel_1_params; void $cuda_reveal_kernel_1_args($cuda_kernel_1_params* args) { args->A = $reveal(args->A); args->B = $reveal(args->B); args->C = $reveal(args->C); } void $cuda_host_launch_kernel_1(dim3 gridDim, dim3 blockDim, size_t $cudaMemSize, cudaStream_t $cudaStream, float* A, const float* B, float* C, int numElements) { $cuda_kernel_1_params args; args.gridDim = gridDim; args.blockDim = blockDim; args.$cudaMemSize = $cudaMemSize; args.$cudaStream = $cudaStream; args.A = A; args.B = B; args.C = C; args.numElements = numElements; $comm_enqueue($cuda_host_comm, $message_pack($CUDA_PLACE_HOST, $CUDA_PLACE_DEVICE, $CUDA_TAG_LAUNCH_kernel_1, &args, sizeof($cuda_kernel_1_params))); $comm_dequeue($cuda_host_comm, $CUDA_PLACE_DEVICE, $CUDA_TAG_LAUNCH_kernel_1); } void $cuda_kernel_1(dim3 gridDim, dim3 blockDim, size_t _cuda_mem_size, float *A, const float *B, float *C, int numElements) { $cuda_kernel_data_t $kernel = $create_cuda_kernel_data($here, gridDim, blockDim); void $cuda_block(uint3 blockIdx) { void $cuda_thread(uint3 threadIdx) { $local_start(); // cudaMemSet currently not supported so this is small hack to initialize C ahead of time if (blockIdx.x == 0 && threadIdx.x == 0) { for (int i = 0; i < gridDim.x; i++) { C[i] = 0; } } //$clear_mem_sets($kernel, _cuda_kid); int $cuda_tid = $dim3_index(blockDim, threadIdx); int $cuda_kid = $cuda_kernel_index(gridDim, blockDim, blockIdx, threadIdx); $cuda_thread_data_t $thread = $create_cuda_thread_data($here, $kernel, $cuda_kid/(blockDim.x * blockDim.y * blockDim.z), $cuda_tid/warpSize, $cuda_tid % warpSize); // Kernel REDUCTION start /* int lane = threadIdx.x % warpSize; int thisWarpSize = warpSize; if (threadIdx.x - lane + warpSize > blockDim.x) { thisWarpSize = ((blockDim.x - 1) % warpSize) + 1; } int i = blockDim.x * blockIdx.x + threadIdx.x; int warpStart = i - lane; //printf("%d,%d - i: %d, warpStart: %d, thisWarpSize: %d\n", blockIdx.x, threadIdx.x,i, warpStart, thisWarpSize); int remainingElements = numElements; while (remainingElements > 1) { //printf("%d,%d - remainingElements: %d - numElements: %d\n", blockIdx.x, threadIdx.x, remainingElements, numElements); if (remainingElements < numElements) { // __syncThreads() //printf("%d,%d - entering barrier\n", blockIdx.x, threadIdx.x); $cuda__syncthreads($thread, 0); //printf("%d,%d - exiting barrier\n", blockIdx.x, threadIdx.x); } if (warpStart + 1 < remainingElements) { float val = i < numElements ? A[i] : 0; //printf("%d,%d - val: %d\n", blockIdx.x, threadIdx.x, val); for (int offset = warpSize/2; offset > 0; offset /= 2) { //__syncwarp(); float tmp = __shfl_down_sync(0, val, offset); if (lane + offset < thisWarpSize) { val += tmp; } //printf("%d,%d - offset: %d - val: %d\n", blockIdx.x, threadIdx.x, offset, val); } if (i < numElements) { A[i] = val; } } i *= warpSize; //warpStart *= warpSize; remainingElements = ((remainingElements - 1) / warpSize) + 1; } if (i == 0) { *C = A[0]; } */ // Kernel REDUCTION end // Kernel REDUCTION 2 start int lane = threadIdx.x % warpSize; int thisWarpSize = warpSize; if (threadIdx.x - lane + warpSize > blockDim.x) { thisWarpSize = ((blockDim.x - 1) % warpSize) + 1; } int i = blockDim.x * blockIdx.x + threadIdx.x; int warpStart = i - lane; if (warpStart + 1 < numElements) { float val = i < numElements ? A[i] : 0; for (int offset = warpSize/2; offset > 0; offset /= 2) { float tmp = __shfl_down_sync(0, val, offset); //float tmp = i + offset < numElements ? A[i + offset] : 0; if (lane + offset < thisWarpSize) { val += tmp; } } if (i < numElements) { A[i] = val; } } $cuda__syncthreads($thread, 0); if (threadIdx.x == 0) { int blockEnd = blockDim.x * (blockIdx.x + 1); if (blockEnd > numElements) { blockEnd = numElements; } for (int j = i + warpSize; j < blockEnd; j += warpSize) { A[i] += A[j]; } atomicAdd(C + blockIdx.x, 1); } if (i == 0) { C[0] = A[0]; for (int j = 1; j < gridDim.x; j++) { while(atomicAdd(C+j,0) == 0) {} C[0] += A[j * blockDim.x]; } } // Kernel REDUCTION 2 end // Kernel BALLOT TEST start /* int i = threadIdx.x; if (i < numElements) { int result = __ballot_sync(~0, A[i] > 0); if (i == 0) { printf("Result: %d\n", result); *C = 0; while(result > 0) { if (result % 2) *C += 1; result /= 2; } printf("done calculating result\n"); } } */ // Kernel BALLOT TEST end //$check_data_race($kernel, _cuda_kid); //$read_set_pop(); //$write_set_pop(); $destroy_cuda_thread_data($thread); $local_end(); } $cuda_run_and_wait_on_procs(blockDim, $cuda_thread); } $cuda_run_and_wait_on_procs(gridDim, $cuda_block); $destroy_cuda_kernel_data($kernel); } void $cuda_kernel_1_proc ($message request, $cuda_op_state_t opState, cudaStream_t cudaStream) { $when(opState->start); $cuda_kernel_1_params args; $message_unpack(request, &args, sizeof($cuda_kernel_1_params)); $cuda_reveal_kernel_1_args(&args); $cuda_kernel_1(args.gridDim, args.blockDim, args.$cudaMemSize, args.A, args.B, args.C, args.numElements); $stream_dequeue(cudaStream); } ///////////////// // CUDA "file" // ///////////////// void $cuda_main() { // Device Variables $scope $cuda_scope = $here; $comm $cuda_device_comm = $comm_create($cuda_scope, $cuda_gcomm, 1); $cuda_context $cuda_global_context; cudaStream_t $cuda_default_stream; // Helper function to get the default stream if passed NULL, and just returns stream otherwise // Currently unused until we support streams other than the default one. cudaStream_t $default_stream_if_null(cudaStream_t stream) { return stream == NULL ? $cuda_default_stream : stream; } // Device Logic $cuda_stream_node_t defaultStreamNode = $create_new_stream_node($cuda_scope); $cuda_default_stream = defaultStreamNode->stream; $cuda_global_context.head = defaultStreamNode; $cuda_global_context.numStreams = 1; while (true) { $message request = $comm_dequeue($cuda_device_comm, $CUDA_PLACE_HOST, $COMM_ANY_TAG); $message response; const int tag = $message_tag(request); switch(tag) { case $CUDA_TAG_SCOPE_REQUEST : response = $message_pack($CUDA_PLACE_DEVICE, $CUDA_PLACE_HOST, $CUDA_TAG_SCOPE_REQUEST, &$cuda_scope, sizeof($scope)); break; case $CUDA_TAG_cudaFree : response = $cuda_free(request); break; case $CUDA_TAG_cudaMemcpy : response = $cuda_memcpy($cuda_scope, $cuda_default_stream, request, false); break; case $CUDA_TAG_cudaMemcpyAsync : response = $cuda_memcpy($cuda_scope, $cuda_default_stream, request, true); break; case $CUDA_TAG_LAUNCH_kernel_1 : $stream_enqueue($cuda_scope, $cuda_default_stream, request, $cuda_kernel_1_proc); response = $message_pack($CUDA_PLACE_DEVICE, $CUDA_PLACE_HOST, tag, NULL, 0); break; case $CUDA_TAG_TEARDOWN : { $proc destructor = $destroy_stream_node($cuda_default_stream->containingNode); $wait(destructor); $comm_destroy($cuda_device_comm); return; } default : $assert(false, "Unknown CUDA request"); } $comm_enqueue($cuda_device_comm, response); } } /////////////// // Host file // /////////////// $input int N = 16; $input int threadsPerBlock = N%2 == 0 ? N/2 : (N+1)/2; //$input int threadsPerBlock = N; $input float A[N]; // Currently unused but left in to save time $input float B[N]; void $host_main() { int size = N * sizeof(float); int numBlocks = (N-1)/threadsPerBlock + 1; //int numBlocks = 1; float* cuda_A; // cudaMalloc((void **)&cuda_A, size); { $scope deviceScope = $cuda_host_request_device_scope(); cuda_A = $hide((float*)$malloc(deviceScope, size)); } cudaMemcpy(cuda_A, A, size, cudaMemcpyHostToDevice); float* cuda_B; // cudaMalloc((void **)&cuda_B, size); { $scope deviceScope = $cuda_host_request_device_scope(); cuda_B = $hide((float*)$malloc(deviceScope, size)); } cudaMemcpy(cuda_B, B, size, cudaMemcpyHostToDevice); float* cuda_C; // cudaMalloc((void **)&cuda_C, sizeof(float) * numBlocks); { $scope deviceScope = $cuda_host_request_device_scope(); cuda_C = $hide((float*)$malloc(deviceScope, sizeof(float) * numBlocks)); } dim3 gridDim = {numBlocks, 1, 1}; dim3 blockDim = {threadsPerBlock, 1, 1}; // kernel_1<<>>(cuda_A, cuda_B, cuda_C, N); $cuda_host_launch_kernel_1(gridDim, blockDim, 0, NULL, cuda_A, cuda_B, cuda_C, N); // Checking correctness float* C = (float *)malloc(size); cudaMemcpy(C, cuda_C, sizeof(float), cudaMemcpyDeviceToHost); // REDUCTION ASSERTION float sum = 0; for(int i = 0; i < N; i++) sum += A[i]; $assert(*C == sum); // BALLOT ASSERTION /* float count = 0; for (int i = 0; i < N; i++) { if (A[i] > 0) count++; } $assert(*C == count); */ free(C); cudaFree(cuda_A); cudaFree(cuda_B); cudaFree(cuda_C); } int main() { $proc host = $spawn $host_main(); $proc cuda = $spawn $cuda_main(); $wait(host); $comm_enqueue($cuda_host_comm, $message_pack($CUDA_PLACE_HOST, $CUDA_PLACE_DEVICE, $CUDA_TAG_TEARDOWN, NULL, 0)); $comm_destroy($cuda_host_comm); $wait(cuda); $gcomm_destroy($cuda_gcomm, NULL); }