#include /* This header file contains useful helper functions for manipulating * the CIVL versions of various Cuda objects. */ #include // To print for debugging #include #include #include #include #include #include #include #pragma CIVL ACSL /* type used to represent an instance of a Cuda kernel * TODO: Update with read/write sets */ struct $cuda_kernel_instance { $proc process; // the actual process executing this kernel $cuda_kernel_status status; // the kernel status // Read and Write sets $mem read_sets[]; $mem write_sets[]; // Number of threads in kernel int size; }; /* a type that wraps a kernel instance for insertion into a list */ struct $cuda_kernel_instance_node { $cuda_kernel_instance_t *instance; $cuda_kernel_instance_node_t *next; }; /* $cuda_kernel_instance_node_t interface */ $cuda_kernel_instance_t *$cuda_get_instance($cuda_kernel_instance_node_t *node) { return node->instance; } /* a type used to represent a Cuda stream */ struct _CUstream { $cuda_kernel_instance_node_t *mostRecent; // the most recently enqueued instance _Bool usable; // indicates whether or not this stream // has been marked for deletion }; /* _CUstream interface */ $cuda_kernel_instance_node_t *$cuda_get_most_recent(cudaStream_t s) { return s->mostRecent; } _Bool $cuda_is_usable(cudaStream_t s) { return s->usable; } void $cuda_set_usable(cudaStream_t s, _Bool b) { s->usable = b; } /* a type that wraps a stream for insertion into a list */ struct $cuda_stream_node { cudaStream_t stream; $cuda_stream_node_t *next; }; /* $cuda_stream_node_t encapsulation functions */ void $cuda_set_stream($cuda_stream_node_t* node, cudaStream_t stream) { node->stream = stream; } void $cuda_set_next($cuda_stream_node_t* node, $cuda_stream_node_t* next) { node->next = next; } /* a type used to represent a Cuda event */ struct _CUevent { $cuda_kernel_instance_t **instances; int numInstances; }; /* _CUevent encapsulation functions */ $cuda_kernel_instance_t **$cuda_get_instances(cudaEvent_t e) { return e->instances; } void $cuda_set_instances(cudaEvent_t e, $cuda_kernel_instance_t** instances, int numInstances) { e->instances = instances; e->numInstances = numInstances; } int $cuda_get_num_instances(cudaEvent_t e) { return e->numInstances; } /* a type representing the state of a Cuda device */ struct $cuda_context { $cuda_stream_node_t *headNode; cudaStream_t nullStream; int numStreams; }; /* $cuda_context_t encapsulation functions */ int $cuda_get_num_streams($cuda_context_t *_context) { return _context->numStreams; } $cuda_stream_node_t *$cuda_get_head_node($cuda_context_t *_context) { return _context->headNode; } cudaStream_t $cuda_get_null_stream($cuda_context_t *_context) { return _context->nullStream; } void $cuda_add_new_stream($cuda_context_t *_context, $cuda_stream_node_t *newNode) { _context->headNode = newNode; _context->numStreams++; } /* Computes the one dimensional index of a grid cell at a given location * in a three dimensional grid of a given size */ int $cuda_index (dim3 size, uint3 location) { return location.x + size.x * (location.y + size.y * location.z); } /* Computes the one dimensional index of a specific thread in the grid given * the grid dimension, block dimension, block index, and thread index */ int $cuda_kernel_index (dim3 gDim, dim3 bDim, uint3 bIdx, uint3 tIdx) { return $cuda_index(gDim, bIdx) * (bDim.x * bDim.y * bDim.z) + $cuda_index(bDim, tIdx); } /* Lifts a single integer x into a three dimensional vector representing * a one dimensional grid of length x */ dim3 $cuda_to_dim3(int x) { dim3 d = { x, 1, 1 }; return d; } /* Given a three dimensional vector representing a grid of size dim, * create and destroy a process, in parallel, for each cell in the grid. * The location of the cell is passed to the spawning function. */ void $cuda_run_procs(dim3 dim, void spawningFunction(uint3)) { $range rx = 0 .. (dim.x == 0 ? -1 : dim.x - 1); $range ry = 0 .. (dim.y == 0 ? -1 : dim.y - 1); $range rz = 0 .. (dim.z == 0 ? -1 : dim.z - 1); $domain(3) dom = ($domain){rx, ry, rz}; $parfor(int x,y,z : dom){ uint3 id = { x, y, z }; spawningFunction(id); } } // ------------------------------------------------ /* $wait on a given process is it is non-null */ void $cuda_try_wait($proc p) { if (p != $proc_null) { $yield(); $wait(p); } } /* The current state of the GPU */ $cuda_context_t $cuda_current_context = { .headNode = NULL, .nullStream = NULL, .numStreams = 0 }; /* malloc and initialize a new $cuda_kernel_instance_t */ // TODO: Determine if passing parameters is better or not $cuda_kernel_instance_t *$cuda_kernel_instance_create(dim3 gDim, dim3 bDim) { //printf("mallocing kernel instance\n"); $cuda_kernel_instance_t *i = ($cuda_kernel_instance_t*)malloc(sizeof($cuda_kernel_instance_t)); i->process = $proc_null; i->status = $cuda_kernel_status_waiting; int threadNum = gDim.x * gDim.y * gDim.z * bDim.x * bDim.y * bDim.z; i->size = threadNum; // TODO: either sequentially or non-sequentially? $mem empty = $mem_empty(); $seq_init(&i->read_sets, threadNum, &empty); $seq_init(&i->write_sets, threadNum, &empty); return i; } /* cleanup and free a given $cuda_kernel_instance_t */ void $cuda_kernel_instance_destroy($cuda_kernel_instance_t *i) { $cuda_try_wait(i->process); //printf("freeing kernel instance\n"); $free(i); } /* malloc and initialize a new $cuda_kernel_instance_node_t */ $cuda_kernel_instance_node_t *$cuda_kernel_instance_node_create() { //printf("mallocing kernel instance node\n"); $cuda_kernel_instance_node_t *node = ($cuda_kernel_instance_node_t*)malloc(sizeof($cuda_kernel_instance_node_t)); node->instance = NULL; node->next = NULL; return node; } /* cleanup and free a given $cuda_kernel_instance_node_t */ void $cuda_kernel_instance_node_destroy($cuda_kernel_instance_node_t *node) { $cuda_kernel_instance_destroy(node->instance); //printf("freeing kernel instance node\n"); $free(node); } /* malloc and initialize a new stream */ cudaStream_t $cuda_stream_create() { cudaStream_t s; //printf("mallocing cuda stream\n"); s = (cudaStream_t)malloc(sizeof(_CUstream)); s->mostRecent = $cuda_kernel_instance_node_create(); dim3 arbitrary = { 1, 1, 1 }; s->mostRecent->instance = $cuda_kernel_instance_create(arbitrary, arbitrary); s->mostRecent->instance->status = $cuda_kernel_status_finished; s->usable = $true; return s; } /* block until the most recently enqueued process on the given stream * has terminated (meaning all kernels in that stream have completed) */ void $cuda_stream_wait(cudaStream_t s) { $cuda_kernel_instance_t *mostRecentInstance = s->mostRecent->instance; $yield(); $unidirectional_when(mostRecentInstance->status == $cuda_kernel_status_finished); } /* block until no more streams have kernels executing */ void $cuda_stream_wait_all() { $cuda_stream_node_t *curNode = $cuda_current_context.headNode; while (curNode != NULL) { $cuda_stream_wait(curNode->stream); curNode = curNode->next; } } /* cleanup and free a given stream */ void $cuda_stream_destroy(cudaStream_t s) { $cuda_kernel_instance_node_t *curNode = s->mostRecent; $cuda_kernel_instance_node_t *nextNode; while (curNode != NULL) { nextNode = curNode->next; $cuda_kernel_instance_node_destroy(curNode); curNode = nextNode; } //printf("freeing cuda stream\n"); $free(s); } /* malloc and initialize a new $cuda_stream_node_t */ $cuda_stream_node_t *$cuda_stream_node_create() { //printf("mallocing cuda stream node\n"); $cuda_stream_node_t *node = ($cuda_stream_node_t*)malloc(sizeof($cuda_stream_node_t)); node->stream = NULL; node->next = NULL; return node; } /* cleanup and free a given $cuda_stream_node_t */ void $cuda_stream_node_destroy($cuda_stream_node_t *node) { $assert((!node->stream->usable)); $cuda_stream_destroy(node->stream); //printf("freeing cuda stream node\n"); $free(node); } /* destroy all stream nodes contained in the context */ void $cuda_stream_node_destroy_all() { $cuda_stream_node_t *curNode = $cuda_current_context.headNode; $cuda_stream_node_t *nextNode; while (curNode != NULL) { nextNode = curNode->next; $cuda_stream_node_destroy(curNode); curNode = nextNode; } $cuda_current_context.headNode = NULL; } /* malloc and initialize a new event */ cudaEvent_t $cuda_event_create() { //printf("mallocing event\n"); cudaEvent_t e = (cudaEvent_t)malloc(sizeof(_CUevent)); e->numInstances = 0; e->instances = NULL; return e; } /* block until all $cuda_kernel_instance_ts contained in this event have * completed */ /*@ depends_on \access(e); @*/ void $cuda_event_wait(cudaEvent_t e) { for (int i = 0; i < e->numInstances; i++) { $yield(); $unidirectional_when(e->instances[i]->status == $cuda_kernel_status_finished); } } /* cleanup and free a given event */ void $cuda_event_destroy(cudaEvent_t e) { if (e->instances != NULL) { //printf("freeing instance list a\n"); $free(e->instances); } //printf("freeing event\n"); $free(e); } /* initialize the cuda context. must be called before any cuda functions. */ void $cuda_init() { $cuda_current_context.nullStream = $cuda_stream_create(); } /* cleanup the cuda context. must be called after all cuda functions. */ void $cuda_finalize() { $cuda_stream_wait_all(); $cuda_stream_wait($cuda_current_context.nullStream); $cuda_stream_node_destroy_all(); $cuda_stream_destroy($cuda_current_context.nullStream); } /* returns an array of pointers to the most recently enqueued kernel * of each stream. */ $cuda_kernel_instance_t **$cuda_all_most_recent_kernels() { int n = $cuda_current_context.numStreams + 1; $cuda_stream_node_t *curNode = $cuda_current_context.headNode; //printf("mallocing instance list a\n"); $cuda_kernel_instance_t **insts = ($cuda_kernel_instance_t**)malloc(n * sizeof($cuda_kernel_instance_t*)) ; insts[0] = $cuda_current_context.nullStream->mostRecent->instance; for (int i = 1; i < n; i++, curNode = curNode->next) { insts[i] = curNode->stream->mostRecent->instance; } return insts; } /* create a kernel instance for the given function k, and enqueue it * onto the given stream. */ void $cuda_enqueue_kernel(cudaStream_t stream, void (*k)($cuda_kernel_instance_t*, cudaEvent_t), dim3 gDim, dim3 bDim) { cudaStream_t s; cudaEvent_t e = $cuda_event_create(); $cuda_kernel_instance_node_t *newNode = $cuda_kernel_instance_node_create(); if (stream == NULL) { e->numInstances = $cuda_current_context.numStreams + 1; e->instances = $cuda_all_most_recent_kernels(); s = $cuda_current_context.nullStream; } else { e->numInstances = 2; //printf("mallocing instance list b\n"); e->instances = ($cuda_kernel_instance_t**)malloc(2 * sizeof($cuda_kernel_instance_t*)) ; e->instances[0] = stream->mostRecent->instance; e->instances[1] = $cuda_current_context.nullStream->mostRecent->instance; s = stream; } $assert((s->usable)); newNode->instance = $cuda_kernel_instance_create(gDim, bDim); newNode->next = s->mostRecent; s->mostRecent = newNode; s->mostRecent->instance->process = $spawn k(s->mostRecent->instance, e); } /* called by kernel processes. wait on the given event, then update * the status of the calling kernel to indicate it has finished waiting */ void $cuda_wait_in_queue ($cuda_kernel_instance_t *this, cudaEvent_t e) { $cuda_event_wait(e); $cuda_event_destroy(e); this->status = $cuda_kernel_status_running; } /* called by kernel processes. update the status of the calling kernel * to indicate that it has completed execution */ void $cuda_kernel_finish($cuda_kernel_instance_t *k) { k->status = $cuda_kernel_status_finished; } /* TODO: Finish check_data_race */ void $cuda_barrier($cuda_kernel_instance_t *k, int kernel_id, $barrier g) { $check_data_race(k, kernel_id); // We have to push a new read and write set before the barrier call to ignore it's reads and writes $read_set_push(); $write_set_push(); void captured_clear_mems(){ $clear_all_mem_sets(k); } $barrier_call_execute(g, captured_clear_mems); $read_set_pop(); $write_set_pop(); } /* TODO: Finish read_write_sets */ $atomic_f void $check_data_race($cuda_kernel_instance_t *k, int cur_tid) { //printf("Current id: %d\n", cur_tid); $mem out_s0 = $mem_empty(); $mem out_s1 = $mem_empty(); $mem cur_mw = $write_set_pop(); $mem cur_mr = $read_set_pop(); // Update current R/W sets k->write_sets[cur_tid] = cur_mw; k->read_sets[cur_tid] = cur_mr; /* printf("CHECKING DATA RACE %d [\n", cur_tid); for (int tmp_tid = 0; tmp_tid < k->size; tmp_tid++) { printf(" RS %d: %s\n", tmp_tid, k->read_sets[tmp_tid]); printf(" WS %d: %s\n", tmp_tid, k->write_sets[tmp_tid]); } printf("]\n"); */ // Check data race for (int tmp_tid = 0; tmp_tid < k->size; tmp_tid++) { if (tmp_tid == cur_tid) continue; $mem tmp_mr = k->read_sets[tmp_tid]; $mem tmp_mw = k->write_sets[tmp_tid]; $assert($mem_no_intersect(cur_mr, tmp_mw, &out_s0, &out_s1), "Data-race detected: %p read by thread %d intersects %p written by thread %d\n", out_s0, cur_tid, out_s1, tmp_tid); $assert($mem_no_intersect(cur_mw, tmp_mr, &out_s0, &out_s1), "Data-race detected: %p read by thread %d intersects %p written by thread %d\n", out_s0, cur_tid, out_s1, tmp_tid); $assert($mem_no_intersect(cur_mw, tmp_mw, &out_s0, &out_s1), "Data-race detected: %p written by thread %d intersects %p written by thread %d\n", out_s0, cur_tid, out_s1, tmp_tid); } // Update current R/W sets //k->write_sets[cur_tid] = $mem_empty(); //k->read_sets[cur_tid] = $mem_empty(); $read_set_push(); $write_set_push(); } /* Clears read and write memory sets of the given thread */ void $clear_mem_sets($cuda_kernel_instance_t *k, int cur_tid) { k->write_sets[cur_tid] = $mem_empty(); k->read_sets[cur_tid] = $mem_empty(); } void $clear_all_mem_sets($cuda_kernel_instance_t *k){ for(int i = 0; i < k->size; i++) $clear_mem_sets(k, i); } /* Publishes current read a write sets to global arrays. Local sets are not cleared */ void $publish($cuda_kernel_instance_t *k, int cur_tid) { k->write_sets[cur_tid] = $write_set_peek(); k->read_sets[cur_tid] = $read_set_peek(); } int is_valid_width(int width){ for(int i = 32; i > 1; i /= 2){ if(width == i){ return 1; } } return 0; } int exchange_data(unsigned mask, int var, int srcLane, int tid, $comm comm, $gbarrier* warpBarriers){ $read_set_push(); $write_set_push(); int laneID = tid % 32; int warpID = tid / 32; int currLaneInMask = (mask >> laneID) & 1; int srcLaneInMask = (mask >> srcLane) & 1; int dest; $gbarrier gbarrier = warpBarriers[warpID]; int numActiveThreads = 0; for (int i = 0; i < $get_nprocs(gbarrier); i++){ numActiveThreads += (mask >> i) & 1; } if(currLaneInMask){ $barrier barrier = $barrier_create($here, gbarrier, laneID); if(srcLaneInMask){ $message request_message = $message_pack(tid, srcLane, 0, &tid, sizeof(int)); $comm_enqueue(comm, request_message); } $barrier_call_subset(barrier, numActiveThreads); while ($comm_probe(comm, $COMM_ANY_SOURCE, 0)){ $yield(); $message recv_request = $comm_dequeue(comm, $COMM_ANY_SOURCE, 0); $message_unpack(recv_request, &dest, sizeof(int)); $message send_message = $message_pack(tid, dest, 1, &var, sizeof(int)); $comm_enqueue(comm, send_message); } if(srcLaneInMask){ $yield(); $message recv_message = $comm_dequeue(comm, srcLane, 1); $message_unpack(recv_message, &var, sizeof(int)); } else{ $havoc(&var); } $barrier_destroy(barrier); } else{ if(laneID != srcLane) $havoc(&var); } $read_set_pop(); $write_set_pop(); return var; } int _cuda__shfl_sync(unsigned mask, int var, int srcLane, int width, int numThreads, int tid, $comm comm, $gbarrier* warpBarriers){ $assert(is_valid_width(width)); int subWarpID = tid / width; srcLane = srcLane % width + subWarpID * width; if(srcLane >= numThreads){ int lastSubWarpSize = numThreads % width; srcLane = srcLane % lastSubWarpSize + subWarpID * width; } return exchange_data(mask, var, srcLane, tid, comm, warpBarriers); } int _cuda__shfl_up_sync(unsigned mask, int var, unsigned int delta, int width, int numThreads, int tid, $comm comm, $gbarrier* warpBarriers){ $assert(is_valid_width(width)); int subWarpLaneID = tid % width; int srcLane = tid; if (subWarpLaneID - delta >= 0){ srcLane = tid - delta; } return exchange_data(mask, var, srcLane, tid, comm, warpBarriers); } int _cuda__shfl_down_sync(unsigned mask, int var, unsigned int delta, int width, int numThreads, int tid, $comm comm, $gbarrier* warpBarriers){ $assert(is_valid_width(width)); int subWarpLaneID = tid % width; int srcLane = tid; if (subWarpLaneID + delta < width && tid + delta < numThreads) { srcLane = tid + delta; } return exchange_data(mask, var, srcLane, tid, comm, warpBarriers); } int _cuda__shfl_xor_sync(unsigned mask, int var, int laneMask, int width, int numThreads, int tid, $comm comm, $gbarrier* warpBarriers){ $assert(is_valid_width(width)); int laneID = tid % 32; int warpID = tid / 32; int subWarpID = tid / width; int srcLane = laneID ^ laneMask; if(!(srcLane / 32 == warpID && srcLane / width <= subWarpID && srcLane < numThreads)){ srcLane = tid; } return exchange_data(mask, var, srcLane, tid, comm, warpBarriers); }