/** * TODO: * - implement cudaMemset and cudaMemsetAsync * - flesh out basic structure of cuda kernel: * - spawn gridDim blocks * - spawn blockDim threads * Alternatively, spawn blockDim warps and then spawn warp's threads * - wait for blocks to finish * - Add in block-level barriers and a __syncthreads() nested function that uses the barrier. * - Add in data race checking support and implement atomicAdd for integers */ enum cudaError { cudaSuccess }; typedef enum cudaError cudaError_t; typedef enum cudaMemcpyKind { cudaMemcpyHostToHost, cudaMemcpyHostToDevice, cudaMemcpyDeviceToHost, cudaMemcpyDeviceToDevice, cudaMemcpyDefault } cudaMemcpyKind; typedef struct $cuda_op_state { _Bool start; $proc op; }* $cuda_op_state_t; typdef struct $cuda_op_state_node { $cuda_op_state_t opState; $cuda_op_state_node_t next; }* $cuda_op_state_node_t; typedef struct cudaStream { $cuda_op_state_node_t head; $cuda_op_state_node_t tail; int numOps; $cuda_stream_node_t containingNode; _Bool alive; }* cudaStream_t; cudaStream_t $cuda_default_stream; typedef struct $cuda_stream_node { $cuda_stream_t stream; $cuda_stream_node_t prev; $cuda_stream_node_t next; }* $cuda_stream_node_t; typedef struct $cuda_context { $cuda_stream_node_t head; int numStreams; } $cuda_context; $cuda_context $cuda_global_context; void $cuda_op_wait_start($cuda_op_state_t cudaOpState) { $when(cudaOpState->start) cudaOpState->op = $this; } // Helper function to get the default stream if passed NULL, and just returns stream otherwise cudaStream_t $default_stream_if_null(cudaStream_t stream) { return stream == NULL ? $cuda_default_stream : stream; } $cuda_stream_node_t $create_new_stream_node() { cudaStream_t newStream = (cudaStream_t) malloc(sizeof(struct cudaStream)); newStream->head = NULL; newStream->tail = NULL; newStream->numOps = 0; newStream->alive = true; $cuda_stream_node_t newHead = ($cuda_stream_node_t) malloc(sizeof(struct $cuda_stream_node)); newHead->stream = newStream; newStream->containingNode = newHead; newHead->prev = NULL; newHead->next = NULL; } // TODO: atomic cudaError_t cudaStreamCreate(cudaStream_t * pStream) { // Create new stream node in linked list $cuda_stream_node_t newHead = $create_new_stream_node(); newHead->next = $cuda_global_context.head; $cuda_global_context.head->prev = newHead; // Update cuda context's head to be the new node we created $cuda_global_context.head = newHead; $cuda_global_context.numStreams++; return cudaSuccess } cudaError_t cudaStreamSynchronize(cudaStream_t stream) { stream = $default_stream_if_null(stream); $assert(stream->alive, "Attempt to synchronize with a destroyed stream"); $when(stream->head == NULL) return cudaSuccess; } $proc $destroy_stream_node($cuda_stream_node_t node) { if (node->prev != NULL) { node->prev->next = node->next; } if (node->next != NULL) { node->next->prev = node->prev; } free(node); node->stream->alive = false; void $destroy_stream_when_complete(cudaStream_t stream) { $when(stream->head==NULL) free(stream); } return $spawn $destroy_when_complete(node->stream); } // TODO: atomic cudaError_t cudaStreamDestroy(cudaStream_t stream) { $assert(stream != NULL && stream != $cuda_default_stream, "Attempt to destroy default stream"); $assert(stream->alive, "Attempt to destroy an already destroyed stream); $destroy_stream_node(stream->containingNode); return cudaSuccess; } // TODO: atomic $cuda_op_state_t $stream_enqueue(cudaStream_t stream) { stream = $default_stream_if_null(stream); $assert(stream->alive, "Attempt to enqueue a CUDA operation onto a destroyed stream"); $cuda_op_state_t newOpState = ($cuda_op_state_t) malloc(sizeof(struct $cuda_op_state)); newOpState->start = false; newOpState->op = NULL; $cuda_op_state_node_t newOpStateNode = ($cuda_op_state_node_t) malloc(sizeof($cuda_op_state_node)); newOpStateNode->opState = newOpState; newOpStateNode->next = NULL; if (stream->tail == NULL) { stream->head = newOpStateNode; stream->tail = newOpStateNode; newOpState->start = true; } else { stream->tail->next = newOpStateNode; stream->tail = newOpStateNode; } stream->numOps++; return newOpState; } // TODO: atomic void $stream_dequeue(cudaStream_t stream) { stream = $default_stream_if_null(stream); $assert(stream->head != NULL, "Attempt to dequeue an empty stream"); if (stream->head == stream->tail) { stream->tail = NULL; } $cuda_op_state_node_t oldHead = stream->head; stream->head = oldHead->next; if (stream->head != NULL) { stream->head->op->start = true; } free(oldHead->opState); stream->numOps--; } void $cuda_memcpy_proc(void* dst, const void* src, size_t count, $cuda_op_state_t opState, cudaStream_t stream) { $cuda_op_wait_start(opState); memcpy(dst, src, count); $stream_dequeue(stream); } cudaError_t cudaMemcpy(void* dst, const void* src, size_t count, cudaMemcpyKind kind) { if (kind == cudaHostToHost) { memcpy(dst, src, count); } else { $cuda_op_state_t opState = $stream_enqueue($cuda_default_stream); $proc memcpyProc = $spawn $cuda_memcpy_proc(dst, src, count, opState, $cuda_default_stream); if (kind != cudaMemcpyDeviceToDevice) { $wait(memcpyProc); } } return cudaSuccess; } cudaError_t cudaMemcpyAsync(void* dst, const void* src, size_t count, cudaMemcpyKind kind, cudaStream_t stream) { if (kind == cudaMemcpyHostToHost) { memcpy(dst, src, count); } else { $cuda_op_state_t opState = $stream_enqueue(stream); $spawn $cuda_memcpy_proc(dst, src, count, opState, stream); } return cudaSuccess; } cudaError_t cudaDeviceSynchronize() { $proc opsToWaitOn[] = ($proc*) malloc(sizeof($proc) * $cuda_global_context.numStreams); int numOps = 0; $atomic { for ($cuda_stream_node_t node = $cuda_global_context.head; node != NULL; node = node->next) { if (node->stream->tail != NULL) { opsToWaitOn[numOps] = node->stream->tail->opState->op; numOps++; } } } $waitall(opsToWaitOn, numOps); return cudaSuccess; } void $cuda_setup() { $cuda_stream_node_t defaultStreamNode = $create_new_stream_node(); $cuda_default_stream = defaultStreamNode->stream; $cuda_global_context.head = defaultStreamNode; $cuda_global_context.count = 1; } void $cuda_teardown() { $proc destructor = $destroy_stream_node($cuda_default_stream->containingNode); $wait(destructor); } void _cuda_kernel_1(dim3 gridDim, dim3 blockDim, size_t _cuda_mem_size, int x) { } void $proc_kernel_1(dim3 gridDim, dim3 blockDim, size_t _cudaMemSize, cudaStream_t _cudaStream, $cuda_op_state_t _cudaOpState, int x) { $cuda_op_wait_start(_cudaOpState); _cuda_kernel_1(gridDim, blockDim, _cudaMemSize, x); $stream_dequeue(_cudaStream); } void _civl_main() { { $cuda_op_state_t _newOpState = $stream_enqueue(stream); $spawn $proc_kernel_1(gridDim, blockDim, 0, stream, _newOpState, x); } } int main() { $cuda_setup(); _civl_main(); $cuda_teardown(); }