Context Navigation

newCudaMockup.cvl@ 53f9d15

1.23 2.0 main test-branch

Last change on this file since 53f9d15 was 874f9d1, checked in by Zane Greenholt <zgrnhlt@…>, 3 years ago

Added vecAdd example to newCudaMockup

git-svn-id: svn://vsl.cis.udel.edu/civl/trunk@5715 fb995dde-84ed-4084-dfe6-e5aef3e2452c

Property mode set to 100644

File size: 11.2 KB

Rev	Line
[29eb398]	1	/**
	2	* TODO:
	3	* - implement cudaMemset and cudaMemsetAsync
	4	* - flesh out basic structure of cuda kernel:
	5	* - spawn gridDim blocks
	6	* - spawn blockDim threads
	7	* Alternatively, spawn blockDim warps and then spawn warp's threads
	8	* - wait for blocks to finish
	9	* - Add in block-level barriers and a __syncthreads() nested function that uses the barrier.
	10	* - Add in data race checking support and implement atomicAdd for integers
[874f9d1]	11	* - Handle dependencies at atomic blocks (replace some with local blocks if possible)
	12	* - Create device scope for cudaMallocs
[29eb398]	13	*/
	14
[30cb1de]	15	#include <concurrency.cvh>
	16	#include <stdlib.h>
[874f9d1]	17	#include <stdio.h>
[30cb1de]	18	#include <stdbool.h>
	19	#include <string.h>
	20
[109d05e]	21	enum cudaError {
	22	cudaSuccess
	23	};
	24	typedef enum cudaError cudaError_t;
	25
	26	typedef enum cudaMemcpyKind {
	27	cudaMemcpyHostToHost,
	28	cudaMemcpyHostToDevice,
	29	cudaMemcpyDeviceToHost,
	30	cudaMemcpyDeviceToDevice,
	31	cudaMemcpyDefault
	32	} cudaMemcpyKind;
	33
[30cb1de]	34	typedef struct {
	35	unsigned int x, y, z;
	36	} dim3;
	37
	38	/* used to represent a location in a three dimensional grid
	39	*/
	40	typedef struct {
	41	unsigned int x, y, z;
	42	} uint3;
	43
	44	typedef struct $cuda_op_state* $cuda_op_state_t;
	45	struct $cuda_op_state {
[109d05e]	46	_Bool start;
[e76acca]	47	$proc op;
[30cb1de]	48	};
[109d05e]	49
[30cb1de]	50	typedef struct $cuda_op_state_node* $cuda_op_state_node_t;
	51	struct $cuda_op_state_node {
[e76acca]	52	$cuda_op_state_t opState;
	53	$cuda_op_state_node_t next;
[30cb1de]	54	};
[109d05e]	55
[30cb1de]	56	typedef struct cudaStream* cudaStream_t;
	57	typedef struct $cuda_stream_node* $cuda_stream_node_t;
	58	struct cudaStream {
[e76acca]	59	$cuda_op_state_node_t head;
	60	$cuda_op_state_node_t tail;
[109d05e]	61	int numOps;
	62	$cuda_stream_node_t containingNode;
	63	_Bool alive;
[30cb1de]	64	};
[109d05e]	65	cudaStream_t $cuda_default_stream;
	66
[30cb1de]	67	struct $cuda_stream_node {
	68	cudaStream_t stream;
[109d05e]	69	$cuda_stream_node_t prev;
	70	$cuda_stream_node_t next;
[30cb1de]	71	};
[109d05e]	72
	73	typedef struct $cuda_context {
[30cb1de]	74	$cuda_stream_node_t head; //list of streams
[109d05e]	75	int numStreams;
	76	} $cuda_context;
	77	$cuda_context $cuda_global_context;
	78
	79	// Helper function to get the default stream if passed NULL, and just returns stream otherwise
	80	cudaStream_t $default_stream_if_null(cudaStream_t stream) {
	81	return stream == NULL ? $cuda_default_stream : stream;
	82	}
	83
	84	$cuda_stream_node_t $create_new_stream_node() {
	85	cudaStream_t newStream = (cudaStream_t) malloc(sizeof(struct cudaStream));
	86	newStream->head = NULL;
	87	newStream->tail = NULL;
	88	newStream->numOps = 0;
	89	newStream->alive = true;
	90
	91	$cuda_stream_node_t newHead = ($cuda_stream_node_t) malloc(sizeof(struct $cuda_stream_node));
	92	newHead->stream = newStream;
	93	newStream->containingNode = newHead;
	94	newHead->prev = NULL;
	95	newHead->next = NULL;
[30cb1de]	96
	97	return newHead;
[109d05e]	98	}
	99
[30cb1de]	100	/**
	101	* TODO:
	102	* - test
	103	* - atomic?
	104	*/
[109d05e]	105	cudaError_t cudaStreamCreate(cudaStream_t * pStream) {
	106	// Create new stream node in linked list
	107	$cuda_stream_node_t newHead = $create_new_stream_node();
	108	newHead->next = $cuda_global_context.head;
	109	$cuda_global_context.head->prev = newHead;
	110
	111	// Update cuda context's head to be the new node we created
	112	$cuda_global_context.head = newHead;
	113	$cuda_global_context.numStreams++;
	114
[30cb1de]	115	return cudaSuccess;
[109d05e]	116	}
	117
[30cb1de]	118	/**
	119	* TODO:
	120	* - test
	121	* - atomic?
	122	*/
[109d05e]	123	cudaError_t cudaStreamSynchronize(cudaStream_t stream) {
	124	stream = $default_stream_if_null(stream);
	125	$assert(stream->alive, "Attempt to synchronize with a destroyed stream");
	126	$when(stream->head == NULL) return cudaSuccess;
	127	}
	128
	129	$proc $destroy_stream_node($cuda_stream_node_t node) {
[874f9d1]	130	$proc lastOpProc = $proc_null;
	131	cudaStream_t stream = node->stream;
[30cb1de]	132
[874f9d1]	133	$local_start();
[30cb1de]	134	if (node->prev != NULL) {
	135	node->prev->next = node->next;
	136	}
	137	if (node->next != NULL) {
	138	node->next->prev = node->prev;
	139	}
	140	free(node);
	141
	142	stream->alive = false;
	143	if(stream->tail != NULL)
	144	lastOpProc = stream->tail->opState->op;
[874f9d1]	145	$local_end();
[109d05e]	146
[30cb1de]	147	void $destroy_stream_when_complete($proc lastOpProc, cudaStream_t stream) {
	148	$wait(lastOpProc);
	149	free(stream);
[109d05e]	150	}
[30cb1de]	151
	152	return $spawn $destroy_stream_when_complete(lastOpProc, stream);
[109d05e]	153	}
	154
	155	// TODO: atomic
	156	cudaError_t cudaStreamDestroy(cudaStream_t stream) {
	157	$assert(stream != NULL && stream != $cuda_default_stream, "Attempt to destroy default stream");
[30cb1de]	158	$assert(stream->alive, "Attempt to destroy an already destroyed stream");
[109d05e]	159	$destroy_stream_node(stream->containingNode);
	160	return cudaSuccess;
	161	}
	162
[30cb1de]	163	/**
	164	* Enqueues the calling $proc as a new cuda operation onto stream. Then blocks until the cuda operation is allowed to execute.
	165	*/
	166	$cuda_op_state_t $stream_enqueue(_Bool* enqueuedFlag, cudaStream_t stream) {
	167	$cuda_op_state_t newOpState = ($cuda_op_state_t) malloc(sizeof(struct $cuda_op_state));
	168	newOpState->start = false;
	169	newOpState->op = $self;
[109d05e]	170
[30cb1de]	171	$cuda_op_state_node_t newOpStateNode = ($cuda_op_state_node_t) malloc(sizeof(struct $cuda_op_state_node));
	172	newOpStateNode->opState = newOpState;
	173	newOpStateNode->next = NULL;
	174
[874f9d1]	175	$local_start();
	176	stream = $default_stream_if_null(stream);
	177	$assert(stream->alive, "Attempt to enqueue a CUDA operation onto a destroyed stream");
[109d05e]	178
[874f9d1]	179	if (stream->tail == NULL) {
	180	stream->head = newOpStateNode;
	181	stream->tail = newOpStateNode;
	182	newOpState->start = true;
	183	} else {
	184	stream->tail->next = newOpStateNode;
	185	stream->tail = newOpStateNode;
[30cb1de]	186	}
[874f9d1]	187	stream->numOps++;
	188	*enqueuedFlag = true;
	189	$local_end();
[30cb1de]	190
	191	return newOpState;
[109d05e]	192	}
	193
	194	void $stream_dequeue(cudaStream_t stream) {
	195	stream = $default_stream_if_null(stream);
	196	$assert(stream->head != NULL, "Attempt to dequeue an empty stream");
[30cb1de]	197
[874f9d1]	198	$local_start();
[30cb1de]	199	if (stream->head == stream->tail) {
	200	stream->tail = NULL;
	201	}
[109d05e]	202
[30cb1de]	203	$cuda_op_state_node_t oldHead = stream->head;
	204	stream->head = oldHead->next;
	205	if (stream->head != NULL) {
	206	stream->head->opState->start = true;
	207	}
[109d05e]	208
[30cb1de]	209	stream->numOps--;
	210	free(oldHead->opState);
	211	free(oldHead);
[874f9d1]	212	$local_end();
[109d05e]	213	}
	214
[30cb1de]	215	void $cuda_memcpy_proc(void* dst, const void* src, size_t count, _Bool* enqueuedFlag, cudaStream_t stream) {
	216	$cuda_op_state_t opState = $stream_enqueue(enqueuedFlag, stream);
	217	$when(opState->start);
[109d05e]	218	memcpy(dst, src, count);
	219	$stream_dequeue(stream);
	220	}
	221
	222	cudaError_t cudaMemcpy(void* dst, const void* src, size_t count, cudaMemcpyKind kind) {
[30cb1de]	223	if (kind == cudaMemcpyHostToHost) {
[29eb398]	224	memcpy(dst, src, count);
[109d05e]	225	} else {
[30cb1de]	226	_Bool enqueuedFlag = false;
	227	$proc memcpyProc = $spawn $cuda_memcpy_proc(dst, src, count, &enqueuedFlag, $cuda_default_stream);
	228	$when(enqueuedFlag);
[e76acca]	229	if (kind != cudaMemcpyDeviceToDevice) {
[109d05e]	230	$wait(memcpyProc);
	231	}
	232	}
	233
	234	return cudaSuccess;
	235	}
	236
	237	cudaError_t cudaMemcpyAsync(void* dst, const void* src, size_t count,
	238	cudaMemcpyKind kind, cudaStream_t stream) {
[e76acca]	239	if (kind == cudaMemcpyHostToHost) {
[29eb398]	240	memcpy(dst, src, count);
[109d05e]	241	} else {
[30cb1de]	242	_Bool enqueuedFlag = false;
	243	$spawn $cuda_memcpy_proc(dst, src, count, &enqueuedFlag, stream);
	244	$when(enqueuedFlag);
[109d05e]	245	}
	246
	247	return cudaSuccess;
	248	}
	249
	250	cudaError_t cudaDeviceSynchronize() {
[30cb1de]	251	$proc* opsToWaitOn;
[109d05e]	252	int numOps = 0;
	253
	254	$atomic {
[30cb1de]	255	opsToWaitOn = ($proc) malloc(sizeof($proc) $cuda_global_context.numStreams);
	256
[109d05e]	257	for ($cuda_stream_node_t node = $cuda_global_context.head;
[30cb1de]	258	node != NULL;
	259	node = node->next) {
	260	if (node->stream->tail != NULL) {
	261	opsToWaitOn[numOps] = node->stream->tail->opState->op;
	262	numOps++;
	263	}
	264	}
	265	}
[e76acca]	266	$waitall(opsToWaitOn, numOps);
[109d05e]	267
[30cb1de]	268	return cudaSuccess;
[109d05e]	269	}
	270
[30cb1de]	271	/**
	272	* Only called at start of program
	273	*/
[109d05e]	274	void $cuda_setup() {
[30cb1de]	275	$cuda_stream_node_t defaultStreamNode = $create_new_stream_node();
	276	$cuda_default_stream = defaultStreamNode->stream;
[109d05e]	277
[30cb1de]	278	$cuda_global_context.head = defaultStreamNode;
	279	$cuda_global_context.numStreams = 1;
[109d05e]	280	}
	281
[30cb1de]	282	/**
	283	* Only called at end of program
	284	*/
[109d05e]	285	void $cuda_teardown() {
[30cb1de]	286	$proc destructor = $destroy_stream_node($cuda_default_stream->containingNode);
	287	$wait(destructor);
	288	}
	289
	290	// Helper function
	291	int $dim3_index(dim3 size, uint3 location) {
	292	return location.x + size.x * (location.y + size.y * location.z);
	293	}
	294
	295	// Helper function
	296	int $cuda_kernel_index (dim3 gDim, dim3 bDim, uint3 bIdx, uint3 tIdx) {
	297	return $dim3_index(gDim, bIdx) * (bDim.x * bDim.y * bDim.z) + $dim3_index(bDim, tIdx);
	298	}
	299
	300	void $cuda_run_and_wait_on_procs(dim3 dim, void spawningFunction(uint3)) {
	301	//TODO: calculate length and index, replace this function in the kernel
[874f9d1]	302	//$local_start();
[30cb1de]	303	int length = dim.x * dim.y * dim.z;
	304	$proc proc_array[length];
	305	$range rx = 0 .. dim.x - 1;
	306	$range ry = 0 .. dim.y - 1;
	307	$range rz = 0 .. dim.z - 1;
	308	$domain(3) dom = ($domain){rx, ry, rz};
[874f9d1]	309	#For some reason there is depends on all here
[30cb1de]	310	$for(int x,y,z : dom){
	311	uint3 id = { x, y, z };
	312	int index = $dim3_index(dim, id);
	313	proc_array[index] = $spawn spawningFunction(id);
	314	}
[874f9d1]	315	//$local_end();
[30cb1de]	316	$waitall(proc_array,length);
[109d05e]	317	}
	318
	319	void _cuda_kernel_1(dim3 gridDim, dim3 blockDim, size_t _cuda_mem_size,
[4bd0090]	320	const float A, const float B, float *C, int numElements) {
[30cb1de]	321	void _cuda_block(uint3 blockIdx) {
	322	int numThreads = (blockDim.x * blockDim.y) * blockDim.z;
	323	$scope _block_root = $here;
	324	$gbarrier _cuda_block_barrier = $gbarrier_create($here, blockDim.x * blockDim.y * blockDim.z);
	325	void _cuda_thread(uint3 threadIdx) {
	326	int _cuda_tid = $dim3_index(blockDim, threadIdx);
	327	int _cuda_kid = $cuda_kernel_index(gridDim, blockDim, blockIdx, threadIdx);
	328	$barrier _cuda_thread_barrier = $barrier_create($here, _cuda_block_barrier, _cuda_tid);
[4bd0090]	329	$local_start();
	330	// Kernel definition start
	331
	332	int i = blockDim.x * blockIdx.x + threadIdx.x;
	333
	334	if (i < numElements)
	335	{
	336	C[i] = A[i] + B[i];
	337	}
	338
	339	// Kernel definition end
	340	$local_end();
[30cb1de]	341	$barrier_destroy(_cuda_thread_barrier);
	342	}
	343	$cuda_run_and_wait_on_procs(blockDim, _cuda_thread);
	344	$gbarrier_destroy(_cuda_block_barrier);
	345	}
	346	$cuda_run_and_wait_on_procs(gridDim, _cuda_block);
[109d05e]	347	}
[30cb1de]	348
[e76acca]	349	void $proc_kernel_1(dim3 gridDim, dim3 blockDim, size_t _cudaMemSize,
[4bd0090]	350	_Bool* enqueuedFlag, cudaStream_t _cudaStream,
	351	const float A, const float B, float *C, int numElements) {
[30cb1de]	352	$cuda_op_state_t opState = $stream_enqueue(enqueuedFlag, _cudaStream);
	353	$when(opState->start);
[4bd0090]	354	_cuda_kernel_1(gridDim, blockDim, _cudaMemSize, A, B, C, numElements);
[30cb1de]	355	$stream_dequeue(_cudaStream);
[109d05e]	356	}
	357
[4bd0090]	358	$input int N;
	359	$assume (N > 0);
	360	$input float A[N];
	361	$input float B[N];
	362
[109d05e]	363	void _civl_main() {
[4bd0090]	364	int size = N * sizeof(float);
[874f9d1]	365	int numBlocks = 2;
	366	int numThreads = N%2 == 0? N/2 : (N+1)/2;
[4bd0090]	367
[874f9d1]	368	float* cuda_A;
[4bd0090]	369	// cudaMalloc((void **)&cuda_A, size);
	370	{
	371	cuda_A = (float *) malloc(size);
	372	}
[874f9d1]	373	cudaMemcpy(cuda_A, A, size, cudaMemcpyHostToDevice);
[4bd0090]	374
[874f9d1]	375	float* cuda_B;
[4bd0090]	376	// cudaMalloc((void **)&cuda_B, size);
[30cb1de]	377	{
[4bd0090]	378	cuda_B = (float *) malloc(size);
	379	}
[874f9d1]	380	cudaMemcpy(cuda_B, B, size, cudaMemcpyHostToDevice);
[4bd0090]	381
[874f9d1]	382	float* cuda_C;
[4bd0090]	383	// cudaMalloc((void **)&cuda_C, size);
	384	{
	385	cuda_C = (float *) malloc(size);
	386	}
	387
[874f9d1]	388	{ // kernel_1<<<gridDim, blockDim>>>(cuda_A, cuda_B, cuda_C, N);
	389	dim3 gridDim = {numBlocks, 1, 1};
	390	dim3 blockDim = {numThreads, 1, 1};
[30cb1de]	391	_Bool enqueuedFlag = false;
[4bd0090]	392	$spawn $proc_kernel_1(gridDim, blockDim, 0, &enqueuedFlag, NULL, cuda_A, cuda_B, cuda_C, N);
[30cb1de]	393	$when(enqueuedFlag);
	394	}
[874f9d1]	395
	396	//Checking correctness
	397	float* C = (float *)malloc(size);
	398
	399	cudaMemcpy(C, cuda_C, size, cudaMemcpyDeviceToHost);
	400
	401	for(int i = 0; i < N; i++)
	402	$assert(C[i] == A[i] + B[i]);
	403
	404	free(C);
	405
	406	//cudaFree(cuda_A);...
	407	free(cuda_A);
	408	free(cuda_B);
	409	free(cuda_C);
[109d05e]	410	}
	411
	412	int main() {
[30cb1de]	413	$cuda_setup();
	414	_civl_main();
	415	$cuda_teardown();
[e76acca]	416	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

Original Format