Context Navigation

newCudaMockup.cvl@ 1aaefd4

main test-branch

Last change on this file since 1aaefd4 was 7e3e7af, checked in by Alex Wilton <awilton@…>, 3 years ago

Fixed a small POR issue in cuda mockup

git-svn-id: svn://vsl.cis.udel.edu/civl/trunk@5749 fb995dde-84ed-4084-dfe6-e5aef3e2452c

Property mode set to 100644

File size: 18.7 KB

Rev	Line
[29eb398]	1	/**
	2	* TODO:
	3	* - implement cudaMemset and cudaMemsetAsync
	4	* - flesh out basic structure of cuda kernel:
	5	* - spawn gridDim blocks
	6	* - spawn blockDim threads
	7	* Alternatively, spawn blockDim warps and then spawn warp's threads
	8	* - wait for blocks to finish
	9	* - Add in block-level barriers and a __syncthreads() nested function that uses the barrier.
	10	* - Add in data race checking support and implement atomicAdd for integers
[874f9d1]	11	* - Handle dependencies at atomic blocks (replace some with local blocks if possible)
[29eb398]	12	*/
	13
[30cb1de]	14	#include <concurrency.cvh>
[6dd0322]	15	#include <comm.cvh>
[30cb1de]	16	#include <stdlib.h>
[874f9d1]	17	#include <stdio.h>
[30cb1de]	18	#include <stdbool.h>
	19	#include <string.h>
	20
[6dd0322]	21	///////////
	22	// Types //
	23	///////////
	24
[109d05e]	25	enum cudaError {
	26	cudaSuccess
	27	};
	28	typedef enum cudaError cudaError_t;
	29
	30	typedef enum cudaMemcpyKind {
	31	cudaMemcpyHostToHost,
	32	cudaMemcpyHostToDevice,
	33	cudaMemcpyDeviceToHost,
	34	cudaMemcpyDeviceToDevice,
	35	cudaMemcpyDefault
	36	} cudaMemcpyKind;
	37
[30cb1de]	38	typedef struct {
	39	unsigned int x, y, z;
	40	} dim3;
	41
	42	/* used to represent a location in a three dimensional grid
	43	*/
	44	typedef struct {
	45	unsigned int x, y, z;
	46	} uint3;
	47
	48	typedef struct $cuda_op_state* $cuda_op_state_t;
	49	struct $cuda_op_state {
[109d05e]	50	_Bool start;
[e76acca]	51	$proc op;
[30cb1de]	52	};
[109d05e]	53
[30cb1de]	54	typedef struct $cuda_op_state_node* $cuda_op_state_node_t;
	55	struct $cuda_op_state_node {
[e76acca]	56	$cuda_op_state_t opState;
	57	$cuda_op_state_node_t next;
[30cb1de]	58	};
[109d05e]	59
[30cb1de]	60	typedef struct cudaStream* cudaStream_t;
	61	typedef struct $cuda_stream_node* $cuda_stream_node_t;
	62	struct cudaStream {
[e76acca]	63	$cuda_op_state_node_t head;
	64	$cuda_op_state_node_t tail;
[109d05e]	65	int numOps;
	66	$cuda_stream_node_t containingNode;
	67	_Bool alive;
[30cb1de]	68	};
[109d05e]	69
[30cb1de]	70	struct $cuda_stream_node {
	71	cudaStream_t stream;
[109d05e]	72	$cuda_stream_node_t prev;
	73	$cuda_stream_node_t next;
[30cb1de]	74	};
[109d05e]	75
	76	typedef struct $cuda_context {
[30cb1de]	77	$cuda_stream_node_t head; //list of streams
[109d05e]	78	int numStreams;
	79	} $cuda_context;
	80
[0ffc6c8]	81	typedef struct $cuda_memcpy_data {
	82	void* dst;
	83	const void* src;
	84	size_t count;
	85	cudaMemcpyKind kind;
	86	} $cuda_memcpy_data;
	87
	88	typedef struct $cuda_kernel_1_data {
	89	dim3 gridDim;
	90	dim3 blockDim;
	91	size_t $cudaMemSize;
	92	cudaStream_t $cudaStream;
	93	const float* A;
	94	const float* B;
	95	float* C;
	96	int numElements;
	97	} $cuda_kernel_1_data;
	98
[6dd0322]	99	//////////////////////
	100	// Global Variables //
	101	//////////////////////
	102
	103	$gcomm $cuda_gcomm = $gcomm_create($here, 2);
	104	const int $CUDA_PLACE_HOST = 0;
	105	const int $CUDA_PLACE_DEVICE = 1;
[0ffc6c8]	106	$comm $cuda_host_comm = $comm_create($here, $cuda_gcomm, $CUDA_PLACE_HOST);
[6dd0322]	107
	108	/**
	109	* Tags used for message-passing between host and device
	110	*/
[0ffc6c8]	111
	112	enum $cuda_tag {
	113	// Predefined tags
	114	$CUDA_TAG_TEARDOWN,
	115	$CUDA_TAG_SCOPE_REQUEST,
	116	$CUDA_TAG_cudaFree,
	117	$CUDA_TAG_cudaMemcpy,
	118	$CUDA_TAG_cudaMemcpyAsync,
	119	// Generated tags (by transformer)
	120	$CUDA_TAG_LAUNCH_kernel_1
	121	};
[6dd0322]	122
	123	////////////////////////////////////////////
	124	// CUDA API Functions (For Host-use Only) //
	125	////////////////////////////////////////////
	126
[0ffc6c8]	127	/*
[6dd0322]	128	cudaError_t cudaMalloc(void** devPtr, size_t size) {
	129	$comm_enqueue($cuda_host_comm, $message_pack($CUDA_PLACE_HOST, $CUDA_PLACE_DEVICE, $CUDA_TAG_cudaMalloc, &size, sizeof(size_t)));
	130	$message response = $comm_dequeue($cuda_host_comm, $CUDA_PLACE_DEVICE, $CUDA_TAG_cudaMalloc);
	131	$message_unpack(response, devPtr, sizeof(void*));
[0ffc6c8]	132
	133	return cudaSuccess;
[6dd0322]	134	}
[0ffc6c8]	135	*/
[6dd0322]	136
[0ffc6c8]	137	$scope $cuda_host_request_device_scope() {
	138	$comm_enqueue($cuda_host_comm, $message_pack($CUDA_PLACE_HOST, $CUDA_PLACE_DEVICE, $CUDA_TAG_SCOPE_REQUEST, NULL, 0));
	139	$message response = $comm_dequeue($cuda_host_comm, $CUDA_PLACE_DEVICE, $CUDA_TAG_SCOPE_REQUEST);
	140	$scope result;
	141	$message_unpack(response, &result, sizeof($scope));
[6dd0322]	142
[0ffc6c8]	143	return result;
	144	}
	145
	146	cudaError_t cudaFree(void* devPtr) {
	147	$comm_enqueue($cuda_host_comm, $message_pack($CUDA_PLACE_HOST, $CUDA_PLACE_DEVICE, $CUDA_TAG_cudaFree, &devPtr, sizeof(void*)));
	148	$comm_dequeue($cuda_host_comm, $CUDA_PLACE_DEVICE, $CUDA_TAG_cudaFree);
	149
	150	return cudaSuccess;
	151	}
	152
	153	void $cuda_helper_host_memcpy(void* dst, const void* src, size_t count, cudaMemcpyKind kind, _Bool async) {
	154	if (kind == cudaMemcpyHostToHost) {
[6dd0322]	155	memcpy(dst, src, count);
	156	} else {
[0ffc6c8]	157	$cuda_memcpy_data args;
[6dd0322]	158	args.dst = dst;
	159	args.src = src;
	160	args.count = count;
	161	args.kind = kind;
[0ffc6c8]	162
	163	int tag = async ? $CUDA_TAG_cudaMemcpyAsync : $CUDA_TAG_cudaMemcpy;
	164
	165	$comm_enqueue($cuda_host_comm, $message_pack($CUDA_PLACE_HOST, $CUDA_PLACE_DEVICE, tag, &args, sizeof($cuda_memcpy_data)));
	166	$comm_dequeue($cuda_host_comm, $CUDA_PLACE_DEVICE, tag);
[6dd0322]	167	}
[0ffc6c8]	168	}
	169
	170	cudaError_t cudaMemcpy(void* dst, const void* src, size_t count, cudaMemcpyKind kind) {
	171	$cuda_helper_host_memcpy(dst, src, count, kind, false);
[6dd0322]	172	return cudaSuccess;
[109d05e]	173	}
	174
[0ffc6c8]	175	cudaError_t cudaMemcpyAsync(void* dst, const void* src, size_t count,
	176	cudaMemcpyKind kind, cudaStream_t stream) {
	177	$cuda_helper_host_memcpy(dst, src, count, kind, true);
	178	return cudaSuccess;
[109d05e]	179	}
	180
[30cb1de]	181	/**
	182	* TODO:
	183	* - test
	184	* - atomic?
	185	*/
[0ffc6c8]	186	/*
[109d05e]	187	cudaError_t cudaStreamCreate(cudaStream_t * pStream) {
	188	// Create new stream node in linked list
	189	$cuda_stream_node_t newHead = $create_new_stream_node();
	190	newHead->next = $cuda_global_context.head;
	191	$cuda_global_context.head->prev = newHead;
	192
	193	// Update cuda context's head to be the new node we created
	194	$cuda_global_context.head = newHead;
	195	$cuda_global_context.numStreams++;
	196
[30cb1de]	197	return cudaSuccess;
[109d05e]	198	}
[0ffc6c8]	199	*/
[109d05e]	200
[30cb1de]	201	/**
	202	* TODO:
	203	* - test
	204	* - atomic?
	205	*/
[0ffc6c8]	206	/*
[109d05e]	207	cudaError_t cudaStreamSynchronize(cudaStream_t stream) {
	208	stream = $default_stream_if_null(stream);
	209	$assert(stream->alive, "Attempt to synchronize with a destroyed stream");
	210	$when(stream->head == NULL) return cudaSuccess;
	211	}
[0ffc6c8]	212	*/
[109d05e]	213
	214	// TODO: atomic
[0ffc6c8]	215	/*
[109d05e]	216	cudaError_t cudaStreamDestroy(cudaStream_t stream) {
	217	$assert(stream != NULL && stream != $cuda_default_stream, "Attempt to destroy default stream");
[30cb1de]	218	$assert(stream->alive, "Attempt to destroy an already destroyed stream");
[109d05e]	219	$destroy_stream_node(stream->containingNode);
	220	return cudaSuccess;
	221	}
[0ffc6c8]	222	*/
[109d05e]	223
[0ffc6c8]	224	/*
[109d05e]	225	cudaError_t cudaDeviceSynchronize() {
[30cb1de]	226	$proc* opsToWaitOn;
[109d05e]	227	int numOps = 0;
	228
	229	$atomic {
[30cb1de]	230	opsToWaitOn = ($proc) malloc(sizeof($proc) $cuda_global_context.numStreams);
	231
[109d05e]	232	for ($cuda_stream_node_t node = $cuda_global_context.head;
[30cb1de]	233	node != NULL;
	234	node = node->next) {
	235	if (node->stream->tail != NULL) {
	236	opsToWaitOn[numOps] = node->stream->tail->opState->op;
	237	numOps++;
	238	}
	239	}
	240	}
[e76acca]	241	$waitall(opsToWaitOn, numOps);
[109d05e]	242
[30cb1de]	243	return cudaSuccess;
[109d05e]	244	}
[0ffc6c8]	245	*/
	246
	247	void $cuda_host_launch_kernel_1(dim3 gridDim, dim3 blockDim, size_t $cudaMemSize, cudaStream_t $cudaStream,
	248	const float* A, const float* B, float* C, int numElements) {
	249	$cuda_kernel_1_data args;
	250	args.gridDim = gridDim;
	251	args.blockDim = blockDim;
	252	args.$cudaMemSize = $cudaMemSize;
	253	args.$cudaStream = $cudaStream;
	254	args.A = A;
	255	args.B = B;
	256	args.C = C;
	257	args.numElements = numElements;
	258
	259	$comm_enqueue($cuda_host_comm, $message_pack($CUDA_PLACE_HOST, $CUDA_PLACE_DEVICE, $CUDA_TAG_LAUNCH_kernel_1, &args, sizeof($cuda_kernel_1_data)));
	260	$comm_dequeue($cuda_host_comm, $CUDA_PLACE_DEVICE, $CUDA_TAG_LAUNCH_kernel_1);
[109d05e]	261	}
	262
[6dd0322]	263	/////////////////
	264	// CUDA "file" //
	265	/////////////////
	266
	267	void _cuda_main() {
	268
	269	//////////////////////
	270	// Device Variables //
	271	//////////////////////
	272
[0ffc6c8]	273	$scope $cuda_scope = $here;
	274
	275	$comm $cuda_device_comm = $comm_create($cuda_scope, $cuda_gcomm, 1);
[6dd0322]	276	$cuda_context $cuda_global_context;
	277	cudaStream_t $cuda_default_stream;
	278
[0ffc6c8]	279	/////////////////////////////////
	280	// Context & Stream Management //
	281	/////////////////////////////////
	282
	283	// Helper function to get the default stream if passed NULL, and just returns stream otherwise
	284	cudaStream_t $default_stream_if_null(cudaStream_t stream) {
	285	return stream == NULL ? $cuda_default_stream : stream;
	286	}
	287
	288	$cuda_stream_node_t $create_new_stream_node() {
	289	cudaStream_t newStream = (cudaStream_t) malloc(sizeof(struct cudaStream));
	290	newStream->head = NULL;
	291	newStream->tail = NULL;
	292	newStream->numOps = 0;
	293	newStream->alive = true;
	294
	295	$cuda_stream_node_t newHead = ($cuda_stream_node_t) malloc(sizeof(struct $cuda_stream_node));
	296	newHead->stream = newStream;
	297	newStream->containingNode = newHead;
	298	newHead->prev = NULL;
	299	newHead->next = NULL;
	300
	301	return newHead;
	302	}
	303
	304	//@ depends_on \nothing;
	305	$atomic_f $proc $destroy_stream_node($cuda_stream_node_t node) {
	306	$proc lastOpProc = $proc_null;
	307	cudaStream_t stream = node->stream;
	308
	309	if (node->prev != NULL) {
	310	node->prev->next = node->next;
	311	}
	312	if (node->next != NULL) {
	313	node->next->prev = node->prev;
	314	}
	315	free(node);
	316
	317	stream->alive = false;
	318	if(stream->tail != NULL)
	319	lastOpProc = stream->tail->opState->op;
	320
	321	void $destroy_stream_when_complete($proc lastOpProc, cudaStream_t stream) {
	322	$wait(lastOpProc);
	323	free(stream);
	324	}
	325
	326	return $spawn $destroy_stream_when_complete(lastOpProc, stream);
	327	}
	328
	329	/**
	330	* Enqueues the calling $proc as a new cuda operation onto stream. Then blocks until the cuda operation is allowed to execute.
	331	*
	332	* Reasoning behind using enqueuedFlag:
	333	* + Enforces in the interface more explicitly that device proc shouldn't continue until new op state is created and properly filled out (including the $proc field)
	334	* + Keeps symmetry since this method means the op does both enqueueing and dequeueing. The alternative technique would have device enqueuing and op dequeuing.
	335	* + Reduces dependencies since device proc will not be manipulating the streams nor have direct access to the newly created op state.
	336	* + Keeps responsibility of device proc strictly to interpreting messages, spawning appropriate ops and sending message.
	337	*/
	338	//@ depends_on \nothing;
[7e3e7af]	339	$atomic_f $cuda_op_state_t $stream_enqueue(_Bool** enqueuedFlag, cudaStream_t stream) {
[0ffc6c8]	340	$cuda_op_state_t newOpState = ($cuda_op_state_t) $malloc($cuda_scope, sizeof(struct $cuda_op_state));
	341	newOpState->start = false;
	342	newOpState->op = $self;
	343
	344	$cuda_op_state_node_t newOpStateNode = ($cuda_op_state_node_t) $malloc($cuda_scope, sizeof(struct $cuda_op_state_node));
	345	newOpStateNode->opState = newOpState;
	346	newOpStateNode->next = NULL;
	347
	348	stream = $default_stream_if_null(stream);
	349	$assert(stream->alive, "Attempt to enqueue a CUDA operation onto a destroyed stream");
	350
	351	if (stream->tail == NULL) {
	352	stream->head = newOpStateNode;
	353	stream->tail = newOpStateNode;
	354	newOpState->start = true;
	355	} else {
	356	stream->tail->next = newOpStateNode;
	357	stream->tail = newOpStateNode;
	358	}
	359	stream->numOps++;
[7e3e7af]	360	**enqueuedFlag = true;
	361	*enqueuedFlag = NULL;
[0ffc6c8]	362
	363	return newOpState;
	364	}
	365
	366	//@ depends_on \nothing;
	367	$atomic_f void $stream_dequeue(cudaStream_t stream) {
	368	stream = $default_stream_if_null(stream);
	369	$assert(stream->head != NULL, "Attempt to dequeue an empty stream");
	370
	371	if (stream->head == stream->tail) {
	372	stream->tail = NULL;
	373	}
	374
	375	$cuda_op_state_node_t oldHead = stream->head;
	376	stream->head = oldHead->next;
	377	if (stream->head != NULL) {
	378	stream->head->opState->start = true;
	379	}
	380
	381	stream->numOps--;
	382	free(oldHead->opState);
	383	free(oldHead);
	384	}
	385
[6dd0322]	386	///////////////////////////////
	387	// CUDA Function Definitions //
	388	///////////////////////////////
[0ffc6c8]	389
[6dd0322]	390	/**
	391	* Only called at start of program
	392	*/
	393	void $cuda_setup() {
	394	$cuda_stream_node_t defaultStreamNode = $create_new_stream_node();
	395	$cuda_default_stream = defaultStreamNode->stream;
	396
	397	$cuda_global_context.head = defaultStreamNode;
	398	$cuda_global_context.numStreams = 1;
	399	}
	400
	401	/**
	402	* Only called at end of program
	403	*/
	404	void $cuda_teardown() {
	405	$proc destructor = $destroy_stream_node($cuda_default_stream->containingNode);
	406	$wait(destructor);
[0ffc6c8]	407	$comm_destroy($cuda_device_comm);
[6dd0322]	408	}
	409
[0ffc6c8]	410	$message $cuda_free($message request) {
	411	void* devPtr;
	412	$message_unpack(request, &devPtr, sizeof(void*));
[cb46bb6]	413	free($reveal(devPtr));
	414	//free(devPtr);
[6dd0322]	415
[0ffc6c8]	416	return $message_pack($CUDA_PLACE_DEVICE, $CUDA_PLACE_HOST, $CUDA_TAG_cudaFree, NULL, 0);
[6dd0322]	417	}
	418
[0ffc6c8]	419	void $cuda_memcpy_proc(void* dst, const void* src, size_t count, cudaMemcpyKind kind, _Bool* enqueuedFlag, cudaStream_t stream) {
[7e3e7af]	420	$cuda_op_state_t opState = $stream_enqueue(&enqueuedFlag, stream);
[6dd0322]	421	$when(opState->start);
[0ffc6c8]	422
	423	if (kind == cudaMemcpyHostToDevice \|\| cudaMemcpyDeviceToDevice) {
[cb46bb6]	424	dst = $reveal(dst);
[0ffc6c8]	425	}
	426	if (kind == cudaMemcpyDeviceToHost \|\| cudaMemcpyDeviceToDevice) {
[cb46bb6]	427	src = $reveal(src);
[0ffc6c8]	428	}
[6dd0322]	429	memcpy(dst, src, count);
[0ffc6c8]	430
[6dd0322]	431	$stream_dequeue(stream);
	432	}
	433
[0ffc6c8]	434	$message $cuda_memcpy($message request, _Bool async) {
	435	$cuda_memcpy_data args;
	436	$message_unpack(request, &args, sizeof($cuda_memcpy_data));
[6dd0322]	437
	438	_Bool enqueuedFlag = false;
[0ffc6c8]	439	$proc memcpyProc = $spawn $cuda_memcpy_proc(args.dst, args.src, args.count, args.kind, &enqueuedFlag, $cuda_default_stream);
[6dd0322]	440	$when(enqueuedFlag);
[0ffc6c8]	441
	442	if (!async && args.kind != cudaMemcpyDeviceToDevice) {
[6dd0322]	443	$wait(memcpyProc);
	444	}
[0ffc6c8]	445	int tag = async ? $CUDA_TAG_cudaMemcpyAsync : $CUDA_TAG_cudaMemcpy;
[6dd0322]	446
[0ffc6c8]	447	return $message_pack($CUDA_PLACE_DEVICE, $CUDA_PLACE_HOST, tag, NULL, 0);
[6dd0322]	448	}
	449
	450	////////////////////////
	451	// Kernel Definitions //
	452	////////////////////////
[0ffc6c8]	453
	454	// Helper function
	455	int $dim3_index(dim3 size, uint3 location) {
	456	return location.x + size.x * (location.y + size.y * location.z);
	457	}
	458
	459	// Helper function
	460	int $cuda_kernel_index (dim3 gDim, dim3 bDim, uint3 bIdx, uint3 tIdx) {
	461	return $dim3_index(gDim, bIdx) * (bDim.x * bDim.y * bDim.z) + $dim3_index(bDim, tIdx);
	462	}
	463
	464	void $cuda_run_and_wait_on_procs(dim3 dim, void spawningFunction(uint3)) {
	465	//TODO: calculate length and index, replace this function in the kernel
	466	$local_start();
	467	int length = dim.x * dim.y * dim.z;
	468	$proc proc_array[length];
	469	$range rx = 0 .. dim.x - 1;
	470	$range ry = 0 .. dim.y - 1;
	471	$range rz = 0 .. dim.z - 1;
	472	$domain(3) dom = ($domain(3)){rx, ry, rz};
	473	$for(int x,y,z : dom){
	474	uint3 id = { x, y, z };
	475	int index = $dim3_index(dim, id);
	476	proc_array[index] = $spawn spawningFunction(id);
	477	}
	478	$local_end();
	479	$waitall(proc_array,length);
	480	}
[6dd0322]	481
	482	// Generated from kernel_1 definition
[0ffc6c8]	483	void $cuda_kernel_1(dim3 gridDim, dim3 blockDim, size_t _cuda_mem_size,
[6dd0322]	484	const float A, const float B, float *C, int numElements) {
	485	void _cuda_block(uint3 blockIdx) {
	486	int numThreads = (blockDim.x * blockDim.y) * blockDim.z;
	487	$scope _block_root = $here;
	488	$gbarrier _cuda_block_barrier = $gbarrier_create($here, blockDim.x * blockDim.y * blockDim.z);
	489	void _cuda_thread(uint3 threadIdx) {
	490	int _cuda_tid = $dim3_index(blockDim, threadIdx);
	491	int _cuda_kid = $cuda_kernel_index(gridDim, blockDim, blockIdx, threadIdx);
	492	$barrier _cuda_thread_barrier = $barrier_create($here, _cuda_block_barrier, _cuda_tid);
	493	$local_start();
	494	// Kernel definition start
	495
	496	int i = blockDim.x * blockIdx.x + threadIdx.x;
	497
	498	if (i < numElements)
	499	{
	500	C[i] = A[i] + B[i];
	501	}
	502
	503	// Kernel definition end
	504	$local_end();
	505	$barrier_destroy(_cuda_thread_barrier);
	506	}
	507	$cuda_run_and_wait_on_procs(blockDim, _cuda_thread);
	508	$gbarrier_destroy(_cuda_block_barrier);
	509	}
	510	$cuda_run_and_wait_on_procs(gridDim, _cuda_block);
	511	}
[0ffc6c8]	512
	513	void $cuda_kernel_1_proc (_Bool* enqueuedFlag, dim3 gridDim, dim3 blockDim,
	514	size_t $cudaMemSize, cudaStream_t $cudaStream,
	515	const float A, const float B, float *C, int numElements) {
[7e3e7af]	516	$cuda_op_state_t opState = $stream_enqueue(&enqueuedFlag, $cudaStream);
[0ffc6c8]	517	$when(opState->start);
	518	$cuda_kernel_1(gridDim, blockDim, $cudaMemSize, A, B, C, numElements);
	519	$stream_dequeue($cudaStream);
	520	}
	521
	522	$message $cuda_device_launch_kernel_1($message request) {
	523	$cuda_kernel_1_data args;
	524	$message_unpack(request, &args, sizeof($cuda_kernel_1_data));
	525
	526	_Bool enqueuedFlag = false;
[6698ce0]	527	$spawn $cuda_kernel_1_proc(&enqueuedFlag, args.gridDim, args.blockDim, args.$cudaMemSize, args.$cudaStream, $reveal(args.A), $reveal(args.B), $reveal(args.C), args.numElements);
[0ffc6c8]	528	$when(enqueuedFlag);
	529
	530	return $message_pack($CUDA_PLACE_DEVICE, $CUDA_PLACE_HOST, $CUDA_TAG_LAUNCH_kernel_1, NULL, 0);
	531	}
[6dd0322]	532
	533	/////////////////
	534	// Device main //
	535	/////////////////
	536
	537	$cuda_setup();
	538
	539	while (true) {
	540	$message request = $comm_dequeue($cuda_device_comm, $CUDA_PLACE_HOST, $COMM_ANY_TAG);
	541	$message response;
[0ffc6c8]	542	const int tag = $message_tag(request);
[6dd0322]	543
	544	switch(tag) {
[0ffc6c8]	545	case $CUDA_TAG_SCOPE_REQUEST :
	546	response = $message_pack($CUDA_PLACE_DEVICE, $CUDA_PLACE_HOST, $CUDA_TAG_SCOPE_REQUEST, &$cuda_scope, sizeof($scope));
	547	break;
	548	case $CUDA_TAG_cudaFree :
	549	response = $cuda_free(request);
[6dd0322]	550	break;
	551	case $CUDA_TAG_cudaMemcpy :
[0ffc6c8]	552	response = $cuda_memcpy(request, false);
	553	break;
	554	case $CUDA_TAG_cudaMemcpyAsync :
	555	response = $cuda_memcpy(request, true);
[6dd0322]	556	break;
	557	case $CUDA_TAG_LAUNCH_kernel_1 :
[0ffc6c8]	558	response = $cuda_device_launch_kernel_1(request);
	559	break;
	560	case $CUDA_TAG_TEARDOWN :
	561	$cuda_teardown();
	562	return;
[6dd0322]	563	default :
	564	$assert(false, "Unknown CUDA request");
	565	}
	566
	567	$comm_enqueue($cuda_device_comm, response);
	568	}
	569	}
	570
	571	///////////////
	572	// Host file //
	573	///////////////
	574
[4bd0090]	575	$input int N;
	576	$assume (N > 0);
	577	$input float A[N];
	578	$input float B[N];
	579
[6dd0322]	580	void _host_main() {
[4bd0090]	581	int size = N * sizeof(float);
[874f9d1]	582	int numBlocks = 2;
	583	int numThreads = N%2 == 0? N/2 : (N+1)/2;
[4bd0090]	584
[874f9d1]	585	float* cuda_A;
[0ffc6c8]	586	// cudaMalloc((void **)&cuda_A, size);
[4bd0090]	587	{
[0ffc6c8]	588	$scope deviceScope = $cuda_host_request_device_scope();
[cb46bb6]	589	cuda_A = $hide((float*)$malloc(deviceScope, size));
	590	//cuda_A = (float*)$malloc(deviceScope, size);
[4bd0090]	591	}
[874f9d1]	592	cudaMemcpy(cuda_A, A, size, cudaMemcpyHostToDevice);
[4bd0090]	593
[874f9d1]	594	float* cuda_B;
[4bd0090]	595	// cudaMalloc((void **)&cuda_B, size);
[30cb1de]	596	{
[0ffc6c8]	597	$scope deviceScope = $cuda_host_request_device_scope();
[cb46bb6]	598	cuda_B = $hide((float*)$malloc(deviceScope, size));
	599	//cuda_B = (float*)$malloc(deviceScope, size);
[4bd0090]	600	}
[874f9d1]	601	cudaMemcpy(cuda_B, B, size, cudaMemcpyHostToDevice);
[4bd0090]	602
[874f9d1]	603	float* cuda_C;
[4bd0090]	604	// cudaMalloc((void **)&cuda_C, size);
	605	{
[0ffc6c8]	606	$scope deviceScope = $cuda_host_request_device_scope();
[cb46bb6]	607	cuda_C = $hide((float*)$malloc(deviceScope, size));
	608	//cuda_C = (float*)$malloc(deviceScope, size);
[4bd0090]	609	}
	610
[0ffc6c8]	611	dim3 gridDim = {numBlocks, 1, 1};
	612	dim3 blockDim = {numThreads, 1, 1};
	613	// kernel_1<<<gridDim, blockDim>>>(cuda_A, cuda_B, cuda_C, N);
	614	$cuda_host_launch_kernel_1(gridDim, blockDim, 0, NULL, cuda_A, cuda_B, cuda_C, N);
[874f9d1]	615
	616	//Checking correctness
	617	float* C = (float *)malloc(size);
	618
	619	cudaMemcpy(C, cuda_C, size, cudaMemcpyDeviceToHost);
	620
	621	for(int i = 0; i < N; i++)
	622	$assert(C[i] == A[i] + B[i]);
	623
	624	free(C);
	625
[7e3e7af]	626	cudaFree(cuda_A);
[0ffc6c8]	627	cudaFree(cuda_B);
	628	cudaFree(cuda_C);
[6dd0322]	629
	630	// inserted by transformer
	631	$comm_enqueue($cuda_host_comm, $message_pack($CUDA_PLACE_HOST, $CUDA_PLACE_DEVICE, $CUDA_TAG_TEARDOWN, NULL, 0));
[0ffc6c8]	632	$comm_destroy($cuda_host_comm);
[109d05e]	633	}
	634
	635	int main() {
[6dd0322]	636	$proc host = $spawn _host_main();
	637	$proc cuda = $spawn _cuda_main();
	638	$wait(host);
	639	$wait(cuda);
[0ffc6c8]	640	$gcomm_destroy($cuda_gcomm, NULL);
[e76acca]	641	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

Original Format