Context Navigation

newCudaMockup.cvl@ 7d77e64

1.23 2.0 main test-branch

Last change on this file since 7d77e64 was b2ca0b6, checked in by Alex Wilton <awilton@…>, 3 years ago

Factored out $reveal code in cuda mockup

git-svn-id: svn://vsl.cis.udel.edu/civl/trunk@5763 fb995dde-84ed-4084-dfe6-e5aef3e2452c

Property mode set to 100644

File size: 16.1 KB

Line
1	#include <concurrency.cvh>
2	#include <comm.cvh>
3	#include <stdlib.h>
4	#include <stdio.h>
5	#include <stdbool.h>
6	#include <string.h>
7	#pragma CIVL ACSL
8
9	///////////
10	// Types //
11	///////////
12
13	enum cudaError {
14	cudaSuccess
15	};
16	typedef enum cudaError cudaError_t;
17
18	typedef enum cudaMemcpyKind {
19	cudaMemcpyHostToHost,
20	cudaMemcpyHostToDevice,
21	cudaMemcpyDeviceToHost,
22	cudaMemcpyDeviceToDevice,
23	cudaMemcpyDefault
24	} cudaMemcpyKind;
25
26	typedef struct {
27	unsigned int x, y, z;
28	} dim3;
29
30	/* used to represent a location in a three dimensional grid
31	*/
32	typedef struct {
33	unsigned int x, y, z;
34	} uint3;
35
36	typedef struct $cuda_op_state* $cuda_op_state_t;
37	struct $cuda_op_state {
38	_Bool start;
39	$proc op;
40	};
41
42	typedef struct $cuda_op_state_node* $cuda_op_state_node_t;
43	struct $cuda_op_state_node {
44	$cuda_op_state_t opState;
45	$cuda_op_state_node_t next;
46	};
47
48	typedef struct cudaStream* cudaStream_t;
49	typedef struct $cuda_stream_node* $cuda_stream_node_t;
50	struct cudaStream {
51	$cuda_op_state_node_t head;
52	$cuda_op_state_node_t tail;
53	int numOps;
54	$cuda_stream_node_t containingNode;
55	_Bool alive;
56	};
57
58	struct $cuda_stream_node {
59	cudaStream_t stream;
60	$cuda_stream_node_t prev;
61	$cuda_stream_node_t next;
62	};
63
64	typedef struct $cuda_context {
65	$cuda_stream_node_t head;
66	int numStreams;
67	} $cuda_context;
68
69	typedef struct {
70	$mem readSets[];
71	$mem writeSets[];
72	int size;
73	} $cuda_kernel_instance;
74
75	typedef struct $cuda_memcpy_data {
76	void* dst;
77	const void* src;
78	size_t count;
79	cudaMemcpyKind kind;
80	} $cuda_memcpy_data;
81
82	//////////////////////
83	// Global Variables //
84	//////////////////////
85
86	$gcomm $cuda_gcomm = $gcomm_create($here, 2);
87	const int $CUDA_PLACE_HOST = 0;
88	const int $CUDA_PLACE_DEVICE = 1;
89	$comm $cuda_host_comm = $comm_create($here, $cuda_gcomm, $CUDA_PLACE_HOST);
90
91	/**
92	* Tags used for message-passing between host and device
93	*/
94	enum $cuda_tag {
95	// Predefined tags
96	$CUDA_TAG_TEARDOWN,
97	$CUDA_TAG_SCOPE_REQUEST,
98	$CUDA_TAG_cudaFree,
99	$CUDA_TAG_cudaMemcpy,
100	$CUDA_TAG_cudaMemcpyAsync,
101	// Generated tags (by transformer)
102	$CUDA_TAG_LAUNCH_kernel_1
103	};
104
105	///////////////////
106	// CIVL-CUDA API //
107	///////////////////
108
109	$scope $cuda_host_request_device_scope() {
110	$comm_enqueue($cuda_host_comm, $message_pack($CUDA_PLACE_HOST, $CUDA_PLACE_DEVICE, $CUDA_TAG_SCOPE_REQUEST, NULL, 0));
111	$message response = $comm_dequeue($cuda_host_comm, $CUDA_PLACE_DEVICE, $CUDA_TAG_SCOPE_REQUEST);
112	$scope result;
113	$message_unpack(response, &result, sizeof($scope));
114
115	return result;
116	}
117
118	void $cuda_helper_host_memcpy(void* dst, const void* src, size_t count, cudaMemcpyKind kind, _Bool async) {
119	if (kind == cudaMemcpyHostToHost) {
120	memcpy(dst, src, count);
121	} else {
122	$cuda_memcpy_data args;
123	args.dst = dst;
124	args.src = src;
125	args.count = count;
126	args.kind = kind;
127
128	int tag = async ? $CUDA_TAG_cudaMemcpyAsync : $CUDA_TAG_cudaMemcpy;
129
130	$comm_enqueue($cuda_host_comm, $message_pack($CUDA_PLACE_HOST, $CUDA_PLACE_DEVICE, tag, &args, sizeof($cuda_memcpy_data)));
131	$comm_dequeue($cuda_host_comm, $CUDA_PLACE_DEVICE, tag);
132	}
133	}
134
135	$cuda_stream_node_t $create_new_stream_node($scope cudaScope) {
136	cudaStream_t newStream = (cudaStream_t) $malloc(cudaScope, sizeof(struct cudaStream));
137	newStream->head = NULL;
138	newStream->tail = NULL;
139	newStream->numOps = 0;
140	newStream->alive = true;
141
142	$cuda_stream_node_t newHead = ($cuda_stream_node_t) $malloc(cudaScope, sizeof(struct $cuda_stream_node));
143	newHead->stream = newStream;
144	newStream->containingNode = newHead;
145	newHead->prev = NULL;
146	newHead->next = NULL;
147
148	return newHead;
149	}
150
151	/*@ depends_on \nothing;
152	@ assigns \nothing;
153	@ reads \nothing;
154	@*/
155	$atomic_f $proc $destroy_stream_node($cuda_stream_node_t node) {
156	$proc lastOpProc = $proc_null;
157	cudaStream_t stream = node->stream;
158
159	if (node->prev != NULL) {
160	node->prev->next = node->next;
161	}
162	if (node->next != NULL) {
163	node->next->prev = node->prev;
164	}
165	free(node);
166
167	stream->alive = false;
168	if(stream->tail != NULL)
169	lastOpProc = stream->tail->opState->op;
170
171	void destroyStreamWhenComplete($proc lastOpProc, cudaStream_t stream) {
172	$wait(lastOpProc);
173	free(stream);
174	}
175
176	return $spawn destroyStreamWhenComplete(lastOpProc, stream);
177	}
178
179	/*@ depends_on \access(stream);
180	@ assigns stream;
181	@ reads \nothing;
182	@*/
183	$atomic_f $proc $stream_enqueue($scope cudaScope, cudaStream_t stream, $message opParams, void(*opProc)($message, $cuda_op_state_t, cudaStream_t)) {
184	$assert(stream->alive, "Attempt to enqueue a CUDA operation onto a destroyed stream");
185
186	$cuda_op_state_t newOpState = ($cuda_op_state_t) $malloc(cudaScope, sizeof(struct $cuda_op_state));
187	newOpState->start = false;
188	newOpState->op = $spawn opProc(opParams, newOpState, stream);
189
190	$cuda_op_state_node_t newOpStateNode = ($cuda_op_state_node_t) $malloc(cudaScope, sizeof(struct $cuda_op_state_node));
191	newOpStateNode->opState = newOpState;
192	newOpStateNode->next = NULL;
193
194	if (stream->tail == NULL) {
195	stream->head = newOpStateNode;
196	stream->tail = newOpStateNode;
197	newOpState->start = true;
198	} else {
199	stream->tail->next = newOpStateNode;
200	stream->tail = newOpStateNode;
201	}
202	stream->numOps++;
203
204	return newOpState->op;
205	}
206
207	/*@ depends_on \nothing;
208	@ assigns \nothing;
209	@ reads \nothing;
210	@*/
211	$atomic_f void $stream_dequeue(cudaStream_t stream) {
212	$assert(stream->head != NULL, "Attempt to dequeue an empty stream");
213
214	if (stream->head == stream->tail) {
215	stream->tail = NULL;
216	}
217
218	$cuda_op_state_node_t oldHead = stream->head;
219	stream->head = oldHead->next;
220	if (stream->head != NULL) {
221	stream->head->opState->start = true;
222	}
223
224	stream->numOps--;
225	free(oldHead->opState);
226	free(oldHead);
227	}
228
229	// Helper function
230	int $dim3_index(dim3 size, uint3 location) {
231	return location.x + size.x * (location.y + size.y * location.z);
232	}
233
234	// Helper function
235	int $cuda_kernel_index (dim3 gDim, dim3 bDim, uint3 bIdx, uint3 tIdx) {
236	return $dim3_index(gDim, bIdx) * (bDim.x * bDim.y * bDim.z) + $dim3_index(bDim, tIdx);
237	}
238
239	void $cuda_run_and_wait_on_procs(dim3 dim, void spawningFunction(uint3)) {
240	//TODO: calculate length and index, replace this function in the kernel
241	$local_start();
242	int length = dim.x * dim.y * dim.z;
243	$proc procArray[length];
244	$range rx = 0 .. dim.x - 1;
245	$range ry = 0 .. dim.y - 1;
246	$range rz = 0 .. dim.z - 1;
247	$domain(3) dom = ($domain(3)){rx, ry, rz};
248	$for(int x,y,z : dom){
249	uint3 id = { x, y, z };
250	int index = $dim3_index(dim, id);
251	procArray[index] = $spawn spawningFunction(id);
252	}
253	$local_end();
254	$waitall(procArray,length);
255	}
256
257
258	// CUDA Ops //
259
260	void $cuda_memcpy_proc($message m, $cuda_op_state_t opState, cudaStream_t stream) {
261
262	$when(opState->start);
263	$cuda_memcpy_data args;
264	$message_unpack(m, &args, sizeof($cuda_memcpy_data));
265
266	if (args.kind == cudaMemcpyHostToDevice \|\| cudaMemcpyDeviceToDevice) {
267	args.dst = $reveal(args.dst);
268	}
269	if (args.kind == cudaMemcpyDeviceToHost \|\| cudaMemcpyDeviceToDevice) {
270	args.src = $reveal(args.src);
271	}
272	memcpy(args.dst, args.src, args.count);
273
274	$stream_dequeue(stream);
275	}
276
277	$message $cuda_memcpy($scope cudaScope, cudaStream_t stream, $message request, _Bool async) {
278	$cuda_memcpy_data args;
279	$message_unpack(request, &args, sizeof($cuda_memcpy_data));
280
281	$proc memcpyProc = $stream_enqueue(cudaScope, stream, request, $cuda_memcpy_proc);
282
283	if (!async && args.kind != cudaMemcpyDeviceToDevice) {
284	$wait(memcpyProc);
285	}
286	int tag = async ? $CUDA_TAG_cudaMemcpyAsync : $CUDA_TAG_cudaMemcpy;
287
288	return $message_pack($CUDA_PLACE_DEVICE, $CUDA_PLACE_HOST, tag, NULL, 0);
289	}
290
291	$message $cuda_free($message request) {
292	void* devPtr;
293	$message_unpack(request, &devPtr, sizeof(void*));
294	free($reveal(devPtr));
295
296	return $message_pack($CUDA_PLACE_DEVICE, $CUDA_PLACE_HOST, $CUDA_TAG_cudaFree, NULL, 0);
297	}
298
299	////////////////////////////////////////////
300	// CUDA API Functions (For Host-use Only) //
301	////////////////////////////////////////////
302
303	cudaError_t cudaFree(void* devPtr) {
304	$comm_enqueue($cuda_host_comm, $message_pack($CUDA_PLACE_HOST, $CUDA_PLACE_DEVICE, $CUDA_TAG_cudaFree, &devPtr, sizeof(void*)));
305	$comm_dequeue($cuda_host_comm, $CUDA_PLACE_DEVICE, $CUDA_TAG_cudaFree);
306
307	return cudaSuccess;
308	}
309
310	cudaError_t cudaMemcpy(void* dst, const void* src, size_t count, cudaMemcpyKind kind) {
311	$cuda_helper_host_memcpy(dst, src, count, kind, false);
312	return cudaSuccess;
313	}
314
315	cudaError_t cudaMemcpyAsync(void* dst, const void* src, size_t count,
316	cudaMemcpyKind kind, cudaStream_t stream) {
317	$cuda_helper_host_memcpy(dst, src, count, kind, true);
318	return cudaSuccess;
319	}
320
321	/*
322	cudaError_t cudaStreamCreate(cudaStream_t * pStream) {
323	// Create new stream node in linked list
324	$cuda_stream_node_t newHead = $create_new_stream_node();
325	newHead->next = $cuda_global_context.head;
326	$cuda_global_context.head->prev = newHead;
327
328	// Update cuda context's head to be the new node we created
329	$cuda_global_context.head = newHead;
330	$cuda_global_context.numStreams++;
331
332	return cudaSuccess;
333	}
334	*/
335
336	/*
337	cudaError_t cudaStreamSynchronize(cudaStream_t stream) {
338	stream = $default_stream_if_null(stream);
339	$assert(stream->alive, "Attempt to synchronize with a destroyed stream");
340	$when(stream->head == NULL) return cudaSuccess;
341	}
342	*/
343
344	/*
345	cudaError_t cudaStreamDestroy(cudaStream_t stream) {
346	$assert(stream != NULL && stream != $cuda_default_stream, "Attempt to destroy default stream");
347	$assert(stream->alive, "Attempt to destroy an already destroyed stream");
348	$destroy_stream_node(stream->containingNode);
349	return cudaSuccess;
350	}
351	*/
352
353	/*
354	cudaError_t cudaDeviceSynchronize() {
355	$proc* opsToWaitOn;
356	int numOps = 0;
357
358	$atomic {
359	opsToWaitOn = ($proc) malloc(sizeof($proc) $cuda_global_context.numStreams);
360
361	for ($cuda_stream_node_t node = $cuda_global_context.head;
362	node != NULL;
363	node = node->next) {
364	if (node->stream->tail != NULL) {
365	opsToWaitOn[numOps] = node->stream->tail->opState->op;
366	numOps++;
367	}
368	}
369	}
370	$waitall(opsToWaitOn, numOps);
371
372	return cudaSuccess;
373	}
374	*/
375
376	//////////////////////////////////
377	// Generated code from kernel_1 //
378	//////////////////////////////////
379
380	typedef struct {
381	dim3 gridDim;
382	dim3 blockDim;
383	size_t $cudaMemSize;
384	cudaStream_t $cudaStream;
385	const float* A;
386	const float* B;
387	float* C;
388	int numElements;
389	} $cuda_kernel_1_data;
390
391	void $cuda_reveal_kernel_1_args($cuda_kernel_1_data* args) {
392	args->A = $reveal(args->A);
393	args->B = $reveal(args->B);
394	args->C = $reveal(args->C);
395	}
396
397	void $cuda_host_launch_kernel_1(dim3 gridDim, dim3 blockDim,
398	size_t $cudaMemSize, cudaStream_t $cudaStream,
399	const float* A, const float* B, float* C, int numElements) {
400	$cuda_kernel_1_data args;
401	args.gridDim = gridDim;
402	args.blockDim = blockDim;
403	args.$cudaMemSize = $cudaMemSize;
404	args.$cudaStream = $cudaStream;
405	args.A = A;
406	args.B = B;
407	args.C = C;
408	args.numElements = numElements;
409
410	$comm_enqueue($cuda_host_comm, $message_pack($CUDA_PLACE_HOST, $CUDA_PLACE_DEVICE, $CUDA_TAG_LAUNCH_kernel_1, &args, sizeof($cuda_kernel_1_data)));
411	$comm_dequeue($cuda_host_comm, $CUDA_PLACE_DEVICE, $CUDA_TAG_LAUNCH_kernel_1);
412	}
413
414	void $cuda_kernel_1(dim3 gridDim, dim3 blockDim, size_t _cuda_mem_size,
415	const float A, const float B, float *C, int numElements) {
416	$cuda_kernel_instance $kernel;
417	void _cuda_block(uint3 blockIdx) {
418	int numThreads = (blockDim.x * blockDim.y) * blockDim.z;
419	$scope _block_root = $here;
420	$gbarrier _cuda_block_barrier = $gbarrier_create($here, blockDim.x * blockDim.y * blockDim.z);
421	void _cuda_thread(uint3 threadIdx) {
422	int _cuda_tid = $dim3_index(blockDim, threadIdx);
423	int _cuda_kid = $cuda_kernel_index(gridDim, blockDim, blockIdx, threadIdx);
424	$barrier _cuda_thread_barrier = $barrier_create($here, _cuda_block_barrier, _cuda_tid);
425	$local_start();
426	// Kernel definition start
427
428	int i = blockDim.x * blockIdx.x + threadIdx.x;
429
430	if (i < numElements)
431	{
432	C[i] = A[i] + B[i];
433	}
434
435	// Kernel definition end
436	$local_end();
437	$barrier_destroy(_cuda_thread_barrier);
438	}
439	$cuda_run_and_wait_on_procs(blockDim, _cuda_thread);
440	$gbarrier_destroy(_cuda_block_barrier);
441	}
442	$cuda_run_and_wait_on_procs(gridDim, _cuda_block);
443	}
444
445	void $cuda_kernel_1_proc ($message request, $cuda_op_state_t opState, cudaStream_t cudaStream) {
446	$when(opState->start);
447
448	$cuda_kernel_1_data args;
449	$message_unpack(request, &args, sizeof($cuda_kernel_1_data));
450	$cuda_reveal_kernel_1_args(&args);
451
452	$cuda_kernel_1(args.gridDim, args.blockDim, args.$cudaMemSize, args.A, args.B, args.C, args.numElements);
453	$stream_dequeue(cudaStream);
454	}
455
456	/////////////////
457	// CUDA "file" //
458	/////////////////
459
460	void $cuda_main() {
461
462	// Device Variables
463
464	$scope $cuda_scope = $here;
465
466	$comm $cuda_device_comm = $comm_create($cuda_scope, $cuda_gcomm, 1);
467	$cuda_context $cuda_global_context;
468	cudaStream_t $cuda_default_stream;
469
470	// Helper function to get the default stream if passed NULL, and just returns stream otherwise
471	// Currently unused until we support streams other than the default one.
472	cudaStream_t $default_stream_if_null(cudaStream_t stream) {
473	return stream == NULL ? $cuda_default_stream : stream;
474	}
475
476	// Device Logic
477
478	$cuda_stream_node_t defaultStreamNode = $create_new_stream_node($cuda_scope);
479	$cuda_default_stream = defaultStreamNode->stream;
480
481	$cuda_global_context.head = defaultStreamNode;
482	$cuda_global_context.numStreams = 1;
483
484	while (true) {
485	$message request = $comm_dequeue($cuda_device_comm, $CUDA_PLACE_HOST, $COMM_ANY_TAG);
486	$message response;
487	const int tag = $message_tag(request);
488
489	switch(tag) {
490	case $CUDA_TAG_SCOPE_REQUEST :
491	response = $message_pack($CUDA_PLACE_DEVICE, $CUDA_PLACE_HOST, $CUDA_TAG_SCOPE_REQUEST, &$cuda_scope, sizeof($scope));
492	break;
493	case $CUDA_TAG_cudaFree :
494	response = $cuda_free(request);
495	break;
496	case $CUDA_TAG_cudaMemcpy :
497	response = $cuda_memcpy($cuda_scope, $cuda_default_stream, request, false);
498	break;
499	case $CUDA_TAG_cudaMemcpyAsync :
500	response = $cuda_memcpy($cuda_scope, $cuda_default_stream, request, true);
501	break;
502	case $CUDA_TAG_LAUNCH_kernel_1 :
503	$stream_enqueue($cuda_scope, $cuda_default_stream, request, $cuda_kernel_1_proc);
504
505	response = $message_pack($CUDA_PLACE_DEVICE, $CUDA_PLACE_HOST, tag, NULL, 0);
506	break;
507	case $CUDA_TAG_TEARDOWN : {
508	$proc destructor = $destroy_stream_node($cuda_default_stream->containingNode);
509	$wait(destructor);
510	$comm_destroy($cuda_device_comm);
511	return;
512	}
513	default :
514	$assert(false, "Unknown CUDA request");
515	}
516
517	$comm_enqueue($cuda_device_comm, response);
518	}
519	}
520
521	///////////////
522	// Host file //
523	///////////////
524
525	$input int N;
526	$assume (N > 0);
527	$input float A[N];
528	$input float B[N];
529
530	void $host_main() {
531	int size = N * sizeof(float);
532	int numBlocks = 2;
533	int numThreads = N%2 == 0? N/2 : (N+1)/2;
534
535	float* cuda_A;
536	// cudaMalloc((void **)&cuda_A, size);
537	{
538	$scope deviceScope = $cuda_host_request_device_scope();
539	cuda_A = $hide((float*)$malloc(deviceScope, size));
540	}
541	cudaMemcpy(cuda_A, A, size, cudaMemcpyHostToDevice);
542
543	float* cuda_B;
544	// cudaMalloc((void **)&cuda_B, size);
545	{
546	$scope deviceScope = $cuda_host_request_device_scope();
547	cuda_B = $hide((float*)$malloc(deviceScope, size));
548	}
549	cudaMemcpy(cuda_B, B, size, cudaMemcpyHostToDevice);
550
551	float* cuda_C;
552	// cudaMalloc((void **)&cuda_C, size);
553	{
554	$scope deviceScope = $cuda_host_request_device_scope();
555	cuda_C = $hide((float*)$malloc(deviceScope, size));
556	}
557
558	dim3 gridDim = {numBlocks, 1, 1};
559	dim3 blockDim = {numThreads, 1, 1};
560	// kernel_1<<<gridDim, blockDim>>>(cuda_A, cuda_B, cuda_C, N);
561	$cuda_host_launch_kernel_1(gridDim, blockDim, 0, NULL, cuda_A, cuda_B, cuda_C, N);
562
563	// Checking correctness
564	float* C = (float *)malloc(size);
565
566	cudaMemcpy(C, cuda_C, size, cudaMemcpyDeviceToHost);
567
568	for(int i = 0; i < N; i++)
569	$assert(C[i] == A[i] + B[i]);
570
571	free(C);
572
573	cudaFree(cuda_A);
574	cudaFree(cuda_B);
575	cudaFree(cuda_C);
576
577	// inserted by transformer
578	$comm_enqueue($cuda_host_comm, $message_pack($CUDA_PLACE_HOST, $CUDA_PLACE_DEVICE, $CUDA_TAG_TEARDOWN, NULL, 0));
579	$comm_destroy($cuda_host_comm);
580	}
581
582	int main() {
583	$proc host = $spawn $host_main();
584	$proc cuda = $spawn $cuda_main();
585	$wait(host);
586	$wait(cuda);
587	$gcomm_destroy($cuda_gcomm, NULL);
588	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

Original Format