Context Navigation

newCudaMockup.cvl@ a8faba8

1.23 2.0 main test-branch

Last change on this file since a8faba8 was 6dd0322, checked in by Alex Wilton <awilton@…>, 3 years ago

Fixed domain type bug. Added first iteration of unreachable pointers for CUDA. Started refactoring newCudaMockup to use message-passing and unreachable pointers.

git-svn-id: svn://vsl.cis.udel.edu/civl/trunk@5734 fb995dde-84ed-4084-dfe6-e5aef3e2452c

Property mode set to 100644

File size: 14.6 KB

Line
1	/**
2	* TODO:
3	* - implement cudaMemset and cudaMemsetAsync
4	* - flesh out basic structure of cuda kernel:
5	* - spawn gridDim blocks
6	* - spawn blockDim threads
7	* Alternatively, spawn blockDim warps and then spawn warp's threads
8	* - wait for blocks to finish
9	* - Add in block-level barriers and a __syncthreads() nested function that uses the barrier.
10	* - Add in data race checking support and implement atomicAdd for integers
11	* - Handle dependencies at atomic blocks (replace some with local blocks if possible)
12	* - Create device scope for cudaMallocs
13	*/
14
15	#include <concurrency.cvh>
16	#include <comm.cvh>
17	#include <stdlib.h>
18	#include <stdio.h>
19	#include <stdbool.h>
20	#include <string.h>
21
22	///////////
23	// Types //
24	///////////
25
26	enum cudaError {
27	cudaSuccess
28	};
29	typedef enum cudaError cudaError_t;
30
31	typedef enum cudaMemcpyKind {
32	cudaMemcpyHostToHost,
33	cudaMemcpyHostToDevice,
34	cudaMemcpyDeviceToHost,
35	cudaMemcpyDeviceToDevice,
36	cudaMemcpyDefault
37	} cudaMemcpyKind;
38
39	typedef struct {
40	unsigned int x, y, z;
41	} dim3;
42
43	/* used to represent a location in a three dimensional grid
44	*/
45	typedef struct {
46	unsigned int x, y, z;
47	} uint3;
48
49	typedef struct $cuda_op_state* $cuda_op_state_t;
50	struct $cuda_op_state {
51	_Bool start;
52	$proc op;
53	};
54
55	typedef struct $cuda_op_state_node* $cuda_op_state_node_t;
56	struct $cuda_op_state_node {
57	$cuda_op_state_t opState;
58	$cuda_op_state_node_t next;
59	};
60
61	typedef struct cudaStream* cudaStream_t;
62	typedef struct $cuda_stream_node* $cuda_stream_node_t;
63	struct cudaStream {
64	$cuda_op_state_node_t head;
65	$cuda_op_state_node_t tail;
66	int numOps;
67	$cuda_stream_node_t containingNode;
68	_Bool alive;
69	};
70
71	struct $cuda_stream_node {
72	cudaStream_t stream;
73	$cuda_stream_node_t prev;
74	$cuda_stream_node_t next;
75	};
76
77	typedef struct $cuda_context {
78	$cuda_stream_node_t head; //list of streams
79	int numStreams;
80	} $cuda_context;
81
82	//////////////////////
83	// Global Variables //
84	//////////////////////
85
86	$gcomm $cuda_gcomm = $gcomm_create($here, 2);
87	const int $CUDA_PLACE_HOST = 0;
88	const int $CUDA_PLACE_DEVICE = 1;
89	$comm $cuda_host_comm = $comm_create($here, $cuda_gcomm, $cuda_place_host);
90
91	/**
92	* Tags used for message-passing between host and device
93	*/
94	// Predefined tags
95	const int $CUDA_TAG_TEARDOWN = 0;
96	const int $CUDA_TAG_cudaMalloc = 1;
97	const int $CUDA_TAG_cudaMemcpy = 2;
98	// Generated tags (by transformer)
99	const int $CUDA_TAG_LAUNCH_kernel_1 = 5;
100
101	////////////////////////////////////////////
102	// CUDA API Functions (For Host-use Only) //
103	////////////////////////////////////////////
104
105	cudaError_t cudaMalloc(void** devPtr, size_t size) {
106	$comm_enqueue($cuda_host_comm, $message_pack($CUDA_PLACE_HOST, $CUDA_PLACE_DEVICE, $CUDA_TAG_cudaMalloc, &size, sizeof(size_t)));
107	$message response = $comm_dequeue($cuda_host_comm, $CUDA_PLACE_DEVICE, $CUDA_TAG_cudaMalloc);
108	$message_unpack(response, devPtr, sizeof(void*));
109	}
110
111	typedef struct $cuda_memcpy_args {
112	void* dst;
113	void src;
114	size_t count;
115	cudaMemcpyKind kind;
116	} $cuda_memcpy_args;
117
118	cudaError_t cudaMemcpy(void* dst, const void* src, size_t count, cudaMemcpyKind kind) {
119	if (kind == cudaMemcpyHostToHost) {
120	memcpy(dst, src, count);
121	} else {
122	$cuda_memcpy_args args;
123	args.dst = dst;
124	args.src = src;
125	args.count = count;
126	args.kind = kind;
127
128	$comm_enqueue($cuda_host_comm, $message_pack($CUDA_PLACE_HOST, $CUDA_PLACE_DEVICE, $CUDA_TAG_cudaMemcpy, &args, sizeof($cuda_memcpy_args)));
129	$comm_dequeue($cuda_host_comm, $CUDA_PLACE_DEVICE, $CUDA_TAG_cudaMalloc);
130	}
131
132	return cudaSuccess;
133	}
134
135	$cuda_stream_node_t $create_new_stream_node() {
136	cudaStream_t newStream = (cudaStream_t) malloc(sizeof(struct cudaStream));
137	newStream->head = NULL;
138	newStream->tail = NULL;
139	newStream->numOps = 0;
140	newStream->alive = true;
141
142	$cuda_stream_node_t newHead = ($cuda_stream_node_t) malloc(sizeof(struct $cuda_stream_node));
143	newHead->stream = newStream;
144	newStream->containingNode = newHead;
145	newHead->prev = NULL;
146	newHead->next = NULL;
147
148	return newHead;
149	}
150
151	/**
152	* TODO:
153	* - test
154	* - atomic?
155	*/
156	cudaError_t cudaStreamCreate(cudaStream_t * pStream) {
157	// Create new stream node in linked list
158	$cuda_stream_node_t newHead = $create_new_stream_node();
159	newHead->next = $cuda_global_context.head;
160	$cuda_global_context.head->prev = newHead;
161
162	// Update cuda context's head to be the new node we created
163	$cuda_global_context.head = newHead;
164	$cuda_global_context.numStreams++;
165
166	return cudaSuccess;
167	}
168
169	/**
170	* TODO:
171	* - test
172	* - atomic?
173	*/
174	cudaError_t cudaStreamSynchronize(cudaStream_t stream) {
175	stream = $default_stream_if_null(stream);
176	$assert(stream->alive, "Attempt to synchronize with a destroyed stream");
177	$when(stream->head == NULL) return cudaSuccess;
178	}
179
180	//@ depends_on \nothing;
181	$atomic_f $proc $destroy_stream_node($cuda_stream_node_t node) {
182	$proc lastOpProc = $proc_null;
183	cudaStream_t stream = node->stream;
184
185	if (node->prev != NULL) {
186	node->prev->next = node->next;
187	}
188	if (node->next != NULL) {
189	node->next->prev = node->prev;
190	}
191	free(node);
192
193	stream->alive = false;
194	if(stream->tail != NULL)
195	lastOpProc = stream->tail->opState->op;
196
197	void $destroy_stream_when_complete($proc lastOpProc, cudaStream_t stream) {
198	$wait(lastOpProc);
199	free(stream);
200	}
201
202	return $spawn $destroy_stream_when_complete(lastOpProc, stream);
203	}
204
205	// TODO: atomic
206	cudaError_t cudaStreamDestroy(cudaStream_t stream) {
207	$assert(stream != NULL && stream != $cuda_default_stream, "Attempt to destroy default stream");
208	$assert(stream->alive, "Attempt to destroy an already destroyed stream");
209	$destroy_stream_node(stream->containingNode);
210	return cudaSuccess;
211	}
212
213	/**
214	* Enqueues the calling $proc as a new cuda operation onto stream. Then blocks until the cuda operation is allowed to execute.
215	*/
216	//@ depends_on \nothing;
217	$atomic_f $cuda_op_state_t $stream_enqueue(_Bool* enqueuedFlag, cudaStream_t stream) {
218	$cuda_op_state_t newOpState = ($cuda_op_state_t) malloc(sizeof(struct $cuda_op_state));
219	newOpState->start = false;
220	newOpState->op = $self;
221
222	$cuda_op_state_node_t newOpStateNode = ($cuda_op_state_node_t) malloc(sizeof(struct $cuda_op_state_node));
223	newOpStateNode->opState = newOpState;
224	newOpStateNode->next = NULL;
225
226	stream = $default_stream_if_null(stream);
227	$assert(stream->alive, "Attempt to enqueue a CUDA operation onto a destroyed stream");
228
229	if (stream->tail == NULL) {
230	stream->head = newOpStateNode;
231	stream->tail = newOpStateNode;
232	newOpState->start = true;
233	} else {
234	stream->tail->next = newOpStateNode;
235	stream->tail = newOpStateNode;
236	}
237	stream->numOps++;
238	*enqueuedFlag = true;
239
240	return newOpState;
241	}
242
243	//@ depends_on \nothing;
244	$atomic_f void $stream_dequeue(cudaStream_t stream) {
245	stream = $default_stream_if_null(stream);
246	$assert(stream->head != NULL, "Attempt to dequeue an empty stream");
247
248	if (stream->head == stream->tail) {
249	stream->tail = NULL;
250	}
251
252	$cuda_op_state_node_t oldHead = stream->head;
253	stream->head = oldHead->next;
254	if (stream->head != NULL) {
255	stream->head->opState->start = true;
256	}
257
258	stream->numOps--;
259	free(oldHead->opState);
260	free(oldHead);
261	}
262
263	cudaError_t cudaMemcpyAsync(void* dst, const void* src, size_t count,
264	cudaMemcpyKind kind, cudaStream_t stream) {
265	if (kind == cudaMemcpyHostToHost) {
266	memcpy(dst, src, count);
267	} else {
268	_Bool enqueuedFlag = false;
269	$spawn $cuda_memcpy_proc(dst, src, count, &enqueuedFlag, stream);
270	$when(enqueuedFlag);
271	}
272
273	return cudaSuccess;
274	}
275
276	cudaError_t cudaDeviceSynchronize() {
277	$proc* opsToWaitOn;
278	int numOps = 0;
279
280	$atomic {
281	opsToWaitOn = ($proc) malloc(sizeof($proc) $cuda_global_context.numStreams);
282
283	for ($cuda_stream_node_t node = $cuda_global_context.head;
284	node != NULL;
285	node = node->next) {
286	if (node->stream->tail != NULL) {
287	opsToWaitOn[numOps] = node->stream->tail->opState->op;
288	numOps++;
289	}
290	}
291	}
292	$waitall(opsToWaitOn, numOps);
293
294	return cudaSuccess;
295	}
296
297	// Helper function
298	int $dim3_index(dim3 size, uint3 location) {
299	return location.x + size.x * (location.y + size.y * location.z);
300	}
301
302	// Helper function
303	int $cuda_kernel_index (dim3 gDim, dim3 bDim, uint3 bIdx, uint3 tIdx) {
304	return $dim3_index(gDim, bIdx) * (bDim.x * bDim.y * bDim.z) + $dim3_index(bDim, tIdx);
305	}
306
307	void $cuda_run_and_wait_on_procs(dim3 dim, void spawningFunction(uint3)) {
308	//TODO: calculate length and index, replace this function in the kernel
309	$local_start();
310	int length = dim.x * dim.y * dim.z;
311	$proc proc_array[length];
312	$range rx = 0 .. dim.x - 1;
313	$range ry = 0 .. dim.y - 1;
314	$range rz = 0 .. dim.z - 1;
315	$domain(3) dom = ($domain(3)){rx, ry, rz};
316	$for(int x,y,z : dom){
317	uint3 id = { x, y, z };
318	int index = $dim3_index(dim, id);
319	proc_array[index] = $spawn spawningFunction(id);
320	}
321	$local_end();
322	$waitall(proc_array,length);
323	}
324
325	void $proc_kernel_1(dim3 gridDim, dim3 blockDim, size_t _cudaMemSize,
326	_Bool* enqueuedFlag, cudaStream_t _cudaStream,
327	const float A, const float B, float *C, int numElements) {
328	$cuda_op_state_t opState = $stream_enqueue(enqueuedFlag, _cudaStream);
329	$when(opState->start);
330	_cuda_kernel_1(gridDim, blockDim, _cudaMemSize, A, B, C, numElements);
331	$stream_dequeue(_cudaStream);
332	}
333
334	/////////////////
335	// CUDA "file" //
336	/////////////////
337
338	void _cuda_main() {
339
340	//////////////////////
341	// Device Variables //
342	//////////////////////
343
344	$comm $cuda_device_comm = $comm_create($here, $cuda_gcomm, 1);
345	$cuda_context $cuda_global_context;
346	cudaStream_t $cuda_default_stream;
347
348	///////////////////////////////
349	// CUDA Function Definitions //
350	///////////////////////////////
351
352	/**
353	* Only called at start of program
354	*/
355	void $cuda_setup() {
356	$cuda_stream_node_t defaultStreamNode = $create_new_stream_node();
357	$cuda_default_stream = defaultStreamNode->stream;
358
359	$cuda_global_context.head = defaultStreamNode;
360	$cuda_global_context.numStreams = 1;
361	}
362
363	/**
364	* Only called at end of program
365	*/
366	void $cuda_teardown() {
367	$proc destructor = $destroy_stream_node($cuda_default_stream->containingNode);
368	$wait(destructor);
369	}
370
371	$message $cuda_malloc($message request) {
372	size_t size;
373	$message_unpack(request, &size, sizeof(size_t));
374	void* ptr = $malloc($here, size);
375
376	return $message_pack($CUDA_PLACE_DEVICE, $CUDA_PLACE_HOST, $CUDA_TAG_cudaMalloc, &$make_unreachable(ptr), sizeof(void*));
377	}
378
379	void $cuda_memcpy_proc(void* dst, const void* src, size_t count, _Bool* enqueuedFlag, cudaStream_t stream) {
380	$cuda_op_state_t opState = $stream_enqueue(enqueuedFlag, stream);
381	$when(opState->start);
382	memcpy(dst, src, count);
383	$stream_dequeue(stream);
384	}
385
386	$message $cuda_memcpy($message request) {
387	$cuda_memcpy_args args;
388	$message_unpack(request, &args, sizeof($cuda_memcpy_args));
389
390	_Bool enqueuedFlag = false;
391	$proc memcpyProc = $spawn $cuda_memcpy_proc(args.dst, args.src, args.count, &enqueuedFlag, $cuda_default_stream);
392	$when(enqueuedFlag);
393	if (args.kind != cudaMemcpyDeviceToDevice) {
394	$wait(memcpyProc);
395	}
396
397	return $message_pack($CUDA_PLACE_DEVICE, $CUDA_PLACE_HOST, $CUDA_TAG_cudaMemcpy, null, 0);
398	}
399
400	// Helper function to get the default stream if passed NULL, and just returns stream otherwise
401	cudaStream_t $default_stream_if_null(cudaStream_t stream) {
402	return stream == NULL ? $cuda_default_stream : stream;
403	}
404
405	////////////////////////
406	// Kernel Definitions //
407	////////////////////////
408
409	// Generated from kernel_1 definition
410	void _cuda_kernel_1(dim3 gridDim, dim3 blockDim, size_t _cuda_mem_size,
411	const float A, const float B, float *C, int numElements) {
412	void _cuda_block(uint3 blockIdx) {
413	int numThreads = (blockDim.x * blockDim.y) * blockDim.z;
414	$scope _block_root = $here;
415	$gbarrier _cuda_block_barrier = $gbarrier_create($here, blockDim.x * blockDim.y * blockDim.z);
416	void _cuda_thread(uint3 threadIdx) {
417	int _cuda_tid = $dim3_index(blockDim, threadIdx);
418	int _cuda_kid = $cuda_kernel_index(gridDim, blockDim, blockIdx, threadIdx);
419	$barrier _cuda_thread_barrier = $barrier_create($here, _cuda_block_barrier, _cuda_tid);
420	$local_start();
421	// Kernel definition start
422
423	int i = blockDim.x * blockIdx.x + threadIdx.x;
424
425	if (i < numElements)
426	{
427	C[i] = A[i] + B[i];
428	}
429
430	// Kernel definition end
431	$local_end();
432	$barrier_destroy(_cuda_thread_barrier);
433	}
434	$cuda_run_and_wait_on_procs(blockDim, _cuda_thread);
435	$gbarrier_destroy(_cuda_block_barrier);
436	}
437	$cuda_run_and_wait_on_procs(gridDim, _cuda_block);
438	}
439
440	/////////////////
441	// Device main //
442	/////////////////
443
444	$cuda_setup();
445
446	while (true) {
447	$message request = $comm_dequeue($cuda_device_comm, $CUDA_PLACE_HOST, $COMM_ANY_TAG);
448	$message response;
449	int tag = $message_tag(request);
450
451	switch(tag) {
452	case $CUDA_TAG_TEARDOWN :
453	$cuda_teardown();
454	return;
455	case $CUDA_TAG_cudaMalloc :
456	response = $cuda_malloc(request);
457	break;
458	case $CUDA_TAG_cudaMemcpy :
459	response = $cuda_memcpy(request);
460	break;
461	case $CUDA_TAG_LAUNCH_kernel_1 :
462	response = ???;
463	default :
464	$assert(false, "Unknown CUDA request");
465	}
466
467	$comm_enqueue($cuda_device_comm, response);
468	}
469	}
470
471	///////////////
472	// Host file //
473	///////////////
474
475	$input int N;
476	$assume (N > 0);
477	$input float A[N];
478	$input float B[N];
479
480	void _host_main() {
481	int size = N * sizeof(float);
482	int numBlocks = 2;
483	int numThreads = N%2 == 0? N/2 : (N+1)/2;
484
485	float* cuda_A;
486	foo((void **)&cuda_A, size);
487	/*
488	{
489	cuda_A = (float *) malloc(size);
490	}
491	*/
492	cudaMemcpy(cuda_A, A, size, cudaMemcpyHostToDevice);
493
494	float* cuda_B;
495	// cudaMalloc((void **)&cuda_B, size);
496	{
497	cuda_B = (float *) malloc(size);
498	}
499	cudaMemcpy(cuda_B, B, size, cudaMemcpyHostToDevice);
500
501	float* cuda_C;
502	// cudaMalloc((void **)&cuda_C, size);
503	{
504	cuda_C = (float *) malloc(size);
505	}
506
507	{ // kernel_1<<<gridDim, blockDim>>>(cuda_A, cuda_B, cuda_C, N);
508	dim3 gridDim = {numBlocks, 1, 1};
509	dim3 blockDim = {numThreads, 1, 1};
510	_Bool enqueuedFlag = false;
511	$spawn $proc_kernel_1(gridDim, blockDim, 0, &enqueuedFlag, NULL, cuda_A, cuda_B, cuda_C, N);
512	$when(enqueuedFlag);
513	}
514
515	//Checking correctness
516	float* C = (float *)malloc(size);
517
518	cudaMemcpy(C, cuda_C, size, cudaMemcpyDeviceToHost);
519
520	for(int i = 0; i < N; i++)
521	$assert(C[i] == A[i] + B[i]);
522
523	free(C);
524
525	//cudaFree(cuda_A);...
526	free(cuda_A);
527	free(cuda_B);
528	free(cuda_C);
529
530	// inserted by transformer
531	$comm_enqueue($cuda_host_comm, $message_pack($CUDA_PLACE_HOST, $CUDA_PLACE_DEVICE, $CUDA_TAG_TEARDOWN, NULL, 0));
532	}
533
534	int main() {
535	$proc host = $spawn _host_main();
536	$proc cuda = $spawn _cuda_main();
537	$wait(host);
538	$wait(cuda);
539	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

Original Format