Context Navigation

newCudaMockup.cvl@ b161e2f

1.23 2.0 main test-branch

Last change on this file since b161e2f was 874f9d1, checked in by Zane Greenholt <zgrnhlt@…>, 3 years ago

Added vecAdd example to newCudaMockup

git-svn-id: svn://vsl.cis.udel.edu/civl/trunk@5715 fb995dde-84ed-4084-dfe6-e5aef3e2452c

Property mode set to 100644

File size: 11.2 KB

Line
1	/**
2	* TODO:
3	* - implement cudaMemset and cudaMemsetAsync
4	* - flesh out basic structure of cuda kernel:
5	* - spawn gridDim blocks
6	* - spawn blockDim threads
7	* Alternatively, spawn blockDim warps and then spawn warp's threads
8	* - wait for blocks to finish
9	* - Add in block-level barriers and a __syncthreads() nested function that uses the barrier.
10	* - Add in data race checking support and implement atomicAdd for integers
11	* - Handle dependencies at atomic blocks (replace some with local blocks if possible)
12	* - Create device scope for cudaMallocs
13	*/
14
15	#include <concurrency.cvh>
16	#include <stdlib.h>
17	#include <stdio.h>
18	#include <stdbool.h>
19	#include <string.h>
20
21	enum cudaError {
22	cudaSuccess
23	};
24	typedef enum cudaError cudaError_t;
25
26	typedef enum cudaMemcpyKind {
27	cudaMemcpyHostToHost,
28	cudaMemcpyHostToDevice,
29	cudaMemcpyDeviceToHost,
30	cudaMemcpyDeviceToDevice,
31	cudaMemcpyDefault
32	} cudaMemcpyKind;
33
34	typedef struct {
35	unsigned int x, y, z;
36	} dim3;
37
38	/* used to represent a location in a three dimensional grid
39	*/
40	typedef struct {
41	unsigned int x, y, z;
42	} uint3;
43
44	typedef struct $cuda_op_state* $cuda_op_state_t;
45	struct $cuda_op_state {
46	_Bool start;
47	$proc op;
48	};
49
50	typedef struct $cuda_op_state_node* $cuda_op_state_node_t;
51	struct $cuda_op_state_node {
52	$cuda_op_state_t opState;
53	$cuda_op_state_node_t next;
54	};
55
56	typedef struct cudaStream* cudaStream_t;
57	typedef struct $cuda_stream_node* $cuda_stream_node_t;
58	struct cudaStream {
59	$cuda_op_state_node_t head;
60	$cuda_op_state_node_t tail;
61	int numOps;
62	$cuda_stream_node_t containingNode;
63	_Bool alive;
64	};
65	cudaStream_t $cuda_default_stream;
66
67	struct $cuda_stream_node {
68	cudaStream_t stream;
69	$cuda_stream_node_t prev;
70	$cuda_stream_node_t next;
71	};
72
73	typedef struct $cuda_context {
74	$cuda_stream_node_t head; //list of streams
75	int numStreams;
76	} $cuda_context;
77	$cuda_context $cuda_global_context;
78
79	// Helper function to get the default stream if passed NULL, and just returns stream otherwise
80	cudaStream_t $default_stream_if_null(cudaStream_t stream) {
81	return stream == NULL ? $cuda_default_stream : stream;
82	}
83
84	$cuda_stream_node_t $create_new_stream_node() {
85	cudaStream_t newStream = (cudaStream_t) malloc(sizeof(struct cudaStream));
86	newStream->head = NULL;
87	newStream->tail = NULL;
88	newStream->numOps = 0;
89	newStream->alive = true;
90
91	$cuda_stream_node_t newHead = ($cuda_stream_node_t) malloc(sizeof(struct $cuda_stream_node));
92	newHead->stream = newStream;
93	newStream->containingNode = newHead;
94	newHead->prev = NULL;
95	newHead->next = NULL;
96
97	return newHead;
98	}
99
100	/**
101	* TODO:
102	* - test
103	* - atomic?
104	*/
105	cudaError_t cudaStreamCreate(cudaStream_t * pStream) {
106	// Create new stream node in linked list
107	$cuda_stream_node_t newHead = $create_new_stream_node();
108	newHead->next = $cuda_global_context.head;
109	$cuda_global_context.head->prev = newHead;
110
111	// Update cuda context's head to be the new node we created
112	$cuda_global_context.head = newHead;
113	$cuda_global_context.numStreams++;
114
115	return cudaSuccess;
116	}
117
118	/**
119	* TODO:
120	* - test
121	* - atomic?
122	*/
123	cudaError_t cudaStreamSynchronize(cudaStream_t stream) {
124	stream = $default_stream_if_null(stream);
125	$assert(stream->alive, "Attempt to synchronize with a destroyed stream");
126	$when(stream->head == NULL) return cudaSuccess;
127	}
128
129	$proc $destroy_stream_node($cuda_stream_node_t node) {
130	$proc lastOpProc = $proc_null;
131	cudaStream_t stream = node->stream;
132
133	$local_start();
134	if (node->prev != NULL) {
135	node->prev->next = node->next;
136	}
137	if (node->next != NULL) {
138	node->next->prev = node->prev;
139	}
140	free(node);
141
142	stream->alive = false;
143	if(stream->tail != NULL)
144	lastOpProc = stream->tail->opState->op;
145	$local_end();
146
147	void $destroy_stream_when_complete($proc lastOpProc, cudaStream_t stream) {
148	$wait(lastOpProc);
149	free(stream);
150	}
151
152	return $spawn $destroy_stream_when_complete(lastOpProc, stream);
153	}
154
155	// TODO: atomic
156	cudaError_t cudaStreamDestroy(cudaStream_t stream) {
157	$assert(stream != NULL && stream != $cuda_default_stream, "Attempt to destroy default stream");
158	$assert(stream->alive, "Attempt to destroy an already destroyed stream");
159	$destroy_stream_node(stream->containingNode);
160	return cudaSuccess;
161	}
162
163	/**
164	* Enqueues the calling $proc as a new cuda operation onto stream. Then blocks until the cuda operation is allowed to execute.
165	*/
166	$cuda_op_state_t $stream_enqueue(_Bool* enqueuedFlag, cudaStream_t stream) {
167	$cuda_op_state_t newOpState = ($cuda_op_state_t) malloc(sizeof(struct $cuda_op_state));
168	newOpState->start = false;
169	newOpState->op = $self;
170
171	$cuda_op_state_node_t newOpStateNode = ($cuda_op_state_node_t) malloc(sizeof(struct $cuda_op_state_node));
172	newOpStateNode->opState = newOpState;
173	newOpStateNode->next = NULL;
174
175	$local_start();
176	stream = $default_stream_if_null(stream);
177	$assert(stream->alive, "Attempt to enqueue a CUDA operation onto a destroyed stream");
178
179	if (stream->tail == NULL) {
180	stream->head = newOpStateNode;
181	stream->tail = newOpStateNode;
182	newOpState->start = true;
183	} else {
184	stream->tail->next = newOpStateNode;
185	stream->tail = newOpStateNode;
186	}
187	stream->numOps++;
188	*enqueuedFlag = true;
189	$local_end();
190
191	return newOpState;
192	}
193
194	void $stream_dequeue(cudaStream_t stream) {
195	stream = $default_stream_if_null(stream);
196	$assert(stream->head != NULL, "Attempt to dequeue an empty stream");
197
198	$local_start();
199	if (stream->head == stream->tail) {
200	stream->tail = NULL;
201	}
202
203	$cuda_op_state_node_t oldHead = stream->head;
204	stream->head = oldHead->next;
205	if (stream->head != NULL) {
206	stream->head->opState->start = true;
207	}
208
209	stream->numOps--;
210	free(oldHead->opState);
211	free(oldHead);
212	$local_end();
213	}
214
215	void $cuda_memcpy_proc(void* dst, const void* src, size_t count, _Bool* enqueuedFlag, cudaStream_t stream) {
216	$cuda_op_state_t opState = $stream_enqueue(enqueuedFlag, stream);
217	$when(opState->start);
218	memcpy(dst, src, count);
219	$stream_dequeue(stream);
220	}
221
222	cudaError_t cudaMemcpy(void* dst, const void* src, size_t count, cudaMemcpyKind kind) {
223	if (kind == cudaMemcpyHostToHost) {
224	memcpy(dst, src, count);
225	} else {
226	_Bool enqueuedFlag = false;
227	$proc memcpyProc = $spawn $cuda_memcpy_proc(dst, src, count, &enqueuedFlag, $cuda_default_stream);
228	$when(enqueuedFlag);
229	if (kind != cudaMemcpyDeviceToDevice) {
230	$wait(memcpyProc);
231	}
232	}
233
234	return cudaSuccess;
235	}
236
237	cudaError_t cudaMemcpyAsync(void* dst, const void* src, size_t count,
238	cudaMemcpyKind kind, cudaStream_t stream) {
239	if (kind == cudaMemcpyHostToHost) {
240	memcpy(dst, src, count);
241	} else {
242	_Bool enqueuedFlag = false;
243	$spawn $cuda_memcpy_proc(dst, src, count, &enqueuedFlag, stream);
244	$when(enqueuedFlag);
245	}
246
247	return cudaSuccess;
248	}
249
250	cudaError_t cudaDeviceSynchronize() {
251	$proc* opsToWaitOn;
252	int numOps = 0;
253
254	$atomic {
255	opsToWaitOn = ($proc) malloc(sizeof($proc) $cuda_global_context.numStreams);
256
257	for ($cuda_stream_node_t node = $cuda_global_context.head;
258	node != NULL;
259	node = node->next) {
260	if (node->stream->tail != NULL) {
261	opsToWaitOn[numOps] = node->stream->tail->opState->op;
262	numOps++;
263	}
264	}
265	}
266	$waitall(opsToWaitOn, numOps);
267
268	return cudaSuccess;
269	}
270
271	/**
272	* Only called at start of program
273	*/
274	void $cuda_setup() {
275	$cuda_stream_node_t defaultStreamNode = $create_new_stream_node();
276	$cuda_default_stream = defaultStreamNode->stream;
277
278	$cuda_global_context.head = defaultStreamNode;
279	$cuda_global_context.numStreams = 1;
280	}
281
282	/**
283	* Only called at end of program
284	*/
285	void $cuda_teardown() {
286	$proc destructor = $destroy_stream_node($cuda_default_stream->containingNode);
287	$wait(destructor);
288	}
289
290	// Helper function
291	int $dim3_index(dim3 size, uint3 location) {
292	return location.x + size.x * (location.y + size.y * location.z);
293	}
294
295	// Helper function
296	int $cuda_kernel_index (dim3 gDim, dim3 bDim, uint3 bIdx, uint3 tIdx) {
297	return $dim3_index(gDim, bIdx) * (bDim.x * bDim.y * bDim.z) + $dim3_index(bDim, tIdx);
298	}
299
300	void $cuda_run_and_wait_on_procs(dim3 dim, void spawningFunction(uint3)) {
301	//TODO: calculate length and index, replace this function in the kernel
302	//$local_start();
303	int length = dim.x * dim.y * dim.z;
304	$proc proc_array[length];
305	$range rx = 0 .. dim.x - 1;
306	$range ry = 0 .. dim.y - 1;
307	$range rz = 0 .. dim.z - 1;
308	$domain(3) dom = ($domain){rx, ry, rz};
309	#For some reason there is depends on all here
310	$for(int x,y,z : dom){
311	uint3 id = { x, y, z };
312	int index = $dim3_index(dim, id);
313	proc_array[index] = $spawn spawningFunction(id);
314	}
315	//$local_end();
316	$waitall(proc_array,length);
317	}
318
319	void _cuda_kernel_1(dim3 gridDim, dim3 blockDim, size_t _cuda_mem_size,
320	const float A, const float B, float *C, int numElements) {
321	void _cuda_block(uint3 blockIdx) {
322	int numThreads = (blockDim.x * blockDim.y) * blockDim.z;
323	$scope _block_root = $here;
324	$gbarrier _cuda_block_barrier = $gbarrier_create($here, blockDim.x * blockDim.y * blockDim.z);
325	void _cuda_thread(uint3 threadIdx) {
326	int _cuda_tid = $dim3_index(blockDim, threadIdx);
327	int _cuda_kid = $cuda_kernel_index(gridDim, blockDim, blockIdx, threadIdx);
328	$barrier _cuda_thread_barrier = $barrier_create($here, _cuda_block_barrier, _cuda_tid);
329	$local_start();
330	// Kernel definition start
331
332	int i = blockDim.x * blockIdx.x + threadIdx.x;
333
334	if (i < numElements)
335	{
336	C[i] = A[i] + B[i];
337	}
338
339	// Kernel definition end
340	$local_end();
341	$barrier_destroy(_cuda_thread_barrier);
342	}
343	$cuda_run_and_wait_on_procs(blockDim, _cuda_thread);
344	$gbarrier_destroy(_cuda_block_barrier);
345	}
346	$cuda_run_and_wait_on_procs(gridDim, _cuda_block);
347	}
348
349	void $proc_kernel_1(dim3 gridDim, dim3 blockDim, size_t _cudaMemSize,
350	_Bool* enqueuedFlag, cudaStream_t _cudaStream,
351	const float A, const float B, float *C, int numElements) {
352	$cuda_op_state_t opState = $stream_enqueue(enqueuedFlag, _cudaStream);
353	$when(opState->start);
354	_cuda_kernel_1(gridDim, blockDim, _cudaMemSize, A, B, C, numElements);
355	$stream_dequeue(_cudaStream);
356	}
357
358	$input int N;
359	$assume (N > 0);
360	$input float A[N];
361	$input float B[N];
362
363	void _civl_main() {
364	int size = N * sizeof(float);
365	int numBlocks = 2;
366	int numThreads = N%2 == 0? N/2 : (N+1)/2;
367
368	float* cuda_A;
369	// cudaMalloc((void **)&cuda_A, size);
370	{
371	cuda_A = (float *) malloc(size);
372	}
373	cudaMemcpy(cuda_A, A, size, cudaMemcpyHostToDevice);
374
375	float* cuda_B;
376	// cudaMalloc((void **)&cuda_B, size);
377	{
378	cuda_B = (float *) malloc(size);
379	}
380	cudaMemcpy(cuda_B, B, size, cudaMemcpyHostToDevice);
381
382	float* cuda_C;
383	// cudaMalloc((void **)&cuda_C, size);
384	{
385	cuda_C = (float *) malloc(size);
386	}
387
388	{ // kernel_1<<<gridDim, blockDim>>>(cuda_A, cuda_B, cuda_C, N);
389	dim3 gridDim = {numBlocks, 1, 1};
390	dim3 blockDim = {numThreads, 1, 1};
391	_Bool enqueuedFlag = false;
392	$spawn $proc_kernel_1(gridDim, blockDim, 0, &enqueuedFlag, NULL, cuda_A, cuda_B, cuda_C, N);
393	$when(enqueuedFlag);
394	}
395
396	//Checking correctness
397	float* C = (float *)malloc(size);
398
399	cudaMemcpy(C, cuda_C, size, cudaMemcpyDeviceToHost);
400
401	for(int i = 0; i < N; i++)
402	$assert(C[i] == A[i] + B[i]);
403
404	free(C);
405
406	//cudaFree(cuda_A);...
407	free(cuda_A);
408	free(cuda_B);
409	free(cuda_C);
410	}
411
412	int main() {
413	$cuda_setup();
414	_civl_main();
415	$cuda_teardown();
416	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

Original Format