Context Navigation

newCudaMockup.cvl@ a8a5acb

1.23 2.0 main test-branch

Last change on this file since a8a5acb was 30cb1de, checked in by Alex Wilton <awilton@…>, 3 years ago

Added changes to newCudaMockup that Zane has been making as well as a few new fixes.

git-svn-id: svn://vsl.cis.udel.edu/civl/trunk@5686 fb995dde-84ed-4084-dfe6-e5aef3e2452c

Property mode set to 100644

File size: 9.6 KB

Line
1	/**
2	* TODO:
3	* - implement cudaMemset and cudaMemsetAsync
4	* - flesh out basic structure of cuda kernel:
5	* - spawn gridDim blocks
6	* - spawn blockDim threads
7	* Alternatively, spawn blockDim warps and then spawn warp's threads
8	* - wait for blocks to finish
9	* - Add in block-level barriers and a __syncthreads() nested function that uses the barrier.
10	* - Add in data race checking support and implement atomicAdd for integers
11	*/
12
13	#include <concurrency.cvh>
14	#include <stdlib.h>
15	#include <stdbool.h>
16	#include <string.h>
17
18	enum cudaError {
19	cudaSuccess
20	};
21	typedef enum cudaError cudaError_t;
22
23	typedef enum cudaMemcpyKind {
24	cudaMemcpyHostToHost,
25	cudaMemcpyHostToDevice,
26	cudaMemcpyDeviceToHost,
27	cudaMemcpyDeviceToDevice,
28	cudaMemcpyDefault
29	} cudaMemcpyKind;
30
31	typedef struct {
32	unsigned int x, y, z;
33	} dim3;
34
35	/* used to represent a location in a three dimensional grid
36	*/
37	typedef struct {
38	unsigned int x, y, z;
39	} uint3;
40
41	typedef struct $cuda_op_state* $cuda_op_state_t;
42	struct $cuda_op_state {
43	_Bool start;
44	$proc op;
45	};
46
47	typedef struct $cuda_op_state_node* $cuda_op_state_node_t;
48	struct $cuda_op_state_node {
49	$cuda_op_state_t opState;
50	$cuda_op_state_node_t next;
51	};
52
53	typedef struct cudaStream* cudaStream_t;
54	typedef struct $cuda_stream_node* $cuda_stream_node_t;
55	struct cudaStream {
56	$cuda_op_state_node_t head;
57	$cuda_op_state_node_t tail;
58	int numOps;
59	$cuda_stream_node_t containingNode;
60	_Bool alive;
61	};
62	cudaStream_t $cuda_default_stream;
63
64	struct $cuda_stream_node {
65	cudaStream_t stream;
66	$cuda_stream_node_t prev;
67	$cuda_stream_node_t next;
68	};
69
70	typedef struct $cuda_context {
71	$cuda_stream_node_t head; //list of streams
72	int numStreams;
73	} $cuda_context;
74	$cuda_context $cuda_global_context;
75
76	// Helper function to get the default stream if passed NULL, and just returns stream otherwise
77	cudaStream_t $default_stream_if_null(cudaStream_t stream) {
78	return stream == NULL ? $cuda_default_stream : stream;
79	}
80
81	$cuda_stream_node_t $create_new_stream_node() {
82	cudaStream_t newStream = (cudaStream_t) malloc(sizeof(struct cudaStream));
83	newStream->head = NULL;
84	newStream->tail = NULL;
85	newStream->numOps = 0;
86	newStream->alive = true;
87
88	$cuda_stream_node_t newHead = ($cuda_stream_node_t) malloc(sizeof(struct $cuda_stream_node));
89	newHead->stream = newStream;
90	newStream->containingNode = newHead;
91	newHead->prev = NULL;
92	newHead->next = NULL;
93
94	return newHead;
95	}
96
97	/**
98	* TODO:
99	* - test
100	* - atomic?
101	*/
102	cudaError_t cudaStreamCreate(cudaStream_t * pStream) {
103	// Create new stream node in linked list
104	$cuda_stream_node_t newHead = $create_new_stream_node();
105	newHead->next = $cuda_global_context.head;
106	$cuda_global_context.head->prev = newHead;
107
108	// Update cuda context's head to be the new node we created
109	$cuda_global_context.head = newHead;
110	$cuda_global_context.numStreams++;
111
112	return cudaSuccess;
113	}
114
115	/**
116	* TODO:
117	* - test
118	* - atomic?
119	*/
120	cudaError_t cudaStreamSynchronize(cudaStream_t stream) {
121	stream = $default_stream_if_null(stream);
122	$assert(stream->alive, "Attempt to synchronize with a destroyed stream");
123	$when(stream->head == NULL) return cudaSuccess;
124	}
125
126	$proc $destroy_stream_node($cuda_stream_node_t node) {
127	$proc lastOpProc = $proc_null;
128	cudaStream_t stream = node->stream;
129
130	$atomic {
131	if (node->prev != NULL) {
132	node->prev->next = node->next;
133	}
134	if (node->next != NULL) {
135	node->next->prev = node->prev;
136	}
137	free(node);
138
139	stream->alive = false;
140	if(stream->tail != NULL)
141	lastOpProc = stream->tail->opState->op;
142	}
143
144	void $destroy_stream_when_complete($proc lastOpProc, cudaStream_t stream) {
145	$wait(lastOpProc);
146	free(stream);
147	}
148
149	return $spawn $destroy_stream_when_complete(lastOpProc, stream);
150	}
151
152	// TODO: atomic
153	cudaError_t cudaStreamDestroy(cudaStream_t stream) {
154	$assert(stream != NULL && stream != $cuda_default_stream, "Attempt to destroy default stream");
155	$assert(stream->alive, "Attempt to destroy an already destroyed stream");
156	$destroy_stream_node(stream->containingNode);
157	return cudaSuccess;
158	}
159
160	/**
161	* Enqueues the calling $proc as a new cuda operation onto stream. Then blocks until the cuda operation is allowed to execute.
162	*/
163	$cuda_op_state_t $stream_enqueue(_Bool* enqueuedFlag, cudaStream_t stream) {
164	$cuda_op_state_t newOpState = ($cuda_op_state_t) malloc(sizeof(struct $cuda_op_state));
165	newOpState->start = false;
166	newOpState->op = $self;
167
168	$cuda_op_state_node_t newOpStateNode = ($cuda_op_state_node_t) malloc(sizeof(struct $cuda_op_state_node));
169	newOpStateNode->opState = newOpState;
170	newOpStateNode->next = NULL;
171
172	$atomic {
173	stream = $default_stream_if_null(stream);
174	$assert(stream->alive, "Attempt to enqueue a CUDA operation onto a destroyed stream");
175
176	if (stream->tail == NULL) {
177	stream->head = newOpStateNode;
178	stream->tail = newOpStateNode;
179	newOpState->start = true;
180	} else {
181	stream->tail->next = newOpStateNode;
182	stream->tail = newOpStateNode;
183	}
184	stream->numOps++;
185	*enqueuedFlag = true;
186	}
187
188	return newOpState;
189	}
190
191	void $stream_dequeue(cudaStream_t stream) {
192	stream = $default_stream_if_null(stream);
193	$assert(stream->head != NULL, "Attempt to dequeue an empty stream");
194
195	$atomic {
196	if (stream->head == stream->tail) {
197	stream->tail = NULL;
198	}
199
200	$cuda_op_state_node_t oldHead = stream->head;
201	stream->head = oldHead->next;
202	if (stream->head != NULL) {
203	stream->head->opState->start = true;
204	}
205
206	stream->numOps--;
207	free(oldHead->opState);
208	free(oldHead);
209	}
210	}
211
212	void $cuda_memcpy_proc(void* dst, const void* src, size_t count, _Bool* enqueuedFlag, cudaStream_t stream) {
213	$cuda_op_state_t opState = $stream_enqueue(enqueuedFlag, stream);
214	$when(opState->start);
215	memcpy(dst, src, count);
216	$stream_dequeue(stream);
217	}
218
219	cudaError_t cudaMemcpy(void* dst, const void* src, size_t count, cudaMemcpyKind kind) {
220	if (kind == cudaMemcpyHostToHost) {
221	memcpy(dst, src, count);
222	} else {
223	_Bool enqueuedFlag = false;
224	$proc memcpyProc = $spawn $cuda_memcpy_proc(dst, src, count, &enqueuedFlag, $cuda_default_stream);
225	$when(enqueuedFlag);
226	if (kind != cudaMemcpyDeviceToDevice) {
227	$wait(memcpyProc);
228	}
229	}
230
231	return cudaSuccess;
232	}
233
234	cudaError_t cudaMemcpyAsync(void* dst, const void* src, size_t count,
235	cudaMemcpyKind kind, cudaStream_t stream) {
236	if (kind == cudaMemcpyHostToHost) {
237	memcpy(dst, src, count);
238	} else {
239	_Bool enqueuedFlag = false;
240	$spawn $cuda_memcpy_proc(dst, src, count, &enqueuedFlag, stream);
241	$when(enqueuedFlag);
242	}
243
244	return cudaSuccess;
245	}
246
247	cudaError_t cudaDeviceSynchronize() {
248	$proc* opsToWaitOn;
249	int numOps = 0;
250
251	$atomic {
252	opsToWaitOn = ($proc) malloc(sizeof($proc) $cuda_global_context.numStreams);
253
254	for ($cuda_stream_node_t node = $cuda_global_context.head;
255	node != NULL;
256	node = node->next) {
257	if (node->stream->tail != NULL) {
258	opsToWaitOn[numOps] = node->stream->tail->opState->op;
259	numOps++;
260	}
261	}
262	}
263	$waitall(opsToWaitOn, numOps);
264
265	return cudaSuccess;
266	}
267
268	/**
269	* Only called at start of program
270	*/
271	void $cuda_setup() {
272	$cuda_stream_node_t defaultStreamNode = $create_new_stream_node();
273	$cuda_default_stream = defaultStreamNode->stream;
274
275	$cuda_global_context.head = defaultStreamNode;
276	$cuda_global_context.numStreams = 1;
277	}
278
279	/**
280	* Only called at end of program
281	*/
282	void $cuda_teardown() {
283	$proc destructor = $destroy_stream_node($cuda_default_stream->containingNode);
284	$wait(destructor);
285	}
286
287	// Helper function
288	int $dim3_index(dim3 size, uint3 location) {
289	return location.x + size.x * (location.y + size.y * location.z);
290	}
291
292	// Helper function
293	int $cuda_kernel_index (dim3 gDim, dim3 bDim, uint3 bIdx, uint3 tIdx) {
294	return $dim3_index(gDim, bIdx) * (bDim.x * bDim.y * bDim.z) + $dim3_index(bDim, tIdx);
295	}
296
297	void $cuda_run_and_wait_on_procs(dim3 dim, void spawningFunction(uint3)) {
298	//TODO: calculate length and index, replace this function in the kernel
299	int length = dim.x * dim.y * dim.z;
300	$proc proc_array[length];
301	$range rx = 0 .. dim.x - 1;
302	$range ry = 0 .. dim.y - 1;
303	$range rz = 0 .. dim.z - 1;
304	$domain(3) dom = ($domain){rx, ry, rz};
305	$for(int x,y,z : dom){
306	uint3 id = { x, y, z };
307	int index = $dim3_index(dim, id);
308	proc_array[index] = $spawn spawningFunction(id);
309	}
310	$waitall(proc_array,length);
311	}
312
313	void _cuda_kernel_1(dim3 gridDim, dim3 blockDim, size_t _cuda_mem_size,
314	int x) {
315	void _cuda_block(uint3 blockIdx) {
316	int numThreads = (blockDim.x * blockDim.y) * blockDim.z;
317	$scope _block_root = $here;
318	$gbarrier _cuda_block_barrier = $gbarrier_create($here, blockDim.x * blockDim.y * blockDim.z);
319	void _cuda_thread(uint3 threadIdx) {
320	int _cuda_tid = $dim3_index(blockDim, threadIdx);
321	int _cuda_kid = $cuda_kernel_index(gridDim, blockDim, blockIdx, threadIdx);
322	$barrier _cuda_thread_barrier = $barrier_create($here, _cuda_block_barrier, _cuda_tid);
323	//Kernel definition
324	$barrier_destroy(_cuda_thread_barrier);
325	}
326	$cuda_run_and_wait_on_procs(blockDim, _cuda_thread);
327	$gbarrier_destroy(_cuda_block_barrier);
328	}
329	$cuda_run_and_wait_on_procs(gridDim, _cuda_block);
330	}
331
332	void $proc_kernel_1(dim3 gridDim, dim3 blockDim, size_t _cudaMemSize,
333	_Bool* enqueuedFlag, cudaStream_t _cudaStream, int x) {
334	$cuda_op_state_t opState = $stream_enqueue(enqueuedFlag, _cudaStream);
335	$when(opState->start);
336	_cuda_kernel_1(gridDim, blockDim, _cudaMemSize, x);
337	$stream_dequeue(_cudaStream);
338	}
339
340	void _civl_main() {
341	{
342	dim3 gridDim = {3, 2, 1};
343	dim3 blockDim = {3, 2, 1};
344	_Bool enqueuedFlag = false;
345	$spawn $proc_kernel_1(gridDim, blockDim, 0, &enqueuedFlag, NULL, 5);
346	$when(enqueuedFlag);
347	}
348	}
349
350	int main() {
351	$cuda_setup();
352	_civl_main();
353	$cuda_teardown();
354	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

Original Format