source: CIVL/mods/dev.civl.com/examples/cuda/newCudaMockup.cvl@ a8a5acb

1.23 2.0 main test-branch
Last change on this file since a8a5acb was 30cb1de, checked in by Alex Wilton <awilton@…>, 3 years ago

Added changes to newCudaMockup that Zane has been making as well as a few new fixes.

git-svn-id: svn://vsl.cis.udel.edu/civl/trunk@5686 fb995dde-84ed-4084-dfe6-e5aef3e2452c

  • Property mode set to 100644
File size: 9.6 KB
Line 
1/**
2 * TODO:
3 * - implement cudaMemset and cudaMemsetAsync
4 * - flesh out basic structure of cuda kernel:
5 * - spawn gridDim blocks
6 * - spawn blockDim threads
7 * Alternatively, spawn blockDim warps and then spawn warp's threads
8 * - wait for blocks to finish
9 * - Add in block-level barriers and a __syncthreads() nested function that uses the barrier.
10 * - Add in data race checking support and implement atomicAdd for integers
11 */
12
13#include <concurrency.cvh>
14#include <stdlib.h>
15#include <stdbool.h>
16#include <string.h>
17
18enum cudaError {
19 cudaSuccess
20};
21typedef enum cudaError cudaError_t;
22
23typedef enum cudaMemcpyKind {
24 cudaMemcpyHostToHost,
25 cudaMemcpyHostToDevice,
26 cudaMemcpyDeviceToHost,
27 cudaMemcpyDeviceToDevice,
28 cudaMemcpyDefault
29} cudaMemcpyKind;
30
31typedef struct {
32 unsigned int x, y, z;
33} dim3;
34
35/* used to represent a location in a three dimensional grid
36 */
37typedef struct {
38 unsigned int x, y, z;
39} uint3;
40
41typedef struct $cuda_op_state* $cuda_op_state_t;
42struct $cuda_op_state {
43 _Bool start;
44 $proc op;
45};
46
47typedef struct $cuda_op_state_node* $cuda_op_state_node_t;
48struct $cuda_op_state_node {
49 $cuda_op_state_t opState;
50 $cuda_op_state_node_t next;
51};
52
53typedef struct cudaStream* cudaStream_t;
54typedef struct $cuda_stream_node* $cuda_stream_node_t;
55struct cudaStream {
56 $cuda_op_state_node_t head;
57 $cuda_op_state_node_t tail;
58 int numOps;
59 $cuda_stream_node_t containingNode;
60 _Bool alive;
61};
62cudaStream_t $cuda_default_stream;
63
64struct $cuda_stream_node {
65 cudaStream_t stream;
66 $cuda_stream_node_t prev;
67 $cuda_stream_node_t next;
68};
69
70typedef struct $cuda_context {
71 $cuda_stream_node_t head; //list of streams
72 int numStreams;
73} $cuda_context;
74$cuda_context $cuda_global_context;
75
76// Helper function to get the default stream if passed NULL, and just returns stream otherwise
77cudaStream_t $default_stream_if_null(cudaStream_t stream) {
78 return stream == NULL ? $cuda_default_stream : stream;
79}
80
81$cuda_stream_node_t $create_new_stream_node() {
82 cudaStream_t newStream = (cudaStream_t) malloc(sizeof(struct cudaStream));
83 newStream->head = NULL;
84 newStream->tail = NULL;
85 newStream->numOps = 0;
86 newStream->alive = true;
87
88 $cuda_stream_node_t newHead = ($cuda_stream_node_t) malloc(sizeof(struct $cuda_stream_node));
89 newHead->stream = newStream;
90 newStream->containingNode = newHead;
91 newHead->prev = NULL;
92 newHead->next = NULL;
93
94 return newHead;
95}
96
97/**
98 * TODO:
99 * - test
100 * - atomic?
101 */
102cudaError_t cudaStreamCreate(cudaStream_t * pStream) {
103 // Create new stream node in linked list
104 $cuda_stream_node_t newHead = $create_new_stream_node();
105 newHead->next = $cuda_global_context.head;
106 $cuda_global_context.head->prev = newHead;
107
108 // Update cuda context's head to be the new node we created
109 $cuda_global_context.head = newHead;
110 $cuda_global_context.numStreams++;
111
112 return cudaSuccess;
113}
114
115/**
116 * TODO:
117 * - test
118 * - atomic?
119 */
120cudaError_t cudaStreamSynchronize(cudaStream_t stream) {
121 stream = $default_stream_if_null(stream);
122 $assert(stream->alive, "Attempt to synchronize with a destroyed stream");
123 $when(stream->head == NULL) return cudaSuccess;
124}
125
126$proc $destroy_stream_node($cuda_stream_node_t node) {
127 $proc lastOpProc = $proc_null;
128 cudaStream_t stream = node->stream;
129
130 $atomic {
131 if (node->prev != NULL) {
132 node->prev->next = node->next;
133 }
134 if (node->next != NULL) {
135 node->next->prev = node->prev;
136 }
137 free(node);
138
139 stream->alive = false;
140 if(stream->tail != NULL)
141 lastOpProc = stream->tail->opState->op;
142 }
143
144 void $destroy_stream_when_complete($proc lastOpProc, cudaStream_t stream) {
145 $wait(lastOpProc);
146 free(stream);
147 }
148
149 return $spawn $destroy_stream_when_complete(lastOpProc, stream);
150}
151
152// TODO: atomic
153cudaError_t cudaStreamDestroy(cudaStream_t stream) {
154 $assert(stream != NULL && stream != $cuda_default_stream, "Attempt to destroy default stream");
155 $assert(stream->alive, "Attempt to destroy an already destroyed stream");
156 $destroy_stream_node(stream->containingNode);
157 return cudaSuccess;
158}
159
160/**
161 * Enqueues the calling $proc as a new cuda operation onto stream. Then blocks until the cuda operation is allowed to execute.
162 */
163$cuda_op_state_t $stream_enqueue(_Bool* enqueuedFlag, cudaStream_t stream) {
164 $cuda_op_state_t newOpState = ($cuda_op_state_t) malloc(sizeof(struct $cuda_op_state));
165 newOpState->start = false;
166 newOpState->op = $self;
167
168 $cuda_op_state_node_t newOpStateNode = ($cuda_op_state_node_t) malloc(sizeof(struct $cuda_op_state_node));
169 newOpStateNode->opState = newOpState;
170 newOpStateNode->next = NULL;
171
172 $atomic {
173 stream = $default_stream_if_null(stream);
174 $assert(stream->alive, "Attempt to enqueue a CUDA operation onto a destroyed stream");
175
176 if (stream->tail == NULL) {
177 stream->head = newOpStateNode;
178 stream->tail = newOpStateNode;
179 newOpState->start = true;
180 } else {
181 stream->tail->next = newOpStateNode;
182 stream->tail = newOpStateNode;
183 }
184 stream->numOps++;
185 *enqueuedFlag = true;
186 }
187
188 return newOpState;
189}
190
191void $stream_dequeue(cudaStream_t stream) {
192 stream = $default_stream_if_null(stream);
193 $assert(stream->head != NULL, "Attempt to dequeue an empty stream");
194
195 $atomic {
196 if (stream->head == stream->tail) {
197 stream->tail = NULL;
198 }
199
200 $cuda_op_state_node_t oldHead = stream->head;
201 stream->head = oldHead->next;
202 if (stream->head != NULL) {
203 stream->head->opState->start = true;
204 }
205
206 stream->numOps--;
207 free(oldHead->opState);
208 free(oldHead);
209 }
210}
211
212void $cuda_memcpy_proc(void* dst, const void* src, size_t count, _Bool* enqueuedFlag, cudaStream_t stream) {
213 $cuda_op_state_t opState = $stream_enqueue(enqueuedFlag, stream);
214 $when(opState->start);
215 memcpy(dst, src, count);
216 $stream_dequeue(stream);
217}
218
219cudaError_t cudaMemcpy(void* dst, const void* src, size_t count, cudaMemcpyKind kind) {
220 if (kind == cudaMemcpyHostToHost) {
221 memcpy(dst, src, count);
222 } else {
223 _Bool enqueuedFlag = false;
224 $proc memcpyProc = $spawn $cuda_memcpy_proc(dst, src, count, &enqueuedFlag, $cuda_default_stream);
225 $when(enqueuedFlag);
226 if (kind != cudaMemcpyDeviceToDevice) {
227 $wait(memcpyProc);
228 }
229 }
230
231 return cudaSuccess;
232}
233
234cudaError_t cudaMemcpyAsync(void* dst, const void* src, size_t count,
235 cudaMemcpyKind kind, cudaStream_t stream) {
236 if (kind == cudaMemcpyHostToHost) {
237 memcpy(dst, src, count);
238 } else {
239 _Bool enqueuedFlag = false;
240 $spawn $cuda_memcpy_proc(dst, src, count, &enqueuedFlag, stream);
241 $when(enqueuedFlag);
242 }
243
244 return cudaSuccess;
245}
246
247cudaError_t cudaDeviceSynchronize() {
248 $proc* opsToWaitOn;
249 int numOps = 0;
250
251 $atomic {
252 opsToWaitOn = ($proc*) malloc(sizeof($proc) * $cuda_global_context.numStreams);
253
254 for ($cuda_stream_node_t node = $cuda_global_context.head;
255 node != NULL;
256 node = node->next) {
257 if (node->stream->tail != NULL) {
258 opsToWaitOn[numOps] = node->stream->tail->opState->op;
259 numOps++;
260 }
261 }
262 }
263 $waitall(opsToWaitOn, numOps);
264
265 return cudaSuccess;
266}
267
268/**
269 * Only called at start of program
270 */
271void $cuda_setup() {
272 $cuda_stream_node_t defaultStreamNode = $create_new_stream_node();
273 $cuda_default_stream = defaultStreamNode->stream;
274
275 $cuda_global_context.head = defaultStreamNode;
276 $cuda_global_context.numStreams = 1;
277}
278
279/**
280 * Only called at end of program
281 */
282void $cuda_teardown() {
283 $proc destructor = $destroy_stream_node($cuda_default_stream->containingNode);
284 $wait(destructor);
285}
286
287// Helper function
288int $dim3_index(dim3 size, uint3 location) {
289 return location.x + size.x * (location.y + size.y * location.z);
290}
291
292// Helper function
293int $cuda_kernel_index (dim3 gDim, dim3 bDim, uint3 bIdx, uint3 tIdx) {
294 return $dim3_index(gDim, bIdx) * (bDim.x * bDim.y * bDim.z) + $dim3_index(bDim, tIdx);
295}
296
297void $cuda_run_and_wait_on_procs(dim3 dim, void spawningFunction(uint3)) {
298 //TODO: calculate length and index, replace this function in the kernel
299 int length = dim.x * dim.y * dim.z;
300 $proc proc_array[length];
301 $range rx = 0 .. dim.x - 1;
302 $range ry = 0 .. dim.y - 1;
303 $range rz = 0 .. dim.z - 1;
304 $domain(3) dom = ($domain){rx, ry, rz};
305 $for(int x,y,z : dom){
306 uint3 id = { x, y, z };
307 int index = $dim3_index(dim, id);
308 proc_array[index] = $spawn spawningFunction(id);
309 }
310 $waitall(proc_array,length);
311}
312
313void _cuda_kernel_1(dim3 gridDim, dim3 blockDim, size_t _cuda_mem_size,
314 int x) {
315 void _cuda_block(uint3 blockIdx) {
316 int numThreads = (blockDim.x * blockDim.y) * blockDim.z;
317 $scope _block_root = $here;
318 $gbarrier _cuda_block_barrier = $gbarrier_create($here, blockDim.x * blockDim.y * blockDim.z);
319 void _cuda_thread(uint3 threadIdx) {
320 int _cuda_tid = $dim3_index(blockDim, threadIdx);
321 int _cuda_kid = $cuda_kernel_index(gridDim, blockDim, blockIdx, threadIdx);
322 $barrier _cuda_thread_barrier = $barrier_create($here, _cuda_block_barrier, _cuda_tid);
323 //Kernel definition
324 $barrier_destroy(_cuda_thread_barrier);
325 }
326 $cuda_run_and_wait_on_procs(blockDim, _cuda_thread);
327 $gbarrier_destroy(_cuda_block_barrier);
328 }
329 $cuda_run_and_wait_on_procs(gridDim, _cuda_block);
330}
331
332void $proc_kernel_1(dim3 gridDim, dim3 blockDim, size_t _cudaMemSize,
333 _Bool* enqueuedFlag, cudaStream_t _cudaStream, int x) {
334 $cuda_op_state_t opState = $stream_enqueue(enqueuedFlag, _cudaStream);
335 $when(opState->start);
336 _cuda_kernel_1(gridDim, blockDim, _cudaMemSize, x);
337 $stream_dequeue(_cudaStream);
338}
339
340void _civl_main() {
341 {
342 dim3 gridDim = {3, 2, 1};
343 dim3 blockDim = {3, 2, 1};
344 _Bool enqueuedFlag = false;
345 $spawn $proc_kernel_1(gridDim, blockDim, 0, &enqueuedFlag, NULL, 5);
346 $when(enqueuedFlag);
347 }
348}
349
350int main() {
351 $cuda_setup();
352 _civl_main();
353 $cuda_teardown();
354}
Note: See TracBrowser for help on using the repository browser.