source: CIVL/examples/cuda/newCudaMockup.cvl@ a8faba8

1.23 2.0 main test-branch
Last change on this file since a8faba8 was 6dd0322, checked in by Alex Wilton <awilton@…>, 3 years ago

Fixed domain type bug. Added first iteration of unreachable pointers for CUDA. Started refactoring newCudaMockup to use message-passing and unreachable pointers.

git-svn-id: svn://vsl.cis.udel.edu/civl/trunk@5734 fb995dde-84ed-4084-dfe6-e5aef3e2452c

  • Property mode set to 100644
File size: 14.6 KB
Line 
1/**
2 * TODO:
3 * - implement cudaMemset and cudaMemsetAsync
4 * - flesh out basic structure of cuda kernel:
5 * - spawn gridDim blocks
6 * - spawn blockDim threads
7 * Alternatively, spawn blockDim warps and then spawn warp's threads
8 * - wait for blocks to finish
9 * - Add in block-level barriers and a __syncthreads() nested function that uses the barrier.
10 * - Add in data race checking support and implement atomicAdd for integers
11 * - Handle dependencies at atomic blocks (replace some with local blocks if possible)
12 * - Create device scope for cudaMallocs
13 */
14
15#include <concurrency.cvh>
16#include <comm.cvh>
17#include <stdlib.h>
18#include <stdio.h>
19#include <stdbool.h>
20#include <string.h>
21
22///////////
23// Types //
24///////////
25
26enum cudaError {
27 cudaSuccess
28};
29typedef enum cudaError cudaError_t;
30
31typedef enum cudaMemcpyKind {
32 cudaMemcpyHostToHost,
33 cudaMemcpyHostToDevice,
34 cudaMemcpyDeviceToHost,
35 cudaMemcpyDeviceToDevice,
36 cudaMemcpyDefault
37} cudaMemcpyKind;
38
39typedef struct {
40 unsigned int x, y, z;
41} dim3;
42
43/* used to represent a location in a three dimensional grid
44 */
45typedef struct {
46 unsigned int x, y, z;
47} uint3;
48
49typedef struct $cuda_op_state* $cuda_op_state_t;
50struct $cuda_op_state {
51 _Bool start;
52 $proc op;
53};
54
55typedef struct $cuda_op_state_node* $cuda_op_state_node_t;
56struct $cuda_op_state_node {
57 $cuda_op_state_t opState;
58 $cuda_op_state_node_t next;
59};
60
61typedef struct cudaStream* cudaStream_t;
62typedef struct $cuda_stream_node* $cuda_stream_node_t;
63struct cudaStream {
64 $cuda_op_state_node_t head;
65 $cuda_op_state_node_t tail;
66 int numOps;
67 $cuda_stream_node_t containingNode;
68 _Bool alive;
69};
70
71struct $cuda_stream_node {
72 cudaStream_t stream;
73 $cuda_stream_node_t prev;
74 $cuda_stream_node_t next;
75};
76
77typedef struct $cuda_context {
78 $cuda_stream_node_t head; //list of streams
79 int numStreams;
80} $cuda_context;
81
82//////////////////////
83// Global Variables //
84//////////////////////
85
86$gcomm $cuda_gcomm = $gcomm_create($here, 2);
87const int $CUDA_PLACE_HOST = 0;
88const int $CUDA_PLACE_DEVICE = 1;
89$comm $cuda_host_comm = $comm_create($here, $cuda_gcomm, $cuda_place_host);
90
91/**
92 * Tags used for message-passing between host and device
93 */
94// Predefined tags
95const int $CUDA_TAG_TEARDOWN = 0;
96const int $CUDA_TAG_cudaMalloc = 1;
97const int $CUDA_TAG_cudaMemcpy = 2;
98// Generated tags (by transformer)
99const int $CUDA_TAG_LAUNCH_kernel_1 = 5;
100
101////////////////////////////////////////////
102// CUDA API Functions (For Host-use Only) //
103////////////////////////////////////////////
104
105cudaError_t cudaMalloc(void** devPtr, size_t size) {
106 $comm_enqueue($cuda_host_comm, $message_pack($CUDA_PLACE_HOST, $CUDA_PLACE_DEVICE, $CUDA_TAG_cudaMalloc, &size, sizeof(size_t)));
107 $message response = $comm_dequeue($cuda_host_comm, $CUDA_PLACE_DEVICE, $CUDA_TAG_cudaMalloc);
108 $message_unpack(response, devPtr, sizeof(void*));
109}
110
111typedef struct $cuda_memcpy_args {
112 void* dst;
113 void src;
114 size_t count;
115 cudaMemcpyKind kind;
116} $cuda_memcpy_args;
117
118cudaError_t cudaMemcpy(void* dst, const void* src, size_t count, cudaMemcpyKind kind) {
119 if (kind == cudaMemcpyHostToHost) {
120 memcpy(dst, src, count);
121 } else {
122 $cuda_memcpy_args args;
123 args.dst = dst;
124 args.src = src;
125 args.count = count;
126 args.kind = kind;
127
128 $comm_enqueue($cuda_host_comm, $message_pack($CUDA_PLACE_HOST, $CUDA_PLACE_DEVICE, $CUDA_TAG_cudaMemcpy, &args, sizeof($cuda_memcpy_args)));
129 $comm_dequeue($cuda_host_comm, $CUDA_PLACE_DEVICE, $CUDA_TAG_cudaMalloc);
130 }
131
132 return cudaSuccess;
133}
134
135$cuda_stream_node_t $create_new_stream_node() {
136 cudaStream_t newStream = (cudaStream_t) malloc(sizeof(struct cudaStream));
137 newStream->head = NULL;
138 newStream->tail = NULL;
139 newStream->numOps = 0;
140 newStream->alive = true;
141
142 $cuda_stream_node_t newHead = ($cuda_stream_node_t) malloc(sizeof(struct $cuda_stream_node));
143 newHead->stream = newStream;
144 newStream->containingNode = newHead;
145 newHead->prev = NULL;
146 newHead->next = NULL;
147
148 return newHead;
149}
150
151/**
152 * TODO:
153 * - test
154 * - atomic?
155 */
156cudaError_t cudaStreamCreate(cudaStream_t * pStream) {
157 // Create new stream node in linked list
158 $cuda_stream_node_t newHead = $create_new_stream_node();
159 newHead->next = $cuda_global_context.head;
160 $cuda_global_context.head->prev = newHead;
161
162 // Update cuda context's head to be the new node we created
163 $cuda_global_context.head = newHead;
164 $cuda_global_context.numStreams++;
165
166 return cudaSuccess;
167}
168
169/**
170 * TODO:
171 * - test
172 * - atomic?
173 */
174cudaError_t cudaStreamSynchronize(cudaStream_t stream) {
175 stream = $default_stream_if_null(stream);
176 $assert(stream->alive, "Attempt to synchronize with a destroyed stream");
177 $when(stream->head == NULL) return cudaSuccess;
178}
179
180//@ depends_on \nothing;
181$atomic_f $proc $destroy_stream_node($cuda_stream_node_t node) {
182 $proc lastOpProc = $proc_null;
183 cudaStream_t stream = node->stream;
184
185 if (node->prev != NULL) {
186 node->prev->next = node->next;
187 }
188 if (node->next != NULL) {
189 node->next->prev = node->prev;
190 }
191 free(node);
192
193 stream->alive = false;
194 if(stream->tail != NULL)
195 lastOpProc = stream->tail->opState->op;
196
197 void $destroy_stream_when_complete($proc lastOpProc, cudaStream_t stream) {
198 $wait(lastOpProc);
199 free(stream);
200 }
201
202 return $spawn $destroy_stream_when_complete(lastOpProc, stream);
203}
204
205// TODO: atomic
206cudaError_t cudaStreamDestroy(cudaStream_t stream) {
207 $assert(stream != NULL && stream != $cuda_default_stream, "Attempt to destroy default stream");
208 $assert(stream->alive, "Attempt to destroy an already destroyed stream");
209 $destroy_stream_node(stream->containingNode);
210 return cudaSuccess;
211}
212
213/**
214 * Enqueues the calling $proc as a new cuda operation onto stream. Then blocks until the cuda operation is allowed to execute.
215 */
216//@ depends_on \nothing;
217$atomic_f $cuda_op_state_t $stream_enqueue(_Bool* enqueuedFlag, cudaStream_t stream) {
218 $cuda_op_state_t newOpState = ($cuda_op_state_t) malloc(sizeof(struct $cuda_op_state));
219 newOpState->start = false;
220 newOpState->op = $self;
221
222 $cuda_op_state_node_t newOpStateNode = ($cuda_op_state_node_t) malloc(sizeof(struct $cuda_op_state_node));
223 newOpStateNode->opState = newOpState;
224 newOpStateNode->next = NULL;
225
226 stream = $default_stream_if_null(stream);
227 $assert(stream->alive, "Attempt to enqueue a CUDA operation onto a destroyed stream");
228
229 if (stream->tail == NULL) {
230 stream->head = newOpStateNode;
231 stream->tail = newOpStateNode;
232 newOpState->start = true;
233 } else {
234 stream->tail->next = newOpStateNode;
235 stream->tail = newOpStateNode;
236 }
237 stream->numOps++;
238 *enqueuedFlag = true;
239
240 return newOpState;
241}
242
243//@ depends_on \nothing;
244$atomic_f void $stream_dequeue(cudaStream_t stream) {
245 stream = $default_stream_if_null(stream);
246 $assert(stream->head != NULL, "Attempt to dequeue an empty stream");
247
248 if (stream->head == stream->tail) {
249 stream->tail = NULL;
250 }
251
252 $cuda_op_state_node_t oldHead = stream->head;
253 stream->head = oldHead->next;
254 if (stream->head != NULL) {
255 stream->head->opState->start = true;
256 }
257
258 stream->numOps--;
259 free(oldHead->opState);
260 free(oldHead);
261}
262
263cudaError_t cudaMemcpyAsync(void* dst, const void* src, size_t count,
264 cudaMemcpyKind kind, cudaStream_t stream) {
265 if (kind == cudaMemcpyHostToHost) {
266 memcpy(dst, src, count);
267 } else {
268 _Bool enqueuedFlag = false;
269 $spawn $cuda_memcpy_proc(dst, src, count, &enqueuedFlag, stream);
270 $when(enqueuedFlag);
271 }
272
273 return cudaSuccess;
274}
275
276cudaError_t cudaDeviceSynchronize() {
277 $proc* opsToWaitOn;
278 int numOps = 0;
279
280 $atomic {
281 opsToWaitOn = ($proc*) malloc(sizeof($proc) * $cuda_global_context.numStreams);
282
283 for ($cuda_stream_node_t node = $cuda_global_context.head;
284 node != NULL;
285 node = node->next) {
286 if (node->stream->tail != NULL) {
287 opsToWaitOn[numOps] = node->stream->tail->opState->op;
288 numOps++;
289 }
290 }
291 }
292 $waitall(opsToWaitOn, numOps);
293
294 return cudaSuccess;
295}
296
297// Helper function
298int $dim3_index(dim3 size, uint3 location) {
299 return location.x + size.x * (location.y + size.y * location.z);
300}
301
302// Helper function
303int $cuda_kernel_index (dim3 gDim, dim3 bDim, uint3 bIdx, uint3 tIdx) {
304 return $dim3_index(gDim, bIdx) * (bDim.x * bDim.y * bDim.z) + $dim3_index(bDim, tIdx);
305}
306
307void $cuda_run_and_wait_on_procs(dim3 dim, void spawningFunction(uint3)) {
308 //TODO: calculate length and index, replace this function in the kernel
309 $local_start();
310 int length = dim.x * dim.y * dim.z;
311 $proc proc_array[length];
312 $range rx = 0 .. dim.x - 1;
313 $range ry = 0 .. dim.y - 1;
314 $range rz = 0 .. dim.z - 1;
315 $domain(3) dom = ($domain(3)){rx, ry, rz};
316 $for(int x,y,z : dom){
317 uint3 id = { x, y, z };
318 int index = $dim3_index(dim, id);
319 proc_array[index] = $spawn spawningFunction(id);
320 }
321 $local_end();
322 $waitall(proc_array,length);
323}
324
325void $proc_kernel_1(dim3 gridDim, dim3 blockDim, size_t _cudaMemSize,
326 _Bool* enqueuedFlag, cudaStream_t _cudaStream,
327 const float *A, const float *B, float *C, int numElements) {
328 $cuda_op_state_t opState = $stream_enqueue(enqueuedFlag, _cudaStream);
329 $when(opState->start);
330 _cuda_kernel_1(gridDim, blockDim, _cudaMemSize, A, B, C, numElements);
331 $stream_dequeue(_cudaStream);
332}
333
334/////////////////
335// CUDA "file" //
336/////////////////
337
338void _cuda_main() {
339
340 //////////////////////
341 // Device Variables //
342 //////////////////////
343
344 $comm $cuda_device_comm = $comm_create($here, $cuda_gcomm, 1);
345 $cuda_context $cuda_global_context;
346 cudaStream_t $cuda_default_stream;
347
348 ///////////////////////////////
349 // CUDA Function Definitions //
350 ///////////////////////////////
351
352 /**
353 * Only called at start of program
354 */
355 void $cuda_setup() {
356 $cuda_stream_node_t defaultStreamNode = $create_new_stream_node();
357 $cuda_default_stream = defaultStreamNode->stream;
358
359 $cuda_global_context.head = defaultStreamNode;
360 $cuda_global_context.numStreams = 1;
361 }
362
363 /**
364 * Only called at end of program
365 */
366 void $cuda_teardown() {
367 $proc destructor = $destroy_stream_node($cuda_default_stream->containingNode);
368 $wait(destructor);
369 }
370
371 $message $cuda_malloc($message request) {
372 size_t size;
373 $message_unpack(request, &size, sizeof(size_t));
374 void* ptr = $malloc($here, size);
375
376 return $message_pack($CUDA_PLACE_DEVICE, $CUDA_PLACE_HOST, $CUDA_TAG_cudaMalloc, &$make_unreachable(ptr), sizeof(void*));
377 }
378
379 void $cuda_memcpy_proc(void* dst, const void* src, size_t count, _Bool* enqueuedFlag, cudaStream_t stream) {
380 $cuda_op_state_t opState = $stream_enqueue(enqueuedFlag, stream);
381 $when(opState->start);
382 memcpy(dst, src, count);
383 $stream_dequeue(stream);
384 }
385
386 $message $cuda_memcpy($message request) {
387 $cuda_memcpy_args args;
388 $message_unpack(request, &args, sizeof($cuda_memcpy_args));
389
390 _Bool enqueuedFlag = false;
391 $proc memcpyProc = $spawn $cuda_memcpy_proc(args.dst, args.src, args.count, &enqueuedFlag, $cuda_default_stream);
392 $when(enqueuedFlag);
393 if (args.kind != cudaMemcpyDeviceToDevice) {
394 $wait(memcpyProc);
395 }
396
397 return $message_pack($CUDA_PLACE_DEVICE, $CUDA_PLACE_HOST, $CUDA_TAG_cudaMemcpy, null, 0);
398 }
399
400 // Helper function to get the default stream if passed NULL, and just returns stream otherwise
401 cudaStream_t $default_stream_if_null(cudaStream_t stream) {
402 return stream == NULL ? $cuda_default_stream : stream;
403 }
404
405 ////////////////////////
406 // Kernel Definitions //
407 ////////////////////////
408
409 // Generated from kernel_1 definition
410 void _cuda_kernel_1(dim3 gridDim, dim3 blockDim, size_t _cuda_mem_size,
411 const float *A, const float *B, float *C, int numElements) {
412 void _cuda_block(uint3 blockIdx) {
413 int numThreads = (blockDim.x * blockDim.y) * blockDim.z;
414 $scope _block_root = $here;
415 $gbarrier _cuda_block_barrier = $gbarrier_create($here, blockDim.x * blockDim.y * blockDim.z);
416 void _cuda_thread(uint3 threadIdx) {
417 int _cuda_tid = $dim3_index(blockDim, threadIdx);
418 int _cuda_kid = $cuda_kernel_index(gridDim, blockDim, blockIdx, threadIdx);
419 $barrier _cuda_thread_barrier = $barrier_create($here, _cuda_block_barrier, _cuda_tid);
420 $local_start();
421 // Kernel definition start
422
423 int i = blockDim.x * blockIdx.x + threadIdx.x;
424
425 if (i < numElements)
426 {
427 C[i] = A[i] + B[i];
428 }
429
430 // Kernel definition end
431 $local_end();
432 $barrier_destroy(_cuda_thread_barrier);
433 }
434 $cuda_run_and_wait_on_procs(blockDim, _cuda_thread);
435 $gbarrier_destroy(_cuda_block_barrier);
436 }
437 $cuda_run_and_wait_on_procs(gridDim, _cuda_block);
438 }
439
440 /////////////////
441 // Device main //
442 /////////////////
443
444 $cuda_setup();
445
446 while (true) {
447 $message request = $comm_dequeue($cuda_device_comm, $CUDA_PLACE_HOST, $COMM_ANY_TAG);
448 $message response;
449 int tag = $message_tag(request);
450
451 switch(tag) {
452 case $CUDA_TAG_TEARDOWN :
453 $cuda_teardown();
454 return;
455 case $CUDA_TAG_cudaMalloc :
456 response = $cuda_malloc(request);
457 break;
458 case $CUDA_TAG_cudaMemcpy :
459 response = $cuda_memcpy(request);
460 break;
461 case $CUDA_TAG_LAUNCH_kernel_1 :
462 response = ???;
463 default :
464 $assert(false, "Unknown CUDA request");
465 }
466
467 $comm_enqueue($cuda_device_comm, response);
468 }
469}
470
471///////////////
472// Host file //
473///////////////
474
475$input int N;
476$assume (N > 0);
477$input float A[N];
478$input float B[N];
479
480void _host_main() {
481 int size = N * sizeof(float);
482 int numBlocks = 2;
483 int numThreads = N%2 == 0? N/2 : (N+1)/2;
484
485 float* cuda_A;
486 foo((void **)&cuda_A, size);
487 /*
488 {
489 cuda_A = (float *) malloc(size);
490 }
491 */
492 cudaMemcpy(cuda_A, A, size, cudaMemcpyHostToDevice);
493
494 float* cuda_B;
495 // cudaMalloc((void **)&cuda_B, size);
496 {
497 cuda_B = (float *) malloc(size);
498 }
499 cudaMemcpy(cuda_B, B, size, cudaMemcpyHostToDevice);
500
501 float* cuda_C;
502 // cudaMalloc((void **)&cuda_C, size);
503 {
504 cuda_C = (float *) malloc(size);
505 }
506
507 { // kernel_1<<<gridDim, blockDim>>>(cuda_A, cuda_B, cuda_C, N);
508 dim3 gridDim = {numBlocks, 1, 1};
509 dim3 blockDim = {numThreads, 1, 1};
510 _Bool enqueuedFlag = false;
511 $spawn $proc_kernel_1(gridDim, blockDim, 0, &enqueuedFlag, NULL, cuda_A, cuda_B, cuda_C, N);
512 $when(enqueuedFlag);
513 }
514
515 //Checking correctness
516 float* C = (float *)malloc(size);
517
518 cudaMemcpy(C, cuda_C, size, cudaMemcpyDeviceToHost);
519
520 for(int i = 0; i < N; i++)
521 $assert(C[i] == A[i] + B[i]);
522
523 free(C);
524
525 //cudaFree(cuda_A);...
526 free(cuda_A);
527 free(cuda_B);
528 free(cuda_C);
529
530 // inserted by transformer
531 $comm_enqueue($cuda_host_comm, $message_pack($CUDA_PLACE_HOST, $CUDA_PLACE_DEVICE, $CUDA_TAG_TEARDOWN, NULL, 0));
532}
533
534int main() {
535 $proc host = $spawn _host_main();
536 $proc cuda = $spawn _cuda_main();
537 $wait(host);
538 $wait(cuda);
539}
Note: See TracBrowser for help on using the repository browser.