source: CIVL/examples/cuda/newCudaMockup.cvl@ 7d77e64

1.23 2.0 main test-branch
Last change on this file since 7d77e64 was b2ca0b6, checked in by Alex Wilton <awilton@…>, 3 years ago

Factored out $reveal code in cuda mockup

git-svn-id: svn://vsl.cis.udel.edu/civl/trunk@5763 fb995dde-84ed-4084-dfe6-e5aef3e2452c

  • Property mode set to 100644
File size: 16.1 KB
Line 
1#include <concurrency.cvh>
2#include <comm.cvh>
3#include <stdlib.h>
4#include <stdio.h>
5#include <stdbool.h>
6#include <string.h>
7#pragma CIVL ACSL
8
9///////////
10// Types //
11///////////
12
13enum cudaError {
14 cudaSuccess
15};
16typedef enum cudaError cudaError_t;
17
18typedef enum cudaMemcpyKind {
19 cudaMemcpyHostToHost,
20 cudaMemcpyHostToDevice,
21 cudaMemcpyDeviceToHost,
22 cudaMemcpyDeviceToDevice,
23 cudaMemcpyDefault
24} cudaMemcpyKind;
25
26typedef struct {
27 unsigned int x, y, z;
28} dim3;
29
30/* used to represent a location in a three dimensional grid
31 */
32typedef struct {
33 unsigned int x, y, z;
34} uint3;
35
36typedef struct $cuda_op_state* $cuda_op_state_t;
37struct $cuda_op_state {
38 _Bool start;
39 $proc op;
40};
41
42typedef struct $cuda_op_state_node* $cuda_op_state_node_t;
43struct $cuda_op_state_node {
44 $cuda_op_state_t opState;
45 $cuda_op_state_node_t next;
46};
47
48typedef struct cudaStream* cudaStream_t;
49typedef struct $cuda_stream_node* $cuda_stream_node_t;
50struct cudaStream {
51 $cuda_op_state_node_t head;
52 $cuda_op_state_node_t tail;
53 int numOps;
54 $cuda_stream_node_t containingNode;
55 _Bool alive;
56};
57
58struct $cuda_stream_node {
59 cudaStream_t stream;
60 $cuda_stream_node_t prev;
61 $cuda_stream_node_t next;
62};
63
64typedef struct $cuda_context {
65 $cuda_stream_node_t head;
66 int numStreams;
67} $cuda_context;
68
69typedef struct {
70 $mem readSets[];
71 $mem writeSets[];
72 int size;
73} $cuda_kernel_instance;
74
75typedef struct $cuda_memcpy_data {
76 void* dst;
77 const void* src;
78 size_t count;
79 cudaMemcpyKind kind;
80} $cuda_memcpy_data;
81
82//////////////////////
83// Global Variables //
84//////////////////////
85
86$gcomm $cuda_gcomm = $gcomm_create($here, 2);
87const int $CUDA_PLACE_HOST = 0;
88const int $CUDA_PLACE_DEVICE = 1;
89$comm $cuda_host_comm = $comm_create($here, $cuda_gcomm, $CUDA_PLACE_HOST);
90
91/**
92 * Tags used for message-passing between host and device
93 */
94enum $cuda_tag {
95 // Predefined tags
96 $CUDA_TAG_TEARDOWN,
97 $CUDA_TAG_SCOPE_REQUEST,
98 $CUDA_TAG_cudaFree,
99 $CUDA_TAG_cudaMemcpy,
100 $CUDA_TAG_cudaMemcpyAsync,
101 // Generated tags (by transformer)
102 $CUDA_TAG_LAUNCH_kernel_1
103};
104
105///////////////////
106// CIVL-CUDA API //
107///////////////////
108
109$scope $cuda_host_request_device_scope() {
110 $comm_enqueue($cuda_host_comm, $message_pack($CUDA_PLACE_HOST, $CUDA_PLACE_DEVICE, $CUDA_TAG_SCOPE_REQUEST, NULL, 0));
111 $message response = $comm_dequeue($cuda_host_comm, $CUDA_PLACE_DEVICE, $CUDA_TAG_SCOPE_REQUEST);
112 $scope result;
113 $message_unpack(response, &result, sizeof($scope));
114
115 return result;
116}
117
118void $cuda_helper_host_memcpy(void* dst, const void* src, size_t count, cudaMemcpyKind kind, _Bool async) {
119 if (kind == cudaMemcpyHostToHost) {
120 memcpy(dst, src, count);
121 } else {
122 $cuda_memcpy_data args;
123 args.dst = dst;
124 args.src = src;
125 args.count = count;
126 args.kind = kind;
127
128 int tag = async ? $CUDA_TAG_cudaMemcpyAsync : $CUDA_TAG_cudaMemcpy;
129
130 $comm_enqueue($cuda_host_comm, $message_pack($CUDA_PLACE_HOST, $CUDA_PLACE_DEVICE, tag, &args, sizeof($cuda_memcpy_data)));
131 $comm_dequeue($cuda_host_comm, $CUDA_PLACE_DEVICE, tag);
132 }
133}
134
135$cuda_stream_node_t $create_new_stream_node($scope cudaScope) {
136 cudaStream_t newStream = (cudaStream_t) $malloc(cudaScope, sizeof(struct cudaStream));
137 newStream->head = NULL;
138 newStream->tail = NULL;
139 newStream->numOps = 0;
140 newStream->alive = true;
141
142 $cuda_stream_node_t newHead = ($cuda_stream_node_t) $malloc(cudaScope, sizeof(struct $cuda_stream_node));
143 newHead->stream = newStream;
144 newStream->containingNode = newHead;
145 newHead->prev = NULL;
146 newHead->next = NULL;
147
148 return newHead;
149}
150
151/*@ depends_on \nothing;
152 @ assigns \nothing;
153 @ reads \nothing;
154 @*/
155$atomic_f $proc $destroy_stream_node($cuda_stream_node_t node) {
156 $proc lastOpProc = $proc_null;
157 cudaStream_t stream = node->stream;
158
159 if (node->prev != NULL) {
160 node->prev->next = node->next;
161 }
162 if (node->next != NULL) {
163 node->next->prev = node->prev;
164 }
165 free(node);
166
167 stream->alive = false;
168 if(stream->tail != NULL)
169 lastOpProc = stream->tail->opState->op;
170
171 void destroyStreamWhenComplete($proc lastOpProc, cudaStream_t stream) {
172 $wait(lastOpProc);
173 free(stream);
174 }
175
176 return $spawn destroyStreamWhenComplete(lastOpProc, stream);
177}
178
179/*@ depends_on \access(stream);
180 @ assigns stream;
181 @ reads \nothing;
182 @*/
183$atomic_f $proc $stream_enqueue($scope cudaScope, cudaStream_t stream, $message opParams, void(*opProc)($message, $cuda_op_state_t, cudaStream_t)) {
184 $assert(stream->alive, "Attempt to enqueue a CUDA operation onto a destroyed stream");
185
186 $cuda_op_state_t newOpState = ($cuda_op_state_t) $malloc(cudaScope, sizeof(struct $cuda_op_state));
187 newOpState->start = false;
188 newOpState->op = $spawn opProc(opParams, newOpState, stream);
189
190 $cuda_op_state_node_t newOpStateNode = ($cuda_op_state_node_t) $malloc(cudaScope, sizeof(struct $cuda_op_state_node));
191 newOpStateNode->opState = newOpState;
192 newOpStateNode->next = NULL;
193
194 if (stream->tail == NULL) {
195 stream->head = newOpStateNode;
196 stream->tail = newOpStateNode;
197 newOpState->start = true;
198 } else {
199 stream->tail->next = newOpStateNode;
200 stream->tail = newOpStateNode;
201 }
202 stream->numOps++;
203
204 return newOpState->op;
205}
206
207/*@ depends_on \nothing;
208 @ assigns \nothing;
209 @ reads \nothing;
210 @*/
211$atomic_f void $stream_dequeue(cudaStream_t stream) {
212 $assert(stream->head != NULL, "Attempt to dequeue an empty stream");
213
214 if (stream->head == stream->tail) {
215 stream->tail = NULL;
216 }
217
218 $cuda_op_state_node_t oldHead = stream->head;
219 stream->head = oldHead->next;
220 if (stream->head != NULL) {
221 stream->head->opState->start = true;
222 }
223
224 stream->numOps--;
225 free(oldHead->opState);
226 free(oldHead);
227}
228
229// Helper function
230int $dim3_index(dim3 size, uint3 location) {
231 return location.x + size.x * (location.y + size.y * location.z);
232}
233
234// Helper function
235int $cuda_kernel_index (dim3 gDim, dim3 bDim, uint3 bIdx, uint3 tIdx) {
236 return $dim3_index(gDim, bIdx) * (bDim.x * bDim.y * bDim.z) + $dim3_index(bDim, tIdx);
237}
238
239void $cuda_run_and_wait_on_procs(dim3 dim, void spawningFunction(uint3)) {
240 //TODO: calculate length and index, replace this function in the kernel
241 $local_start();
242 int length = dim.x * dim.y * dim.z;
243 $proc procArray[length];
244 $range rx = 0 .. dim.x - 1;
245 $range ry = 0 .. dim.y - 1;
246 $range rz = 0 .. dim.z - 1;
247 $domain(3) dom = ($domain(3)){rx, ry, rz};
248 $for(int x,y,z : dom){
249 uint3 id = { x, y, z };
250 int index = $dim3_index(dim, id);
251 procArray[index] = $spawn spawningFunction(id);
252 }
253 $local_end();
254 $waitall(procArray,length);
255}
256
257
258// CUDA Ops //
259
260void $cuda_memcpy_proc($message m, $cuda_op_state_t opState, cudaStream_t stream) {
261
262 $when(opState->start);
263 $cuda_memcpy_data args;
264 $message_unpack(m, &args, sizeof($cuda_memcpy_data));
265
266 if (args.kind == cudaMemcpyHostToDevice || cudaMemcpyDeviceToDevice) {
267 args.dst = $reveal(args.dst);
268 }
269 if (args.kind == cudaMemcpyDeviceToHost || cudaMemcpyDeviceToDevice) {
270 args.src = $reveal(args.src);
271 }
272 memcpy(args.dst, args.src, args.count);
273
274 $stream_dequeue(stream);
275}
276
277$message $cuda_memcpy($scope cudaScope, cudaStream_t stream, $message request, _Bool async) {
278 $cuda_memcpy_data args;
279 $message_unpack(request, &args, sizeof($cuda_memcpy_data));
280
281 $proc memcpyProc = $stream_enqueue(cudaScope, stream, request, $cuda_memcpy_proc);
282
283 if (!async && args.kind != cudaMemcpyDeviceToDevice) {
284 $wait(memcpyProc);
285 }
286 int tag = async ? $CUDA_TAG_cudaMemcpyAsync : $CUDA_TAG_cudaMemcpy;
287
288 return $message_pack($CUDA_PLACE_DEVICE, $CUDA_PLACE_HOST, tag, NULL, 0);
289}
290
291$message $cuda_free($message request) {
292 void* devPtr;
293 $message_unpack(request, &devPtr, sizeof(void*));
294 free($reveal(devPtr));
295
296 return $message_pack($CUDA_PLACE_DEVICE, $CUDA_PLACE_HOST, $CUDA_TAG_cudaFree, NULL, 0);
297}
298
299////////////////////////////////////////////
300// CUDA API Functions (For Host-use Only) //
301////////////////////////////////////////////
302
303cudaError_t cudaFree(void* devPtr) {
304 $comm_enqueue($cuda_host_comm, $message_pack($CUDA_PLACE_HOST, $CUDA_PLACE_DEVICE, $CUDA_TAG_cudaFree, &devPtr, sizeof(void*)));
305 $comm_dequeue($cuda_host_comm, $CUDA_PLACE_DEVICE, $CUDA_TAG_cudaFree);
306
307 return cudaSuccess;
308}
309
310cudaError_t cudaMemcpy(void* dst, const void* src, size_t count, cudaMemcpyKind kind) {
311 $cuda_helper_host_memcpy(dst, src, count, kind, false);
312 return cudaSuccess;
313}
314
315cudaError_t cudaMemcpyAsync(void* dst, const void* src, size_t count,
316 cudaMemcpyKind kind, cudaStream_t stream) {
317 $cuda_helper_host_memcpy(dst, src, count, kind, true);
318 return cudaSuccess;
319}
320
321/*
322cudaError_t cudaStreamCreate(cudaStream_t * pStream) {
323 // Create new stream node in linked list
324 $cuda_stream_node_t newHead = $create_new_stream_node();
325 newHead->next = $cuda_global_context.head;
326 $cuda_global_context.head->prev = newHead;
327
328 // Update cuda context's head to be the new node we created
329 $cuda_global_context.head = newHead;
330 $cuda_global_context.numStreams++;
331
332 return cudaSuccess;
333}
334*/
335
336/*
337cudaError_t cudaStreamSynchronize(cudaStream_t stream) {
338 stream = $default_stream_if_null(stream);
339 $assert(stream->alive, "Attempt to synchronize with a destroyed stream");
340 $when(stream->head == NULL) return cudaSuccess;
341}
342*/
343
344/*
345cudaError_t cudaStreamDestroy(cudaStream_t stream) {
346 $assert(stream != NULL && stream != $cuda_default_stream, "Attempt to destroy default stream");
347 $assert(stream->alive, "Attempt to destroy an already destroyed stream");
348 $destroy_stream_node(stream->containingNode);
349 return cudaSuccess;
350}
351*/
352
353/*
354cudaError_t cudaDeviceSynchronize() {
355 $proc* opsToWaitOn;
356 int numOps = 0;
357
358 $atomic {
359 opsToWaitOn = ($proc*) malloc(sizeof($proc) * $cuda_global_context.numStreams);
360
361 for ($cuda_stream_node_t node = $cuda_global_context.head;
362 node != NULL;
363 node = node->next) {
364 if (node->stream->tail != NULL) {
365 opsToWaitOn[numOps] = node->stream->tail->opState->op;
366 numOps++;
367 }
368 }
369 }
370 $waitall(opsToWaitOn, numOps);
371
372 return cudaSuccess;
373}
374*/
375
376//////////////////////////////////
377// Generated code from kernel_1 //
378//////////////////////////////////
379
380typedef struct {
381 dim3 gridDim;
382 dim3 blockDim;
383 size_t $cudaMemSize;
384 cudaStream_t $cudaStream;
385 const float* A;
386 const float* B;
387 float* C;
388 int numElements;
389} $cuda_kernel_1_data;
390
391void $cuda_reveal_kernel_1_args($cuda_kernel_1_data* args) {
392 args->A = $reveal(args->A);
393 args->B = $reveal(args->B);
394 args->C = $reveal(args->C);
395}
396
397void $cuda_host_launch_kernel_1(dim3 gridDim, dim3 blockDim,
398 size_t $cudaMemSize, cudaStream_t $cudaStream,
399 const float* A, const float* B, float* C, int numElements) {
400 $cuda_kernel_1_data args;
401 args.gridDim = gridDim;
402 args.blockDim = blockDim;
403 args.$cudaMemSize = $cudaMemSize;
404 args.$cudaStream = $cudaStream;
405 args.A = A;
406 args.B = B;
407 args.C = C;
408 args.numElements = numElements;
409
410 $comm_enqueue($cuda_host_comm, $message_pack($CUDA_PLACE_HOST, $CUDA_PLACE_DEVICE, $CUDA_TAG_LAUNCH_kernel_1, &args, sizeof($cuda_kernel_1_data)));
411 $comm_dequeue($cuda_host_comm, $CUDA_PLACE_DEVICE, $CUDA_TAG_LAUNCH_kernel_1);
412}
413
414void $cuda_kernel_1(dim3 gridDim, dim3 blockDim, size_t _cuda_mem_size,
415 const float *A, const float *B, float *C, int numElements) {
416 $cuda_kernel_instance $kernel;
417 void _cuda_block(uint3 blockIdx) {
418 int numThreads = (blockDim.x * blockDim.y) * blockDim.z;
419 $scope _block_root = $here;
420 $gbarrier _cuda_block_barrier = $gbarrier_create($here, blockDim.x * blockDim.y * blockDim.z);
421 void _cuda_thread(uint3 threadIdx) {
422 int _cuda_tid = $dim3_index(blockDim, threadIdx);
423 int _cuda_kid = $cuda_kernel_index(gridDim, blockDim, blockIdx, threadIdx);
424 $barrier _cuda_thread_barrier = $barrier_create($here, _cuda_block_barrier, _cuda_tid);
425 $local_start();
426 // Kernel definition start
427
428 int i = blockDim.x * blockIdx.x + threadIdx.x;
429
430 if (i < numElements)
431 {
432 C[i] = A[i] + B[i];
433 }
434
435 // Kernel definition end
436 $local_end();
437 $barrier_destroy(_cuda_thread_barrier);
438 }
439 $cuda_run_and_wait_on_procs(blockDim, _cuda_thread);
440 $gbarrier_destroy(_cuda_block_barrier);
441 }
442 $cuda_run_and_wait_on_procs(gridDim, _cuda_block);
443}
444
445void $cuda_kernel_1_proc ($message request, $cuda_op_state_t opState, cudaStream_t cudaStream) {
446 $when(opState->start);
447
448 $cuda_kernel_1_data args;
449 $message_unpack(request, &args, sizeof($cuda_kernel_1_data));
450 $cuda_reveal_kernel_1_args(&args);
451
452 $cuda_kernel_1(args.gridDim, args.blockDim, args.$cudaMemSize, args.A, args.B, args.C, args.numElements);
453 $stream_dequeue(cudaStream);
454}
455
456/////////////////
457// CUDA "file" //
458/////////////////
459
460void $cuda_main() {
461
462 // Device Variables
463
464 $scope $cuda_scope = $here;
465
466 $comm $cuda_device_comm = $comm_create($cuda_scope, $cuda_gcomm, 1);
467 $cuda_context $cuda_global_context;
468 cudaStream_t $cuda_default_stream;
469
470 // Helper function to get the default stream if passed NULL, and just returns stream otherwise
471 // Currently unused until we support streams other than the default one.
472 cudaStream_t $default_stream_if_null(cudaStream_t stream) {
473 return stream == NULL ? $cuda_default_stream : stream;
474 }
475
476 // Device Logic
477
478 $cuda_stream_node_t defaultStreamNode = $create_new_stream_node($cuda_scope);
479 $cuda_default_stream = defaultStreamNode->stream;
480
481 $cuda_global_context.head = defaultStreamNode;
482 $cuda_global_context.numStreams = 1;
483
484 while (true) {
485 $message request = $comm_dequeue($cuda_device_comm, $CUDA_PLACE_HOST, $COMM_ANY_TAG);
486 $message response;
487 const int tag = $message_tag(request);
488
489 switch(tag) {
490 case $CUDA_TAG_SCOPE_REQUEST :
491 response = $message_pack($CUDA_PLACE_DEVICE, $CUDA_PLACE_HOST, $CUDA_TAG_SCOPE_REQUEST, &$cuda_scope, sizeof($scope));
492 break;
493 case $CUDA_TAG_cudaFree :
494 response = $cuda_free(request);
495 break;
496 case $CUDA_TAG_cudaMemcpy :
497 response = $cuda_memcpy($cuda_scope, $cuda_default_stream, request, false);
498 break;
499 case $CUDA_TAG_cudaMemcpyAsync :
500 response = $cuda_memcpy($cuda_scope, $cuda_default_stream, request, true);
501 break;
502 case $CUDA_TAG_LAUNCH_kernel_1 :
503 $stream_enqueue($cuda_scope, $cuda_default_stream, request, $cuda_kernel_1_proc);
504
505 response = $message_pack($CUDA_PLACE_DEVICE, $CUDA_PLACE_HOST, tag, NULL, 0);
506 break;
507 case $CUDA_TAG_TEARDOWN : {
508 $proc destructor = $destroy_stream_node($cuda_default_stream->containingNode);
509 $wait(destructor);
510 $comm_destroy($cuda_device_comm);
511 return;
512 }
513 default :
514 $assert(false, "Unknown CUDA request");
515 }
516
517 $comm_enqueue($cuda_device_comm, response);
518 }
519}
520
521///////////////
522// Host file //
523///////////////
524
525$input int N;
526$assume (N > 0);
527$input float A[N];
528$input float B[N];
529
530void $host_main() {
531 int size = N * sizeof(float);
532 int numBlocks = 2;
533 int numThreads = N%2 == 0? N/2 : (N+1)/2;
534
535 float* cuda_A;
536 // cudaMalloc((void **)&cuda_A, size);
537 {
538 $scope deviceScope = $cuda_host_request_device_scope();
539 cuda_A = $hide((float*)$malloc(deviceScope, size));
540 }
541 cudaMemcpy(cuda_A, A, size, cudaMemcpyHostToDevice);
542
543 float* cuda_B;
544 // cudaMalloc((void **)&cuda_B, size);
545 {
546 $scope deviceScope = $cuda_host_request_device_scope();
547 cuda_B = $hide((float*)$malloc(deviceScope, size));
548 }
549 cudaMemcpy(cuda_B, B, size, cudaMemcpyHostToDevice);
550
551 float* cuda_C;
552 // cudaMalloc((void **)&cuda_C, size);
553 {
554 $scope deviceScope = $cuda_host_request_device_scope();
555 cuda_C = $hide((float*)$malloc(deviceScope, size));
556 }
557
558 dim3 gridDim = {numBlocks, 1, 1};
559 dim3 blockDim = {numThreads, 1, 1};
560 // kernel_1<<<gridDim, blockDim>>>(cuda_A, cuda_B, cuda_C, N);
561 $cuda_host_launch_kernel_1(gridDim, blockDim, 0, NULL, cuda_A, cuda_B, cuda_C, N);
562
563 // Checking correctness
564 float* C = (float *)malloc(size);
565
566 cudaMemcpy(C, cuda_C, size, cudaMemcpyDeviceToHost);
567
568 for(int i = 0; i < N; i++)
569 $assert(C[i] == A[i] + B[i]);
570
571 free(C);
572
573 cudaFree(cuda_A);
574 cudaFree(cuda_B);
575 cudaFree(cuda_C);
576
577 // inserted by transformer
578 $comm_enqueue($cuda_host_comm, $message_pack($CUDA_PLACE_HOST, $CUDA_PLACE_DEVICE, $CUDA_TAG_TEARDOWN, NULL, 0));
579 $comm_destroy($cuda_host_comm);
580}
581
582int main() {
583 $proc host = $spawn $host_main();
584 $proc cuda = $spawn $cuda_main();
585 $wait(host);
586 $wait(cuda);
587 $gcomm_destroy($cuda_gcomm, NULL);
588}
Note: See TracBrowser for help on using the repository browser.