source: CIVL/examples/cuda/newCudaMockup.cvl@ 1aaefd4

main test-branch
Last change on this file since 1aaefd4 was 7e3e7af, checked in by Alex Wilton <awilton@…>, 3 years ago

Fixed a small POR issue in cuda mockup

git-svn-id: svn://vsl.cis.udel.edu/civl/trunk@5749 fb995dde-84ed-4084-dfe6-e5aef3e2452c

  • Property mode set to 100644
File size: 18.7 KB
RevLine 
[29eb398]1/**
2 * TODO:
3 * - implement cudaMemset and cudaMemsetAsync
4 * - flesh out basic structure of cuda kernel:
5 * - spawn gridDim blocks
6 * - spawn blockDim threads
7 * Alternatively, spawn blockDim warps and then spawn warp's threads
8 * - wait for blocks to finish
9 * - Add in block-level barriers and a __syncthreads() nested function that uses the barrier.
10 * - Add in data race checking support and implement atomicAdd for integers
[874f9d1]11 * - Handle dependencies at atomic blocks (replace some with local blocks if possible)
[29eb398]12 */
13
[30cb1de]14#include <concurrency.cvh>
[6dd0322]15#include <comm.cvh>
[30cb1de]16#include <stdlib.h>
[874f9d1]17#include <stdio.h>
[30cb1de]18#include <stdbool.h>
19#include <string.h>
20
[6dd0322]21///////////
22// Types //
23///////////
24
[109d05e]25enum cudaError {
26 cudaSuccess
27};
28typedef enum cudaError cudaError_t;
29
30typedef enum cudaMemcpyKind {
31 cudaMemcpyHostToHost,
32 cudaMemcpyHostToDevice,
33 cudaMemcpyDeviceToHost,
34 cudaMemcpyDeviceToDevice,
35 cudaMemcpyDefault
36} cudaMemcpyKind;
37
[30cb1de]38typedef struct {
39 unsigned int x, y, z;
40} dim3;
41
42/* used to represent a location in a three dimensional grid
43 */
44typedef struct {
45 unsigned int x, y, z;
46} uint3;
47
48typedef struct $cuda_op_state* $cuda_op_state_t;
49struct $cuda_op_state {
[109d05e]50 _Bool start;
[e76acca]51 $proc op;
[30cb1de]52};
[109d05e]53
[30cb1de]54typedef struct $cuda_op_state_node* $cuda_op_state_node_t;
55struct $cuda_op_state_node {
[e76acca]56 $cuda_op_state_t opState;
57 $cuda_op_state_node_t next;
[30cb1de]58};
[109d05e]59
[30cb1de]60typedef struct cudaStream* cudaStream_t;
61typedef struct $cuda_stream_node* $cuda_stream_node_t;
62struct cudaStream {
[e76acca]63 $cuda_op_state_node_t head;
64 $cuda_op_state_node_t tail;
[109d05e]65 int numOps;
66 $cuda_stream_node_t containingNode;
67 _Bool alive;
[30cb1de]68};
[109d05e]69
[30cb1de]70struct $cuda_stream_node {
71 cudaStream_t stream;
[109d05e]72 $cuda_stream_node_t prev;
73 $cuda_stream_node_t next;
[30cb1de]74};
[109d05e]75
76typedef struct $cuda_context {
[30cb1de]77 $cuda_stream_node_t head; //list of streams
[109d05e]78 int numStreams;
79} $cuda_context;
80
[0ffc6c8]81typedef struct $cuda_memcpy_data {
82 void* dst;
83 const void* src;
84 size_t count;
85 cudaMemcpyKind kind;
86} $cuda_memcpy_data;
87
88typedef struct $cuda_kernel_1_data {
89 dim3 gridDim;
90 dim3 blockDim;
91 size_t $cudaMemSize;
92 cudaStream_t $cudaStream;
93 const float* A;
94 const float* B;
95 float* C;
96 int numElements;
97} $cuda_kernel_1_data;
98
[6dd0322]99//////////////////////
100// Global Variables //
101//////////////////////
102
103$gcomm $cuda_gcomm = $gcomm_create($here, 2);
104const int $CUDA_PLACE_HOST = 0;
105const int $CUDA_PLACE_DEVICE = 1;
[0ffc6c8]106$comm $cuda_host_comm = $comm_create($here, $cuda_gcomm, $CUDA_PLACE_HOST);
[6dd0322]107
108/**
109 * Tags used for message-passing between host and device
110 */
[0ffc6c8]111
112enum $cuda_tag {
113 // Predefined tags
114 $CUDA_TAG_TEARDOWN,
115 $CUDA_TAG_SCOPE_REQUEST,
116 $CUDA_TAG_cudaFree,
117 $CUDA_TAG_cudaMemcpy,
118 $CUDA_TAG_cudaMemcpyAsync,
119 // Generated tags (by transformer)
120 $CUDA_TAG_LAUNCH_kernel_1
121};
[6dd0322]122
123////////////////////////////////////////////
124// CUDA API Functions (For Host-use Only) //
125////////////////////////////////////////////
126
[0ffc6c8]127/*
[6dd0322]128cudaError_t cudaMalloc(void** devPtr, size_t size) {
129 $comm_enqueue($cuda_host_comm, $message_pack($CUDA_PLACE_HOST, $CUDA_PLACE_DEVICE, $CUDA_TAG_cudaMalloc, &size, sizeof(size_t)));
130 $message response = $comm_dequeue($cuda_host_comm, $CUDA_PLACE_DEVICE, $CUDA_TAG_cudaMalloc);
131 $message_unpack(response, devPtr, sizeof(void*));
[0ffc6c8]132
133 return cudaSuccess;
[6dd0322]134}
[0ffc6c8]135*/
[6dd0322]136
[0ffc6c8]137$scope $cuda_host_request_device_scope() {
138 $comm_enqueue($cuda_host_comm, $message_pack($CUDA_PLACE_HOST, $CUDA_PLACE_DEVICE, $CUDA_TAG_SCOPE_REQUEST, NULL, 0));
139 $message response = $comm_dequeue($cuda_host_comm, $CUDA_PLACE_DEVICE, $CUDA_TAG_SCOPE_REQUEST);
140 $scope result;
141 $message_unpack(response, &result, sizeof($scope));
[6dd0322]142
[0ffc6c8]143 return result;
144}
145
146cudaError_t cudaFree(void* devPtr) {
147 $comm_enqueue($cuda_host_comm, $message_pack($CUDA_PLACE_HOST, $CUDA_PLACE_DEVICE, $CUDA_TAG_cudaFree, &devPtr, sizeof(void*)));
148 $comm_dequeue($cuda_host_comm, $CUDA_PLACE_DEVICE, $CUDA_TAG_cudaFree);
149
150 return cudaSuccess;
151}
152
153void $cuda_helper_host_memcpy(void* dst, const void* src, size_t count, cudaMemcpyKind kind, _Bool async) {
154 if (kind == cudaMemcpyHostToHost) {
[6dd0322]155 memcpy(dst, src, count);
156 } else {
[0ffc6c8]157 $cuda_memcpy_data args;
[6dd0322]158 args.dst = dst;
159 args.src = src;
160 args.count = count;
161 args.kind = kind;
[0ffc6c8]162
163 int tag = async ? $CUDA_TAG_cudaMemcpyAsync : $CUDA_TAG_cudaMemcpy;
164
165 $comm_enqueue($cuda_host_comm, $message_pack($CUDA_PLACE_HOST, $CUDA_PLACE_DEVICE, tag, &args, sizeof($cuda_memcpy_data)));
166 $comm_dequeue($cuda_host_comm, $CUDA_PLACE_DEVICE, tag);
[6dd0322]167 }
[0ffc6c8]168}
169
170cudaError_t cudaMemcpy(void* dst, const void* src, size_t count, cudaMemcpyKind kind) {
171 $cuda_helper_host_memcpy(dst, src, count, kind, false);
[6dd0322]172 return cudaSuccess;
[109d05e]173}
174
[0ffc6c8]175cudaError_t cudaMemcpyAsync(void* dst, const void* src, size_t count,
176 cudaMemcpyKind kind, cudaStream_t stream) {
177 $cuda_helper_host_memcpy(dst, src, count, kind, true);
178 return cudaSuccess;
[109d05e]179}
180
[30cb1de]181/**
182 * TODO:
183 * - test
184 * - atomic?
185 */
[0ffc6c8]186/*
[109d05e]187cudaError_t cudaStreamCreate(cudaStream_t * pStream) {
188 // Create new stream node in linked list
189 $cuda_stream_node_t newHead = $create_new_stream_node();
190 newHead->next = $cuda_global_context.head;
191 $cuda_global_context.head->prev = newHead;
192
193 // Update cuda context's head to be the new node we created
194 $cuda_global_context.head = newHead;
195 $cuda_global_context.numStreams++;
196
[30cb1de]197 return cudaSuccess;
[109d05e]198}
[0ffc6c8]199*/
[109d05e]200
[30cb1de]201/**
202 * TODO:
203 * - test
204 * - atomic?
205 */
[0ffc6c8]206/*
[109d05e]207cudaError_t cudaStreamSynchronize(cudaStream_t stream) {
208 stream = $default_stream_if_null(stream);
209 $assert(stream->alive, "Attempt to synchronize with a destroyed stream");
210 $when(stream->head == NULL) return cudaSuccess;
211}
[0ffc6c8]212*/
[109d05e]213
214// TODO: atomic
[0ffc6c8]215/*
[109d05e]216cudaError_t cudaStreamDestroy(cudaStream_t stream) {
217 $assert(stream != NULL && stream != $cuda_default_stream, "Attempt to destroy default stream");
[30cb1de]218 $assert(stream->alive, "Attempt to destroy an already destroyed stream");
[109d05e]219 $destroy_stream_node(stream->containingNode);
220 return cudaSuccess;
221}
[0ffc6c8]222*/
[109d05e]223
[0ffc6c8]224/*
[109d05e]225cudaError_t cudaDeviceSynchronize() {
[30cb1de]226 $proc* opsToWaitOn;
[109d05e]227 int numOps = 0;
228
229 $atomic {
[30cb1de]230 opsToWaitOn = ($proc*) malloc(sizeof($proc) * $cuda_global_context.numStreams);
231
[109d05e]232 for ($cuda_stream_node_t node = $cuda_global_context.head;
[30cb1de]233 node != NULL;
234 node = node->next) {
235 if (node->stream->tail != NULL) {
236 opsToWaitOn[numOps] = node->stream->tail->opState->op;
237 numOps++;
238 }
239 }
240 }
[e76acca]241 $waitall(opsToWaitOn, numOps);
[109d05e]242
[30cb1de]243 return cudaSuccess;
[109d05e]244}
[0ffc6c8]245*/
246
247void $cuda_host_launch_kernel_1(dim3 gridDim, dim3 blockDim, size_t $cudaMemSize, cudaStream_t $cudaStream,
248 const float* A, const float* B, float* C, int numElements) {
249 $cuda_kernel_1_data args;
250 args.gridDim = gridDim;
251 args.blockDim = blockDim;
252 args.$cudaMemSize = $cudaMemSize;
253 args.$cudaStream = $cudaStream;
254 args.A = A;
255 args.B = B;
256 args.C = C;
257 args.numElements = numElements;
258
259 $comm_enqueue($cuda_host_comm, $message_pack($CUDA_PLACE_HOST, $CUDA_PLACE_DEVICE, $CUDA_TAG_LAUNCH_kernel_1, &args, sizeof($cuda_kernel_1_data)));
260 $comm_dequeue($cuda_host_comm, $CUDA_PLACE_DEVICE, $CUDA_TAG_LAUNCH_kernel_1);
[109d05e]261}
262
[6dd0322]263/////////////////
264// CUDA "file" //
265/////////////////
266
267void _cuda_main() {
268
269 //////////////////////
270 // Device Variables //
271 //////////////////////
272
[0ffc6c8]273 $scope $cuda_scope = $here;
274
275 $comm $cuda_device_comm = $comm_create($cuda_scope, $cuda_gcomm, 1);
[6dd0322]276 $cuda_context $cuda_global_context;
277 cudaStream_t $cuda_default_stream;
278
[0ffc6c8]279 /////////////////////////////////
280 // Context & Stream Management //
281 /////////////////////////////////
282
283 // Helper function to get the default stream if passed NULL, and just returns stream otherwise
284 cudaStream_t $default_stream_if_null(cudaStream_t stream) {
285 return stream == NULL ? $cuda_default_stream : stream;
286 }
287
288 $cuda_stream_node_t $create_new_stream_node() {
289 cudaStream_t newStream = (cudaStream_t) malloc(sizeof(struct cudaStream));
290 newStream->head = NULL;
291 newStream->tail = NULL;
292 newStream->numOps = 0;
293 newStream->alive = true;
294
295 $cuda_stream_node_t newHead = ($cuda_stream_node_t) malloc(sizeof(struct $cuda_stream_node));
296 newHead->stream = newStream;
297 newStream->containingNode = newHead;
298 newHead->prev = NULL;
299 newHead->next = NULL;
300
301 return newHead;
302 }
303
304 //@ depends_on \nothing;
305 $atomic_f $proc $destroy_stream_node($cuda_stream_node_t node) {
306 $proc lastOpProc = $proc_null;
307 cudaStream_t stream = node->stream;
308
309 if (node->prev != NULL) {
310 node->prev->next = node->next;
311 }
312 if (node->next != NULL) {
313 node->next->prev = node->prev;
314 }
315 free(node);
316
317 stream->alive = false;
318 if(stream->tail != NULL)
319 lastOpProc = stream->tail->opState->op;
320
321 void $destroy_stream_when_complete($proc lastOpProc, cudaStream_t stream) {
322 $wait(lastOpProc);
323 free(stream);
324 }
325
326 return $spawn $destroy_stream_when_complete(lastOpProc, stream);
327 }
328
329 /**
330 * Enqueues the calling $proc as a new cuda operation onto stream. Then blocks until the cuda operation is allowed to execute.
331 *
332 * Reasoning behind using enqueuedFlag:
333 * + Enforces in the interface more explicitly that device proc shouldn't continue until new op state is created and properly filled out (including the $proc field)
334 * + Keeps symmetry since this method means the op does both enqueueing and dequeueing. The alternative technique would have device enqueuing and op dequeuing.
335 * + Reduces dependencies since device proc will not be manipulating the streams nor have direct access to the newly created op state.
336 * + Keeps responsibility of device proc strictly to interpreting messages, spawning appropriate ops and sending message.
337 */
338 //@ depends_on \nothing;
[7e3e7af]339 $atomic_f $cuda_op_state_t $stream_enqueue(_Bool** enqueuedFlag, cudaStream_t stream) {
[0ffc6c8]340 $cuda_op_state_t newOpState = ($cuda_op_state_t) $malloc($cuda_scope, sizeof(struct $cuda_op_state));
341 newOpState->start = false;
342 newOpState->op = $self;
343
344 $cuda_op_state_node_t newOpStateNode = ($cuda_op_state_node_t) $malloc($cuda_scope, sizeof(struct $cuda_op_state_node));
345 newOpStateNode->opState = newOpState;
346 newOpStateNode->next = NULL;
347
348 stream = $default_stream_if_null(stream);
349 $assert(stream->alive, "Attempt to enqueue a CUDA operation onto a destroyed stream");
350
351 if (stream->tail == NULL) {
352 stream->head = newOpStateNode;
353 stream->tail = newOpStateNode;
354 newOpState->start = true;
355 } else {
356 stream->tail->next = newOpStateNode;
357 stream->tail = newOpStateNode;
358 }
359 stream->numOps++;
[7e3e7af]360 **enqueuedFlag = true;
361 *enqueuedFlag = NULL;
[0ffc6c8]362
363 return newOpState;
364 }
365
366 //@ depends_on \nothing;
367 $atomic_f void $stream_dequeue(cudaStream_t stream) {
368 stream = $default_stream_if_null(stream);
369 $assert(stream->head != NULL, "Attempt to dequeue an empty stream");
370
371 if (stream->head == stream->tail) {
372 stream->tail = NULL;
373 }
374
375 $cuda_op_state_node_t oldHead = stream->head;
376 stream->head = oldHead->next;
377 if (stream->head != NULL) {
378 stream->head->opState->start = true;
379 }
380
381 stream->numOps--;
382 free(oldHead->opState);
383 free(oldHead);
384 }
385
[6dd0322]386 ///////////////////////////////
387 // CUDA Function Definitions //
388 ///////////////////////////////
[0ffc6c8]389
[6dd0322]390 /**
391 * Only called at start of program
392 */
393 void $cuda_setup() {
394 $cuda_stream_node_t defaultStreamNode = $create_new_stream_node();
395 $cuda_default_stream = defaultStreamNode->stream;
396
397 $cuda_global_context.head = defaultStreamNode;
398 $cuda_global_context.numStreams = 1;
399 }
400
401 /**
402 * Only called at end of program
403 */
404 void $cuda_teardown() {
405 $proc destructor = $destroy_stream_node($cuda_default_stream->containingNode);
406 $wait(destructor);
[0ffc6c8]407 $comm_destroy($cuda_device_comm);
[6dd0322]408 }
409
[0ffc6c8]410 $message $cuda_free($message request) {
411 void* devPtr;
412 $message_unpack(request, &devPtr, sizeof(void*));
[cb46bb6]413 free($reveal(devPtr));
414 //free(devPtr);
[6dd0322]415
[0ffc6c8]416 return $message_pack($CUDA_PLACE_DEVICE, $CUDA_PLACE_HOST, $CUDA_TAG_cudaFree, NULL, 0);
[6dd0322]417 }
418
[0ffc6c8]419 void $cuda_memcpy_proc(void* dst, const void* src, size_t count, cudaMemcpyKind kind, _Bool* enqueuedFlag, cudaStream_t stream) {
[7e3e7af]420 $cuda_op_state_t opState = $stream_enqueue(&enqueuedFlag, stream);
[6dd0322]421 $when(opState->start);
[0ffc6c8]422
423 if (kind == cudaMemcpyHostToDevice || cudaMemcpyDeviceToDevice) {
[cb46bb6]424 dst = $reveal(dst);
[0ffc6c8]425 }
426 if (kind == cudaMemcpyDeviceToHost || cudaMemcpyDeviceToDevice) {
[cb46bb6]427 src = $reveal(src);
[0ffc6c8]428 }
[6dd0322]429 memcpy(dst, src, count);
[0ffc6c8]430
[6dd0322]431 $stream_dequeue(stream);
432 }
433
[0ffc6c8]434 $message $cuda_memcpy($message request, _Bool async) {
435 $cuda_memcpy_data args;
436 $message_unpack(request, &args, sizeof($cuda_memcpy_data));
[6dd0322]437
438 _Bool enqueuedFlag = false;
[0ffc6c8]439 $proc memcpyProc = $spawn $cuda_memcpy_proc(args.dst, args.src, args.count, args.kind, &enqueuedFlag, $cuda_default_stream);
[6dd0322]440 $when(enqueuedFlag);
[0ffc6c8]441
442 if (!async && args.kind != cudaMemcpyDeviceToDevice) {
[6dd0322]443 $wait(memcpyProc);
444 }
[0ffc6c8]445 int tag = async ? $CUDA_TAG_cudaMemcpyAsync : $CUDA_TAG_cudaMemcpy;
[6dd0322]446
[0ffc6c8]447 return $message_pack($CUDA_PLACE_DEVICE, $CUDA_PLACE_HOST, tag, NULL, 0);
[6dd0322]448 }
449
450 ////////////////////////
451 // Kernel Definitions //
452 ////////////////////////
[0ffc6c8]453
454 // Helper function
455 int $dim3_index(dim3 size, uint3 location) {
456 return location.x + size.x * (location.y + size.y * location.z);
457 }
458
459 // Helper function
460 int $cuda_kernel_index (dim3 gDim, dim3 bDim, uint3 bIdx, uint3 tIdx) {
461 return $dim3_index(gDim, bIdx) * (bDim.x * bDim.y * bDim.z) + $dim3_index(bDim, tIdx);
462 }
463
464 void $cuda_run_and_wait_on_procs(dim3 dim, void spawningFunction(uint3)) {
465 //TODO: calculate length and index, replace this function in the kernel
466 $local_start();
467 int length = dim.x * dim.y * dim.z;
468 $proc proc_array[length];
469 $range rx = 0 .. dim.x - 1;
470 $range ry = 0 .. dim.y - 1;
471 $range rz = 0 .. dim.z - 1;
472 $domain(3) dom = ($domain(3)){rx, ry, rz};
473 $for(int x,y,z : dom){
474 uint3 id = { x, y, z };
475 int index = $dim3_index(dim, id);
476 proc_array[index] = $spawn spawningFunction(id);
477 }
478 $local_end();
479 $waitall(proc_array,length);
480 }
[6dd0322]481
482 // Generated from kernel_1 definition
[0ffc6c8]483 void $cuda_kernel_1(dim3 gridDim, dim3 blockDim, size_t _cuda_mem_size,
[6dd0322]484 const float *A, const float *B, float *C, int numElements) {
485 void _cuda_block(uint3 blockIdx) {
486 int numThreads = (blockDim.x * blockDim.y) * blockDim.z;
487 $scope _block_root = $here;
488 $gbarrier _cuda_block_barrier = $gbarrier_create($here, blockDim.x * blockDim.y * blockDim.z);
489 void _cuda_thread(uint3 threadIdx) {
490 int _cuda_tid = $dim3_index(blockDim, threadIdx);
491 int _cuda_kid = $cuda_kernel_index(gridDim, blockDim, blockIdx, threadIdx);
492 $barrier _cuda_thread_barrier = $barrier_create($here, _cuda_block_barrier, _cuda_tid);
493 $local_start();
494 // Kernel definition start
495
496 int i = blockDim.x * blockIdx.x + threadIdx.x;
497
498 if (i < numElements)
499 {
500 C[i] = A[i] + B[i];
501 }
502
503 // Kernel definition end
504 $local_end();
505 $barrier_destroy(_cuda_thread_barrier);
506 }
507 $cuda_run_and_wait_on_procs(blockDim, _cuda_thread);
508 $gbarrier_destroy(_cuda_block_barrier);
509 }
510 $cuda_run_and_wait_on_procs(gridDim, _cuda_block);
511 }
[0ffc6c8]512
513 void $cuda_kernel_1_proc (_Bool* enqueuedFlag, dim3 gridDim, dim3 blockDim,
514 size_t $cudaMemSize, cudaStream_t $cudaStream,
515 const float *A, const float *B, float *C, int numElements) {
[7e3e7af]516 $cuda_op_state_t opState = $stream_enqueue(&enqueuedFlag, $cudaStream);
[0ffc6c8]517 $when(opState->start);
518 $cuda_kernel_1(gridDim, blockDim, $cudaMemSize, A, B, C, numElements);
519 $stream_dequeue($cudaStream);
520 }
521
522 $message $cuda_device_launch_kernel_1($message request) {
523 $cuda_kernel_1_data args;
524 $message_unpack(request, &args, sizeof($cuda_kernel_1_data));
525
526 _Bool enqueuedFlag = false;
[6698ce0]527 $spawn $cuda_kernel_1_proc(&enqueuedFlag, args.gridDim, args.blockDim, args.$cudaMemSize, args.$cudaStream, $reveal(args.A), $reveal(args.B), $reveal(args.C), args.numElements);
[0ffc6c8]528 $when(enqueuedFlag);
529
530 return $message_pack($CUDA_PLACE_DEVICE, $CUDA_PLACE_HOST, $CUDA_TAG_LAUNCH_kernel_1, NULL, 0);
531 }
[6dd0322]532
533 /////////////////
534 // Device main //
535 /////////////////
536
537 $cuda_setup();
538
539 while (true) {
540 $message request = $comm_dequeue($cuda_device_comm, $CUDA_PLACE_HOST, $COMM_ANY_TAG);
541 $message response;
[0ffc6c8]542 const int tag = $message_tag(request);
[6dd0322]543
544 switch(tag) {
[0ffc6c8]545 case $CUDA_TAG_SCOPE_REQUEST :
546 response = $message_pack($CUDA_PLACE_DEVICE, $CUDA_PLACE_HOST, $CUDA_TAG_SCOPE_REQUEST, &$cuda_scope, sizeof($scope));
547 break;
548 case $CUDA_TAG_cudaFree :
549 response = $cuda_free(request);
[6dd0322]550 break;
551 case $CUDA_TAG_cudaMemcpy :
[0ffc6c8]552 response = $cuda_memcpy(request, false);
553 break;
554 case $CUDA_TAG_cudaMemcpyAsync :
555 response = $cuda_memcpy(request, true);
[6dd0322]556 break;
557 case $CUDA_TAG_LAUNCH_kernel_1 :
[0ffc6c8]558 response = $cuda_device_launch_kernel_1(request);
559 break;
560 case $CUDA_TAG_TEARDOWN :
561 $cuda_teardown();
562 return;
[6dd0322]563 default :
564 $assert(false, "Unknown CUDA request");
565 }
566
567 $comm_enqueue($cuda_device_comm, response);
568 }
569}
570
571///////////////
572// Host file //
573///////////////
574
[4bd0090]575$input int N;
576$assume (N > 0);
577$input float A[N];
578$input float B[N];
579
[6dd0322]580void _host_main() {
[4bd0090]581 int size = N * sizeof(float);
[874f9d1]582 int numBlocks = 2;
583 int numThreads = N%2 == 0? N/2 : (N+1)/2;
[4bd0090]584
[874f9d1]585 float* cuda_A;
[0ffc6c8]586 // cudaMalloc((void **)&cuda_A, size);
[4bd0090]587 {
[0ffc6c8]588 $scope deviceScope = $cuda_host_request_device_scope();
[cb46bb6]589 cuda_A = $hide((float*)$malloc(deviceScope, size));
590 //cuda_A = (float*)$malloc(deviceScope, size);
[4bd0090]591 }
[874f9d1]592 cudaMemcpy(cuda_A, A, size, cudaMemcpyHostToDevice);
[4bd0090]593
[874f9d1]594 float* cuda_B;
[4bd0090]595 // cudaMalloc((void **)&cuda_B, size);
[30cb1de]596 {
[0ffc6c8]597 $scope deviceScope = $cuda_host_request_device_scope();
[cb46bb6]598 cuda_B = $hide((float*)$malloc(deviceScope, size));
599 //cuda_B = (float*)$malloc(deviceScope, size);
[4bd0090]600 }
[874f9d1]601 cudaMemcpy(cuda_B, B, size, cudaMemcpyHostToDevice);
[4bd0090]602
[874f9d1]603 float* cuda_C;
[4bd0090]604 // cudaMalloc((void **)&cuda_C, size);
605 {
[0ffc6c8]606 $scope deviceScope = $cuda_host_request_device_scope();
[cb46bb6]607 cuda_C = $hide((float*)$malloc(deviceScope, size));
608 //cuda_C = (float*)$malloc(deviceScope, size);
[4bd0090]609 }
610
[0ffc6c8]611 dim3 gridDim = {numBlocks, 1, 1};
612 dim3 blockDim = {numThreads, 1, 1};
613 // kernel_1<<<gridDim, blockDim>>>(cuda_A, cuda_B, cuda_C, N);
614 $cuda_host_launch_kernel_1(gridDim, blockDim, 0, NULL, cuda_A, cuda_B, cuda_C, N);
[874f9d1]615
616 //Checking correctness
617 float* C = (float *)malloc(size);
618
619 cudaMemcpy(C, cuda_C, size, cudaMemcpyDeviceToHost);
620
621 for(int i = 0; i < N; i++)
622 $assert(C[i] == A[i] + B[i]);
623
624 free(C);
625
[7e3e7af]626 cudaFree(cuda_A);
[0ffc6c8]627 cudaFree(cuda_B);
628 cudaFree(cuda_C);
[6dd0322]629
630 // inserted by transformer
631 $comm_enqueue($cuda_host_comm, $message_pack($CUDA_PLACE_HOST, $CUDA_PLACE_DEVICE, $CUDA_TAG_TEARDOWN, NULL, 0));
[0ffc6c8]632 $comm_destroy($cuda_host_comm);
[109d05e]633}
634
635int main() {
[6dd0322]636 $proc host = $spawn _host_main();
637 $proc cuda = $spawn _cuda_main();
638 $wait(host);
639 $wait(cuda);
[0ffc6c8]640 $gcomm_destroy($cuda_gcomm, NULL);
[e76acca]641}
Note: See TracBrowser for help on using the repository browser.