source: CIVL/examples/cuda/newCudaMockup.cvl@ 53f9d15

1.23 2.0 main test-branch
Last change on this file since 53f9d15 was 874f9d1, checked in by Zane Greenholt <zgrnhlt@…>, 3 years ago

Added vecAdd example to newCudaMockup

git-svn-id: svn://vsl.cis.udel.edu/civl/trunk@5715 fb995dde-84ed-4084-dfe6-e5aef3e2452c

  • Property mode set to 100644
File size: 11.2 KB
RevLine 
[29eb398]1/**
2 * TODO:
3 * - implement cudaMemset and cudaMemsetAsync
4 * - flesh out basic structure of cuda kernel:
5 * - spawn gridDim blocks
6 * - spawn blockDim threads
7 * Alternatively, spawn blockDim warps and then spawn warp's threads
8 * - wait for blocks to finish
9 * - Add in block-level barriers and a __syncthreads() nested function that uses the barrier.
10 * - Add in data race checking support and implement atomicAdd for integers
[874f9d1]11 * - Handle dependencies at atomic blocks (replace some with local blocks if possible)
12 * - Create device scope for cudaMallocs
[29eb398]13 */
14
[30cb1de]15#include <concurrency.cvh>
16#include <stdlib.h>
[874f9d1]17#include <stdio.h>
[30cb1de]18#include <stdbool.h>
19#include <string.h>
20
[109d05e]21enum cudaError {
22 cudaSuccess
23};
24typedef enum cudaError cudaError_t;
25
26typedef enum cudaMemcpyKind {
27 cudaMemcpyHostToHost,
28 cudaMemcpyHostToDevice,
29 cudaMemcpyDeviceToHost,
30 cudaMemcpyDeviceToDevice,
31 cudaMemcpyDefault
32} cudaMemcpyKind;
33
[30cb1de]34typedef struct {
35 unsigned int x, y, z;
36} dim3;
37
38/* used to represent a location in a three dimensional grid
39 */
40typedef struct {
41 unsigned int x, y, z;
42} uint3;
43
44typedef struct $cuda_op_state* $cuda_op_state_t;
45struct $cuda_op_state {
[109d05e]46 _Bool start;
[e76acca]47 $proc op;
[30cb1de]48};
[109d05e]49
[30cb1de]50typedef struct $cuda_op_state_node* $cuda_op_state_node_t;
51struct $cuda_op_state_node {
[e76acca]52 $cuda_op_state_t opState;
53 $cuda_op_state_node_t next;
[30cb1de]54};
[109d05e]55
[30cb1de]56typedef struct cudaStream* cudaStream_t;
57typedef struct $cuda_stream_node* $cuda_stream_node_t;
58struct cudaStream {
[e76acca]59 $cuda_op_state_node_t head;
60 $cuda_op_state_node_t tail;
[109d05e]61 int numOps;
62 $cuda_stream_node_t containingNode;
63 _Bool alive;
[30cb1de]64};
[109d05e]65cudaStream_t $cuda_default_stream;
66
[30cb1de]67struct $cuda_stream_node {
68 cudaStream_t stream;
[109d05e]69 $cuda_stream_node_t prev;
70 $cuda_stream_node_t next;
[30cb1de]71};
[109d05e]72
73typedef struct $cuda_context {
[30cb1de]74 $cuda_stream_node_t head; //list of streams
[109d05e]75 int numStreams;
76} $cuda_context;
77$cuda_context $cuda_global_context;
78
79// Helper function to get the default stream if passed NULL, and just returns stream otherwise
80cudaStream_t $default_stream_if_null(cudaStream_t stream) {
81 return stream == NULL ? $cuda_default_stream : stream;
82}
83
84$cuda_stream_node_t $create_new_stream_node() {
85 cudaStream_t newStream = (cudaStream_t) malloc(sizeof(struct cudaStream));
86 newStream->head = NULL;
87 newStream->tail = NULL;
88 newStream->numOps = 0;
89 newStream->alive = true;
90
91 $cuda_stream_node_t newHead = ($cuda_stream_node_t) malloc(sizeof(struct $cuda_stream_node));
92 newHead->stream = newStream;
93 newStream->containingNode = newHead;
94 newHead->prev = NULL;
95 newHead->next = NULL;
[30cb1de]96
97 return newHead;
[109d05e]98}
99
[30cb1de]100/**
101 * TODO:
102 * - test
103 * - atomic?
104 */
[109d05e]105cudaError_t cudaStreamCreate(cudaStream_t * pStream) {
106 // Create new stream node in linked list
107 $cuda_stream_node_t newHead = $create_new_stream_node();
108 newHead->next = $cuda_global_context.head;
109 $cuda_global_context.head->prev = newHead;
110
111 // Update cuda context's head to be the new node we created
112 $cuda_global_context.head = newHead;
113 $cuda_global_context.numStreams++;
114
[30cb1de]115 return cudaSuccess;
[109d05e]116}
117
[30cb1de]118/**
119 * TODO:
120 * - test
121 * - atomic?
122 */
[109d05e]123cudaError_t cudaStreamSynchronize(cudaStream_t stream) {
124 stream = $default_stream_if_null(stream);
125 $assert(stream->alive, "Attempt to synchronize with a destroyed stream");
126 $when(stream->head == NULL) return cudaSuccess;
127}
128
129$proc $destroy_stream_node($cuda_stream_node_t node) {
[874f9d1]130 $proc lastOpProc = $proc_null;
131 cudaStream_t stream = node->stream;
[30cb1de]132
[874f9d1]133 $local_start();
[30cb1de]134 if (node->prev != NULL) {
135 node->prev->next = node->next;
136 }
137 if (node->next != NULL) {
138 node->next->prev = node->prev;
139 }
140 free(node);
141
142 stream->alive = false;
143 if(stream->tail != NULL)
144 lastOpProc = stream->tail->opState->op;
[874f9d1]145 $local_end();
[109d05e]146
[30cb1de]147 void $destroy_stream_when_complete($proc lastOpProc, cudaStream_t stream) {
148 $wait(lastOpProc);
149 free(stream);
[109d05e]150 }
[30cb1de]151
152 return $spawn $destroy_stream_when_complete(lastOpProc, stream);
[109d05e]153}
154
155// TODO: atomic
156cudaError_t cudaStreamDestroy(cudaStream_t stream) {
157 $assert(stream != NULL && stream != $cuda_default_stream, "Attempt to destroy default stream");
[30cb1de]158 $assert(stream->alive, "Attempt to destroy an already destroyed stream");
[109d05e]159 $destroy_stream_node(stream->containingNode);
160 return cudaSuccess;
161}
162
[30cb1de]163/**
164 * Enqueues the calling $proc as a new cuda operation onto stream. Then blocks until the cuda operation is allowed to execute.
165 */
166$cuda_op_state_t $stream_enqueue(_Bool* enqueuedFlag, cudaStream_t stream) {
167 $cuda_op_state_t newOpState = ($cuda_op_state_t) malloc(sizeof(struct $cuda_op_state));
168 newOpState->start = false;
169 newOpState->op = $self;
[109d05e]170
[30cb1de]171 $cuda_op_state_node_t newOpStateNode = ($cuda_op_state_node_t) malloc(sizeof(struct $cuda_op_state_node));
172 newOpStateNode->opState = newOpState;
173 newOpStateNode->next = NULL;
174
[874f9d1]175 $local_start();
176 stream = $default_stream_if_null(stream);
177 $assert(stream->alive, "Attempt to enqueue a CUDA operation onto a destroyed stream");
[109d05e]178
[874f9d1]179 if (stream->tail == NULL) {
180 stream->head = newOpStateNode;
181 stream->tail = newOpStateNode;
182 newOpState->start = true;
183 } else {
184 stream->tail->next = newOpStateNode;
185 stream->tail = newOpStateNode;
[30cb1de]186 }
[874f9d1]187 stream->numOps++;
188 *enqueuedFlag = true;
189 $local_end();
[30cb1de]190
191 return newOpState;
[109d05e]192}
193
194void $stream_dequeue(cudaStream_t stream) {
195 stream = $default_stream_if_null(stream);
196 $assert(stream->head != NULL, "Attempt to dequeue an empty stream");
[30cb1de]197
[874f9d1]198 $local_start();
[30cb1de]199 if (stream->head == stream->tail) {
200 stream->tail = NULL;
201 }
[109d05e]202
[30cb1de]203 $cuda_op_state_node_t oldHead = stream->head;
204 stream->head = oldHead->next;
205 if (stream->head != NULL) {
206 stream->head->opState->start = true;
207 }
[109d05e]208
[30cb1de]209 stream->numOps--;
210 free(oldHead->opState);
211 free(oldHead);
[874f9d1]212 $local_end();
[109d05e]213}
214
[30cb1de]215void $cuda_memcpy_proc(void* dst, const void* src, size_t count, _Bool* enqueuedFlag, cudaStream_t stream) {
216 $cuda_op_state_t opState = $stream_enqueue(enqueuedFlag, stream);
217 $when(opState->start);
[109d05e]218 memcpy(dst, src, count);
219 $stream_dequeue(stream);
220}
221
222cudaError_t cudaMemcpy(void* dst, const void* src, size_t count, cudaMemcpyKind kind) {
[30cb1de]223 if (kind == cudaMemcpyHostToHost) {
[29eb398]224 memcpy(dst, src, count);
[109d05e]225 } else {
[30cb1de]226 _Bool enqueuedFlag = false;
227 $proc memcpyProc = $spawn $cuda_memcpy_proc(dst, src, count, &enqueuedFlag, $cuda_default_stream);
228 $when(enqueuedFlag);
[e76acca]229 if (kind != cudaMemcpyDeviceToDevice) {
[109d05e]230 $wait(memcpyProc);
231 }
232 }
233
234 return cudaSuccess;
235}
236
237cudaError_t cudaMemcpyAsync(void* dst, const void* src, size_t count,
238 cudaMemcpyKind kind, cudaStream_t stream) {
[e76acca]239 if (kind == cudaMemcpyHostToHost) {
[29eb398]240 memcpy(dst, src, count);
[109d05e]241 } else {
[30cb1de]242 _Bool enqueuedFlag = false;
243 $spawn $cuda_memcpy_proc(dst, src, count, &enqueuedFlag, stream);
244 $when(enqueuedFlag);
[109d05e]245 }
246
247 return cudaSuccess;
248}
249
250cudaError_t cudaDeviceSynchronize() {
[30cb1de]251 $proc* opsToWaitOn;
[109d05e]252 int numOps = 0;
253
254 $atomic {
[30cb1de]255 opsToWaitOn = ($proc*) malloc(sizeof($proc) * $cuda_global_context.numStreams);
256
[109d05e]257 for ($cuda_stream_node_t node = $cuda_global_context.head;
[30cb1de]258 node != NULL;
259 node = node->next) {
260 if (node->stream->tail != NULL) {
261 opsToWaitOn[numOps] = node->stream->tail->opState->op;
262 numOps++;
263 }
264 }
265 }
[e76acca]266 $waitall(opsToWaitOn, numOps);
[109d05e]267
[30cb1de]268 return cudaSuccess;
[109d05e]269}
270
[30cb1de]271/**
272 * Only called at start of program
273 */
[109d05e]274void $cuda_setup() {
[30cb1de]275 $cuda_stream_node_t defaultStreamNode = $create_new_stream_node();
276 $cuda_default_stream = defaultStreamNode->stream;
[109d05e]277
[30cb1de]278 $cuda_global_context.head = defaultStreamNode;
279 $cuda_global_context.numStreams = 1;
[109d05e]280}
281
[30cb1de]282/**
283 * Only called at end of program
284 */
[109d05e]285void $cuda_teardown() {
[30cb1de]286 $proc destructor = $destroy_stream_node($cuda_default_stream->containingNode);
287 $wait(destructor);
288}
289
290// Helper function
291int $dim3_index(dim3 size, uint3 location) {
292 return location.x + size.x * (location.y + size.y * location.z);
293}
294
295// Helper function
296int $cuda_kernel_index (dim3 gDim, dim3 bDim, uint3 bIdx, uint3 tIdx) {
297 return $dim3_index(gDim, bIdx) * (bDim.x * bDim.y * bDim.z) + $dim3_index(bDim, tIdx);
298}
299
300void $cuda_run_and_wait_on_procs(dim3 dim, void spawningFunction(uint3)) {
301 //TODO: calculate length and index, replace this function in the kernel
[874f9d1]302 //$local_start();
[30cb1de]303 int length = dim.x * dim.y * dim.z;
304 $proc proc_array[length];
305 $range rx = 0 .. dim.x - 1;
306 $range ry = 0 .. dim.y - 1;
307 $range rz = 0 .. dim.z - 1;
308 $domain(3) dom = ($domain){rx, ry, rz};
[874f9d1]309 #For some reason there is depends on all here
[30cb1de]310 $for(int x,y,z : dom){
311 uint3 id = { x, y, z };
312 int index = $dim3_index(dim, id);
313 proc_array[index] = $spawn spawningFunction(id);
314 }
[874f9d1]315 //$local_end();
[30cb1de]316 $waitall(proc_array,length);
[109d05e]317}
318
319void _cuda_kernel_1(dim3 gridDim, dim3 blockDim, size_t _cuda_mem_size,
[4bd0090]320 const float *A, const float *B, float *C, int numElements) {
[30cb1de]321 void _cuda_block(uint3 blockIdx) {
322 int numThreads = (blockDim.x * blockDim.y) * blockDim.z;
323 $scope _block_root = $here;
324 $gbarrier _cuda_block_barrier = $gbarrier_create($here, blockDim.x * blockDim.y * blockDim.z);
325 void _cuda_thread(uint3 threadIdx) {
326 int _cuda_tid = $dim3_index(blockDim, threadIdx);
327 int _cuda_kid = $cuda_kernel_index(gridDim, blockDim, blockIdx, threadIdx);
328 $barrier _cuda_thread_barrier = $barrier_create($here, _cuda_block_barrier, _cuda_tid);
[4bd0090]329 $local_start();
330 // Kernel definition start
331
332 int i = blockDim.x * blockIdx.x + threadIdx.x;
333
334 if (i < numElements)
335 {
336 C[i] = A[i] + B[i];
337 }
338
339 // Kernel definition end
340 $local_end();
[30cb1de]341 $barrier_destroy(_cuda_thread_barrier);
342 }
343 $cuda_run_and_wait_on_procs(blockDim, _cuda_thread);
344 $gbarrier_destroy(_cuda_block_barrier);
345 }
346 $cuda_run_and_wait_on_procs(gridDim, _cuda_block);
[109d05e]347}
[30cb1de]348
[e76acca]349void $proc_kernel_1(dim3 gridDim, dim3 blockDim, size_t _cudaMemSize,
[4bd0090]350 _Bool* enqueuedFlag, cudaStream_t _cudaStream,
351 const float *A, const float *B, float *C, int numElements) {
[30cb1de]352 $cuda_op_state_t opState = $stream_enqueue(enqueuedFlag, _cudaStream);
353 $when(opState->start);
[4bd0090]354 _cuda_kernel_1(gridDim, blockDim, _cudaMemSize, A, B, C, numElements);
[30cb1de]355 $stream_dequeue(_cudaStream);
[109d05e]356}
357
[4bd0090]358$input int N;
359$assume (N > 0);
360$input float A[N];
361$input float B[N];
362
[109d05e]363void _civl_main() {
[4bd0090]364 int size = N * sizeof(float);
[874f9d1]365 int numBlocks = 2;
366 int numThreads = N%2 == 0? N/2 : (N+1)/2;
[4bd0090]367
[874f9d1]368 float* cuda_A;
[4bd0090]369 // cudaMalloc((void **)&cuda_A, size);
370 {
371 cuda_A = (float *) malloc(size);
372 }
[874f9d1]373 cudaMemcpy(cuda_A, A, size, cudaMemcpyHostToDevice);
[4bd0090]374
[874f9d1]375 float* cuda_B;
[4bd0090]376 // cudaMalloc((void **)&cuda_B, size);
[30cb1de]377 {
[4bd0090]378 cuda_B = (float *) malloc(size);
379 }
[874f9d1]380 cudaMemcpy(cuda_B, B, size, cudaMemcpyHostToDevice);
[4bd0090]381
[874f9d1]382 float* cuda_C;
[4bd0090]383 // cudaMalloc((void **)&cuda_C, size);
384 {
385 cuda_C = (float *) malloc(size);
386 }
387
[874f9d1]388 { // kernel_1<<<gridDim, blockDim>>>(cuda_A, cuda_B, cuda_C, N);
389 dim3 gridDim = {numBlocks, 1, 1};
390 dim3 blockDim = {numThreads, 1, 1};
[30cb1de]391 _Bool enqueuedFlag = false;
[4bd0090]392 $spawn $proc_kernel_1(gridDim, blockDim, 0, &enqueuedFlag, NULL, cuda_A, cuda_B, cuda_C, N);
[30cb1de]393 $when(enqueuedFlag);
394 }
[874f9d1]395
396 //Checking correctness
397 float* C = (float *)malloc(size);
398
399 cudaMemcpy(C, cuda_C, size, cudaMemcpyDeviceToHost);
400
401 for(int i = 0; i < N; i++)
402 $assert(C[i] == A[i] + B[i]);
403
404 free(C);
405
406 //cudaFree(cuda_A);...
407 free(cuda_A);
408 free(cuda_B);
409 free(cuda_C);
[109d05e]410}
411
412int main() {
[30cb1de]413 $cuda_setup();
414 _civl_main();
415 $cuda_teardown();
[e76acca]416}
Note: See TracBrowser for help on using the repository browser.