source: CIVL/examples/cuda/newCudaMockup.cvl

main
Last change on this file was 9dbe9864, checked in by Alex Wilton <awilton@…>, 2 years ago

Merged CUDA branch into trunk.

git-svn-id: svn://vsl.cis.udel.edu/civl/trunk@5890 fb995dde-84ed-4084-dfe6-e5aef3e2452c

  • Property mode set to 100644
File size: 67.4 KB
RevLine 
[30cb1de]1#include <concurrency.cvh>
[6dd0322]2#include <comm.cvh>
[30cb1de]3#include <stdlib.h>
[874f9d1]4#include <stdio.h>
[30cb1de]5#include <stdbool.h>
6#include <string.h>
[9cabba4]7#include <mem.cvh>
[66ea4f9]8#pragma CIVL ACSL
[30cb1de]9
[6dd0322]10///////////
11// Types //
12///////////
13
[109d05e]14enum cudaError {
15 cudaSuccess
16};
17typedef enum cudaError cudaError_t;
18
[9cabba4]19//typedef $cuda_memcpy_kind cudaMemcpyKind;
[109d05e]20
[30cb1de]21typedef struct {
22 unsigned int x, y, z;
23} dim3;
24
25/* used to represent a location in a three dimensional grid
26 */
27typedef struct {
28 unsigned int x, y, z;
29} uint3;
30
[9cabba4]31typedef enum {
32 cudaMemcpyHostToHost,
33 cudaMemcpyHostToDevice,
34 cudaMemcpyDeviceToHost,
35 cudaMemcpyDeviceToDevice,
36 cudaMemcpyDefault
37} cudaMemcpyKind;
38
[30cb1de]39typedef struct $cuda_op_state* $cuda_op_state_t;
[9cabba4]40typedef struct $cuda_op_state {
[109d05e]41 _Bool start;
[e76acca]42 $proc op;
[9cabba4]43} $cuda_op_state;
[109d05e]44
[30cb1de]45typedef struct $cuda_op_state_node* $cuda_op_state_node_t;
[9cabba4]46typedef struct $cuda_op_state_node {
[e76acca]47 $cuda_op_state_t opState;
48 $cuda_op_state_node_t next;
[9cabba4]49} $cuda_op_state_node;
[109d05e]50
[30cb1de]51typedef struct $cuda_stream_node* $cuda_stream_node_t;
[9cabba4]52typedef struct $cuda_stream* $cuda_stream_t;
53typedef $cuda_stream_t cudaStream_t;
54typedef struct $cuda_stream {
[e76acca]55 $cuda_op_state_node_t head;
56 $cuda_op_state_node_t tail;
[109d05e]57 int numOps;
58 $cuda_stream_node_t containingNode;
59 _Bool alive;
[9cabba4]60} $cuda_stream;
[109d05e]61
[9cabba4]62typedef struct $cuda_stream_node{
[30cb1de]63 cudaStream_t stream;
[109d05e]64 $cuda_stream_node_t prev;
65 $cuda_stream_node_t next;
[9cabba4]66} $cuda_stream_node;
[109d05e]67
[9cabba4]68typedef struct $cuda_context* $cuda_context_t;
[109d05e]69typedef struct $cuda_context {
[a67c590]70 $cuda_stream_node_t head;
[109d05e]71 int numStreams;
72} $cuda_context;
73
[9dbe9864]74typedef struct $cuda_mem_set* $cuda_mem_set_t;
75typedef struct $cuda_mem_set {
76 $mem reads;
77 $mem writes;
78} $cuda_mem_set;
79
80void $cuda_mem_set_clear($cuda_mem_set_t memSet) {
81 $mem emptyset = $mem_empty();
82 memSet->reads = emptyset;
83 memSet->writes = emptyset;
84}
[6dd0322]85
[9dbe9864]86$cuda_mem_set_t $cuda_mem_set_create($scope scope) {
87 $cuda_mem_set_t newMemSet = ($cuda_mem_set_t) $malloc(scope, sizeof($cuda_mem_set));
[6dd0322]88
[9dbe9864]89 $cuda_mem_set_clear(newMemSet);
[6dd0322]90
[9dbe9864]91 return newMemSet;
[0ffc6c8]92}
93
[9dbe9864]94void $cuda_mem_set_destroy($cuda_mem_set_t memSet) {
95 free(memSet);
[0ffc6c8]96}
97
[9dbe9864]98void $cuda_mem_set_add($cuda_mem_set_t memSet, $mem reads, $mem writes) {
99 memSet->reads = $mem_union(memSet->reads, reads);
100 memSet->writes = $mem_union(memSet->writes, writes);
[a67c590]101}
102
[9dbe9864]103void $cuda_check_mems_disjoint($cuda_mem_set_t m1, int b1, int w1, int l1,
104 $cuda_mem_set_t m2, int b2, int w2, int l2) {
105 $mem out1 = $mem_empty();
106 $mem out2 = $mem_empty();
[a67c590]107
[9dbe9864]108 $assert($mem_no_intersect(m1->reads, m2->writes, &out1, &out2),
109 "Data-race detected: %p read by thread <%d, %d, %d> intersects %p written by thread <%d, %d, %d>\n",
110 out1, b1, w1, l1, out2, b2, w2, l2);
111 $assert($mem_no_intersect(m1->writes, m2->reads, &out1, &out2),
112 "Data-race detected: %p written by thread <%d, %d, %d> intersects %p read by thread <%d, %d, %d>\n",
113 out1, b1, w1, l1, out2, b2, w2, l2);
114 $assert($mem_no_intersect(m1->writes, m2->writes, &out1, &out2),
115 "Data-race detected: %p written by thread <%d, %d, %d> intersects %p written by thread <%d, %d, %d>\n",
116 out1, b1, w1, l1, out2, b2, w2, l2);
[a67c590]117}
118
[9dbe9864]119typedef enum {
120 $CUDA_WARP_TAG_EMPTY,
121 $CUDA_WARP_TAG_warpsync,
122 $CUDA_WARP_TAG_shfl_sync,
123 $CUDA_WARP_TAG_shfl_up_sync,
124 $CUDA_WARP_TAG_shfl_down_sync,
125 $CUDA_WARP_TAG_shfl_xor_sync,
126 $CUDA_WARP_TAG_all_sync,
127 $CUDA_WARP_TAG_any_sync,
128 $CUDA_WARP_TAG_ballot_sync
129} $cuda_warp_tag;
[a67c590]130
[9dbe9864]131$input int warpSize = 32;
[a67c590]132
[9dbe9864]133typedef struct $cuda_warp_data* $cuda_warp_data_t;
134typedef struct $cuda_warp_data {
135 int size;
136 $cuda_mem_set_t** memSets;
137 _Bool patching[];
138 int num_alive;
139 _Bool alive[];
140 int num_in_barrier;
141 _Bool in_barrier[];
142 $cuda_warp_tag currOp;
143 int reductionLane;
144 $gcomm gcomm;
145} $cuda_warp_data;
[a67c590]146
[9dbe9864]147/*@ depends_on \access(warp);
148 @ executes_when \true;
149 @ */
150$atomic_f void $cuda_warp_barrier_update($cuda_warp_data_t warp) {
151 if (warp->num_in_barrier == warp->num_alive) {
152 warp->num_in_barrier = 0;
153 warp->currOp = $CUDA_WARP_TAG_EMPTY;
154 for (int i = 0; i < warp->size; i++) {
155 warp->in_barrier[i] = $false;
156 $cuda_mem_set_clear(warp->memSets[i][i]);
157 }
[a67c590]158 }
159}
160
[9dbe9864]161$cuda_warp_data_t $create_cuda_warp_data($scope warpScope, int size) {
162 $cuda_warp_data_t newWarp = ($cuda_warp_data_t) malloc(sizeof($cuda_warp_data));
163 newWarp->size = size;
164 newWarp->memSets = ($cuda_mem_set_t**) $malloc(warpScope, size * sizeof($cuda_mem_set_t*));
165 for (int i = 0; i < size; i++) {
166 newWarp->memSets[i] = ($cuda_mem_set_t*) $malloc(warpScope, size * sizeof($cuda_mem_set_t));
167 for (int j = 0; j < size; j++) {
168 newWarp->memSets[i][j] = $cuda_mem_set_create(warpScope);
169 }
170 }
171 newWarp->patching = (_Bool[size])$lambda(int i) $false;
172 newWarp->num_alive = size;
173 newWarp->alive = (_Bool[size])$lambda(int i) $true;
174 newWarp->num_in_barrier = 0;
175 newWarp->in_barrier = (_Bool[size])$lambda(int i) $false;
176 newWarp->currOp = $CUDA_WARP_TAG_EMPTY;
177 newWarp->reductionLane = -1;
178 newWarp->gcomm = $gcomm_create(warpScope, size);
179
180 return newWarp;
[a67c590]181}
182
[9dbe9864]183void $destroy_cuda_warp_data($cuda_warp_data_t warp) {
184 $assert(warp != NULL, "Attempt to destroy a NULL warp");
[a67c590]185
[9dbe9864]186 for (int i = 0; i < warp->size; i++) {
187 for (int j = 0; j < warp->size; j++) {
188 $cuda_mem_set_destroy(warp->memSets[i][j]);
189 }
190 free(warp->memSets[i]);
[a67c590]191 }
[9dbe9864]192 free(warp->memSets);
193 $gcomm_destroy(warp->gcomm, NULL);
194 free(warp);
[a67c590]195}
196
[9dbe9864]197typedef struct $cuda_block_data* $cuda_block_data_t;
198typedef struct $cuda_block_data {
199 int size;
200 $cuda_mem_set_t* memSets;
201 int predArray[];
202 $gbarrier gbarrier;
203 int currBarrierID;
204 int numWarps;
205 $cuda_warp_data_t* warps;
206} $cuda_block_data;
[a67c590]207
[9dbe9864]208$cuda_block_data_t $create_cuda_block_data($scope scope, int size) {
209 $cuda_block_data_t newBlock = ($cuda_block_data_t) $malloc(scope, sizeof($cuda_block_data));
[a67c590]210
[9dbe9864]211 newBlock->size = size;
212 newBlock->numWarps = (size - 1)/warpSize + 1;
213
214 newBlock->predArray = (int[size]) $lambda(int i) 0;
215
216 newBlock->gbarrier = $gbarrier_create(scope, size);
217 newBlock->currBarrierID = -1;
[a67c590]218
[9dbe9864]219 newBlock->memSets = ($cuda_mem_set_t*) $malloc(scope, size * sizeof($cuda_mem_set_t));
220 for (int i = 0; i < size; i++) {
221 newBlock->memSets[i] = $cuda_mem_set_create(scope);
[a67c590]222 }
[9dbe9864]223 newBlock->warps = ($cuda_warp_data_t*) $malloc(scope, newBlock->numWarps * sizeof($cuda_warp_data_t));
224 for (int i = 0; i < newBlock->numWarps - 1; i++) {
225 newBlock->warps[i] = $create_cuda_warp_data(scope, warpSize);
[a67c590]226 }
[9dbe9864]227 int lastIndex = newBlock->numWarps - 1;
228 newBlock->warps[lastIndex] = $create_cuda_warp_data(scope, ((size - 1) % warpSize) + 1);
229
230 return newBlock;
[a67c590]231}
232
[9dbe9864]233void $destroy_cuda_block_data($cuda_block_data_t block) {
234 for (int i = 0; i < block->size; i++) {
235 $cuda_mem_set_destroy(block->memSets[i]);
236 }
237 free(block->memSets);
[a67c590]238
[9dbe9864]239 for (int i = 0; i < block->numWarps; i++) {
240 $destroy_cuda_warp_data(block->warps[i]);
[a67c590]241 }
[9dbe9864]242 free(block->warps);
[a67c590]243
[9dbe9864]244 $gbarrier_destroy(block->gbarrier);
245
246 free(block);
[a67c590]247}
248
[9dbe9864]249typedef struct $cuda_kernel_data* $cuda_kernel_data_t;
250typedef struct $cuda_kernel_data {
251 int size;
252 $cuda_mem_set_t* memSets;
253 int numBlocks;
254 $cuda_block_data_t* blocks;
255} $cuda_kernel_data;
[a67c590]256
[9dbe9864]257$cuda_kernel_data_t $create_cuda_kernel_data($scope scope, dim3 gridDim, dim3 blockDim){
258 int numBlocks = (gridDim.x * gridDim.y) * gridDim.z;
259 int threadsPerBlock = (blockDim.x * blockDim.y) * blockDim.z;
[a67c590]260
[9dbe9864]261 $cuda_kernel_data_t newKernel = ($cuda_kernel_data_t)$malloc(scope, sizeof($cuda_kernel_data));
262
263 newKernel->size = threadsPerBlock * numBlocks;
264 newKernel->numBlocks = numBlocks;
265
266 newKernel->memSets = ($cuda_mem_set_t*) $malloc(scope, newKernel->size * sizeof($cuda_mem_set_t));
267 for (int i = 0; i < newKernel->size; i++) {
268 newKernel->memSets[i] = $cuda_mem_set_create(scope);
269 }
270
271 newKernel->blocks = ($cuda_block_data_t*) $malloc(scope, newKernel->numBlocks * sizeof($cuda_block_data_t));
272 for (int i = 0; i < newKernel->numBlocks; i++) {
273 newKernel->blocks[i] = $create_cuda_block_data(scope, threadsPerBlock);
274 }
[9cabba4]275
[9dbe9864]276 return newKernel;
[9cabba4]277}
278
[9dbe9864]279void $destroy_cuda_kernel_data($cuda_kernel_data_t kernel){
280 for (int i = 0; i < kernel->size; i++) {
281 $cuda_mem_set_destroy(kernel->memSets[i]);
282 }
283 free(kernel->memSets);
[9cabba4]284
[9dbe9864]285 for (int i = 0; i < kernel->numBlocks; i++) {
286 $destroy_cuda_block_data(kernel->blocks[i]);
287 }
288 free(kernel->blocks);
289 free(kernel);
[9cabba4]290}
291
[9dbe9864]292typedef struct $cuda_thread_data* $cuda_thread_data_t;
293typedef struct $cuda_thread_data {
294 $cuda_kernel_data_t kernel;
295 $cuda_block_data_t block;
296 $cuda_warp_data_t warp;
297 int blockID;
298 int warpID;
299 int laneID;
300 $comm lane_comm;
301 $barrier block_barrier;
302} $cuda_thread_data;
[9cabba4]303
[9dbe9864]304/*@ depends_on \access(thread);
305 @ executes_when \true;
306 @*/
307$atomic_f void $cuda_check_warp_data_race($cuda_thread_data_t thread) {
308 int blockID = thread->blockID;
309 int warpID = thread->warpID;
310 int laneID = thread->laneID;
311 $cuda_mem_set_t** warpMems = thread->warp->memSets;
[9cabba4]312
[9dbe9864]313 for (int i = 0; i < thread->warp->size; i++) {
314 if (i != laneID) {
315 $cuda_check_mems_disjoint(warpMems[laneID][laneID], blockID, warpID, laneID,
316 warpMems[i][i], blockID, warpID, i);
317 }
[9cabba4]318 }
[9dbe9864]319}
320
321/*@ depends_on \access(thread);
322 @ executes_when \true;
323 @*/
324$atomic_f void $cuda_check_block_data_race($cuda_thread_data_t thread) {
325 int blockID = thread->blockID;
326 int warpID = thread->warpID;
327 int laneID = thread->laneID;
328 int indexInBlock = warpID * warpSize + laneID;
329 $cuda_mem_set_t* blockMems = thread->block->memSets;
[9cabba4]330
[9dbe9864]331 for (int i = 0; i < thread->block->numWarps; i++) {
332 if (i != warpID) {
333 int currWarpSize = thread->block->warps[i]->size;
334 for (int j = 0; j < currWarpSize; j++) {
335 $cuda_check_mems_disjoint(blockMems[indexInBlock], blockID, warpID, laneID,
336 blockMems[i * warpSize + j], blockID, i, j);
337 }
338 }
[9cabba4]339 }
340}
341
[9dbe9864]342/*@ depends_on \access(thread);
343 @ executes_when \true;
344 @*/
345$atomic_f void $cuda_check_kernel_data_race($cuda_thread_data_t thread) {
346 int blockID = thread->blockID;
347 int warpID = thread->warpID;
348 int laneID = thread->laneID;
349 int threadsPerBlock = thread->block->size;
350 int indexInKernel = blockID * threadsPerBlock + warpID * warpSize + laneID;
351 $cuda_mem_set_t* kernelMems = thread->kernel->memSets;
352 //printf("%d, %d, %d - Checking kernel data race. km index %d.\n\n", thread->blockID, thread->warpID, thread->laneID, indexInKernel);
[a67c590]353
[9dbe9864]354 for (int i = 0; i < thread->kernel->numBlocks; i++) {
355 if (i != blockID) {
356 $cuda_block_data_t block = thread->kernel->blocks[i];
357 for (int j = 0; j < block->numWarps; j++) {
358 int currWarpSize = block->warps[j]->size;
359 for (int k = 0; k < currWarpSize; k++) {
360 //printf("%d, %d, %d - Checking kdr against <%d,%d,%d> with km index %d.\n\n", thread->blockID, thread->warpID, thread->laneID, i,j,k, i * threadsPerBlock + j * warpSize + k);
361 $cuda_check_mems_disjoint(kernelMems[indexInKernel], blockID, warpID, laneID,
362 kernelMems[i * threadsPerBlock + j * warpSize + k], i, j, k);
363 }
364 }
365 }
366 }
[109d05e]367}
368
[9dbe9864]369/*@ depends_on \access(thread);
370 @ executes_when \true;
371 @*/
372$atomic_f void $cuda_start_mem_patching($cuda_thread_data_t thread) {
373 thread->warp->patching[thread->laneID] = $true;
[109d05e]374}
375
[9dbe9864]376/*@ depends_on \access(thread);
377 @ executes_when \true;
378 @*/
379$atomic_f void $cuda_update_patches($cuda_thread_data_t thread) {
380 $cuda_warp_data_t warp = thread->warp;
381 int lane = thread->laneID;
382
383 for (int i = 0; i < warp->size; i++) {
384 if (warp->patching[i]) {
385 $cuda_mem_set_add(warp->memSets[i][lane], warp->memSets[lane][lane]->reads, warp->memSets[lane][lane]->writes);
386 }
387 }
[109d05e]388}
389
[9dbe9864]390/*@ depends_on \access(thread);
391 @ executes_when \true;
392 @*/
393$atomic_f void $cuda_end_mem_patching($cuda_thread_data_t thread) {
394 if (thread->warp->patching[thread->laneID]) {
395 int size = thread->warp->size;
396 int blockID = thread->blockID;
397 int warpID = thread->warpID;
398 int laneID = thread->laneID;
399 $cuda_mem_set_t* threadMems = thread->warp->memSets[laneID];
400
401 thread->warp->patching[laneID] = $false;
[109d05e]402
[9dbe9864]403 for (int i = 0; i < size; i++) {
404 if (i != laneID) {
405 $cuda_check_mems_disjoint(threadMems[laneID], blockID, warpID, laneID,
406 threadMems[i], blockID, warpID, i);
407 $cuda_mem_set_clear(threadMems[i]);
[30cb1de]408 }
409 }
410 }
[109d05e]411}
[9cabba4]412
[9dbe9864]413/*@ depends_on \access(thread);
414 @ executes_when \true;
415 @*/
416$atomic_f void $cuda_start_kernel_region($cuda_thread_data_t thread) {
417 int lane = thread->laneID;
418 //printf("%d, %d, %d - Starting region.\n\n", thread->blockID, thread->warpID, thread->laneID);
419 $cuda_update_patches(thread);
420 $cuda_mem_set_clear(thread->warp->memSets[lane][lane]);
421 $cuda_mem_set_clear(thread->block->memSets[thread->warpID * warpSize + thread->laneID]);
422 $cuda_mem_set_clear(thread->kernel->memSets[thread->blockID * thread->block->size + thread->warpID * warpSize + thread->laneID]);
423 //$cuda_mem_set_t km = thread->kernel->memSets[thread->blockID];
424 //printf("%d, %d, %d - kernel mem set after clear:\n\tReads: %s\n\tWrites: %s\n\n", thread->blockID, thread->warpID, thread->laneID, km->reads, km->writes);
425 $read_set_push();
426 $write_set_push();
427}
[9cabba4]428
[9dbe9864]429/*@ depends_on \access(thread);
430 @ executes_when \true;
431 @*/
432$atomic_f void $cuda_complete_warp_region($cuda_thread_data_t thread) {
433 $mem reads = $read_set_pop();
434 $mem writes = $write_set_pop();
435 int lane = thread->laneID;
436 $cuda_mem_set_t laneMem = thread->warp->memSets[lane][lane];
437 //printf("%d, %d, %d - completing warp region.\n\tReads: %s\n\tWrites: %s\n\n", thread->blockID, thread->warpID, thread->laneID, reads, writes);
[9cabba4]438
[9dbe9864]439 $cuda_mem_set_add(laneMem, reads, writes);
440 $cuda_mem_set_add(thread->block->memSets[thread->warpID * warpSize + thread->laneID], laneMem->reads, laneMem->writes);
441 $cuda_mem_set_add(thread->kernel->memSets[thread->blockID * thread->block->size + thread->warpID * warpSize + thread->laneID], laneMem->reads, laneMem->writes);
442 //$cuda_mem_set_t km = thread->kernel->memSets[thread->blockID];
443 //printf("%d, %d, %d - kernel mem set after add:\n\tReads: %s\n\tWrites: %s\n\n", thread->blockID, thread->warpID, thread->laneID, km->reads, km->writes);
444 $cuda_end_mem_patching(thread);
445 $cuda_check_warp_data_race(thread);
[9cabba4]446}
447
[9dbe9864]448/*@ depends_on \access(thread);
449 @ executes_when \true;
450 @*/
451$atomic_f void $cuda_complete_block_region($cuda_thread_data_t thread) {
452 //printf("%d, %d, %d - completing block region\n\n", thread->blockID, thread->warpID, thread->laneID);
453 $cuda_complete_warp_region(thread);
454 $cuda_check_block_data_race(thread);
[9cabba4]455}
456
[9dbe9864]457/*@ depends_on \access(thread);
[9cabba4]458 @ executes_when \true;
[9dbe9864]459 @*/
460$atomic_f void $cuda_complete_kernel_region($cuda_thread_data_t thread) {
461 //printf("%d, %d, %d - completing kernel region\n\n", thread->blockID, thread->warpID, thread->laneID);
462 $cuda_complete_block_region(thread);
463 $cuda_check_kernel_data_race(thread);
[9cabba4]464}
465
[9dbe9864]466$cuda_thread_data_t $create_cuda_thread_data($scope scope, $cuda_kernel_data_t kernel, int blockID, int warpID, int laneID) {
467 $cuda_thread_data_t newThread = ($cuda_thread_data_t) $malloc(scope, sizeof($cuda_thread_data));
[9cabba4]468
[9dbe9864]469 newThread->kernel = kernel;
470 newThread->block = kernel->blocks[blockID];
471 newThread->warp = newThread->block->warps[warpID];
472 newThread->blockID = blockID;
473 newThread->warpID = warpID;
474 newThread->laneID = laneID;
475 newThread->lane_comm = $comm_create(scope, kernel->blocks[blockID]->warps[warpID]->gcomm, laneID);
476 newThread->block_barrier = $barrier_create(scope, kernel->blocks[blockID]->gbarrier, warpID * warpSize + laneID);
[9cabba4]477
[9dbe9864]478 $read_set_push();
479 $write_set_push();
480
481 return newThread;
482}
[9cabba4]483
[9dbe9864]484/*@ depends_on \access(thread);
[9cabba4]485 @ executes_when \true;
486 @ */
[9dbe9864]487$atomic_f void $destroy_cuda_thread_data($cuda_thread_data_t thread) {
488 $assert(thread != NULL, "Attempt to destroy NULL cuda thread");
489 $cuda_complete_kernel_region(thread);
490
491 $cuda_warp_data_t warp = thread->warp;
492 warp->alive[thread->laneID] = $false;
493 warp->num_alive--;
494 $cuda_warp_barrier_update(warp);
495
496 $barrier_destroy(thread->block_barrier);
497
498 $comm_destroy(thread->lane_comm);
499 free(thread);
[9cabba4]500}
501
[9dbe9864]502/*@ depends_on \access(thread);
[9cabba4]503 @ executes_when \true;
504 @ */
[9dbe9864]505$atomic_f void $cuda_warp_barrier_enter($cuda_thread_data_t thread, $cuda_warp_tag tag) {
506 $cuda_warp_data_t warp = thread->warp;
507 $assert(!warp->in_barrier[thread->laneID]);
508 $assert(warp->currOp == tag || warp->currOp == $CUDA_WARP_TAG_EMPTY);
[9cabba4]509
[9dbe9864]510 warp->in_barrier[thread->laneID] = $true;
511 warp->currOp = tag;
[9cabba4]512 warp->num_in_barrier++;
513 $cuda_warp_barrier_update(warp);
514}
515
[9dbe9864]516// Seems this needs to be atomic to work. Why?
517/*@ depends_on \access(thread);
518 @*/
519$atomic_f void $cuda_warp_barrier_exit($cuda_thread_data_t thread) {
520 $when(!thread->warp->in_barrier[thread->laneID]);
[9cabba4]521}
522
[9dbe9864]523void $cuda_warp_barrier_call($cuda_thread_data_t thread, $cuda_warp_tag tag) {
524 $cuda_warp_barrier_enter(thread, tag);
525 $cuda_warp_barrier_exit(thread);
[9cabba4]526}
527
[9dbe9864]528void $cuda__syncthreads($cuda_thread_data_t thread, int barrierIndex) {
529 $cuda_complete_block_region(thread);
530 int indexInBlock = thread->warpID * warpSize + thread->laneID;
531
532 if (thread->block->currBarrierID == -1) {
533 thread->block->currBarrierID = barrierIndex;
534 }
535 $assert(thread->block->currBarrierID == barrierIndex,
536 "Divergent calls to __syncthreads.");
537
538 $local_end();
539 $barrier_call(thread->block_barrier);
540 $local_start();
541 int lane = thread->laneID;
542
543 $cuda_mem_set_clear(thread->warp->memSets[lane][lane]);
544 $cuda_mem_set_clear(thread->block->memSets[thread->warpID * warpSize + lane]);
545
546 if (indexInBlock == 0) {
547 thread->block->currBarrierID = -1;
548 }
549
550 $local_end();
551 $barrier_call(thread->block_barrier);
552 $local_start();
553
554 $read_set_push();
555 $write_set_push();
556}
557
558#define $CUDA_DEFINE_SYNCTHREADS_VARIANT(NAME, INIT, REDUCTION) \
559 int NAME(int predicate, $cuda_thread_data_t thread, int barrierIndex) { \
560 $cuda_complete_block_region(thread); \
561 int indexInBlock = thread->warpID * warpSize + thread->laneID; \
562 \
563 if (thread->block->currBarrierID == -1) { \
564 thread->block->currBarrierID = barrierIndex; \
565 } \
566 $assert(thread->block->currBarrierID == barrierIndex, \
567 "Divergent calls to NAME."); \
568 \
569 if (predicate) { \
570 thread->block->predArray[indexInBlock] = 1; \
571 } else { \
572 thread->block->predArray[indexInBlock] = 0; \
573 } \
574 \
575 $local_end(); \
576 $barrier_call(thread->block_barrier); \
577 $local_start(); \
578 int lane = thread->laneID; \
579 \
580 $cuda_mem_set_clear(thread->warp->memSets[lane][lane]); \
581 $cuda_mem_set_clear(thread->block->memSets[indexInBlock]); \
582 \
583 if (indexInBlock == 0) { \
584 int result = INIT; \
585 for (int i = 0; i < thread->block->size; i++) { \
586 REDUCTION; \
587 } \
588 thread->block->predArray[0] = result; \
589 thread->block->currBarrierID = -1; \
590 } \
591 \
592 $local_end(); \
593 $barrier_call(thread->block_barrier); \
594 $local_start(); \
595 int result = thread->block->predArray[0]; \
596 \
597 $local_end(); \
598 $barrier_call(thread->block_barrier); \
599 $local_start(); \
600 \
601 $read_set_push(); \
602 $write_set_push(); \
603 return result; \
604 }
605
606$CUDA_DEFINE_SYNCTHREADS_VARIANT($cuda__syncthreads_count, 0, result += thread->block->predArray[i])
607$CUDA_DEFINE_SYNCTHREADS_VARIANT($cuda__syncthreads_or, 0, if (thread->block->predArray[i]) {
608 result = 1;
609 continue;
610 })
611$CUDA_DEFINE_SYNCTHREADS_VARIANT($cuda__syncthreads_and, 1, if (!thread->block->predArray[i]) {
612 result = 0;
613 continue;
614 })
[9cabba4]615
616#define $GET_ARG_1(_1, ...) _1
617#define $GET_ARG_2(_1, _2, ...) _2
[9dbe9864]618
619#define __syncwarp() $cuda__syncwarp($thread)
620void $cuda__syncwarp($cuda_thread_data_t thread) {
621 $cuda_complete_warp_region(thread);
622 $local_end();
623 $cuda_warp_barrier_call(thread, $CUDA_WARP_TAG_warpsync);
624 $local_start();
625 $read_set_push();
626 $write_set_push();
627}
628
[9cabba4]629#define $CUDA_SHFL_PARAM_MACRO(...) $GET_ARG_1(__VA_ARGS__, warpSize, 0), $GET_ARG_2(__VA_ARGS__, warpSize, 0)
630
631#define __shfl_sync(mask, var, ...) \
632 _Generic(var, \
633 default: $cuda__shfl_sync_int, \
634 unsigned int: $cuda__shfl_sync_uint, \
635 long: $cuda__shfl_sync_long, \
636 unsigned long: $cuda__shfl_sync_ulong, \
637 long long: $cuda__shfl_sync_ll, \
638 unsigned long long: $cuda__shfl_sync_ull, \
[9dbe9864]639 float: $cuda__shfl_sync_float, \
640 double: $cuda__shfl_sync_double) (mask, var, $CUDA_SHFL_PARAM_MACRO(__VA_ARGS__), $thread)
[9cabba4]641
642#define __shfl_up_sync(mask, var, ...) \
643 _Generic(var, \
644 default: $cuda__shfl_up_sync_int, \
645 unsigned int: $cuda__shfl_up_sync_uint, \
646 long: $cuda__shfl_up_sync_long, \
647 unsigned long: $cuda__shfl_up_sync_ulong, \
648 long long: $cuda__shfl_up_sync_ll, \
649 unsigned long long: $cuda__shfl_up_sync_ull, \
650 float: $cuda__shfl_up_sync_float, \
[9dbe9864]651 double: $cuda__shfl_up_sync_double) (mask, var, $CUDA_SHFL_PARAM_MACRO(__VA_ARGS__), $thread)
[9cabba4]652
653#define __shfl_down_sync(mask, var, ...) \
654 _Generic(var, \
655 default: $cuda__shfl_down_sync_int, \
656 unsigned int: $cuda__shfl_down_sync_uint, \
657 long: $cuda__shfl_down_sync_long, \
658 unsigned long: $cuda__shfl_down_sync_ulong, \
659 long long: $cuda__shfl_down_sync_ll, \
660 unsigned long long: $cuda__shfl_down_sync_ull, \
661 float: $cuda__shfl_down_sync_float, \
[9dbe9864]662 double: $cuda__shfl_down_sync_double) (mask, var, $CUDA_SHFL_PARAM_MACRO(__VA_ARGS__), $thread)
[9cabba4]663
664#define __shfl_xor_sync(mask, var, ...) \
665 _Generic(var, \
666 default: $cuda__shfl_xor_sync_int, \
667 unsigned int: $cuda__shfl_xor_sync_uint, \
668 long: $cuda__shfl_xor_sync_long, \
669 unsigned long: $cuda__shfl_xor_sync_ulong, \
670 long long: $cuda__shfl_xor_sync_ll, \
671 unsigned long long: $cuda__shfl_xor_sync_ull, \
672 float: $cuda__shfl_xor_sync_float, \
[9dbe9864]673 double: $cuda__shfl_xor_sync_double) (mask, var, $CUDA_SHFL_PARAM_MACRO(__VA_ARGS__), $thread)
674
675#define $CUDA_INTRINSIC_RACE_CHECK_PROTOCOL_PRE() \
676 $mem writes = $write_set_pop(); \
677 $mem reads = $read_set_pop(); \
678 $cuda_mem_set_add(thread->warp->memSets[thread->laneID][thread->laneID], \
679 reads, writes); \
680 thread->warp->patching[thread->laneID] = $true;
681
682#define $CUDA_INTRINSIC_RACE_CHECK_PROTOCOL_POST() \
683 $read_set_push(); \
684 $write_set_push();
[9cabba4]685
686#define $CUDA_GENERIC_SHFL_BODY() \
687 $assert (width <= warpSize); \
688 for (int v = width; v > 1; v /= 2) { \
689 $assert(v % 2 == 0); \
690 } \
691 \
692 int requestLane; \
693 switch(tag) { \
694 case $CUDA_WARP_TAG_shfl_sync: \
[9dbe9864]695 requestLane = thread->laneID/width + laneParam % width; \
[9cabba4]696 break; \
697 case $CUDA_WARP_TAG_shfl_up_sync: \
[9dbe9864]698 requestLane = thread->laneID - laneParam; \
[9cabba4]699 break; \
700 case $CUDA_WARP_TAG_shfl_down_sync: \
[9dbe9864]701 requestLane = thread->laneID + laneParam; \
[9cabba4]702 break; \
703 case $CUDA_WARP_TAG_shfl_xor_sync: \
[9dbe9864]704 requestLane = thread->laneID ^ laneParam; \
[9cabba4]705 break; \
706 } \
[9dbe9864]707 $cuda_warp_data_t warp = thread->warp; \
708 _Bool validSrcLane = requestLane >= 0 && requestLane < warp->size; \
[9cabba4]709 if (validSrcLane) { \
[9dbe9864]710 $comm_enqueue(thread->lane_comm, $message_pack(thread->laneID, requestLane, tag, NULL, 0)); \
[9cabba4]711 } \
712 \
713 $local_end(); \
[9dbe9864]714 $cuda_warp_barrier_call(thread, tag); \
[9cabba4]715 $local_start(); \
716 \
717 \
[9dbe9864]718 _Bool requested[width] = (_Bool[width])$lambda(int i) $false; \
719 int subWarpStart = thread->laneID/width; \
720 while ($comm_probe(thread->lane_comm, $COMM_ANY_SOURCE, tag)) { \
721 $message request = $comm_dequeue(thread->lane_comm, $COMM_ANY_SOURCE, tag); \
722 requested[$message_source(request) - subWarpStart] = $true; \
723 } \
724 \
725 $local_end(); \
726 $cuda_warp_barrier_call(thread, tag); \
727 $local_start(); \
728 \
729 for(int i = 0; i < width; i++) { \
730 if (requested[i]) { \
731 $comm_enqueue(thread->lane_comm, $message_pack(thread->laneID, i + subWarpStart, tag, &var, typeSize)); \
732 } \
[9cabba4]733 } \
734 \
735 $local_end(); \
[9dbe9864]736 $cuda_warp_barrier_call(thread, tag); \
[9cabba4]737 $local_start(); \
738 \
739 if (validSrcLane) { \
[9dbe9864]740 $message result = $comm_dequeue(thread->lane_comm, requestLane, tag); \
[9cabba4]741 $message_unpack(result, &resultVal, typeSize); \
742 } else { \
743 $havoc(&resultVal); \
744 }
745
746#define $CUDA_DEFINE_SHFL(NAME, T, TAG) \
[9dbe9864]747 T NAME(unsigned mask, T var, int laneParam, int width, $cuda_thread_data_t thread) { \
748 $CUDA_INTRINSIC_RACE_CHECK_PROTOCOL_PRE(); \
[9cabba4]749 T resultVal; \
750 int typeSize = sizeof(T); \
751 $cuda_warp_tag tag = TAG; \
752 \
753 $CUDA_GENERIC_SHFL_BODY(); \
754 \
[9dbe9864]755 $CUDA_INTRINSIC_RACE_CHECK_PROTOCOL_POST(); \
[9cabba4]756 return resultVal; \
757 }
758
[9dbe9864]759$CUDA_DEFINE_SHFL($cuda__shfl_sync_int, int, $CUDA_WARP_TAG_shfl_sync)
760$CUDA_DEFINE_SHFL($cuda__shfl_sync_uint, unsigned int, $CUDA_WARP_TAG_shfl_sync)
761$CUDA_DEFINE_SHFL($cuda__shfl_sync_long, long, $CUDA_WARP_TAG_shfl_sync)
762$CUDA_DEFINE_SHFL($cuda__shfl_sync_ulong, unsigned long, $CUDA_WARP_TAG_shfl_sync)
763$CUDA_DEFINE_SHFL($cuda__shfl_sync_ll, long long, $CUDA_WARP_TAG_shfl_sync)
764$CUDA_DEFINE_SHFL($cuda__shfl_sync_ull, unsigned long long, $CUDA_WARP_TAG_shfl_sync)
765$CUDA_DEFINE_SHFL($cuda__shfl_sync_float, float, $CUDA_WARP_TAG_shfl_sync)
766$CUDA_DEFINE_SHFL($cuda__shfl_sync_double, double, $CUDA_WARP_TAG_shfl_sync)
767
768$CUDA_DEFINE_SHFL($cuda__shfl_up_sync_int, int, $CUDA_WARP_TAG_shfl_up_sync)
769$CUDA_DEFINE_SHFL($cuda__shfl_up_sync_uint, unsigned int, $CUDA_WARP_TAG_shfl_up_sync)
770$CUDA_DEFINE_SHFL($cuda__shfl_up_sync_long, long, $CUDA_WARP_TAG_shfl_up_sync)
771$CUDA_DEFINE_SHFL($cuda__shfl_up_sync_ulong, unsigned long, $CUDA_WARP_TAG_shfl_up_sync)
772$CUDA_DEFINE_SHFL($cuda__shfl_up_sync_ll, long long, $CUDA_WARP_TAG_shfl_up_sync)
773$CUDA_DEFINE_SHFL($cuda__shfl_up_sync_ull, unsigned long long, $CUDA_WARP_TAG_shfl_up_sync)
774$CUDA_DEFINE_SHFL($cuda__shfl_up_sync_float, float, $CUDA_WARP_TAG_shfl_up_sync)
775$CUDA_DEFINE_SHFL($cuda__shfl_up_sync_double, double, $CUDA_WARP_TAG_shfl_up_sync)
776
777$CUDA_DEFINE_SHFL($cuda__shfl_down_sync_int, int, $CUDA_WARP_TAG_shfl_down_sync)
778$CUDA_DEFINE_SHFL($cuda__shfl_down_sync_uint, unsigned int, $CUDA_WARP_TAG_shfl_down_sync)
779$CUDA_DEFINE_SHFL($cuda__shfl_down_sync_long, long, $CUDA_WARP_TAG_shfl_down_sync)
780$CUDA_DEFINE_SHFL($cuda__shfl_down_sync_ulong, unsigned long, $CUDA_WARP_TAG_shfl_down_sync)
781$CUDA_DEFINE_SHFL($cuda__shfl_down_sync_ll, long long, $CUDA_WARP_TAG_shfl_down_sync)
782$CUDA_DEFINE_SHFL($cuda__shfl_down_sync_ull, unsigned long long, $CUDA_WARP_TAG_shfl_down_sync)
783$CUDA_DEFINE_SHFL($cuda__shfl_down_sync_float, float, $CUDA_WARP_TAG_shfl_down_sync)
784$CUDA_DEFINE_SHFL($cuda__shfl_down_sync_double, double, $CUDA_WARP_TAG_shfl_down_sync)
785
786$CUDA_DEFINE_SHFL($cuda__shfl_xor_sync_int, int, $CUDA_WARP_TAG_shfl_xor_sync)
787$CUDA_DEFINE_SHFL($cuda__shfl_xor_sync_uint, unsigned int, $CUDA_WARP_TAG_shfl_xor_sync)
788$CUDA_DEFINE_SHFL($cuda__shfl_xor_sync_long, long, $CUDA_WARP_TAG_shfl_xor_sync)
789$CUDA_DEFINE_SHFL($cuda__shfl_xor_sync_ulong, unsigned long, $CUDA_WARP_TAG_shfl_xor_sync)
790$CUDA_DEFINE_SHFL($cuda__shfl_xor_sync_ll, long long, $CUDA_WARP_TAG_shfl_xor_sync)
791$CUDA_DEFINE_SHFL($cuda__shfl_xor_sync_ull, unsigned long long, $CUDA_WARP_TAG_shfl_xor_sync)
792$CUDA_DEFINE_SHFL($cuda__shfl_xor_sync_float, float, $CUDA_WARP_TAG_shfl_xor_sync)
793$CUDA_DEFINE_SHFL($cuda__shfl_xor_sync_double, double, $CUDA_WARP_TAG_shfl_xor_sync)
794
795#define __ballot_sync(mask, predicate) $cuda__ballot_sync(mask, predicate, $thread)
796#define __all_sync(mask, predicate) $cuda__all_sync(mask, predicate, $thread)
797#define __any_sync(mask, predicate) $cuda__any_sync(mask, predicate, $thread)
798
799#define $CUDA_GENERIC_COND_REDUCTION_BODY(COND, T_REDUCTION, F_REDUCTION) \
800 $cuda_warp_data_t warp = thread->warp; \
801 int laneID = thread->laneID; \
802 $comm comm = thread->lane_comm; \
803 if (warp->reductionLane == -1) { \
804 warp->reductionLane = laneID; \
805 \
806 result = initialValue; \
807 for (int i = 0; i < warp->size; i++) { \
808 if (i == laneID) { \
809 operand = value; \
810 } else { \
811 $local_end(); \
812 $when(!warp->alive[i] || $comm_probe(comm, i, tag)) $local_start(); \
813 \
814 if (!warp->alive[i]) { \
815 operand = initialValue; \
816 } else { \
817 $local_end(); \
818 $message_unpack($comm_dequeue(comm, i, tag), &operand, typeSize); \
819 $local_start(); \
820 } \
821 } \
822 \
823 if (COND) { \
824 result = T_REDUCTION; \
825 } else { \
826 result = F_REDUCTION; \
827 } \
828 } \
829 \
830 warp->reductionLane = -1; \
831 \
832 for (int i = 0; i< warp->size; i++) { \
833 if (i != laneID && warp->alive[i]) { \
834 $comm_enqueue(comm, $message_pack(laneID, i, tag, &result, typeSize)); \
835 } \
836 } \
837 } else { \
838 int reductionLane = warp->reductionLane; \
839 $comm_enqueue(comm, $message_pack(laneID, reductionLane, tag, &value, typeSize)); \
840 $local_end(); \
841 $message_unpack($comm_dequeue(comm, reductionLane, tag), &result, typeSize); \
842 $local_start(); \
843 }
844
845#define $CUDA_GENERIC_REDUCTION_BODY(REDUCTION) $CUDA_GENERIC_COND_REDUCTION_BODY($true, REDUCTION, result)
846
847int $cuda__all_sync(unsigned mask, int value, $cuda_thread_data_t thread) {
848 $CUDA_INTRINSIC_RACE_CHECK_PROTOCOL_PRE();
849
850 $cuda_warp_tag tag = $CUDA_WARP_TAG_all_sync;
851 int typeSize = sizeof(int);
852 int initialValue = 1;
853 int result, operand;
854
855 $CUDA_GENERIC_COND_REDUCTION_BODY(result != 0 && operand != 0, 1, 0);
856
857 $CUDA_INTRINSIC_RACE_CHECK_PROTOCOL_POST();
858
859 return result;
860}
861
862int $cuda__any_sync(unsigned mask, int value, $cuda_thread_data_t thread) {
863 $CUDA_INTRINSIC_RACE_CHECK_PROTOCOL_PRE();
864
865 $cuda_warp_tag tag = $CUDA_WARP_TAG_any_sync;
866 int typeSize = sizeof(int);
867 int initialValue = 0;
868 int result, operand;
869
870 $CUDA_GENERIC_COND_REDUCTION_BODY(result != 0 || operand != 0, 1, 0);
871
872 $CUDA_INTRINSIC_RACE_CHECK_PROTOCOL_POST();
873
874 return result;
875}
876
877unsigned $cuda__ballot_sync(unsigned mask, int value, $cuda_thread_data_t thread) {
878 $CUDA_INTRINSIC_RACE_CHECK_PROTOCOL_PRE();
879
880 $cuda_warp_tag tag = $CUDA_WARP_TAG_ballot_sync;
881 int initialValue = 0;
882 int typeSize = sizeof(int);
883 unsigned result;
884 int operand;
885
886 $CUDA_GENERIC_COND_REDUCTION_BODY(operand == 0, 2 * result, 2 * result + 1);
887
888 $CUDA_INTRINSIC_RACE_CHECK_PROTOCOL_POST();
889
890 return result;
891}
892
893/* atomicAdd()
894 * Reads the 16-bit, 32-bit or 64-bit word old located at the address address in
895 * global or shared memory, computes (old + val), and stores the result back to
896 * memory at the same address. These three operations are performed in one atomic
897 * transaction. The function returns old.
898 */
899#define atomicAdd(X,Y) _Generic(X, \
900 default : $cuda_atomicAdd_int, \
901 unsigned int* : $cuda_atomicAdd_uint, \
902 unsigned long long int* : $cuda_atomicAdd_ullint, \
903 float* : $cuda_atomicAdd_float, \
904 double* : $cuda_atomicAdd_double) (X, Y, $thread)
905
906/* atomicSub()
907 * reads the 32-bit word old located at the address address in global or shared
908 * memory, computes (old - val), and stores the result back to memory at the same
909 * address. These three operations are performed in one atomic transaction. The
910 * function returns old.
911 */
912#define atomicSub(X,Y) _Generic(X, \
913 default : $cuda_atomicSub_int, \
914 unsigned int* : $cuda_atomicSub_uint) (X, Y, $thread)
915
916/* atomicExch()
917 * reads the 32-bit or 64-bit word old located at the address address in global
918 * or shared memory and stores val back to memory at the same address. These two
919 * operations are performed in one atomic transaction. The function returns old.
920 */
921#define atomicExch(X,Y) _Generic(X, \
922 default : $cuda_atomicExch_int, \
923 unsigned int* : $cuda_atomicExch_uint, \
924 unsigned long long int* : $cuda_atomicExch_ullint, \
925 float* : $cuda_atomicExch_float) (X, Y, $thread)
926
927/* atomicMin()
928 * reads the 32-bit or 64-bit word old located at the address address in global
929 * or shared memory, computes the minimum of old and val, and stores the result
930 * back to memory at the same address. These three operations are performed in one
931 * atomic transaction. The function returns old.
932 */
933#define atomicMin(X,Y) _Generic(X, \
934 default : $cuda_atomicMin_int, \
935 unsigned int* : $cuda_atomicMin_uint, \
936 unsigned long long int* : $cuda_atomicMin_ullint) (X, Y, $thread)
937
938/* atomicMax()
939 * reads the 32-bit or 64-bit word old located at the address address in global
940 * or shared memory, computes the maximum of old and val, and stores the result
941 * back to memory at the same address. These three operations are performed in one
942 * atomic transaction. The function returns old.
943 */
944#define atomicMax(X,Y) _Generic(X, \
945 default : $cuda_atomicMax_int, \
946 unsigned int* : $cuda_atomicMax_uint, \
947 unsigned long long int* : $cuda_atomicMax_ullint) (X, Y, $thread)
948
949/* atomicInc()
950 * reads the 32-bit word old located at the address address in global or shared
951 * memory, computes ((old >= val) ? 0 : (old+1)), and stores the result back to
952 * memory at the same address. These three operations are performed in one atomic
953 * transaction. The function returns old.
954 */
955#define atomicInc(address, val) $cuda_atomicInc(address, val, $thread)
956
957/* atomicDec()
958 * reads the 32-bit word old located at the address address in global or shared
959 * memory, computes (((old == 0) || (old > val)) ? val : (old-1) ), and stores
960 * the result back to memory at the same address. These three operations are
961 * performed in one atomic transaction. The function returns old.
962 */
963#define atomicDec(address, val) $cuda_atomicDec(address, val, $thread)
964
965/* atomicCAS()
966 * reads the 16-bit, 32-bit or 64-bit word old located at the address address in
967 * global or shared memory, computes (old == compare ? val : old) , and stores the
968 * result back to memory at the same address. These three operations are performed
969 * in one atomic transaction. The function returns old (Compare And Swap).
970 */
971#define atomicCAS(address, compare, val) _Generic(address, \
972 default : $cuda_atomicCAS_int, \
973 unsigned int* : $cuda_atomicCAS_uint, \
974 unsigned long long int* : $cuda_atomicCAS_ullint, \
975 unsigned short int* : $cuda_atomicCAS_usint) (address, compare, val, $thread)
976
977/*
978 * reads the 32-bit or 64-bit word old located at the address address
979 * in global or shared memory, computes (old & val), and stores the
980 * result back to memory at the same address. These three operations
981 * are performed in one atomic transaction. The function returns old.
982 */
983#define atomicAnd(address, val) _Generic(address, \
984 default : $cuda_atomicAnd_int, \
985 unsigned int* : $cuda_atomicAnd_uint, \
986 unsigned long long int* : $cuda_atomicAnd_ullint) (address, val, $thread)
987
988/*
989 * reads the 32-bit or 64-bit word old located at the address address
990 * in global or shared memory, computes (old | val), and stores the
991 * result back to memory at the same address. These three operations
992 * are performed in one atomic transaction. The function returns old.
993 */
994#define atomicOr(address, val) _Generic(address, \
995 default : $cuda_atomicOr_int, \
996 unsigned int* : $cuda_atomicOr_uint, \
997 unsigned long long int* : $cuda_atomicOr_ullint) (address, val, $thread)
998
999/*
1000 * reads the 32-bit or 64-bit word old located at the address address
1001 * in global or shared memory, computes (old ^ val), and stores the
1002 * result back to memory at the same address. These three operations
1003 * are performed in one atomic transaction. The function returns old.
1004 */
1005#define atomicXor(address, val) _Generic(address, \
1006 default : $cuda_atomicXor_int, \
1007 unsigned int* : $cuda_atomicXor_uint, \
1008 unsigned long long int* : $cuda_atomicXor_ullint) (address, val, $thread)
1009
1010#define $CUDA_ATOMIC_PRE_ACTION(T) \
1011 $cuda_complete_kernel_region(thread); \
1012 $yield(); \
1013 $cuda_start_kernel_region(thread); \
1014 T old = *address;
1015
1016#define $CUDA_ATOMIC_POST_ACTION() \
1017 $cuda_complete_kernel_region(thread); \
1018 $cuda_start_kernel_region(thread); \
1019 return old;
1020
1021#define $CUDA_DEFINE_ATOMIC_ADD(NAME, T) \
1022 T NAME(T* address, T val, $cuda_thread_data_t thread) { \
1023 $CUDA_ATOMIC_PRE_ACTION(T) \
1024 *address += val; \
1025 $CUDA_ATOMIC_POST_ACTION() \
1026 }
1027
1028$CUDA_DEFINE_ATOMIC_ADD($cuda_atomicAdd_int, int)
1029$CUDA_DEFINE_ATOMIC_ADD($cuda_atomicAdd_uint, unsigned int)
1030$CUDA_DEFINE_ATOMIC_ADD($cuda_atomicAdd_ullint, unsigned long long int)
1031$CUDA_DEFINE_ATOMIC_ADD($cuda_atomicAdd_float, float)
1032$CUDA_DEFINE_ATOMIC_ADD($cuda_atomicAdd_double, double)
1033
1034#define $CUDA_DEFINE_ATOMIC_SUB(NAME, T) \
1035 T NAME(T* address, T val, $cuda_thread_data_t thread) { \
1036 $CUDA_ATOMIC_PRE_ACTION(T) \
1037 *address -= val; \
1038 $CUDA_ATOMIC_POST_ACTION() \
1039 }
1040
1041$CUDA_DEFINE_ATOMIC_SUB($cuda_atomicSub_int, int)
1042$CUDA_DEFINE_ATOMIC_SUB($cuda_atomicSub_uint, unsigned int)
1043
1044#define $CUDA_DEFINE_ATOMIC_EXCH(NAME, T) \
1045 T NAME(T* address, T val, $cuda_thread_data_t thread) { \
1046 $CUDA_ATOMIC_PRE_ACTION(T) \
1047 *address = val; \
1048 $CUDA_ATOMIC_POST_ACTION() \
1049 }
1050
1051$CUDA_DEFINE_ATOMIC_EXCH($cuda_atomicExch_int, int)
1052$CUDA_DEFINE_ATOMIC_EXCH($cuda_atomicExch_uint, unsigned int)
1053$CUDA_DEFINE_ATOMIC_EXCH($cuda_atomicExch_ullint, unsigned long long int)
1054$CUDA_DEFINE_ATOMIC_EXCH($cuda_atomicExch_float, float)
1055
1056#define $CUDA_DEFINE_ATOMIC_MIN(NAME, T) \
1057 T NAME(T* address, T val, $cuda_thread_data_t thread) { \
1058 $CUDA_ATOMIC_PRE_ACTION(T) \
1059 if (old <= val) *address = old; \
1060 else *address = val; \
1061 $CUDA_ATOMIC_POST_ACTION() \
1062 }
1063
1064$CUDA_DEFINE_ATOMIC_MIN($cuda_atomicMin_int, int)
1065$CUDA_DEFINE_ATOMIC_MIN($cuda_atomicMin_uint, unsigned int)
1066$CUDA_DEFINE_ATOMIC_MIN($cuda_atomicMin_ullint, unsigned long long int)
1067
1068#define $CUDA_DEFINE_ATOMIC_MAX(NAME, T) \
1069 T NAME(T* address, T val, $cuda_thread_data_t thread) { \
1070 $CUDA_ATOMIC_PRE_ACTION(T) \
1071 if (old >= val) *address = old; \
1072 else *address = val; \
1073 $CUDA_ATOMIC_POST_ACTION() \
1074 }
1075
1076$CUDA_DEFINE_ATOMIC_MAX($cuda_atomicMax_int, int)
1077$CUDA_DEFINE_ATOMIC_MAX($cuda_atomicMax_uint, unsigned int)
1078$CUDA_DEFINE_ATOMIC_MAX($cuda_atomicMax_ullint, unsigned long long int)
1079
1080unsigned int $cuda_atomicInc(unsigned int* address, unsigned int val, $cuda_thread_data_t thread) {
1081 $CUDA_ATOMIC_PRE_ACTION(unsigned int);
1082 if (old >= val) *address = 0;
1083 else *address = old + 1;
1084 $CUDA_ATOMIC_POST_ACTION()
1085}
1086
1087unsigned int $cuda_atomicDec(unsigned int* address, unsigned int val, $cuda_thread_data_t thread) {
1088 $CUDA_ATOMIC_PRE_ACTION(unsigned int);
1089 if (old == 0 || old > val) *address = val;
1090 else *address = old - 1;
1091 $CUDA_ATOMIC_POST_ACTION()
1092}
1093
1094#define $CUDA_DEFINE_ATOMIC_CAS(NAME, T) \
1095 T NAME(T* address, T compare, T val, $cuda_thread_data_t thread) { \
1096 $CUDA_ATOMIC_PRE_ACTION(T) \
1097 if (old == compare) *address = val; \
1098 $CUDA_ATOMIC_POST_ACTION() \
1099 }
1100
1101$CUDA_DEFINE_ATOMIC_CAS($cuda_atomicCAS_int, int)
1102$CUDA_DEFINE_ATOMIC_CAS($cuda_atomicCAS_uint, unsigned int)
1103$CUDA_DEFINE_ATOMIC_CAS($cuda_atomicCAS_ullint, unsigned long long int)
1104$CUDA_DEFINE_ATOMIC_CAS($cuda_atomicCAS_usint, unsigned short int)
1105
1106#define $CUDA_DEFINE_ATOMIC_AND(NAME, T) \
1107 T NAME(T* address, T val, $cuda_thread_data_t thread) { \
1108 $CUDA_ATOMIC_PRE_ACTION(T) \
1109 *address = old & val; \
1110 $CUDA_ATOMIC_POST_ACTION() \
1111 }
1112
1113$CUDA_DEFINE_ATOMIC_AND($cuda_atomicAnd_int, int)
1114$CUDA_DEFINE_ATOMIC_AND($cuda_atomicAnd_uint, unsigned int)
1115$CUDA_DEFINE_ATOMIC_AND($cuda_atomicAnd_ullint, unsigned long long int)
1116
1117#define $CUDA_DEFINE_ATOMIC_OR(NAME, T) \
1118 T NAME(T* address, T val, $cuda_thread_data_t thread) { \
1119 $CUDA_ATOMIC_PRE_ACTION(T) \
1120 *address = old | val; \
1121 $CUDA_ATOMIC_POST_ACTION() \
1122 }
1123
1124$CUDA_DEFINE_ATOMIC_OR($cuda_atomicOr_int, int)
1125$CUDA_DEFINE_ATOMIC_OR($cuda_atomicOr_uint, unsigned int)
1126$CUDA_DEFINE_ATOMIC_OR($cuda_atomicOr_ullint, unsigned long long int)
1127
1128#define $CUDA_DEFINE_ATOMIC_XOR(NAME, T) \
1129 T NAME(T* address, T val, $cuda_thread_data_t thread) { \
1130 $CUDA_ATOMIC_PRE_ACTION(T) \
1131 *address = old ^ val; \
1132 $CUDA_ATOMIC_POST_ACTION() \
1133 }
1134
1135$CUDA_DEFINE_ATOMIC_XOR($cuda_atomicXor_int, int)
1136$CUDA_DEFINE_ATOMIC_XOR($cuda_atomicXor_uint, unsigned int)
1137$CUDA_DEFINE_ATOMIC_XOR($cuda_atomicXor_ullint, unsigned long long int)
1138
1139//////////////////////
1140// Global Variables //
1141//////////////////////
1142
1143$gcomm $cuda_gcomm = $gcomm_create($here, 2);
1144const int $CUDA_PLACE_HOST = 0;
1145const int $CUDA_PLACE_DEVICE = 1;
1146$comm $cuda_host_comm = $comm_create($here, $cuda_gcomm, $CUDA_PLACE_HOST);
1147
1148/**
1149 * Tags used for message-passing between host and device
1150 */
1151enum $cuda_tag {
1152 // Predefined tags
1153 $CUDA_TAG_TEARDOWN,
1154 $CUDA_TAG_SCOPE_REQUEST,
1155 $CUDA_TAG_cudaFree,
1156 $CUDA_TAG_cudaMemcpy,
1157 $CUDA_TAG_cudaMemcpyAsync,
1158 // Generated tags (by transformer)
1159 $CUDA_TAG_LAUNCH_kernel_1
1160};
1161
1162///////////////////
1163// CIVL-CUDA API //
1164///////////////////
1165
1166$scope $cuda_host_request_device_scope() {
1167 $comm_enqueue($cuda_host_comm, $message_pack($CUDA_PLACE_HOST, $CUDA_PLACE_DEVICE, $CUDA_TAG_SCOPE_REQUEST, NULL, 0));
1168 $message response = $comm_dequeue($cuda_host_comm, $CUDA_PLACE_DEVICE, $CUDA_TAG_SCOPE_REQUEST);
1169 $scope result;
1170 $message_unpack(response, &result, sizeof($scope));
1171
1172 return result;
1173}
1174
1175typedef struct $cuda_memcpy_data {
1176 void* dst;
1177 const void* src;
1178 size_t count;
1179 cudaMemcpyKind kind;
1180} $cuda_memcpy_data;
1181
1182void $cuda_host_memcpy(void* dst, const void* src, size_t count, cudaMemcpyKind kind, _Bool async) {
1183 if (kind == cudaMemcpyHostToHost) {
1184 memcpy(dst, src, count);
1185 } else {
1186 $cuda_memcpy_data args;
1187 args.dst = dst;
1188 args.src = src;
1189 args.count = count;
1190 args.kind = kind;
1191
1192 int tag = async ? $CUDA_TAG_cudaMemcpyAsync : $CUDA_TAG_cudaMemcpy;
1193
1194 $comm_enqueue($cuda_host_comm, $message_pack($CUDA_PLACE_HOST, $CUDA_PLACE_DEVICE, tag, &args, sizeof($cuda_memcpy_data)));
1195 $comm_dequeue($cuda_host_comm, $CUDA_PLACE_DEVICE, tag);
1196 }
1197}
1198
1199$cuda_stream_node_t $create_new_stream_node($scope cudaScope) {
1200 cudaStream_t newStream = (cudaStream_t) $malloc(cudaScope, sizeof($cuda_stream));
1201 newStream->head = NULL;
1202 newStream->tail = NULL;
1203 newStream->numOps = 0;
1204 newStream->alive = true;
1205
1206 $cuda_stream_node_t newHead = ($cuda_stream_node_t) $malloc(cudaScope, sizeof($cuda_stream_node));
1207 newHead->stream = newStream;
1208 newStream->containingNode = newHead;
1209 newHead->prev = NULL;
1210 newHead->next = NULL;
1211
1212 return newHead;
1213}
1214
1215/*@ depends_on \nothing;
1216 @ assigns \nothing;
1217 @ reads \nothing;
1218 @*/
1219$atomic_f $proc $destroy_stream_node($cuda_stream_node_t node) {
1220 $proc lastOpProc = $proc_null;
1221 cudaStream_t stream = node->stream;
1222
1223 if (node->prev != NULL) {
1224 node->prev->next = node->next;
1225 }
1226 if (node->next != NULL) {
1227 node->next->prev = node->prev;
1228 }
1229 free(node);
1230
1231 stream->alive = false;
1232 if(stream->tail != NULL)
1233 lastOpProc = stream->tail->opState->op;
1234
1235 void destroyStreamWhenComplete($proc lastOpProc, cudaStream_t stream) {
1236 $wait(lastOpProc);
1237 free(stream);
1238 }
1239
1240 return $spawn destroyStreamWhenComplete(lastOpProc, stream);
1241}
1242
1243/*@ depends_on \access(stream);
1244 @ assigns stream;
1245 @ reads \nothing;
1246 @*/
1247$atomic_f $proc $stream_enqueue($scope cudaScope, cudaStream_t stream, $message opParams, void(*opProc)($message, $cuda_op_state_t, cudaStream_t)) {
1248 $assert(stream->alive, "Attempt to enqueue a CUDA operation onto a destroyed stream");
1249
1250 $cuda_op_state_t newOpState = ($cuda_op_state_t) $malloc(cudaScope, sizeof($cuda_op_state));
1251 newOpState->start = false;
1252 newOpState->op = $spawn opProc(opParams, newOpState, stream);
1253
1254 $cuda_op_state_node_t newOpStateNode = ($cuda_op_state_node_t) $malloc(cudaScope, sizeof($cuda_op_state_node));
1255 newOpStateNode->opState = newOpState;
1256 newOpStateNode->next = NULL;
1257
1258 if (stream->tail == NULL) {
1259 stream->head = newOpStateNode;
1260 stream->tail = newOpStateNode;
1261 newOpState->start = true;
1262 } else {
1263 stream->tail->next = newOpStateNode;
1264 stream->tail = newOpStateNode;
1265 }
1266 stream->numOps++;
1267
1268 return newOpState->op;
1269}
1270
1271/*@ depends_on \nothing;
1272 @ assigns \nothing;
1273 @ reads \nothing;
1274 @*/
1275$atomic_f void $stream_dequeue(cudaStream_t stream) {
1276 $assert(stream->head != NULL, "Attempt to dequeue an empty stream");
1277
1278 if (stream->head == stream->tail) {
1279 stream->tail = NULL;
1280 }
1281
1282 $cuda_op_state_node_t oldHead = stream->head;
1283 stream->head = oldHead->next;
1284 if (stream->head != NULL) {
1285 stream->head->opState->start = true;
1286 }
1287
1288 stream->numOps--;
1289 free(oldHead->opState);
1290 free(oldHead);
1291}
1292
1293// Helper function
1294int $dim3_index(dim3 size, uint3 location) {
1295 return location.x + size.x * (location.y + size.y * location.z);
1296}
1297
1298// Helper function
1299int $cuda_kernel_index (dim3 gDim, dim3 bDim, uint3 bIdx, uint3 tIdx) {
1300 return $dim3_index(gDim, bIdx) * (bDim.x * bDim.y * bDim.z) + $dim3_index(bDim, tIdx);
1301}
1302
1303void $cuda_run_and_wait_on_procs(dim3 dim, void spawningFunction(uint3)) {
1304 //TODO: calculate length and index, replace this function in the kernel
1305 $local_start();
1306 int length = dim.x * dim.y * dim.z;
1307 $proc procArray[length];
1308 $range rx = 0 .. dim.x - 1;
1309 $range ry = 0 .. dim.y - 1;
1310 $range rz = 0 .. dim.z - 1;
1311 $domain(3) dom = ($domain(3)){rx, ry, rz};
1312 $for(int x,y,z : dom){
1313 uint3 id = { x, y, z };
1314 int index = $dim3_index(dim, id);
1315 procArray[index] = $spawn spawningFunction(id);
1316 }
1317 $local_end();
1318 $waitall(procArray,length);
1319}
1320
[9cabba4]1321
[9dbe9864]1322// CUDA Ops //
[9cabba4]1323
[9dbe9864]1324void $cuda_memcpy_proc($message m, $cuda_op_state_t opState, cudaStream_t stream) {
1325
1326 $when(opState->start);
1327 $cuda_memcpy_data args;
1328 $message_unpack(m, &args, sizeof($cuda_memcpy_data));
1329
1330 if (args.kind == cudaMemcpyHostToDevice || cudaMemcpyDeviceToDevice) {
1331 args.dst = $reveal(args.dst);
1332 }
1333 if (args.kind == cudaMemcpyDeviceToHost || cudaMemcpyDeviceToDevice) {
1334 args.src = $reveal(args.src);
1335 }
1336 memcpy(args.dst, args.src, args.count);
1337
1338 $stream_dequeue(stream);
1339}
[9cabba4]1340
[9dbe9864]1341$message $cuda_memcpy($scope cudaScope, cudaStream_t stream, $message request, _Bool async) {
1342 $cuda_memcpy_data args;
1343 $message_unpack(request, &args, sizeof($cuda_memcpy_data));
[9cabba4]1344
[9dbe9864]1345 $proc memcpyProc = $stream_enqueue(cudaScope, stream, request, $cuda_memcpy_proc);
1346
1347 if (!async && args.kind != cudaMemcpyDeviceToDevice) {
1348 $wait(memcpyProc);
[9cabba4]1349 }
[9dbe9864]1350 int tag = async ? $CUDA_TAG_cudaMemcpyAsync : $CUDA_TAG_cudaMemcpy;
[9cabba4]1351
[9dbe9864]1352 return $message_pack($CUDA_PLACE_DEVICE, $CUDA_PLACE_HOST, tag, NULL, 0);
1353}
[9cabba4]1354
[9dbe9864]1355$message $cuda_free($message request) {
1356 void* devPtr;
1357 $message_unpack(request, &devPtr, sizeof(void*));
1358 free($reveal(devPtr));
[9cabba4]1359
[9dbe9864]1360 return $message_pack($CUDA_PLACE_DEVICE, $CUDA_PLACE_HOST, $CUDA_TAG_cudaFree, NULL, 0);
[9cabba4]1361}
1362
1363
1364
[9dbe9864]1365////////////////////////////////////////////
1366// CUDA API Functions (For Host-use Only) //
1367////////////////////////////////////////////
[9cabba4]1368
[9dbe9864]1369cudaError_t cudaFree(void* devPtr) {
1370 $comm_enqueue($cuda_host_comm, $message_pack($CUDA_PLACE_HOST, $CUDA_PLACE_DEVICE, $CUDA_TAG_cudaFree, &devPtr, sizeof(void*)));
1371 $comm_dequeue($cuda_host_comm, $CUDA_PLACE_DEVICE, $CUDA_TAG_cudaFree);
1372
1373 return cudaSuccess;
1374}
[9cabba4]1375
[9dbe9864]1376cudaError_t cudaMemcpy(void* dst, const void* src, size_t count, cudaMemcpyKind kind) {
1377 $cuda_host_memcpy(dst, src, count, kind, false);
1378 return cudaSuccess;
1379}
[9cabba4]1380
[9dbe9864]1381cudaError_t cudaMemcpyAsync(void* dst, const void* src, size_t count,
1382 cudaMemcpyKind kind, cudaStream_t stream) {
1383 $cuda_host_memcpy(dst, src, count, kind, true);
1384 return cudaSuccess;
[9cabba4]1385}
1386
1387/*
[9dbe9864]1388cudaError_t cudaStreamCreate(cudaStream_t * pStream) {
1389 // Create new stream node in linked list
1390 $cuda_stream_node_t newHead = $create_new_stream_node();
1391 newHead->next = $cuda_global_context.head;
1392 $cuda_global_context.head->prev = newHead;
1393
1394 // Update cuda context's head to be the new node we created
1395 $cuda_global_context.head = newHead;
1396 $cuda_global_context.numStreams++;
1397
1398 return cudaSuccess;
1399}
1400*/
[9cabba4]1401
[9dbe9864]1402/*
1403cudaError_t cudaStreamSynchronize(cudaStream_t stream) {
1404 stream = $default_stream_if_null(stream);
1405 $assert(stream->alive, "Attempt to synchronize with a destroyed stream");
1406 $when(stream->head == NULL) return cudaSuccess;
1407}
1408*/
[9cabba4]1409
[9dbe9864]1410/*
1411cudaError_t cudaStreamDestroy(cudaStream_t stream) {
1412 $assert(stream != NULL && stream != $cuda_default_stream, "Attempt to destroy default stream");
1413 $assert(stream->alive, "Attempt to destroy an already destroyed stream");
1414 $destroy_stream_node(stream->containingNode);
1415 return cudaSuccess;
1416}
1417*/
[9cabba4]1418
[9dbe9864]1419/*
1420cudaError_t cudaDeviceSynchronize() {
1421 $proc* opsToWaitOn;
1422 int numOps = 0;
1423
1424 $atomic {
1425 opsToWaitOn = ($proc*) malloc(sizeof($proc) * $cuda_global_context.numStreams);
1426
1427 for ($cuda_stream_node_t node = $cuda_global_context.head;
1428 node != NULL;
1429 node = node->next) {
1430 if (node->stream->tail != NULL) {
1431 opsToWaitOn[numOps] = node->stream->tail->opState->op;
1432 numOps++;
[9cabba4]1433 }
1434 }
1435 }
[9dbe9864]1436 $waitall(opsToWaitOn, numOps);
1437
1438 return cudaSuccess;
[9cabba4]1439}
1440*/
1441
[a67c590]1442//////////////////////////////////
1443// Generated code from kernel_1 //
1444//////////////////////////////////
1445
1446typedef struct {
1447 dim3 gridDim;
1448 dim3 blockDim;
1449 size_t $cudaMemSize;
1450 cudaStream_t $cudaStream;
[9cabba4]1451 float* A;
[a67c590]1452 const float* B;
1453 float* C;
1454 int numElements;
[9dbe9864]1455} $cuda_kernel_1_params;
[a67c590]1456
[9dbe9864]1457void $cuda_reveal_kernel_1_args($cuda_kernel_1_params* args) {
[b2ca0b6]1458 args->A = $reveal(args->A);
1459 args->B = $reveal(args->B);
1460 args->C = $reveal(args->C);
1461}
1462
[a67c590]1463void $cuda_host_launch_kernel_1(dim3 gridDim, dim3 blockDim,
1464 size_t $cudaMemSize, cudaStream_t $cudaStream,
[9cabba4]1465 float* A, const float* B, float* C, int numElements) {
[9dbe9864]1466 $cuda_kernel_1_params args;
[0ffc6c8]1467 args.gridDim = gridDim;
1468 args.blockDim = blockDim;
1469 args.$cudaMemSize = $cudaMemSize;
1470 args.$cudaStream = $cudaStream;
1471 args.A = A;
1472 args.B = B;
1473 args.C = C;
1474 args.numElements = numElements;
1475
[9dbe9864]1476 $comm_enqueue($cuda_host_comm, $message_pack($CUDA_PLACE_HOST, $CUDA_PLACE_DEVICE, $CUDA_TAG_LAUNCH_kernel_1, &args, sizeof($cuda_kernel_1_params)));
[0ffc6c8]1477 $comm_dequeue($cuda_host_comm, $CUDA_PLACE_DEVICE, $CUDA_TAG_LAUNCH_kernel_1);
[109d05e]1478}
1479
[66ea4f9]1480void $cuda_kernel_1(dim3 gridDim, dim3 blockDim, size_t _cuda_mem_size,
[9cabba4]1481 float *A, const float *B, float *C, int numElements) {
[9dbe9864]1482 $cuda_kernel_data_t $kernel = $create_cuda_kernel_data($here, gridDim, blockDim);
[9cabba4]1483 void $cuda_block(uint3 blockIdx) {
1484 void $cuda_thread(uint3 threadIdx) {
1485 $local_start();
[9dbe9864]1486 // cudaMemSet currently not supported so this is small hack to initialize C ahead of time
1487 if (blockIdx.x == 0 && threadIdx.x == 0) {
1488 for (int i = 0; i < gridDim.x; i++) {
1489 C[i] = 0;
1490 }
1491 }
[9cabba4]1492 //$clear_mem_sets($kernel, _cuda_kid);
[9dbe9864]1493 int $cuda_tid = $dim3_index(blockDim, threadIdx);
1494 int $cuda_kid = $cuda_kernel_index(gridDim, blockDim, blockIdx, threadIdx);
1495 $cuda_thread_data_t $thread = $create_cuda_thread_data($here, $kernel, $cuda_kid/(blockDim.x * blockDim.y * blockDim.z), $cuda_tid/warpSize, $cuda_tid % warpSize);
[9cabba4]1496
1497 // Kernel REDUCTION start
1498 /*
1499 int lane = threadIdx.x % warpSize;
1500 int thisWarpSize = warpSize;
1501 if (threadIdx.x - lane + warpSize > blockDim.x) {
1502 thisWarpSize = ((blockDim.x - 1) % warpSize) + 1;
[66ea4f9]1503 }
[9cabba4]1504
1505 int i = blockDim.x * blockIdx.x + threadIdx.x;
1506 int warpStart = i - lane;
[9dbe9864]1507 //printf("%d,%d - i: %d, warpStart: %d, thisWarpSize: %d\n", blockIdx.x, threadIdx.x,i, warpStart, thisWarpSize);
[9cabba4]1508 int remainingElements = numElements;
1509
1510 while (remainingElements > 1) {
[9dbe9864]1511 //printf("%d,%d - remainingElements: %d - numElements: %d\n", blockIdx.x, threadIdx.x, remainingElements, numElements);
[9cabba4]1512 if (remainingElements < numElements) {
1513 // __syncThreads()
1514 //printf("%d,%d - entering barrier\n", blockIdx.x, threadIdx.x);
1515
[9dbe9864]1516 $cuda__syncthreads($thread, 0);
[9cabba4]1517 //printf("%d,%d - exiting barrier\n", blockIdx.x, threadIdx.x);
1518 }
[66ea4f9]1519
[9cabba4]1520 if (warpStart + 1 < remainingElements) {
1521 float val = i < numElements ? A[i] : 0;
[9dbe9864]1522 //printf("%d,%d - val: %d\n", blockIdx.x, threadIdx.x, val);
[9cabba4]1523
1524 for (int offset = warpSize/2; offset > 0; offset /= 2) {
[9dbe9864]1525 //__syncwarp();
[9cabba4]1526 float tmp = __shfl_down_sync(0, val, offset);
1527 if (lane + offset < thisWarpSize) {
1528 val += tmp;
1529 }
[9dbe9864]1530 //printf("%d,%d - offset: %d - val: %d\n", blockIdx.x, threadIdx.x, offset, val);
[9cabba4]1531 }
1532
1533 if (i < numElements) {
1534 A[i] = val;
1535 }
1536 }
1537
1538 i *= warpSize;
1539 //warpStart *= warpSize;
1540 remainingElements = ((remainingElements - 1) / warpSize) + 1;
1541 }
1542
1543 if (i == 0) {
1544 *C = A[0];
1545 }
1546 */
[9dbe9864]1547 // Kernel REDUCTION end
1548
1549 // Kernel REDUCTION 2 start
1550
1551 int lane = threadIdx.x % warpSize;
1552 int thisWarpSize = warpSize;
1553 if (threadIdx.x - lane + warpSize > blockDim.x) {
1554 thisWarpSize = ((blockDim.x - 1) % warpSize) + 1;
1555 }
1556
1557 int i = blockDim.x * blockIdx.x + threadIdx.x;
1558 int warpStart = i - lane;
1559
1560 if (warpStart + 1 < numElements) {
1561 float val = i < numElements ? A[i] : 0;
1562 for (int offset = warpSize/2; offset > 0; offset /= 2) {
1563 float tmp = __shfl_down_sync(0, val, offset);
1564 //float tmp = i + offset < numElements ? A[i + offset] : 0;
1565 if (lane + offset < thisWarpSize) {
1566 val += tmp;
1567 }
1568 }
1569
1570 if (i < numElements) {
1571 A[i] = val;
1572 }
1573 }
1574
1575 $cuda__syncthreads($thread, 0);
1576 if (threadIdx.x == 0) {
1577 int blockEnd = blockDim.x * (blockIdx.x + 1);
1578 if (blockEnd > numElements) {
1579 blockEnd = numElements;
1580 }
1581 for (int j = i + warpSize; j < blockEnd; j += warpSize) {
1582 A[i] += A[j];
1583 }
1584 atomicAdd(C + blockIdx.x, 1);
1585 }
1586
1587 if (i == 0) {
1588 C[0] = A[0];
1589 for (int j = 1; j < gridDim.x; j++) {
1590 while(atomicAdd(C+j,0) == 0) {}
1591 C[0] += A[j * blockDim.x];
1592 }
1593 }
1594 // Kernel REDUCTION 2 end
1595
[9cabba4]1596 // Kernel BALLOT TEST start
1597
[9dbe9864]1598 /*
[9cabba4]1599 int i = threadIdx.x;
1600 if (i < numElements) {
1601 int result = __ballot_sync(~0, A[i] > 0);
1602 if (i == 0) {
1603 printf("Result: %d\n", result);
1604 *C = 0;
1605 while(result > 0) {
1606 if (result % 2)
1607 *C += 1;
1608 result /= 2;
1609 }
1610 printf("done calculating result\n");
1611 }
1612 }
[9dbe9864]1613 */
[9cabba4]1614 // Kernel BALLOT TEST end
1615 //$check_data_race($kernel, _cuda_kid);
1616 //$read_set_pop();
1617 //$write_set_pop();
[9dbe9864]1618 $destroy_cuda_thread_data($thread);
[66ea4f9]1619 $local_end();
1620 }
[9cabba4]1621 $cuda_run_and_wait_on_procs(blockDim, $cuda_thread);
[66ea4f9]1622 }
[9cabba4]1623 $cuda_run_and_wait_on_procs(gridDim, $cuda_block);
[9dbe9864]1624 $destroy_cuda_kernel_data($kernel);
[66ea4f9]1625}
1626
[a67c590]1627void $cuda_kernel_1_proc ($message request, $cuda_op_state_t opState, cudaStream_t cudaStream) {
[66ea4f9]1628 $when(opState->start);
1629
[9dbe9864]1630 $cuda_kernel_1_params args;
1631 $message_unpack(request, &args, sizeof($cuda_kernel_1_params));
[b2ca0b6]1632 $cuda_reveal_kernel_1_args(&args);
[66ea4f9]1633
[b2ca0b6]1634 $cuda_kernel_1(args.gridDim, args.blockDim, args.$cudaMemSize, args.A, args.B, args.C, args.numElements);
[a67c590]1635 $stream_dequeue(cudaStream);
[66ea4f9]1636}
1637
[6dd0322]1638/////////////////
1639// CUDA "file" //
1640/////////////////
1641
[a67c590]1642void $cuda_main() {
[6dd0322]1643
[a67c590]1644 // Device Variables
[6dd0322]1645
[0ffc6c8]1646 $scope $cuda_scope = $here;
1647
1648 $comm $cuda_device_comm = $comm_create($cuda_scope, $cuda_gcomm, 1);
[6dd0322]1649 $cuda_context $cuda_global_context;
1650 cudaStream_t $cuda_default_stream;
[66ea4f9]1651
[0ffc6c8]1652 // Helper function to get the default stream if passed NULL, and just returns stream otherwise
[a67c590]1653 // Currently unused until we support streams other than the default one.
[0ffc6c8]1654 cudaStream_t $default_stream_if_null(cudaStream_t stream) {
1655 return stream == NULL ? $cuda_default_stream : stream;
1656 }
1657
[a67c590]1658 // Device Logic
[0ffc6c8]1659
[a67c590]1660 $cuda_stream_node_t defaultStreamNode = $create_new_stream_node($cuda_scope);
1661 $cuda_default_stream = defaultStreamNode->stream;
[6dd0322]1662
[a67c590]1663 $cuda_global_context.head = defaultStreamNode;
1664 $cuda_global_context.numStreams = 1;
[6dd0322]1665
1666 while (true) {
1667 $message request = $comm_dequeue($cuda_device_comm, $CUDA_PLACE_HOST, $COMM_ANY_TAG);
1668 $message response;
[0ffc6c8]1669 const int tag = $message_tag(request);
[6dd0322]1670
1671 switch(tag) {
[0ffc6c8]1672 case $CUDA_TAG_SCOPE_REQUEST :
1673 response = $message_pack($CUDA_PLACE_DEVICE, $CUDA_PLACE_HOST, $CUDA_TAG_SCOPE_REQUEST, &$cuda_scope, sizeof($scope));
1674 break;
1675 case $CUDA_TAG_cudaFree :
1676 response = $cuda_free(request);
[6dd0322]1677 break;
1678 case $CUDA_TAG_cudaMemcpy :
[a67c590]1679 response = $cuda_memcpy($cuda_scope, $cuda_default_stream, request, false);
[0ffc6c8]1680 break;
1681 case $CUDA_TAG_cudaMemcpyAsync :
[a67c590]1682 response = $cuda_memcpy($cuda_scope, $cuda_default_stream, request, true);
[6dd0322]1683 break;
1684 case $CUDA_TAG_LAUNCH_kernel_1 :
[a67c590]1685 $stream_enqueue($cuda_scope, $cuda_default_stream, request, $cuda_kernel_1_proc);
1686
1687 response = $message_pack($CUDA_PLACE_DEVICE, $CUDA_PLACE_HOST, tag, NULL, 0);
[0ffc6c8]1688 break;
[a67c590]1689 case $CUDA_TAG_TEARDOWN : {
1690 $proc destructor = $destroy_stream_node($cuda_default_stream->containingNode);
1691 $wait(destructor);
1692 $comm_destroy($cuda_device_comm);
[0ffc6c8]1693 return;
[a67c590]1694 }
[6dd0322]1695 default :
1696 $assert(false, "Unknown CUDA request");
1697 }
1698
1699 $comm_enqueue($cuda_device_comm, response);
1700 }
1701}
1702
1703///////////////
1704// Host file //
1705///////////////
1706
[9dbe9864]1707$input int N = 16;
1708$input int threadsPerBlock = N%2 == 0 ? N/2 : (N+1)/2;
1709//$input int threadsPerBlock = N;
[4bd0090]1710$input float A[N];
[9cabba4]1711// Currently unused but left in to save time
[4bd0090]1712$input float B[N];
1713
[a67c590]1714void $host_main() {
[4bd0090]1715 int size = N * sizeof(float);
[9dbe9864]1716 int numBlocks = (N-1)/threadsPerBlock + 1;
1717 //int numBlocks = 1;
[4bd0090]1718
[874f9d1]1719 float* cuda_A;
[0ffc6c8]1720 // cudaMalloc((void **)&cuda_A, size);
[4bd0090]1721 {
[0ffc6c8]1722 $scope deviceScope = $cuda_host_request_device_scope();
[cb46bb6]1723 cuda_A = $hide((float*)$malloc(deviceScope, size));
[4bd0090]1724 }
[874f9d1]1725 cudaMemcpy(cuda_A, A, size, cudaMemcpyHostToDevice);
[4bd0090]1726
[874f9d1]1727 float* cuda_B;
[4bd0090]1728 // cudaMalloc((void **)&cuda_B, size);
[30cb1de]1729 {
[0ffc6c8]1730 $scope deviceScope = $cuda_host_request_device_scope();
[cb46bb6]1731 cuda_B = $hide((float*)$malloc(deviceScope, size));
[4bd0090]1732 }
[874f9d1]1733 cudaMemcpy(cuda_B, B, size, cudaMemcpyHostToDevice);
[4bd0090]1734
[874f9d1]1735 float* cuda_C;
[9dbe9864]1736 // cudaMalloc((void **)&cuda_C, sizeof(float) * numBlocks);
[4bd0090]1737 {
[0ffc6c8]1738 $scope deviceScope = $cuda_host_request_device_scope();
[9dbe9864]1739 cuda_C = $hide((float*)$malloc(deviceScope, sizeof(float) * numBlocks));
[4bd0090]1740 }
1741
[0ffc6c8]1742 dim3 gridDim = {numBlocks, 1, 1};
[9dbe9864]1743 dim3 blockDim = {threadsPerBlock, 1, 1};
[0ffc6c8]1744 // kernel_1<<<gridDim, blockDim>>>(cuda_A, cuda_B, cuda_C, N);
1745 $cuda_host_launch_kernel_1(gridDim, blockDim, 0, NULL, cuda_A, cuda_B, cuda_C, N);
[874f9d1]1746
[a67c590]1747 // Checking correctness
[874f9d1]1748 float* C = (float *)malloc(size);
1749
[9cabba4]1750 cudaMemcpy(C, cuda_C, sizeof(float), cudaMemcpyDeviceToHost);
1751
1752 // REDUCTION ASSERTION
[9dbe9864]1753
[9cabba4]1754 float sum = 0;
[874f9d1]1755 for(int i = 0; i < N; i++)
[9cabba4]1756 sum += A[i];
1757
1758 $assert(*C == sum);
[9dbe9864]1759
[9cabba4]1760 // BALLOT ASSERTION
[9dbe9864]1761 /*
[9cabba4]1762 float count = 0;
1763 for (int i = 0; i < N; i++) {
1764 if (A[i] > 0)
1765 count++;
1766 }
1767 $assert(*C == count);
[9dbe9864]1768 */
[874f9d1]1769
1770 free(C);
1771
[7e3e7af]1772 cudaFree(cuda_A);
[0ffc6c8]1773 cudaFree(cuda_B);
1774 cudaFree(cuda_C);
[6dd0322]1775
[109d05e]1776}
1777
1778int main() {
[a67c590]1779 $proc host = $spawn $host_main();
1780 $proc cuda = $spawn $cuda_main();
[6dd0322]1781 $wait(host);
[9cabba4]1782 $comm_enqueue($cuda_host_comm, $message_pack($CUDA_PLACE_HOST, $CUDA_PLACE_DEVICE, $CUDA_TAG_TEARDOWN, NULL, 0));
1783 $comm_destroy($cuda_host_comm);
[6dd0322]1784 $wait(cuda);
[0ffc6c8]1785 $gcomm_destroy($cuda_gcomm, NULL);
[e76acca]1786}
Note: See TracBrowser for help on using the repository browser.