| 1 | #include <concurrency.cvh>
|
|---|
| 2 | #include <comm.cvh>
|
|---|
| 3 | #include <stdlib.h>
|
|---|
| 4 | #include <stdio.h>
|
|---|
| 5 | #include <stdbool.h>
|
|---|
| 6 | #include <string.h>
|
|---|
| 7 | #include <mem.cvh>
|
|---|
| 8 | #pragma CIVL ACSL
|
|---|
| 9 |
|
|---|
| 10 | ///////////
|
|---|
| 11 | // Types //
|
|---|
| 12 | ///////////
|
|---|
| 13 |
|
|---|
| 14 | enum cudaError {
|
|---|
| 15 | cudaSuccess
|
|---|
| 16 | };
|
|---|
| 17 | typedef enum cudaError cudaError_t;
|
|---|
| 18 |
|
|---|
| 19 | //typedef $cuda_memcpy_kind cudaMemcpyKind;
|
|---|
| 20 |
|
|---|
| 21 | typedef struct {
|
|---|
| 22 | unsigned int x, y, z;
|
|---|
| 23 | } dim3;
|
|---|
| 24 |
|
|---|
| 25 | /* used to represent a location in a three dimensional grid
|
|---|
| 26 | */
|
|---|
| 27 | typedef struct {
|
|---|
| 28 | unsigned int x, y, z;
|
|---|
| 29 | } uint3;
|
|---|
| 30 |
|
|---|
| 31 | typedef enum {
|
|---|
| 32 | cudaMemcpyHostToHost,
|
|---|
| 33 | cudaMemcpyHostToDevice,
|
|---|
| 34 | cudaMemcpyDeviceToHost,
|
|---|
| 35 | cudaMemcpyDeviceToDevice,
|
|---|
| 36 | cudaMemcpyDefault
|
|---|
| 37 | } cudaMemcpyKind;
|
|---|
| 38 |
|
|---|
| 39 | typedef struct $cuda_op_state* $cuda_op_state_t;
|
|---|
| 40 | typedef struct $cuda_op_state {
|
|---|
| 41 | _Bool start;
|
|---|
| 42 | $proc op;
|
|---|
| 43 | } $cuda_op_state;
|
|---|
| 44 |
|
|---|
| 45 | typedef struct $cuda_op_state_node* $cuda_op_state_node_t;
|
|---|
| 46 | typedef struct $cuda_op_state_node {
|
|---|
| 47 | $cuda_op_state_t opState;
|
|---|
| 48 | $cuda_op_state_node_t next;
|
|---|
| 49 | } $cuda_op_state_node;
|
|---|
| 50 |
|
|---|
| 51 | typedef struct $cuda_stream_node* $cuda_stream_node_t;
|
|---|
| 52 | typedef struct $cuda_stream* $cuda_stream_t;
|
|---|
| 53 | typedef $cuda_stream_t cudaStream_t;
|
|---|
| 54 | typedef struct $cuda_stream {
|
|---|
| 55 | $cuda_op_state_node_t head;
|
|---|
| 56 | $cuda_op_state_node_t tail;
|
|---|
| 57 | int numOps;
|
|---|
| 58 | $cuda_stream_node_t containingNode;
|
|---|
| 59 | _Bool alive;
|
|---|
| 60 | } $cuda_stream;
|
|---|
| 61 |
|
|---|
| 62 | typedef struct $cuda_stream_node{
|
|---|
| 63 | cudaStream_t stream;
|
|---|
| 64 | $cuda_stream_node_t prev;
|
|---|
| 65 | $cuda_stream_node_t next;
|
|---|
| 66 | } $cuda_stream_node;
|
|---|
| 67 |
|
|---|
| 68 | typedef struct $cuda_context* $cuda_context_t;
|
|---|
| 69 | typedef struct $cuda_context {
|
|---|
| 70 | $cuda_stream_node_t head;
|
|---|
| 71 | int numStreams;
|
|---|
| 72 | } $cuda_context;
|
|---|
| 73 |
|
|---|
| 74 | typedef struct $cuda_mem_set* $cuda_mem_set_t;
|
|---|
| 75 | typedef struct $cuda_mem_set {
|
|---|
| 76 | $mem reads;
|
|---|
| 77 | $mem writes;
|
|---|
| 78 | } $cuda_mem_set;
|
|---|
| 79 |
|
|---|
| 80 | void $cuda_mem_set_clear($cuda_mem_set_t memSet) {
|
|---|
| 81 | $mem emptyset = $mem_empty();
|
|---|
| 82 | memSet->reads = emptyset;
|
|---|
| 83 | memSet->writes = emptyset;
|
|---|
| 84 | }
|
|---|
| 85 |
|
|---|
| 86 | $cuda_mem_set_t $cuda_mem_set_create($scope scope) {
|
|---|
| 87 | $cuda_mem_set_t newMemSet = ($cuda_mem_set_t) $malloc(scope, sizeof($cuda_mem_set));
|
|---|
| 88 |
|
|---|
| 89 | $cuda_mem_set_clear(newMemSet);
|
|---|
| 90 |
|
|---|
| 91 | return newMemSet;
|
|---|
| 92 | }
|
|---|
| 93 |
|
|---|
| 94 | void $cuda_mem_set_destroy($cuda_mem_set_t memSet) {
|
|---|
| 95 | free(memSet);
|
|---|
| 96 | }
|
|---|
| 97 |
|
|---|
| 98 | void $cuda_mem_set_add($cuda_mem_set_t memSet, $mem reads, $mem writes) {
|
|---|
| 99 | memSet->reads = $mem_union(memSet->reads, reads);
|
|---|
| 100 | memSet->writes = $mem_union(memSet->writes, writes);
|
|---|
| 101 | }
|
|---|
| 102 |
|
|---|
| 103 | void $cuda_check_mems_disjoint($cuda_mem_set_t m1, int b1, int w1, int l1,
|
|---|
| 104 | $cuda_mem_set_t m2, int b2, int w2, int l2) {
|
|---|
| 105 | $mem out1 = $mem_empty();
|
|---|
| 106 | $mem out2 = $mem_empty();
|
|---|
| 107 |
|
|---|
| 108 | $assert($mem_no_intersect(m1->reads, m2->writes, &out1, &out2),
|
|---|
| 109 | "Data-race detected: %p read by thread <%d, %d, %d> intersects %p written by thread <%d, %d, %d>\n",
|
|---|
| 110 | out1, b1, w1, l1, out2, b2, w2, l2);
|
|---|
| 111 | $assert($mem_no_intersect(m1->writes, m2->reads, &out1, &out2),
|
|---|
| 112 | "Data-race detected: %p written by thread <%d, %d, %d> intersects %p read by thread <%d, %d, %d>\n",
|
|---|
| 113 | out1, b1, w1, l1, out2, b2, w2, l2);
|
|---|
| 114 | $assert($mem_no_intersect(m1->writes, m2->writes, &out1, &out2),
|
|---|
| 115 | "Data-race detected: %p written by thread <%d, %d, %d> intersects %p written by thread <%d, %d, %d>\n",
|
|---|
| 116 | out1, b1, w1, l1, out2, b2, w2, l2);
|
|---|
| 117 | }
|
|---|
| 118 |
|
|---|
| 119 | typedef enum {
|
|---|
| 120 | $CUDA_WARP_TAG_EMPTY,
|
|---|
| 121 | $CUDA_WARP_TAG_warpsync,
|
|---|
| 122 | $CUDA_WARP_TAG_shfl_sync,
|
|---|
| 123 | $CUDA_WARP_TAG_shfl_up_sync,
|
|---|
| 124 | $CUDA_WARP_TAG_shfl_down_sync,
|
|---|
| 125 | $CUDA_WARP_TAG_shfl_xor_sync,
|
|---|
| 126 | $CUDA_WARP_TAG_all_sync,
|
|---|
| 127 | $CUDA_WARP_TAG_any_sync,
|
|---|
| 128 | $CUDA_WARP_TAG_ballot_sync
|
|---|
| 129 | } $cuda_warp_tag;
|
|---|
| 130 |
|
|---|
| 131 | $input int warpSize = 32;
|
|---|
| 132 |
|
|---|
| 133 | typedef struct $cuda_warp_data* $cuda_warp_data_t;
|
|---|
| 134 | typedef struct $cuda_warp_data {
|
|---|
| 135 | int size;
|
|---|
| 136 | $cuda_mem_set_t** memSets;
|
|---|
| 137 | _Bool patching[];
|
|---|
| 138 | int num_alive;
|
|---|
| 139 | _Bool alive[];
|
|---|
| 140 | int num_in_barrier;
|
|---|
| 141 | _Bool in_barrier[];
|
|---|
| 142 | $cuda_warp_tag currOp;
|
|---|
| 143 | int reductionLane;
|
|---|
| 144 | $gcomm gcomm;
|
|---|
| 145 | } $cuda_warp_data;
|
|---|
| 146 |
|
|---|
| 147 | /*@ depends_on \access(warp);
|
|---|
| 148 | @ executes_when \true;
|
|---|
| 149 | @ */
|
|---|
| 150 | $atomic_f void $cuda_warp_barrier_update($cuda_warp_data_t warp) {
|
|---|
| 151 | if (warp->num_in_barrier == warp->num_alive) {
|
|---|
| 152 | warp->num_in_barrier = 0;
|
|---|
| 153 | warp->currOp = $CUDA_WARP_TAG_EMPTY;
|
|---|
| 154 | for (int i = 0; i < warp->size; i++) {
|
|---|
| 155 | warp->in_barrier[i] = $false;
|
|---|
| 156 | $cuda_mem_set_clear(warp->memSets[i][i]);
|
|---|
| 157 | }
|
|---|
| 158 | }
|
|---|
| 159 | }
|
|---|
| 160 |
|
|---|
| 161 | $cuda_warp_data_t $create_cuda_warp_data($scope warpScope, int size) {
|
|---|
| 162 | $cuda_warp_data_t newWarp = ($cuda_warp_data_t) malloc(sizeof($cuda_warp_data));
|
|---|
| 163 | newWarp->size = size;
|
|---|
| 164 | newWarp->memSets = ($cuda_mem_set_t**) $malloc(warpScope, size * sizeof($cuda_mem_set_t*));
|
|---|
| 165 | for (int i = 0; i < size; i++) {
|
|---|
| 166 | newWarp->memSets[i] = ($cuda_mem_set_t*) $malloc(warpScope, size * sizeof($cuda_mem_set_t));
|
|---|
| 167 | for (int j = 0; j < size; j++) {
|
|---|
| 168 | newWarp->memSets[i][j] = $cuda_mem_set_create(warpScope);
|
|---|
| 169 | }
|
|---|
| 170 | }
|
|---|
| 171 | newWarp->patching = (_Bool[size])$lambda(int i) $false;
|
|---|
| 172 | newWarp->num_alive = size;
|
|---|
| 173 | newWarp->alive = (_Bool[size])$lambda(int i) $true;
|
|---|
| 174 | newWarp->num_in_barrier = 0;
|
|---|
| 175 | newWarp->in_barrier = (_Bool[size])$lambda(int i) $false;
|
|---|
| 176 | newWarp->currOp = $CUDA_WARP_TAG_EMPTY;
|
|---|
| 177 | newWarp->reductionLane = -1;
|
|---|
| 178 | newWarp->gcomm = $gcomm_create(warpScope, size);
|
|---|
| 179 |
|
|---|
| 180 | return newWarp;
|
|---|
| 181 | }
|
|---|
| 182 |
|
|---|
| 183 | void $destroy_cuda_warp_data($cuda_warp_data_t warp) {
|
|---|
| 184 | $assert(warp != NULL, "Attempt to destroy a NULL warp");
|
|---|
| 185 |
|
|---|
| 186 | for (int i = 0; i < warp->size; i++) {
|
|---|
| 187 | for (int j = 0; j < warp->size; j++) {
|
|---|
| 188 | $cuda_mem_set_destroy(warp->memSets[i][j]);
|
|---|
| 189 | }
|
|---|
| 190 | free(warp->memSets[i]);
|
|---|
| 191 | }
|
|---|
| 192 | free(warp->memSets);
|
|---|
| 193 | $gcomm_destroy(warp->gcomm, NULL);
|
|---|
| 194 | free(warp);
|
|---|
| 195 | }
|
|---|
| 196 |
|
|---|
| 197 | typedef struct $cuda_block_data* $cuda_block_data_t;
|
|---|
| 198 | typedef struct $cuda_block_data {
|
|---|
| 199 | int size;
|
|---|
| 200 | $cuda_mem_set_t* memSets;
|
|---|
| 201 | int predArray[];
|
|---|
| 202 | $gbarrier gbarrier;
|
|---|
| 203 | int currBarrierID;
|
|---|
| 204 | int numWarps;
|
|---|
| 205 | $cuda_warp_data_t* warps;
|
|---|
| 206 | } $cuda_block_data;
|
|---|
| 207 |
|
|---|
| 208 | $cuda_block_data_t $create_cuda_block_data($scope scope, int size) {
|
|---|
| 209 | $cuda_block_data_t newBlock = ($cuda_block_data_t) $malloc(scope, sizeof($cuda_block_data));
|
|---|
| 210 |
|
|---|
| 211 | newBlock->size = size;
|
|---|
| 212 | newBlock->numWarps = (size - 1)/warpSize + 1;
|
|---|
| 213 |
|
|---|
| 214 | newBlock->predArray = (int[size]) $lambda(int i) 0;
|
|---|
| 215 |
|
|---|
| 216 | newBlock->gbarrier = $gbarrier_create(scope, size);
|
|---|
| 217 | newBlock->currBarrierID = -1;
|
|---|
| 218 |
|
|---|
| 219 | newBlock->memSets = ($cuda_mem_set_t*) $malloc(scope, size * sizeof($cuda_mem_set_t));
|
|---|
| 220 | for (int i = 0; i < size; i++) {
|
|---|
| 221 | newBlock->memSets[i] = $cuda_mem_set_create(scope);
|
|---|
| 222 | }
|
|---|
| 223 | newBlock->warps = ($cuda_warp_data_t*) $malloc(scope, newBlock->numWarps * sizeof($cuda_warp_data_t));
|
|---|
| 224 | for (int i = 0; i < newBlock->numWarps - 1; i++) {
|
|---|
| 225 | newBlock->warps[i] = $create_cuda_warp_data(scope, warpSize);
|
|---|
| 226 | }
|
|---|
| 227 | int lastIndex = newBlock->numWarps - 1;
|
|---|
| 228 | newBlock->warps[lastIndex] = $create_cuda_warp_data(scope, ((size - 1) % warpSize) + 1);
|
|---|
| 229 |
|
|---|
| 230 | return newBlock;
|
|---|
| 231 | }
|
|---|
| 232 |
|
|---|
| 233 | void $destroy_cuda_block_data($cuda_block_data_t block) {
|
|---|
| 234 | for (int i = 0; i < block->size; i++) {
|
|---|
| 235 | $cuda_mem_set_destroy(block->memSets[i]);
|
|---|
| 236 | }
|
|---|
| 237 | free(block->memSets);
|
|---|
| 238 |
|
|---|
| 239 | for (int i = 0; i < block->numWarps; i++) {
|
|---|
| 240 | $destroy_cuda_warp_data(block->warps[i]);
|
|---|
| 241 | }
|
|---|
| 242 | free(block->warps);
|
|---|
| 243 |
|
|---|
| 244 | $gbarrier_destroy(block->gbarrier);
|
|---|
| 245 |
|
|---|
| 246 | free(block);
|
|---|
| 247 | }
|
|---|
| 248 |
|
|---|
| 249 | typedef struct $cuda_kernel_data* $cuda_kernel_data_t;
|
|---|
| 250 | typedef struct $cuda_kernel_data {
|
|---|
| 251 | int size;
|
|---|
| 252 | $cuda_mem_set_t* memSets;
|
|---|
| 253 | int numBlocks;
|
|---|
| 254 | $cuda_block_data_t* blocks;
|
|---|
| 255 | } $cuda_kernel_data;
|
|---|
| 256 |
|
|---|
| 257 | $cuda_kernel_data_t $create_cuda_kernel_data($scope scope, dim3 gridDim, dim3 blockDim){
|
|---|
| 258 | int numBlocks = (gridDim.x * gridDim.y) * gridDim.z;
|
|---|
| 259 | int threadsPerBlock = (blockDim.x * blockDim.y) * blockDim.z;
|
|---|
| 260 |
|
|---|
| 261 | $cuda_kernel_data_t newKernel = ($cuda_kernel_data_t)$malloc(scope, sizeof($cuda_kernel_data));
|
|---|
| 262 |
|
|---|
| 263 | newKernel->size = threadsPerBlock * numBlocks;
|
|---|
| 264 | newKernel->numBlocks = numBlocks;
|
|---|
| 265 |
|
|---|
| 266 | newKernel->memSets = ($cuda_mem_set_t*) $malloc(scope, newKernel->size * sizeof($cuda_mem_set_t));
|
|---|
| 267 | for (int i = 0; i < newKernel->size; i++) {
|
|---|
| 268 | newKernel->memSets[i] = $cuda_mem_set_create(scope);
|
|---|
| 269 | }
|
|---|
| 270 |
|
|---|
| 271 | newKernel->blocks = ($cuda_block_data_t*) $malloc(scope, newKernel->numBlocks * sizeof($cuda_block_data_t));
|
|---|
| 272 | for (int i = 0; i < newKernel->numBlocks; i++) {
|
|---|
| 273 | newKernel->blocks[i] = $create_cuda_block_data(scope, threadsPerBlock);
|
|---|
| 274 | }
|
|---|
| 275 |
|
|---|
| 276 | return newKernel;
|
|---|
| 277 | }
|
|---|
| 278 |
|
|---|
| 279 | void $destroy_cuda_kernel_data($cuda_kernel_data_t kernel){
|
|---|
| 280 | for (int i = 0; i < kernel->size; i++) {
|
|---|
| 281 | $cuda_mem_set_destroy(kernel->memSets[i]);
|
|---|
| 282 | }
|
|---|
| 283 | free(kernel->memSets);
|
|---|
| 284 |
|
|---|
| 285 | for (int i = 0; i < kernel->numBlocks; i++) {
|
|---|
| 286 | $destroy_cuda_block_data(kernel->blocks[i]);
|
|---|
| 287 | }
|
|---|
| 288 | free(kernel->blocks);
|
|---|
| 289 | free(kernel);
|
|---|
| 290 | }
|
|---|
| 291 |
|
|---|
| 292 | typedef struct $cuda_thread_data* $cuda_thread_data_t;
|
|---|
| 293 | typedef struct $cuda_thread_data {
|
|---|
| 294 | $cuda_kernel_data_t kernel;
|
|---|
| 295 | $cuda_block_data_t block;
|
|---|
| 296 | $cuda_warp_data_t warp;
|
|---|
| 297 | int blockID;
|
|---|
| 298 | int warpID;
|
|---|
| 299 | int laneID;
|
|---|
| 300 | $comm lane_comm;
|
|---|
| 301 | $barrier block_barrier;
|
|---|
| 302 | } $cuda_thread_data;
|
|---|
| 303 |
|
|---|
| 304 | /*@ depends_on \access(thread);
|
|---|
| 305 | @ executes_when \true;
|
|---|
| 306 | @*/
|
|---|
| 307 | $atomic_f void $cuda_check_warp_data_race($cuda_thread_data_t thread) {
|
|---|
| 308 | int blockID = thread->blockID;
|
|---|
| 309 | int warpID = thread->warpID;
|
|---|
| 310 | int laneID = thread->laneID;
|
|---|
| 311 | $cuda_mem_set_t** warpMems = thread->warp->memSets;
|
|---|
| 312 |
|
|---|
| 313 | for (int i = 0; i < thread->warp->size; i++) {
|
|---|
| 314 | if (i != laneID) {
|
|---|
| 315 | $cuda_check_mems_disjoint(warpMems[laneID][laneID], blockID, warpID, laneID,
|
|---|
| 316 | warpMems[i][i], blockID, warpID, i);
|
|---|
| 317 | }
|
|---|
| 318 | }
|
|---|
| 319 | }
|
|---|
| 320 |
|
|---|
| 321 | /*@ depends_on \access(thread);
|
|---|
| 322 | @ executes_when \true;
|
|---|
| 323 | @*/
|
|---|
| 324 | $atomic_f void $cuda_check_block_data_race($cuda_thread_data_t thread) {
|
|---|
| 325 | int blockID = thread->blockID;
|
|---|
| 326 | int warpID = thread->warpID;
|
|---|
| 327 | int laneID = thread->laneID;
|
|---|
| 328 | int indexInBlock = warpID * warpSize + laneID;
|
|---|
| 329 | $cuda_mem_set_t* blockMems = thread->block->memSets;
|
|---|
| 330 |
|
|---|
| 331 | for (int i = 0; i < thread->block->numWarps; i++) {
|
|---|
| 332 | if (i != warpID) {
|
|---|
| 333 | int currWarpSize = thread->block->warps[i]->size;
|
|---|
| 334 | for (int j = 0; j < currWarpSize; j++) {
|
|---|
| 335 | $cuda_check_mems_disjoint(blockMems[indexInBlock], blockID, warpID, laneID,
|
|---|
| 336 | blockMems[i * warpSize + j], blockID, i, j);
|
|---|
| 337 | }
|
|---|
| 338 | }
|
|---|
| 339 | }
|
|---|
| 340 | }
|
|---|
| 341 |
|
|---|
| 342 | /*@ depends_on \access(thread);
|
|---|
| 343 | @ executes_when \true;
|
|---|
| 344 | @*/
|
|---|
| 345 | $atomic_f void $cuda_check_kernel_data_race($cuda_thread_data_t thread) {
|
|---|
| 346 | int blockID = thread->blockID;
|
|---|
| 347 | int warpID = thread->warpID;
|
|---|
| 348 | int laneID = thread->laneID;
|
|---|
| 349 | int threadsPerBlock = thread->block->size;
|
|---|
| 350 | int indexInKernel = blockID * threadsPerBlock + warpID * warpSize + laneID;
|
|---|
| 351 | $cuda_mem_set_t* kernelMems = thread->kernel->memSets;
|
|---|
| 352 | //printf("%d, %d, %d - Checking kernel data race. km index %d.\n\n", thread->blockID, thread->warpID, thread->laneID, indexInKernel);
|
|---|
| 353 |
|
|---|
| 354 | for (int i = 0; i < thread->kernel->numBlocks; i++) {
|
|---|
| 355 | if (i != blockID) {
|
|---|
| 356 | $cuda_block_data_t block = thread->kernel->blocks[i];
|
|---|
| 357 | for (int j = 0; j < block->numWarps; j++) {
|
|---|
| 358 | int currWarpSize = block->warps[j]->size;
|
|---|
| 359 | for (int k = 0; k < currWarpSize; k++) {
|
|---|
| 360 | //printf("%d, %d, %d - Checking kdr against <%d,%d,%d> with km index %d.\n\n", thread->blockID, thread->warpID, thread->laneID, i,j,k, i * threadsPerBlock + j * warpSize + k);
|
|---|
| 361 | $cuda_check_mems_disjoint(kernelMems[indexInKernel], blockID, warpID, laneID,
|
|---|
| 362 | kernelMems[i * threadsPerBlock + j * warpSize + k], i, j, k);
|
|---|
| 363 | }
|
|---|
| 364 | }
|
|---|
| 365 | }
|
|---|
| 366 | }
|
|---|
| 367 | }
|
|---|
| 368 |
|
|---|
| 369 | /*@ depends_on \access(thread);
|
|---|
| 370 | @ executes_when \true;
|
|---|
| 371 | @*/
|
|---|
| 372 | $atomic_f void $cuda_start_mem_patching($cuda_thread_data_t thread) {
|
|---|
| 373 | thread->warp->patching[thread->laneID] = $true;
|
|---|
| 374 | }
|
|---|
| 375 |
|
|---|
| 376 | /*@ depends_on \access(thread);
|
|---|
| 377 | @ executes_when \true;
|
|---|
| 378 | @*/
|
|---|
| 379 | $atomic_f void $cuda_update_patches($cuda_thread_data_t thread) {
|
|---|
| 380 | $cuda_warp_data_t warp = thread->warp;
|
|---|
| 381 | int lane = thread->laneID;
|
|---|
| 382 |
|
|---|
| 383 | for (int i = 0; i < warp->size; i++) {
|
|---|
| 384 | if (warp->patching[i]) {
|
|---|
| 385 | $cuda_mem_set_add(warp->memSets[i][lane], warp->memSets[lane][lane]->reads, warp->memSets[lane][lane]->writes);
|
|---|
| 386 | }
|
|---|
| 387 | }
|
|---|
| 388 | }
|
|---|
| 389 |
|
|---|
| 390 | /*@ depends_on \access(thread);
|
|---|
| 391 | @ executes_when \true;
|
|---|
| 392 | @*/
|
|---|
| 393 | $atomic_f void $cuda_end_mem_patching($cuda_thread_data_t thread) {
|
|---|
| 394 | if (thread->warp->patching[thread->laneID]) {
|
|---|
| 395 | int size = thread->warp->size;
|
|---|
| 396 | int blockID = thread->blockID;
|
|---|
| 397 | int warpID = thread->warpID;
|
|---|
| 398 | int laneID = thread->laneID;
|
|---|
| 399 | $cuda_mem_set_t* threadMems = thread->warp->memSets[laneID];
|
|---|
| 400 |
|
|---|
| 401 | thread->warp->patching[laneID] = $false;
|
|---|
| 402 |
|
|---|
| 403 | for (int i = 0; i < size; i++) {
|
|---|
| 404 | if (i != laneID) {
|
|---|
| 405 | $cuda_check_mems_disjoint(threadMems[laneID], blockID, warpID, laneID,
|
|---|
| 406 | threadMems[i], blockID, warpID, i);
|
|---|
| 407 | $cuda_mem_set_clear(threadMems[i]);
|
|---|
| 408 | }
|
|---|
| 409 | }
|
|---|
| 410 | }
|
|---|
| 411 | }
|
|---|
| 412 |
|
|---|
| 413 | /*@ depends_on \access(thread);
|
|---|
| 414 | @ executes_when \true;
|
|---|
| 415 | @*/
|
|---|
| 416 | $atomic_f void $cuda_start_kernel_region($cuda_thread_data_t thread) {
|
|---|
| 417 | int lane = thread->laneID;
|
|---|
| 418 | //printf("%d, %d, %d - Starting region.\n\n", thread->blockID, thread->warpID, thread->laneID);
|
|---|
| 419 | $cuda_update_patches(thread);
|
|---|
| 420 | $cuda_mem_set_clear(thread->warp->memSets[lane][lane]);
|
|---|
| 421 | $cuda_mem_set_clear(thread->block->memSets[thread->warpID * warpSize + thread->laneID]);
|
|---|
| 422 | $cuda_mem_set_clear(thread->kernel->memSets[thread->blockID * thread->block->size + thread->warpID * warpSize + thread->laneID]);
|
|---|
| 423 | //$cuda_mem_set_t km = thread->kernel->memSets[thread->blockID];
|
|---|
| 424 | //printf("%d, %d, %d - kernel mem set after clear:\n\tReads: %s\n\tWrites: %s\n\n", thread->blockID, thread->warpID, thread->laneID, km->reads, km->writes);
|
|---|
| 425 | $read_set_push();
|
|---|
| 426 | $write_set_push();
|
|---|
| 427 | }
|
|---|
| 428 |
|
|---|
| 429 | /*@ depends_on \access(thread);
|
|---|
| 430 | @ executes_when \true;
|
|---|
| 431 | @*/
|
|---|
| 432 | $atomic_f void $cuda_complete_warp_region($cuda_thread_data_t thread) {
|
|---|
| 433 | $mem reads = $read_set_pop();
|
|---|
| 434 | $mem writes = $write_set_pop();
|
|---|
| 435 | int lane = thread->laneID;
|
|---|
| 436 | $cuda_mem_set_t laneMem = thread->warp->memSets[lane][lane];
|
|---|
| 437 | //printf("%d, %d, %d - completing warp region.\n\tReads: %s\n\tWrites: %s\n\n", thread->blockID, thread->warpID, thread->laneID, reads, writes);
|
|---|
| 438 |
|
|---|
| 439 | $cuda_mem_set_add(laneMem, reads, writes);
|
|---|
| 440 | $cuda_mem_set_add(thread->block->memSets[thread->warpID * warpSize + thread->laneID], laneMem->reads, laneMem->writes);
|
|---|
| 441 | $cuda_mem_set_add(thread->kernel->memSets[thread->blockID * thread->block->size + thread->warpID * warpSize + thread->laneID], laneMem->reads, laneMem->writes);
|
|---|
| 442 | //$cuda_mem_set_t km = thread->kernel->memSets[thread->blockID];
|
|---|
| 443 | //printf("%d, %d, %d - kernel mem set after add:\n\tReads: %s\n\tWrites: %s\n\n", thread->blockID, thread->warpID, thread->laneID, km->reads, km->writes);
|
|---|
| 444 | $cuda_end_mem_patching(thread);
|
|---|
| 445 | $cuda_check_warp_data_race(thread);
|
|---|
| 446 | }
|
|---|
| 447 |
|
|---|
| 448 | /*@ depends_on \access(thread);
|
|---|
| 449 | @ executes_when \true;
|
|---|
| 450 | @*/
|
|---|
| 451 | $atomic_f void $cuda_complete_block_region($cuda_thread_data_t thread) {
|
|---|
| 452 | //printf("%d, %d, %d - completing block region\n\n", thread->blockID, thread->warpID, thread->laneID);
|
|---|
| 453 | $cuda_complete_warp_region(thread);
|
|---|
| 454 | $cuda_check_block_data_race(thread);
|
|---|
| 455 | }
|
|---|
| 456 |
|
|---|
| 457 | /*@ depends_on \access(thread);
|
|---|
| 458 | @ executes_when \true;
|
|---|
| 459 | @*/
|
|---|
| 460 | $atomic_f void $cuda_complete_kernel_region($cuda_thread_data_t thread) {
|
|---|
| 461 | //printf("%d, %d, %d - completing kernel region\n\n", thread->blockID, thread->warpID, thread->laneID);
|
|---|
| 462 | $cuda_complete_block_region(thread);
|
|---|
| 463 | $cuda_check_kernel_data_race(thread);
|
|---|
| 464 | }
|
|---|
| 465 |
|
|---|
| 466 | $cuda_thread_data_t $create_cuda_thread_data($scope scope, $cuda_kernel_data_t kernel, int blockID, int warpID, int laneID) {
|
|---|
| 467 | $cuda_thread_data_t newThread = ($cuda_thread_data_t) $malloc(scope, sizeof($cuda_thread_data));
|
|---|
| 468 |
|
|---|
| 469 | newThread->kernel = kernel;
|
|---|
| 470 | newThread->block = kernel->blocks[blockID];
|
|---|
| 471 | newThread->warp = newThread->block->warps[warpID];
|
|---|
| 472 | newThread->blockID = blockID;
|
|---|
| 473 | newThread->warpID = warpID;
|
|---|
| 474 | newThread->laneID = laneID;
|
|---|
| 475 | newThread->lane_comm = $comm_create(scope, kernel->blocks[blockID]->warps[warpID]->gcomm, laneID);
|
|---|
| 476 | newThread->block_barrier = $barrier_create(scope, kernel->blocks[blockID]->gbarrier, warpID * warpSize + laneID);
|
|---|
| 477 |
|
|---|
| 478 | $read_set_push();
|
|---|
| 479 | $write_set_push();
|
|---|
| 480 |
|
|---|
| 481 | return newThread;
|
|---|
| 482 | }
|
|---|
| 483 |
|
|---|
| 484 | /*@ depends_on \access(thread);
|
|---|
| 485 | @ executes_when \true;
|
|---|
| 486 | @ */
|
|---|
| 487 | $atomic_f void $destroy_cuda_thread_data($cuda_thread_data_t thread) {
|
|---|
| 488 | $assert(thread != NULL, "Attempt to destroy NULL cuda thread");
|
|---|
| 489 | $cuda_complete_kernel_region(thread);
|
|---|
| 490 |
|
|---|
| 491 | $cuda_warp_data_t warp = thread->warp;
|
|---|
| 492 | warp->alive[thread->laneID] = $false;
|
|---|
| 493 | warp->num_alive--;
|
|---|
| 494 | $cuda_warp_barrier_update(warp);
|
|---|
| 495 |
|
|---|
| 496 | $barrier_destroy(thread->block_barrier);
|
|---|
| 497 |
|
|---|
| 498 | $comm_destroy(thread->lane_comm);
|
|---|
| 499 | free(thread);
|
|---|
| 500 | }
|
|---|
| 501 |
|
|---|
| 502 | /*@ depends_on \access(thread);
|
|---|
| 503 | @ executes_when \true;
|
|---|
| 504 | @ */
|
|---|
| 505 | $atomic_f void $cuda_warp_barrier_enter($cuda_thread_data_t thread, $cuda_warp_tag tag) {
|
|---|
| 506 | $cuda_warp_data_t warp = thread->warp;
|
|---|
| 507 | $assert(!warp->in_barrier[thread->laneID]);
|
|---|
| 508 | $assert(warp->currOp == tag || warp->currOp == $CUDA_WARP_TAG_EMPTY);
|
|---|
| 509 |
|
|---|
| 510 | warp->in_barrier[thread->laneID] = $true;
|
|---|
| 511 | warp->currOp = tag;
|
|---|
| 512 | warp->num_in_barrier++;
|
|---|
| 513 | $cuda_warp_barrier_update(warp);
|
|---|
| 514 | }
|
|---|
| 515 |
|
|---|
| 516 | // Seems this needs to be atomic to work. Why?
|
|---|
| 517 | /*@ depends_on \access(thread);
|
|---|
| 518 | @*/
|
|---|
| 519 | $atomic_f void $cuda_warp_barrier_exit($cuda_thread_data_t thread) {
|
|---|
| 520 | $when(!thread->warp->in_barrier[thread->laneID]);
|
|---|
| 521 | }
|
|---|
| 522 |
|
|---|
| 523 | void $cuda_warp_barrier_call($cuda_thread_data_t thread, $cuda_warp_tag tag) {
|
|---|
| 524 | $cuda_warp_barrier_enter(thread, tag);
|
|---|
| 525 | $cuda_warp_barrier_exit(thread);
|
|---|
| 526 | }
|
|---|
| 527 |
|
|---|
| 528 | void $cuda__syncthreads($cuda_thread_data_t thread, int barrierIndex) {
|
|---|
| 529 | $cuda_complete_block_region(thread);
|
|---|
| 530 | int indexInBlock = thread->warpID * warpSize + thread->laneID;
|
|---|
| 531 |
|
|---|
| 532 | if (thread->block->currBarrierID == -1) {
|
|---|
| 533 | thread->block->currBarrierID = barrierIndex;
|
|---|
| 534 | }
|
|---|
| 535 | $assert(thread->block->currBarrierID == barrierIndex,
|
|---|
| 536 | "Divergent calls to __syncthreads.");
|
|---|
| 537 |
|
|---|
| 538 | $local_end();
|
|---|
| 539 | $barrier_call(thread->block_barrier);
|
|---|
| 540 | $local_start();
|
|---|
| 541 | int lane = thread->laneID;
|
|---|
| 542 |
|
|---|
| 543 | $cuda_mem_set_clear(thread->warp->memSets[lane][lane]);
|
|---|
| 544 | $cuda_mem_set_clear(thread->block->memSets[thread->warpID * warpSize + lane]);
|
|---|
| 545 |
|
|---|
| 546 | if (indexInBlock == 0) {
|
|---|
| 547 | thread->block->currBarrierID = -1;
|
|---|
| 548 | }
|
|---|
| 549 |
|
|---|
| 550 | $local_end();
|
|---|
| 551 | $barrier_call(thread->block_barrier);
|
|---|
| 552 | $local_start();
|
|---|
| 553 |
|
|---|
| 554 | $read_set_push();
|
|---|
| 555 | $write_set_push();
|
|---|
| 556 | }
|
|---|
| 557 |
|
|---|
| 558 | #define $CUDA_DEFINE_SYNCTHREADS_VARIANT(NAME, INIT, REDUCTION) \
|
|---|
| 559 | int NAME(int predicate, $cuda_thread_data_t thread, int barrierIndex) { \
|
|---|
| 560 | $cuda_complete_block_region(thread); \
|
|---|
| 561 | int indexInBlock = thread->warpID * warpSize + thread->laneID; \
|
|---|
| 562 | \
|
|---|
| 563 | if (thread->block->currBarrierID == -1) { \
|
|---|
| 564 | thread->block->currBarrierID = barrierIndex; \
|
|---|
| 565 | } \
|
|---|
| 566 | $assert(thread->block->currBarrierID == barrierIndex, \
|
|---|
| 567 | "Divergent calls to NAME."); \
|
|---|
| 568 | \
|
|---|
| 569 | if (predicate) { \
|
|---|
| 570 | thread->block->predArray[indexInBlock] = 1; \
|
|---|
| 571 | } else { \
|
|---|
| 572 | thread->block->predArray[indexInBlock] = 0; \
|
|---|
| 573 | } \
|
|---|
| 574 | \
|
|---|
| 575 | $local_end(); \
|
|---|
| 576 | $barrier_call(thread->block_barrier); \
|
|---|
| 577 | $local_start(); \
|
|---|
| 578 | int lane = thread->laneID; \
|
|---|
| 579 | \
|
|---|
| 580 | $cuda_mem_set_clear(thread->warp->memSets[lane][lane]); \
|
|---|
| 581 | $cuda_mem_set_clear(thread->block->memSets[indexInBlock]); \
|
|---|
| 582 | \
|
|---|
| 583 | if (indexInBlock == 0) { \
|
|---|
| 584 | int result = INIT; \
|
|---|
| 585 | for (int i = 0; i < thread->block->size; i++) { \
|
|---|
| 586 | REDUCTION; \
|
|---|
| 587 | } \
|
|---|
| 588 | thread->block->predArray[0] = result; \
|
|---|
| 589 | thread->block->currBarrierID = -1; \
|
|---|
| 590 | } \
|
|---|
| 591 | \
|
|---|
| 592 | $local_end(); \
|
|---|
| 593 | $barrier_call(thread->block_barrier); \
|
|---|
| 594 | $local_start(); \
|
|---|
| 595 | int result = thread->block->predArray[0]; \
|
|---|
| 596 | \
|
|---|
| 597 | $local_end(); \
|
|---|
| 598 | $barrier_call(thread->block_barrier); \
|
|---|
| 599 | $local_start(); \
|
|---|
| 600 | \
|
|---|
| 601 | $read_set_push(); \
|
|---|
| 602 | $write_set_push(); \
|
|---|
| 603 | return result; \
|
|---|
| 604 | }
|
|---|
| 605 |
|
|---|
| 606 | $CUDA_DEFINE_SYNCTHREADS_VARIANT($cuda__syncthreads_count, 0, result += thread->block->predArray[i])
|
|---|
| 607 | $CUDA_DEFINE_SYNCTHREADS_VARIANT($cuda__syncthreads_or, 0, if (thread->block->predArray[i]) {
|
|---|
| 608 | result = 1;
|
|---|
| 609 | continue;
|
|---|
| 610 | })
|
|---|
| 611 | $CUDA_DEFINE_SYNCTHREADS_VARIANT($cuda__syncthreads_and, 1, if (!thread->block->predArray[i]) {
|
|---|
| 612 | result = 0;
|
|---|
| 613 | continue;
|
|---|
| 614 | })
|
|---|
| 615 |
|
|---|
| 616 | #define $GET_ARG_1(_1, ...) _1
|
|---|
| 617 | #define $GET_ARG_2(_1, _2, ...) _2
|
|---|
| 618 |
|
|---|
| 619 | #define __syncwarp() $cuda__syncwarp($thread)
|
|---|
| 620 | void $cuda__syncwarp($cuda_thread_data_t thread) {
|
|---|
| 621 | $cuda_complete_warp_region(thread);
|
|---|
| 622 | $local_end();
|
|---|
| 623 | $cuda_warp_barrier_call(thread, $CUDA_WARP_TAG_warpsync);
|
|---|
| 624 | $local_start();
|
|---|
| 625 | $read_set_push();
|
|---|
| 626 | $write_set_push();
|
|---|
| 627 | }
|
|---|
| 628 |
|
|---|
| 629 | #define $CUDA_SHFL_PARAM_MACRO(...) $GET_ARG_1(__VA_ARGS__, warpSize, 0), $GET_ARG_2(__VA_ARGS__, warpSize, 0)
|
|---|
| 630 |
|
|---|
| 631 | #define __shfl_sync(mask, var, ...) \
|
|---|
| 632 | _Generic(var, \
|
|---|
| 633 | default: $cuda__shfl_sync_int, \
|
|---|
| 634 | unsigned int: $cuda__shfl_sync_uint, \
|
|---|
| 635 | long: $cuda__shfl_sync_long, \
|
|---|
| 636 | unsigned long: $cuda__shfl_sync_ulong, \
|
|---|
| 637 | long long: $cuda__shfl_sync_ll, \
|
|---|
| 638 | unsigned long long: $cuda__shfl_sync_ull, \
|
|---|
| 639 | float: $cuda__shfl_sync_float, \
|
|---|
| 640 | double: $cuda__shfl_sync_double) (mask, var, $CUDA_SHFL_PARAM_MACRO(__VA_ARGS__), $thread)
|
|---|
| 641 |
|
|---|
| 642 | #define __shfl_up_sync(mask, var, ...) \
|
|---|
| 643 | _Generic(var, \
|
|---|
| 644 | default: $cuda__shfl_up_sync_int, \
|
|---|
| 645 | unsigned int: $cuda__shfl_up_sync_uint, \
|
|---|
| 646 | long: $cuda__shfl_up_sync_long, \
|
|---|
| 647 | unsigned long: $cuda__shfl_up_sync_ulong, \
|
|---|
| 648 | long long: $cuda__shfl_up_sync_ll, \
|
|---|
| 649 | unsigned long long: $cuda__shfl_up_sync_ull, \
|
|---|
| 650 | float: $cuda__shfl_up_sync_float, \
|
|---|
| 651 | double: $cuda__shfl_up_sync_double) (mask, var, $CUDA_SHFL_PARAM_MACRO(__VA_ARGS__), $thread)
|
|---|
| 652 |
|
|---|
| 653 | #define __shfl_down_sync(mask, var, ...) \
|
|---|
| 654 | _Generic(var, \
|
|---|
| 655 | default: $cuda__shfl_down_sync_int, \
|
|---|
| 656 | unsigned int: $cuda__shfl_down_sync_uint, \
|
|---|
| 657 | long: $cuda__shfl_down_sync_long, \
|
|---|
| 658 | unsigned long: $cuda__shfl_down_sync_ulong, \
|
|---|
| 659 | long long: $cuda__shfl_down_sync_ll, \
|
|---|
| 660 | unsigned long long: $cuda__shfl_down_sync_ull, \
|
|---|
| 661 | float: $cuda__shfl_down_sync_float, \
|
|---|
| 662 | double: $cuda__shfl_down_sync_double) (mask, var, $CUDA_SHFL_PARAM_MACRO(__VA_ARGS__), $thread)
|
|---|
| 663 |
|
|---|
| 664 | #define __shfl_xor_sync(mask, var, ...) \
|
|---|
| 665 | _Generic(var, \
|
|---|
| 666 | default: $cuda__shfl_xor_sync_int, \
|
|---|
| 667 | unsigned int: $cuda__shfl_xor_sync_uint, \
|
|---|
| 668 | long: $cuda__shfl_xor_sync_long, \
|
|---|
| 669 | unsigned long: $cuda__shfl_xor_sync_ulong, \
|
|---|
| 670 | long long: $cuda__shfl_xor_sync_ll, \
|
|---|
| 671 | unsigned long long: $cuda__shfl_xor_sync_ull, \
|
|---|
| 672 | float: $cuda__shfl_xor_sync_float, \
|
|---|
| 673 | double: $cuda__shfl_xor_sync_double) (mask, var, $CUDA_SHFL_PARAM_MACRO(__VA_ARGS__), $thread)
|
|---|
| 674 |
|
|---|
| 675 | #define $CUDA_INTRINSIC_RACE_CHECK_PROTOCOL_PRE() \
|
|---|
| 676 | $mem writes = $write_set_pop(); \
|
|---|
| 677 | $mem reads = $read_set_pop(); \
|
|---|
| 678 | $cuda_mem_set_add(thread->warp->memSets[thread->laneID][thread->laneID], \
|
|---|
| 679 | reads, writes); \
|
|---|
| 680 | thread->warp->patching[thread->laneID] = $true;
|
|---|
| 681 |
|
|---|
| 682 | #define $CUDA_INTRINSIC_RACE_CHECK_PROTOCOL_POST() \
|
|---|
| 683 | $read_set_push(); \
|
|---|
| 684 | $write_set_push();
|
|---|
| 685 |
|
|---|
| 686 | #define $CUDA_GENERIC_SHFL_BODY() \
|
|---|
| 687 | $assert (width <= warpSize); \
|
|---|
| 688 | for (int v = width; v > 1; v /= 2) { \
|
|---|
| 689 | $assert(v % 2 == 0); \
|
|---|
| 690 | } \
|
|---|
| 691 | \
|
|---|
| 692 | int requestLane; \
|
|---|
| 693 | switch(tag) { \
|
|---|
| 694 | case $CUDA_WARP_TAG_shfl_sync: \
|
|---|
| 695 | requestLane = thread->laneID/width + laneParam % width; \
|
|---|
| 696 | break; \
|
|---|
| 697 | case $CUDA_WARP_TAG_shfl_up_sync: \
|
|---|
| 698 | requestLane = thread->laneID - laneParam; \
|
|---|
| 699 | break; \
|
|---|
| 700 | case $CUDA_WARP_TAG_shfl_down_sync: \
|
|---|
| 701 | requestLane = thread->laneID + laneParam; \
|
|---|
| 702 | break; \
|
|---|
| 703 | case $CUDA_WARP_TAG_shfl_xor_sync: \
|
|---|
| 704 | requestLane = thread->laneID ^ laneParam; \
|
|---|
| 705 | break; \
|
|---|
| 706 | } \
|
|---|
| 707 | $cuda_warp_data_t warp = thread->warp; \
|
|---|
| 708 | _Bool validSrcLane = requestLane >= 0 && requestLane < warp->size; \
|
|---|
| 709 | if (validSrcLane) { \
|
|---|
| 710 | $comm_enqueue(thread->lane_comm, $message_pack(thread->laneID, requestLane, tag, NULL, 0)); \
|
|---|
| 711 | } \
|
|---|
| 712 | \
|
|---|
| 713 | $local_end(); \
|
|---|
| 714 | $cuda_warp_barrier_call(thread, tag); \
|
|---|
| 715 | $local_start(); \
|
|---|
| 716 | \
|
|---|
| 717 | \
|
|---|
| 718 | _Bool requested[width] = (_Bool[width])$lambda(int i) $false; \
|
|---|
| 719 | int subWarpStart = thread->laneID/width; \
|
|---|
| 720 | while ($comm_probe(thread->lane_comm, $COMM_ANY_SOURCE, tag)) { \
|
|---|
| 721 | $message request = $comm_dequeue(thread->lane_comm, $COMM_ANY_SOURCE, tag); \
|
|---|
| 722 | requested[$message_source(request) - subWarpStart] = $true; \
|
|---|
| 723 | } \
|
|---|
| 724 | \
|
|---|
| 725 | $local_end(); \
|
|---|
| 726 | $cuda_warp_barrier_call(thread, tag); \
|
|---|
| 727 | $local_start(); \
|
|---|
| 728 | \
|
|---|
| 729 | for(int i = 0; i < width; i++) { \
|
|---|
| 730 | if (requested[i]) { \
|
|---|
| 731 | $comm_enqueue(thread->lane_comm, $message_pack(thread->laneID, i + subWarpStart, tag, &var, typeSize)); \
|
|---|
| 732 | } \
|
|---|
| 733 | } \
|
|---|
| 734 | \
|
|---|
| 735 | $local_end(); \
|
|---|
| 736 | $cuda_warp_barrier_call(thread, tag); \
|
|---|
| 737 | $local_start(); \
|
|---|
| 738 | \
|
|---|
| 739 | if (validSrcLane) { \
|
|---|
| 740 | $message result = $comm_dequeue(thread->lane_comm, requestLane, tag); \
|
|---|
| 741 | $message_unpack(result, &resultVal, typeSize); \
|
|---|
| 742 | } else { \
|
|---|
| 743 | $havoc(&resultVal); \
|
|---|
| 744 | }
|
|---|
| 745 |
|
|---|
| 746 | #define $CUDA_DEFINE_SHFL(NAME, T, TAG) \
|
|---|
| 747 | T NAME(unsigned mask, T var, int laneParam, int width, $cuda_thread_data_t thread) { \
|
|---|
| 748 | $CUDA_INTRINSIC_RACE_CHECK_PROTOCOL_PRE(); \
|
|---|
| 749 | T resultVal; \
|
|---|
| 750 | int typeSize = sizeof(T); \
|
|---|
| 751 | $cuda_warp_tag tag = TAG; \
|
|---|
| 752 | \
|
|---|
| 753 | $CUDA_GENERIC_SHFL_BODY(); \
|
|---|
| 754 | \
|
|---|
| 755 | $CUDA_INTRINSIC_RACE_CHECK_PROTOCOL_POST(); \
|
|---|
| 756 | return resultVal; \
|
|---|
| 757 | }
|
|---|
| 758 |
|
|---|
| 759 | $CUDA_DEFINE_SHFL($cuda__shfl_sync_int, int, $CUDA_WARP_TAG_shfl_sync)
|
|---|
| 760 | $CUDA_DEFINE_SHFL($cuda__shfl_sync_uint, unsigned int, $CUDA_WARP_TAG_shfl_sync)
|
|---|
| 761 | $CUDA_DEFINE_SHFL($cuda__shfl_sync_long, long, $CUDA_WARP_TAG_shfl_sync)
|
|---|
| 762 | $CUDA_DEFINE_SHFL($cuda__shfl_sync_ulong, unsigned long, $CUDA_WARP_TAG_shfl_sync)
|
|---|
| 763 | $CUDA_DEFINE_SHFL($cuda__shfl_sync_ll, long long, $CUDA_WARP_TAG_shfl_sync)
|
|---|
| 764 | $CUDA_DEFINE_SHFL($cuda__shfl_sync_ull, unsigned long long, $CUDA_WARP_TAG_shfl_sync)
|
|---|
| 765 | $CUDA_DEFINE_SHFL($cuda__shfl_sync_float, float, $CUDA_WARP_TAG_shfl_sync)
|
|---|
| 766 | $CUDA_DEFINE_SHFL($cuda__shfl_sync_double, double, $CUDA_WARP_TAG_shfl_sync)
|
|---|
| 767 |
|
|---|
| 768 | $CUDA_DEFINE_SHFL($cuda__shfl_up_sync_int, int, $CUDA_WARP_TAG_shfl_up_sync)
|
|---|
| 769 | $CUDA_DEFINE_SHFL($cuda__shfl_up_sync_uint, unsigned int, $CUDA_WARP_TAG_shfl_up_sync)
|
|---|
| 770 | $CUDA_DEFINE_SHFL($cuda__shfl_up_sync_long, long, $CUDA_WARP_TAG_shfl_up_sync)
|
|---|
| 771 | $CUDA_DEFINE_SHFL($cuda__shfl_up_sync_ulong, unsigned long, $CUDA_WARP_TAG_shfl_up_sync)
|
|---|
| 772 | $CUDA_DEFINE_SHFL($cuda__shfl_up_sync_ll, long long, $CUDA_WARP_TAG_shfl_up_sync)
|
|---|
| 773 | $CUDA_DEFINE_SHFL($cuda__shfl_up_sync_ull, unsigned long long, $CUDA_WARP_TAG_shfl_up_sync)
|
|---|
| 774 | $CUDA_DEFINE_SHFL($cuda__shfl_up_sync_float, float, $CUDA_WARP_TAG_shfl_up_sync)
|
|---|
| 775 | $CUDA_DEFINE_SHFL($cuda__shfl_up_sync_double, double, $CUDA_WARP_TAG_shfl_up_sync)
|
|---|
| 776 |
|
|---|
| 777 | $CUDA_DEFINE_SHFL($cuda__shfl_down_sync_int, int, $CUDA_WARP_TAG_shfl_down_sync)
|
|---|
| 778 | $CUDA_DEFINE_SHFL($cuda__shfl_down_sync_uint, unsigned int, $CUDA_WARP_TAG_shfl_down_sync)
|
|---|
| 779 | $CUDA_DEFINE_SHFL($cuda__shfl_down_sync_long, long, $CUDA_WARP_TAG_shfl_down_sync)
|
|---|
| 780 | $CUDA_DEFINE_SHFL($cuda__shfl_down_sync_ulong, unsigned long, $CUDA_WARP_TAG_shfl_down_sync)
|
|---|
| 781 | $CUDA_DEFINE_SHFL($cuda__shfl_down_sync_ll, long long, $CUDA_WARP_TAG_shfl_down_sync)
|
|---|
| 782 | $CUDA_DEFINE_SHFL($cuda__shfl_down_sync_ull, unsigned long long, $CUDA_WARP_TAG_shfl_down_sync)
|
|---|
| 783 | $CUDA_DEFINE_SHFL($cuda__shfl_down_sync_float, float, $CUDA_WARP_TAG_shfl_down_sync)
|
|---|
| 784 | $CUDA_DEFINE_SHFL($cuda__shfl_down_sync_double, double, $CUDA_WARP_TAG_shfl_down_sync)
|
|---|
| 785 |
|
|---|
| 786 | $CUDA_DEFINE_SHFL($cuda__shfl_xor_sync_int, int, $CUDA_WARP_TAG_shfl_xor_sync)
|
|---|
| 787 | $CUDA_DEFINE_SHFL($cuda__shfl_xor_sync_uint, unsigned int, $CUDA_WARP_TAG_shfl_xor_sync)
|
|---|
| 788 | $CUDA_DEFINE_SHFL($cuda__shfl_xor_sync_long, long, $CUDA_WARP_TAG_shfl_xor_sync)
|
|---|
| 789 | $CUDA_DEFINE_SHFL($cuda__shfl_xor_sync_ulong, unsigned long, $CUDA_WARP_TAG_shfl_xor_sync)
|
|---|
| 790 | $CUDA_DEFINE_SHFL($cuda__shfl_xor_sync_ll, long long, $CUDA_WARP_TAG_shfl_xor_sync)
|
|---|
| 791 | $CUDA_DEFINE_SHFL($cuda__shfl_xor_sync_ull, unsigned long long, $CUDA_WARP_TAG_shfl_xor_sync)
|
|---|
| 792 | $CUDA_DEFINE_SHFL($cuda__shfl_xor_sync_float, float, $CUDA_WARP_TAG_shfl_xor_sync)
|
|---|
| 793 | $CUDA_DEFINE_SHFL($cuda__shfl_xor_sync_double, double, $CUDA_WARP_TAG_shfl_xor_sync)
|
|---|
| 794 |
|
|---|
| 795 | #define __ballot_sync(mask, predicate) $cuda__ballot_sync(mask, predicate, $thread)
|
|---|
| 796 | #define __all_sync(mask, predicate) $cuda__all_sync(mask, predicate, $thread)
|
|---|
| 797 | #define __any_sync(mask, predicate) $cuda__any_sync(mask, predicate, $thread)
|
|---|
| 798 |
|
|---|
| 799 | #define $CUDA_GENERIC_COND_REDUCTION_BODY(COND, T_REDUCTION, F_REDUCTION) \
|
|---|
| 800 | $cuda_warp_data_t warp = thread->warp; \
|
|---|
| 801 | int laneID = thread->laneID; \
|
|---|
| 802 | $comm comm = thread->lane_comm; \
|
|---|
| 803 | if (warp->reductionLane == -1) { \
|
|---|
| 804 | warp->reductionLane = laneID; \
|
|---|
| 805 | \
|
|---|
| 806 | result = initialValue; \
|
|---|
| 807 | for (int i = 0; i < warp->size; i++) { \
|
|---|
| 808 | if (i == laneID) { \
|
|---|
| 809 | operand = value; \
|
|---|
| 810 | } else { \
|
|---|
| 811 | $local_end(); \
|
|---|
| 812 | $when(!warp->alive[i] || $comm_probe(comm, i, tag)) $local_start(); \
|
|---|
| 813 | \
|
|---|
| 814 | if (!warp->alive[i]) { \
|
|---|
| 815 | operand = initialValue; \
|
|---|
| 816 | } else { \
|
|---|
| 817 | $local_end(); \
|
|---|
| 818 | $message_unpack($comm_dequeue(comm, i, tag), &operand, typeSize); \
|
|---|
| 819 | $local_start(); \
|
|---|
| 820 | } \
|
|---|
| 821 | } \
|
|---|
| 822 | \
|
|---|
| 823 | if (COND) { \
|
|---|
| 824 | result = T_REDUCTION; \
|
|---|
| 825 | } else { \
|
|---|
| 826 | result = F_REDUCTION; \
|
|---|
| 827 | } \
|
|---|
| 828 | } \
|
|---|
| 829 | \
|
|---|
| 830 | warp->reductionLane = -1; \
|
|---|
| 831 | \
|
|---|
| 832 | for (int i = 0; i< warp->size; i++) { \
|
|---|
| 833 | if (i != laneID && warp->alive[i]) { \
|
|---|
| 834 | $comm_enqueue(comm, $message_pack(laneID, i, tag, &result, typeSize)); \
|
|---|
| 835 | } \
|
|---|
| 836 | } \
|
|---|
| 837 | } else { \
|
|---|
| 838 | int reductionLane = warp->reductionLane; \
|
|---|
| 839 | $comm_enqueue(comm, $message_pack(laneID, reductionLane, tag, &value, typeSize)); \
|
|---|
| 840 | $local_end(); \
|
|---|
| 841 | $message_unpack($comm_dequeue(comm, reductionLane, tag), &result, typeSize); \
|
|---|
| 842 | $local_start(); \
|
|---|
| 843 | }
|
|---|
| 844 |
|
|---|
| 845 | #define $CUDA_GENERIC_REDUCTION_BODY(REDUCTION) $CUDA_GENERIC_COND_REDUCTION_BODY($true, REDUCTION, result)
|
|---|
| 846 |
|
|---|
| 847 | int $cuda__all_sync(unsigned mask, int value, $cuda_thread_data_t thread) {
|
|---|
| 848 | $CUDA_INTRINSIC_RACE_CHECK_PROTOCOL_PRE();
|
|---|
| 849 |
|
|---|
| 850 | $cuda_warp_tag tag = $CUDA_WARP_TAG_all_sync;
|
|---|
| 851 | int typeSize = sizeof(int);
|
|---|
| 852 | int initialValue = 1;
|
|---|
| 853 | int result, operand;
|
|---|
| 854 |
|
|---|
| 855 | $CUDA_GENERIC_COND_REDUCTION_BODY(result != 0 && operand != 0, 1, 0);
|
|---|
| 856 |
|
|---|
| 857 | $CUDA_INTRINSIC_RACE_CHECK_PROTOCOL_POST();
|
|---|
| 858 |
|
|---|
| 859 | return result;
|
|---|
| 860 | }
|
|---|
| 861 |
|
|---|
| 862 | int $cuda__any_sync(unsigned mask, int value, $cuda_thread_data_t thread) {
|
|---|
| 863 | $CUDA_INTRINSIC_RACE_CHECK_PROTOCOL_PRE();
|
|---|
| 864 |
|
|---|
| 865 | $cuda_warp_tag tag = $CUDA_WARP_TAG_any_sync;
|
|---|
| 866 | int typeSize = sizeof(int);
|
|---|
| 867 | int initialValue = 0;
|
|---|
| 868 | int result, operand;
|
|---|
| 869 |
|
|---|
| 870 | $CUDA_GENERIC_COND_REDUCTION_BODY(result != 0 || operand != 0, 1, 0);
|
|---|
| 871 |
|
|---|
| 872 | $CUDA_INTRINSIC_RACE_CHECK_PROTOCOL_POST();
|
|---|
| 873 |
|
|---|
| 874 | return result;
|
|---|
| 875 | }
|
|---|
| 876 |
|
|---|
| 877 | unsigned $cuda__ballot_sync(unsigned mask, int value, $cuda_thread_data_t thread) {
|
|---|
| 878 | $CUDA_INTRINSIC_RACE_CHECK_PROTOCOL_PRE();
|
|---|
| 879 |
|
|---|
| 880 | $cuda_warp_tag tag = $CUDA_WARP_TAG_ballot_sync;
|
|---|
| 881 | int initialValue = 0;
|
|---|
| 882 | int typeSize = sizeof(int);
|
|---|
| 883 | unsigned result;
|
|---|
| 884 | int operand;
|
|---|
| 885 |
|
|---|
| 886 | $CUDA_GENERIC_COND_REDUCTION_BODY(operand == 0, 2 * result, 2 * result + 1);
|
|---|
| 887 |
|
|---|
| 888 | $CUDA_INTRINSIC_RACE_CHECK_PROTOCOL_POST();
|
|---|
| 889 |
|
|---|
| 890 | return result;
|
|---|
| 891 | }
|
|---|
| 892 |
|
|---|
| 893 | /* atomicAdd()
|
|---|
| 894 | * Reads the 16-bit, 32-bit or 64-bit word old located at the address address in
|
|---|
| 895 | * global or shared memory, computes (old + val), and stores the result back to
|
|---|
| 896 | * memory at the same address. These three operations are performed in one atomic
|
|---|
| 897 | * transaction. The function returns old.
|
|---|
| 898 | */
|
|---|
| 899 | #define atomicAdd(X,Y) _Generic(X, \
|
|---|
| 900 | default : $cuda_atomicAdd_int, \
|
|---|
| 901 | unsigned int* : $cuda_atomicAdd_uint, \
|
|---|
| 902 | unsigned long long int* : $cuda_atomicAdd_ullint, \
|
|---|
| 903 | float* : $cuda_atomicAdd_float, \
|
|---|
| 904 | double* : $cuda_atomicAdd_double) (X, Y, $thread)
|
|---|
| 905 |
|
|---|
| 906 | /* atomicSub()
|
|---|
| 907 | * reads the 32-bit word old located at the address address in global or shared
|
|---|
| 908 | * memory, computes (old - val), and stores the result back to memory at the same
|
|---|
| 909 | * address. These three operations are performed in one atomic transaction. The
|
|---|
| 910 | * function returns old.
|
|---|
| 911 | */
|
|---|
| 912 | #define atomicSub(X,Y) _Generic(X, \
|
|---|
| 913 | default : $cuda_atomicSub_int, \
|
|---|
| 914 | unsigned int* : $cuda_atomicSub_uint) (X, Y, $thread)
|
|---|
| 915 |
|
|---|
| 916 | /* atomicExch()
|
|---|
| 917 | * reads the 32-bit or 64-bit word old located at the address address in global
|
|---|
| 918 | * or shared memory and stores val back to memory at the same address. These two
|
|---|
| 919 | * operations are performed in one atomic transaction. The function returns old.
|
|---|
| 920 | */
|
|---|
| 921 | #define atomicExch(X,Y) _Generic(X, \
|
|---|
| 922 | default : $cuda_atomicExch_int, \
|
|---|
| 923 | unsigned int* : $cuda_atomicExch_uint, \
|
|---|
| 924 | unsigned long long int* : $cuda_atomicExch_ullint, \
|
|---|
| 925 | float* : $cuda_atomicExch_float) (X, Y, $thread)
|
|---|
| 926 |
|
|---|
| 927 | /* atomicMin()
|
|---|
| 928 | * reads the 32-bit or 64-bit word old located at the address address in global
|
|---|
| 929 | * or shared memory, computes the minimum of old and val, and stores the result
|
|---|
| 930 | * back to memory at the same address. These three operations are performed in one
|
|---|
| 931 | * atomic transaction. The function returns old.
|
|---|
| 932 | */
|
|---|
| 933 | #define atomicMin(X,Y) _Generic(X, \
|
|---|
| 934 | default : $cuda_atomicMin_int, \
|
|---|
| 935 | unsigned int* : $cuda_atomicMin_uint, \
|
|---|
| 936 | unsigned long long int* : $cuda_atomicMin_ullint) (X, Y, $thread)
|
|---|
| 937 |
|
|---|
| 938 | /* atomicMax()
|
|---|
| 939 | * reads the 32-bit or 64-bit word old located at the address address in global
|
|---|
| 940 | * or shared memory, computes the maximum of old and val, and stores the result
|
|---|
| 941 | * back to memory at the same address. These three operations are performed in one
|
|---|
| 942 | * atomic transaction. The function returns old.
|
|---|
| 943 | */
|
|---|
| 944 | #define atomicMax(X,Y) _Generic(X, \
|
|---|
| 945 | default : $cuda_atomicMax_int, \
|
|---|
| 946 | unsigned int* : $cuda_atomicMax_uint, \
|
|---|
| 947 | unsigned long long int* : $cuda_atomicMax_ullint) (X, Y, $thread)
|
|---|
| 948 |
|
|---|
| 949 | /* atomicInc()
|
|---|
| 950 | * reads the 32-bit word old located at the address address in global or shared
|
|---|
| 951 | * memory, computes ((old >= val) ? 0 : (old+1)), and stores the result back to
|
|---|
| 952 | * memory at the same address. These three operations are performed in one atomic
|
|---|
| 953 | * transaction. The function returns old.
|
|---|
| 954 | */
|
|---|
| 955 | #define atomicInc(address, val) $cuda_atomicInc(address, val, $thread)
|
|---|
| 956 |
|
|---|
| 957 | /* atomicDec()
|
|---|
| 958 | * reads the 32-bit word old located at the address address in global or shared
|
|---|
| 959 | * memory, computes (((old == 0) || (old > val)) ? val : (old-1) ), and stores
|
|---|
| 960 | * the result back to memory at the same address. These three operations are
|
|---|
| 961 | * performed in one atomic transaction. The function returns old.
|
|---|
| 962 | */
|
|---|
| 963 | #define atomicDec(address, val) $cuda_atomicDec(address, val, $thread)
|
|---|
| 964 |
|
|---|
| 965 | /* atomicCAS()
|
|---|
| 966 | * reads the 16-bit, 32-bit or 64-bit word old located at the address address in
|
|---|
| 967 | * global or shared memory, computes (old == compare ? val : old) , and stores the
|
|---|
| 968 | * result back to memory at the same address. These three operations are performed
|
|---|
| 969 | * in one atomic transaction. The function returns old (Compare And Swap).
|
|---|
| 970 | */
|
|---|
| 971 | #define atomicCAS(address, compare, val) _Generic(address, \
|
|---|
| 972 | default : $cuda_atomicCAS_int, \
|
|---|
| 973 | unsigned int* : $cuda_atomicCAS_uint, \
|
|---|
| 974 | unsigned long long int* : $cuda_atomicCAS_ullint, \
|
|---|
| 975 | unsigned short int* : $cuda_atomicCAS_usint) (address, compare, val, $thread)
|
|---|
| 976 |
|
|---|
| 977 | /*
|
|---|
| 978 | * reads the 32-bit or 64-bit word old located at the address address
|
|---|
| 979 | * in global or shared memory, computes (old & val), and stores the
|
|---|
| 980 | * result back to memory at the same address. These three operations
|
|---|
| 981 | * are performed in one atomic transaction. The function returns old.
|
|---|
| 982 | */
|
|---|
| 983 | #define atomicAnd(address, val) _Generic(address, \
|
|---|
| 984 | default : $cuda_atomicAnd_int, \
|
|---|
| 985 | unsigned int* : $cuda_atomicAnd_uint, \
|
|---|
| 986 | unsigned long long int* : $cuda_atomicAnd_ullint) (address, val, $thread)
|
|---|
| 987 |
|
|---|
| 988 | /*
|
|---|
| 989 | * reads the 32-bit or 64-bit word old located at the address address
|
|---|
| 990 | * in global or shared memory, computes (old | val), and stores the
|
|---|
| 991 | * result back to memory at the same address. These three operations
|
|---|
| 992 | * are performed in one atomic transaction. The function returns old.
|
|---|
| 993 | */
|
|---|
| 994 | #define atomicOr(address, val) _Generic(address, \
|
|---|
| 995 | default : $cuda_atomicOr_int, \
|
|---|
| 996 | unsigned int* : $cuda_atomicOr_uint, \
|
|---|
| 997 | unsigned long long int* : $cuda_atomicOr_ullint) (address, val, $thread)
|
|---|
| 998 |
|
|---|
| 999 | /*
|
|---|
| 1000 | * reads the 32-bit or 64-bit word old located at the address address
|
|---|
| 1001 | * in global or shared memory, computes (old ^ val), and stores the
|
|---|
| 1002 | * result back to memory at the same address. These three operations
|
|---|
| 1003 | * are performed in one atomic transaction. The function returns old.
|
|---|
| 1004 | */
|
|---|
| 1005 | #define atomicXor(address, val) _Generic(address, \
|
|---|
| 1006 | default : $cuda_atomicXor_int, \
|
|---|
| 1007 | unsigned int* : $cuda_atomicXor_uint, \
|
|---|
| 1008 | unsigned long long int* : $cuda_atomicXor_ullint) (address, val, $thread)
|
|---|
| 1009 |
|
|---|
| 1010 | #define $CUDA_ATOMIC_PRE_ACTION(T) \
|
|---|
| 1011 | $cuda_complete_kernel_region(thread); \
|
|---|
| 1012 | $yield(); \
|
|---|
| 1013 | $cuda_start_kernel_region(thread); \
|
|---|
| 1014 | T old = *address;
|
|---|
| 1015 |
|
|---|
| 1016 | #define $CUDA_ATOMIC_POST_ACTION() \
|
|---|
| 1017 | $cuda_complete_kernel_region(thread); \
|
|---|
| 1018 | $cuda_start_kernel_region(thread); \
|
|---|
| 1019 | return old;
|
|---|
| 1020 |
|
|---|
| 1021 | #define $CUDA_DEFINE_ATOMIC_ADD(NAME, T) \
|
|---|
| 1022 | T NAME(T* address, T val, $cuda_thread_data_t thread) { \
|
|---|
| 1023 | $CUDA_ATOMIC_PRE_ACTION(T) \
|
|---|
| 1024 | *address += val; \
|
|---|
| 1025 | $CUDA_ATOMIC_POST_ACTION() \
|
|---|
| 1026 | }
|
|---|
| 1027 |
|
|---|
| 1028 | $CUDA_DEFINE_ATOMIC_ADD($cuda_atomicAdd_int, int)
|
|---|
| 1029 | $CUDA_DEFINE_ATOMIC_ADD($cuda_atomicAdd_uint, unsigned int)
|
|---|
| 1030 | $CUDA_DEFINE_ATOMIC_ADD($cuda_atomicAdd_ullint, unsigned long long int)
|
|---|
| 1031 | $CUDA_DEFINE_ATOMIC_ADD($cuda_atomicAdd_float, float)
|
|---|
| 1032 | $CUDA_DEFINE_ATOMIC_ADD($cuda_atomicAdd_double, double)
|
|---|
| 1033 |
|
|---|
| 1034 | #define $CUDA_DEFINE_ATOMIC_SUB(NAME, T) \
|
|---|
| 1035 | T NAME(T* address, T val, $cuda_thread_data_t thread) { \
|
|---|
| 1036 | $CUDA_ATOMIC_PRE_ACTION(T) \
|
|---|
| 1037 | *address -= val; \
|
|---|
| 1038 | $CUDA_ATOMIC_POST_ACTION() \
|
|---|
| 1039 | }
|
|---|
| 1040 |
|
|---|
| 1041 | $CUDA_DEFINE_ATOMIC_SUB($cuda_atomicSub_int, int)
|
|---|
| 1042 | $CUDA_DEFINE_ATOMIC_SUB($cuda_atomicSub_uint, unsigned int)
|
|---|
| 1043 |
|
|---|
| 1044 | #define $CUDA_DEFINE_ATOMIC_EXCH(NAME, T) \
|
|---|
| 1045 | T NAME(T* address, T val, $cuda_thread_data_t thread) { \
|
|---|
| 1046 | $CUDA_ATOMIC_PRE_ACTION(T) \
|
|---|
| 1047 | *address = val; \
|
|---|
| 1048 | $CUDA_ATOMIC_POST_ACTION() \
|
|---|
| 1049 | }
|
|---|
| 1050 |
|
|---|
| 1051 | $CUDA_DEFINE_ATOMIC_EXCH($cuda_atomicExch_int, int)
|
|---|
| 1052 | $CUDA_DEFINE_ATOMIC_EXCH($cuda_atomicExch_uint, unsigned int)
|
|---|
| 1053 | $CUDA_DEFINE_ATOMIC_EXCH($cuda_atomicExch_ullint, unsigned long long int)
|
|---|
| 1054 | $CUDA_DEFINE_ATOMIC_EXCH($cuda_atomicExch_float, float)
|
|---|
| 1055 |
|
|---|
| 1056 | #define $CUDA_DEFINE_ATOMIC_MIN(NAME, T) \
|
|---|
| 1057 | T NAME(T* address, T val, $cuda_thread_data_t thread) { \
|
|---|
| 1058 | $CUDA_ATOMIC_PRE_ACTION(T) \
|
|---|
| 1059 | if (old <= val) *address = old; \
|
|---|
| 1060 | else *address = val; \
|
|---|
| 1061 | $CUDA_ATOMIC_POST_ACTION() \
|
|---|
| 1062 | }
|
|---|
| 1063 |
|
|---|
| 1064 | $CUDA_DEFINE_ATOMIC_MIN($cuda_atomicMin_int, int)
|
|---|
| 1065 | $CUDA_DEFINE_ATOMIC_MIN($cuda_atomicMin_uint, unsigned int)
|
|---|
| 1066 | $CUDA_DEFINE_ATOMIC_MIN($cuda_atomicMin_ullint, unsigned long long int)
|
|---|
| 1067 |
|
|---|
| 1068 | #define $CUDA_DEFINE_ATOMIC_MAX(NAME, T) \
|
|---|
| 1069 | T NAME(T* address, T val, $cuda_thread_data_t thread) { \
|
|---|
| 1070 | $CUDA_ATOMIC_PRE_ACTION(T) \
|
|---|
| 1071 | if (old >= val) *address = old; \
|
|---|
| 1072 | else *address = val; \
|
|---|
| 1073 | $CUDA_ATOMIC_POST_ACTION() \
|
|---|
| 1074 | }
|
|---|
| 1075 |
|
|---|
| 1076 | $CUDA_DEFINE_ATOMIC_MAX($cuda_atomicMax_int, int)
|
|---|
| 1077 | $CUDA_DEFINE_ATOMIC_MAX($cuda_atomicMax_uint, unsigned int)
|
|---|
| 1078 | $CUDA_DEFINE_ATOMIC_MAX($cuda_atomicMax_ullint, unsigned long long int)
|
|---|
| 1079 |
|
|---|
| 1080 | unsigned int $cuda_atomicInc(unsigned int* address, unsigned int val, $cuda_thread_data_t thread) {
|
|---|
| 1081 | $CUDA_ATOMIC_PRE_ACTION(unsigned int);
|
|---|
| 1082 | if (old >= val) *address = 0;
|
|---|
| 1083 | else *address = old + 1;
|
|---|
| 1084 | $CUDA_ATOMIC_POST_ACTION()
|
|---|
| 1085 | }
|
|---|
| 1086 |
|
|---|
| 1087 | unsigned int $cuda_atomicDec(unsigned int* address, unsigned int val, $cuda_thread_data_t thread) {
|
|---|
| 1088 | $CUDA_ATOMIC_PRE_ACTION(unsigned int);
|
|---|
| 1089 | if (old == 0 || old > val) *address = val;
|
|---|
| 1090 | else *address = old - 1;
|
|---|
| 1091 | $CUDA_ATOMIC_POST_ACTION()
|
|---|
| 1092 | }
|
|---|
| 1093 |
|
|---|
| 1094 | #define $CUDA_DEFINE_ATOMIC_CAS(NAME, T) \
|
|---|
| 1095 | T NAME(T* address, T compare, T val, $cuda_thread_data_t thread) { \
|
|---|
| 1096 | $CUDA_ATOMIC_PRE_ACTION(T) \
|
|---|
| 1097 | if (old == compare) *address = val; \
|
|---|
| 1098 | $CUDA_ATOMIC_POST_ACTION() \
|
|---|
| 1099 | }
|
|---|
| 1100 |
|
|---|
| 1101 | $CUDA_DEFINE_ATOMIC_CAS($cuda_atomicCAS_int, int)
|
|---|
| 1102 | $CUDA_DEFINE_ATOMIC_CAS($cuda_atomicCAS_uint, unsigned int)
|
|---|
| 1103 | $CUDA_DEFINE_ATOMIC_CAS($cuda_atomicCAS_ullint, unsigned long long int)
|
|---|
| 1104 | $CUDA_DEFINE_ATOMIC_CAS($cuda_atomicCAS_usint, unsigned short int)
|
|---|
| 1105 |
|
|---|
| 1106 | #define $CUDA_DEFINE_ATOMIC_AND(NAME, T) \
|
|---|
| 1107 | T NAME(T* address, T val, $cuda_thread_data_t thread) { \
|
|---|
| 1108 | $CUDA_ATOMIC_PRE_ACTION(T) \
|
|---|
| 1109 | *address = old & val; \
|
|---|
| 1110 | $CUDA_ATOMIC_POST_ACTION() \
|
|---|
| 1111 | }
|
|---|
| 1112 |
|
|---|
| 1113 | $CUDA_DEFINE_ATOMIC_AND($cuda_atomicAnd_int, int)
|
|---|
| 1114 | $CUDA_DEFINE_ATOMIC_AND($cuda_atomicAnd_uint, unsigned int)
|
|---|
| 1115 | $CUDA_DEFINE_ATOMIC_AND($cuda_atomicAnd_ullint, unsigned long long int)
|
|---|
| 1116 |
|
|---|
| 1117 | #define $CUDA_DEFINE_ATOMIC_OR(NAME, T) \
|
|---|
| 1118 | T NAME(T* address, T val, $cuda_thread_data_t thread) { \
|
|---|
| 1119 | $CUDA_ATOMIC_PRE_ACTION(T) \
|
|---|
| 1120 | *address = old | val; \
|
|---|
| 1121 | $CUDA_ATOMIC_POST_ACTION() \
|
|---|
| 1122 | }
|
|---|
| 1123 |
|
|---|
| 1124 | $CUDA_DEFINE_ATOMIC_OR($cuda_atomicOr_int, int)
|
|---|
| 1125 | $CUDA_DEFINE_ATOMIC_OR($cuda_atomicOr_uint, unsigned int)
|
|---|
| 1126 | $CUDA_DEFINE_ATOMIC_OR($cuda_atomicOr_ullint, unsigned long long int)
|
|---|
| 1127 |
|
|---|
| 1128 | #define $CUDA_DEFINE_ATOMIC_XOR(NAME, T) \
|
|---|
| 1129 | T NAME(T* address, T val, $cuda_thread_data_t thread) { \
|
|---|
| 1130 | $CUDA_ATOMIC_PRE_ACTION(T) \
|
|---|
| 1131 | *address = old ^ val; \
|
|---|
| 1132 | $CUDA_ATOMIC_POST_ACTION() \
|
|---|
| 1133 | }
|
|---|
| 1134 |
|
|---|
| 1135 | $CUDA_DEFINE_ATOMIC_XOR($cuda_atomicXor_int, int)
|
|---|
| 1136 | $CUDA_DEFINE_ATOMIC_XOR($cuda_atomicXor_uint, unsigned int)
|
|---|
| 1137 | $CUDA_DEFINE_ATOMIC_XOR($cuda_atomicXor_ullint, unsigned long long int)
|
|---|
| 1138 |
|
|---|
| 1139 | //////////////////////
|
|---|
| 1140 | // Global Variables //
|
|---|
| 1141 | //////////////////////
|
|---|
| 1142 |
|
|---|
| 1143 | $gcomm $cuda_gcomm = $gcomm_create($here, 2);
|
|---|
| 1144 | const int $CUDA_PLACE_HOST = 0;
|
|---|
| 1145 | const int $CUDA_PLACE_DEVICE = 1;
|
|---|
| 1146 | $comm $cuda_host_comm = $comm_create($here, $cuda_gcomm, $CUDA_PLACE_HOST);
|
|---|
| 1147 |
|
|---|
| 1148 | /**
|
|---|
| 1149 | * Tags used for message-passing between host and device
|
|---|
| 1150 | */
|
|---|
| 1151 | enum $cuda_tag {
|
|---|
| 1152 | // Predefined tags
|
|---|
| 1153 | $CUDA_TAG_TEARDOWN,
|
|---|
| 1154 | $CUDA_TAG_SCOPE_REQUEST,
|
|---|
| 1155 | $CUDA_TAG_cudaFree,
|
|---|
| 1156 | $CUDA_TAG_cudaMemcpy,
|
|---|
| 1157 | $CUDA_TAG_cudaMemcpyAsync,
|
|---|
| 1158 | // Generated tags (by transformer)
|
|---|
| 1159 | $CUDA_TAG_LAUNCH_kernel_1
|
|---|
| 1160 | };
|
|---|
| 1161 |
|
|---|
| 1162 | ///////////////////
|
|---|
| 1163 | // CIVL-CUDA API //
|
|---|
| 1164 | ///////////////////
|
|---|
| 1165 |
|
|---|
| 1166 | $scope $cuda_host_request_device_scope() {
|
|---|
| 1167 | $comm_enqueue($cuda_host_comm, $message_pack($CUDA_PLACE_HOST, $CUDA_PLACE_DEVICE, $CUDA_TAG_SCOPE_REQUEST, NULL, 0));
|
|---|
| 1168 | $message response = $comm_dequeue($cuda_host_comm, $CUDA_PLACE_DEVICE, $CUDA_TAG_SCOPE_REQUEST);
|
|---|
| 1169 | $scope result;
|
|---|
| 1170 | $message_unpack(response, &result, sizeof($scope));
|
|---|
| 1171 |
|
|---|
| 1172 | return result;
|
|---|
| 1173 | }
|
|---|
| 1174 |
|
|---|
| 1175 | typedef struct $cuda_memcpy_data {
|
|---|
| 1176 | void* dst;
|
|---|
| 1177 | const void* src;
|
|---|
| 1178 | size_t count;
|
|---|
| 1179 | cudaMemcpyKind kind;
|
|---|
| 1180 | } $cuda_memcpy_data;
|
|---|
| 1181 |
|
|---|
| 1182 | void $cuda_host_memcpy(void* dst, const void* src, size_t count, cudaMemcpyKind kind, _Bool async) {
|
|---|
| 1183 | if (kind == cudaMemcpyHostToHost) {
|
|---|
| 1184 | memcpy(dst, src, count);
|
|---|
| 1185 | } else {
|
|---|
| 1186 | $cuda_memcpy_data args;
|
|---|
| 1187 | args.dst = dst;
|
|---|
| 1188 | args.src = src;
|
|---|
| 1189 | args.count = count;
|
|---|
| 1190 | args.kind = kind;
|
|---|
| 1191 |
|
|---|
| 1192 | int tag = async ? $CUDA_TAG_cudaMemcpyAsync : $CUDA_TAG_cudaMemcpy;
|
|---|
| 1193 |
|
|---|
| 1194 | $comm_enqueue($cuda_host_comm, $message_pack($CUDA_PLACE_HOST, $CUDA_PLACE_DEVICE, tag, &args, sizeof($cuda_memcpy_data)));
|
|---|
| 1195 | $comm_dequeue($cuda_host_comm, $CUDA_PLACE_DEVICE, tag);
|
|---|
| 1196 | }
|
|---|
| 1197 | }
|
|---|
| 1198 |
|
|---|
| 1199 | $cuda_stream_node_t $create_new_stream_node($scope cudaScope) {
|
|---|
| 1200 | cudaStream_t newStream = (cudaStream_t) $malloc(cudaScope, sizeof($cuda_stream));
|
|---|
| 1201 | newStream->head = NULL;
|
|---|
| 1202 | newStream->tail = NULL;
|
|---|
| 1203 | newStream->numOps = 0;
|
|---|
| 1204 | newStream->alive = true;
|
|---|
| 1205 |
|
|---|
| 1206 | $cuda_stream_node_t newHead = ($cuda_stream_node_t) $malloc(cudaScope, sizeof($cuda_stream_node));
|
|---|
| 1207 | newHead->stream = newStream;
|
|---|
| 1208 | newStream->containingNode = newHead;
|
|---|
| 1209 | newHead->prev = NULL;
|
|---|
| 1210 | newHead->next = NULL;
|
|---|
| 1211 |
|
|---|
| 1212 | return newHead;
|
|---|
| 1213 | }
|
|---|
| 1214 |
|
|---|
| 1215 | /*@ depends_on \nothing;
|
|---|
| 1216 | @ assigns \nothing;
|
|---|
| 1217 | @ reads \nothing;
|
|---|
| 1218 | @*/
|
|---|
| 1219 | $atomic_f $proc $destroy_stream_node($cuda_stream_node_t node) {
|
|---|
| 1220 | $proc lastOpProc = $proc_null;
|
|---|
| 1221 | cudaStream_t stream = node->stream;
|
|---|
| 1222 |
|
|---|
| 1223 | if (node->prev != NULL) {
|
|---|
| 1224 | node->prev->next = node->next;
|
|---|
| 1225 | }
|
|---|
| 1226 | if (node->next != NULL) {
|
|---|
| 1227 | node->next->prev = node->prev;
|
|---|
| 1228 | }
|
|---|
| 1229 | free(node);
|
|---|
| 1230 |
|
|---|
| 1231 | stream->alive = false;
|
|---|
| 1232 | if(stream->tail != NULL)
|
|---|
| 1233 | lastOpProc = stream->tail->opState->op;
|
|---|
| 1234 |
|
|---|
| 1235 | void destroyStreamWhenComplete($proc lastOpProc, cudaStream_t stream) {
|
|---|
| 1236 | $wait(lastOpProc);
|
|---|
| 1237 | free(stream);
|
|---|
| 1238 | }
|
|---|
| 1239 |
|
|---|
| 1240 | return $spawn destroyStreamWhenComplete(lastOpProc, stream);
|
|---|
| 1241 | }
|
|---|
| 1242 |
|
|---|
| 1243 | /*@ depends_on \access(stream);
|
|---|
| 1244 | @ assigns stream;
|
|---|
| 1245 | @ reads \nothing;
|
|---|
| 1246 | @*/
|
|---|
| 1247 | $atomic_f $proc $stream_enqueue($scope cudaScope, cudaStream_t stream, $message opParams, void(*opProc)($message, $cuda_op_state_t, cudaStream_t)) {
|
|---|
| 1248 | $assert(stream->alive, "Attempt to enqueue a CUDA operation onto a destroyed stream");
|
|---|
| 1249 |
|
|---|
| 1250 | $cuda_op_state_t newOpState = ($cuda_op_state_t) $malloc(cudaScope, sizeof($cuda_op_state));
|
|---|
| 1251 | newOpState->start = false;
|
|---|
| 1252 | newOpState->op = $spawn opProc(opParams, newOpState, stream);
|
|---|
| 1253 |
|
|---|
| 1254 | $cuda_op_state_node_t newOpStateNode = ($cuda_op_state_node_t) $malloc(cudaScope, sizeof($cuda_op_state_node));
|
|---|
| 1255 | newOpStateNode->opState = newOpState;
|
|---|
| 1256 | newOpStateNode->next = NULL;
|
|---|
| 1257 |
|
|---|
| 1258 | if (stream->tail == NULL) {
|
|---|
| 1259 | stream->head = newOpStateNode;
|
|---|
| 1260 | stream->tail = newOpStateNode;
|
|---|
| 1261 | newOpState->start = true;
|
|---|
| 1262 | } else {
|
|---|
| 1263 | stream->tail->next = newOpStateNode;
|
|---|
| 1264 | stream->tail = newOpStateNode;
|
|---|
| 1265 | }
|
|---|
| 1266 | stream->numOps++;
|
|---|
| 1267 |
|
|---|
| 1268 | return newOpState->op;
|
|---|
| 1269 | }
|
|---|
| 1270 |
|
|---|
| 1271 | /*@ depends_on \nothing;
|
|---|
| 1272 | @ assigns \nothing;
|
|---|
| 1273 | @ reads \nothing;
|
|---|
| 1274 | @*/
|
|---|
| 1275 | $atomic_f void $stream_dequeue(cudaStream_t stream) {
|
|---|
| 1276 | $assert(stream->head != NULL, "Attempt to dequeue an empty stream");
|
|---|
| 1277 |
|
|---|
| 1278 | if (stream->head == stream->tail) {
|
|---|
| 1279 | stream->tail = NULL;
|
|---|
| 1280 | }
|
|---|
| 1281 |
|
|---|
| 1282 | $cuda_op_state_node_t oldHead = stream->head;
|
|---|
| 1283 | stream->head = oldHead->next;
|
|---|
| 1284 | if (stream->head != NULL) {
|
|---|
| 1285 | stream->head->opState->start = true;
|
|---|
| 1286 | }
|
|---|
| 1287 |
|
|---|
| 1288 | stream->numOps--;
|
|---|
| 1289 | free(oldHead->opState);
|
|---|
| 1290 | free(oldHead);
|
|---|
| 1291 | }
|
|---|
| 1292 |
|
|---|
| 1293 | // Helper function
|
|---|
| 1294 | int $dim3_index(dim3 size, uint3 location) {
|
|---|
| 1295 | return location.x + size.x * (location.y + size.y * location.z);
|
|---|
| 1296 | }
|
|---|
| 1297 |
|
|---|
| 1298 | // Helper function
|
|---|
| 1299 | int $cuda_kernel_index (dim3 gDim, dim3 bDim, uint3 bIdx, uint3 tIdx) {
|
|---|
| 1300 | return $dim3_index(gDim, bIdx) * (bDim.x * bDim.y * bDim.z) + $dim3_index(bDim, tIdx);
|
|---|
| 1301 | }
|
|---|
| 1302 |
|
|---|
| 1303 | void $cuda_run_and_wait_on_procs(dim3 dim, void spawningFunction(uint3)) {
|
|---|
| 1304 | //TODO: calculate length and index, replace this function in the kernel
|
|---|
| 1305 | $local_start();
|
|---|
| 1306 | int length = dim.x * dim.y * dim.z;
|
|---|
| 1307 | $proc procArray[length];
|
|---|
| 1308 | $range rx = 0 .. dim.x - 1;
|
|---|
| 1309 | $range ry = 0 .. dim.y - 1;
|
|---|
| 1310 | $range rz = 0 .. dim.z - 1;
|
|---|
| 1311 | $domain(3) dom = ($domain(3)){rx, ry, rz};
|
|---|
| 1312 | $for(int x,y,z : dom){
|
|---|
| 1313 | uint3 id = { x, y, z };
|
|---|
| 1314 | int index = $dim3_index(dim, id);
|
|---|
| 1315 | procArray[index] = $spawn spawningFunction(id);
|
|---|
| 1316 | }
|
|---|
| 1317 | $local_end();
|
|---|
| 1318 | $waitall(procArray,length);
|
|---|
| 1319 | }
|
|---|
| 1320 |
|
|---|
| 1321 |
|
|---|
| 1322 | // CUDA Ops //
|
|---|
| 1323 |
|
|---|
| 1324 | void $cuda_memcpy_proc($message m, $cuda_op_state_t opState, cudaStream_t stream) {
|
|---|
| 1325 |
|
|---|
| 1326 | $when(opState->start);
|
|---|
| 1327 | $cuda_memcpy_data args;
|
|---|
| 1328 | $message_unpack(m, &args, sizeof($cuda_memcpy_data));
|
|---|
| 1329 |
|
|---|
| 1330 | if (args.kind == cudaMemcpyHostToDevice || cudaMemcpyDeviceToDevice) {
|
|---|
| 1331 | args.dst = $reveal(args.dst);
|
|---|
| 1332 | }
|
|---|
| 1333 | if (args.kind == cudaMemcpyDeviceToHost || cudaMemcpyDeviceToDevice) {
|
|---|
| 1334 | args.src = $reveal(args.src);
|
|---|
| 1335 | }
|
|---|
| 1336 | memcpy(args.dst, args.src, args.count);
|
|---|
| 1337 |
|
|---|
| 1338 | $stream_dequeue(stream);
|
|---|
| 1339 | }
|
|---|
| 1340 |
|
|---|
| 1341 | $message $cuda_memcpy($scope cudaScope, cudaStream_t stream, $message request, _Bool async) {
|
|---|
| 1342 | $cuda_memcpy_data args;
|
|---|
| 1343 | $message_unpack(request, &args, sizeof($cuda_memcpy_data));
|
|---|
| 1344 |
|
|---|
| 1345 | $proc memcpyProc = $stream_enqueue(cudaScope, stream, request, $cuda_memcpy_proc);
|
|---|
| 1346 |
|
|---|
| 1347 | if (!async && args.kind != cudaMemcpyDeviceToDevice) {
|
|---|
| 1348 | $wait(memcpyProc);
|
|---|
| 1349 | }
|
|---|
| 1350 | int tag = async ? $CUDA_TAG_cudaMemcpyAsync : $CUDA_TAG_cudaMemcpy;
|
|---|
| 1351 |
|
|---|
| 1352 | return $message_pack($CUDA_PLACE_DEVICE, $CUDA_PLACE_HOST, tag, NULL, 0);
|
|---|
| 1353 | }
|
|---|
| 1354 |
|
|---|
| 1355 | $message $cuda_free($message request) {
|
|---|
| 1356 | void* devPtr;
|
|---|
| 1357 | $message_unpack(request, &devPtr, sizeof(void*));
|
|---|
| 1358 | free($reveal(devPtr));
|
|---|
| 1359 |
|
|---|
| 1360 | return $message_pack($CUDA_PLACE_DEVICE, $CUDA_PLACE_HOST, $CUDA_TAG_cudaFree, NULL, 0);
|
|---|
| 1361 | }
|
|---|
| 1362 |
|
|---|
| 1363 |
|
|---|
| 1364 |
|
|---|
| 1365 | ////////////////////////////////////////////
|
|---|
| 1366 | // CUDA API Functions (For Host-use Only) //
|
|---|
| 1367 | ////////////////////////////////////////////
|
|---|
| 1368 |
|
|---|
| 1369 | cudaError_t cudaFree(void* devPtr) {
|
|---|
| 1370 | $comm_enqueue($cuda_host_comm, $message_pack($CUDA_PLACE_HOST, $CUDA_PLACE_DEVICE, $CUDA_TAG_cudaFree, &devPtr, sizeof(void*)));
|
|---|
| 1371 | $comm_dequeue($cuda_host_comm, $CUDA_PLACE_DEVICE, $CUDA_TAG_cudaFree);
|
|---|
| 1372 |
|
|---|
| 1373 | return cudaSuccess;
|
|---|
| 1374 | }
|
|---|
| 1375 |
|
|---|
| 1376 | cudaError_t cudaMemcpy(void* dst, const void* src, size_t count, cudaMemcpyKind kind) {
|
|---|
| 1377 | $cuda_host_memcpy(dst, src, count, kind, false);
|
|---|
| 1378 | return cudaSuccess;
|
|---|
| 1379 | }
|
|---|
| 1380 |
|
|---|
| 1381 | cudaError_t cudaMemcpyAsync(void* dst, const void* src, size_t count,
|
|---|
| 1382 | cudaMemcpyKind kind, cudaStream_t stream) {
|
|---|
| 1383 | $cuda_host_memcpy(dst, src, count, kind, true);
|
|---|
| 1384 | return cudaSuccess;
|
|---|
| 1385 | }
|
|---|
| 1386 |
|
|---|
| 1387 | /*
|
|---|
| 1388 | cudaError_t cudaStreamCreate(cudaStream_t * pStream) {
|
|---|
| 1389 | // Create new stream node in linked list
|
|---|
| 1390 | $cuda_stream_node_t newHead = $create_new_stream_node();
|
|---|
| 1391 | newHead->next = $cuda_global_context.head;
|
|---|
| 1392 | $cuda_global_context.head->prev = newHead;
|
|---|
| 1393 |
|
|---|
| 1394 | // Update cuda context's head to be the new node we created
|
|---|
| 1395 | $cuda_global_context.head = newHead;
|
|---|
| 1396 | $cuda_global_context.numStreams++;
|
|---|
| 1397 |
|
|---|
| 1398 | return cudaSuccess;
|
|---|
| 1399 | }
|
|---|
| 1400 | */
|
|---|
| 1401 |
|
|---|
| 1402 | /*
|
|---|
| 1403 | cudaError_t cudaStreamSynchronize(cudaStream_t stream) {
|
|---|
| 1404 | stream = $default_stream_if_null(stream);
|
|---|
| 1405 | $assert(stream->alive, "Attempt to synchronize with a destroyed stream");
|
|---|
| 1406 | $when(stream->head == NULL) return cudaSuccess;
|
|---|
| 1407 | }
|
|---|
| 1408 | */
|
|---|
| 1409 |
|
|---|
| 1410 | /*
|
|---|
| 1411 | cudaError_t cudaStreamDestroy(cudaStream_t stream) {
|
|---|
| 1412 | $assert(stream != NULL && stream != $cuda_default_stream, "Attempt to destroy default stream");
|
|---|
| 1413 | $assert(stream->alive, "Attempt to destroy an already destroyed stream");
|
|---|
| 1414 | $destroy_stream_node(stream->containingNode);
|
|---|
| 1415 | return cudaSuccess;
|
|---|
| 1416 | }
|
|---|
| 1417 | */
|
|---|
| 1418 |
|
|---|
| 1419 | /*
|
|---|
| 1420 | cudaError_t cudaDeviceSynchronize() {
|
|---|
| 1421 | $proc* opsToWaitOn;
|
|---|
| 1422 | int numOps = 0;
|
|---|
| 1423 |
|
|---|
| 1424 | $atomic {
|
|---|
| 1425 | opsToWaitOn = ($proc*) malloc(sizeof($proc) * $cuda_global_context.numStreams);
|
|---|
| 1426 |
|
|---|
| 1427 | for ($cuda_stream_node_t node = $cuda_global_context.head;
|
|---|
| 1428 | node != NULL;
|
|---|
| 1429 | node = node->next) {
|
|---|
| 1430 | if (node->stream->tail != NULL) {
|
|---|
| 1431 | opsToWaitOn[numOps] = node->stream->tail->opState->op;
|
|---|
| 1432 | numOps++;
|
|---|
| 1433 | }
|
|---|
| 1434 | }
|
|---|
| 1435 | }
|
|---|
| 1436 | $waitall(opsToWaitOn, numOps);
|
|---|
| 1437 |
|
|---|
| 1438 | return cudaSuccess;
|
|---|
| 1439 | }
|
|---|
| 1440 | */
|
|---|
| 1441 |
|
|---|
| 1442 | //////////////////////////////////
|
|---|
| 1443 | // Generated code from kernel_1 //
|
|---|
| 1444 | //////////////////////////////////
|
|---|
| 1445 |
|
|---|
| 1446 | typedef struct {
|
|---|
| 1447 | dim3 gridDim;
|
|---|
| 1448 | dim3 blockDim;
|
|---|
| 1449 | size_t $cudaMemSize;
|
|---|
| 1450 | cudaStream_t $cudaStream;
|
|---|
| 1451 | float* A;
|
|---|
| 1452 | const float* B;
|
|---|
| 1453 | float* C;
|
|---|
| 1454 | int numElements;
|
|---|
| 1455 | } $cuda_kernel_1_params;
|
|---|
| 1456 |
|
|---|
| 1457 | void $cuda_reveal_kernel_1_args($cuda_kernel_1_params* args) {
|
|---|
| 1458 | args->A = $reveal(args->A);
|
|---|
| 1459 | args->B = $reveal(args->B);
|
|---|
| 1460 | args->C = $reveal(args->C);
|
|---|
| 1461 | }
|
|---|
| 1462 |
|
|---|
| 1463 | void $cuda_host_launch_kernel_1(dim3 gridDim, dim3 blockDim,
|
|---|
| 1464 | size_t $cudaMemSize, cudaStream_t $cudaStream,
|
|---|
| 1465 | float* A, const float* B, float* C, int numElements) {
|
|---|
| 1466 | $cuda_kernel_1_params args;
|
|---|
| 1467 | args.gridDim = gridDim;
|
|---|
| 1468 | args.blockDim = blockDim;
|
|---|
| 1469 | args.$cudaMemSize = $cudaMemSize;
|
|---|
| 1470 | args.$cudaStream = $cudaStream;
|
|---|
| 1471 | args.A = A;
|
|---|
| 1472 | args.B = B;
|
|---|
| 1473 | args.C = C;
|
|---|
| 1474 | args.numElements = numElements;
|
|---|
| 1475 |
|
|---|
| 1476 | $comm_enqueue($cuda_host_comm, $message_pack($CUDA_PLACE_HOST, $CUDA_PLACE_DEVICE, $CUDA_TAG_LAUNCH_kernel_1, &args, sizeof($cuda_kernel_1_params)));
|
|---|
| 1477 | $comm_dequeue($cuda_host_comm, $CUDA_PLACE_DEVICE, $CUDA_TAG_LAUNCH_kernel_1);
|
|---|
| 1478 | }
|
|---|
| 1479 |
|
|---|
| 1480 | void $cuda_kernel_1(dim3 gridDim, dim3 blockDim, size_t _cuda_mem_size,
|
|---|
| 1481 | float *A, const float *B, float *C, int numElements) {
|
|---|
| 1482 | $cuda_kernel_data_t $kernel = $create_cuda_kernel_data($here, gridDim, blockDim);
|
|---|
| 1483 | void $cuda_block(uint3 blockIdx) {
|
|---|
| 1484 | void $cuda_thread(uint3 threadIdx) {
|
|---|
| 1485 | $local_start();
|
|---|
| 1486 | // cudaMemSet currently not supported so this is small hack to initialize C ahead of time
|
|---|
| 1487 | if (blockIdx.x == 0 && threadIdx.x == 0) {
|
|---|
| 1488 | for (int i = 0; i < gridDim.x; i++) {
|
|---|
| 1489 | C[i] = 0;
|
|---|
| 1490 | }
|
|---|
| 1491 | }
|
|---|
| 1492 | //$clear_mem_sets($kernel, _cuda_kid);
|
|---|
| 1493 | int $cuda_tid = $dim3_index(blockDim, threadIdx);
|
|---|
| 1494 | int $cuda_kid = $cuda_kernel_index(gridDim, blockDim, blockIdx, threadIdx);
|
|---|
| 1495 | $cuda_thread_data_t $thread = $create_cuda_thread_data($here, $kernel, $cuda_kid/(blockDim.x * blockDim.y * blockDim.z), $cuda_tid/warpSize, $cuda_tid % warpSize);
|
|---|
| 1496 |
|
|---|
| 1497 | // Kernel REDUCTION start
|
|---|
| 1498 | /*
|
|---|
| 1499 | int lane = threadIdx.x % warpSize;
|
|---|
| 1500 | int thisWarpSize = warpSize;
|
|---|
| 1501 | if (threadIdx.x - lane + warpSize > blockDim.x) {
|
|---|
| 1502 | thisWarpSize = ((blockDim.x - 1) % warpSize) + 1;
|
|---|
| 1503 | }
|
|---|
| 1504 |
|
|---|
| 1505 | int i = blockDim.x * blockIdx.x + threadIdx.x;
|
|---|
| 1506 | int warpStart = i - lane;
|
|---|
| 1507 | //printf("%d,%d - i: %d, warpStart: %d, thisWarpSize: %d\n", blockIdx.x, threadIdx.x,i, warpStart, thisWarpSize);
|
|---|
| 1508 | int remainingElements = numElements;
|
|---|
| 1509 |
|
|---|
| 1510 | while (remainingElements > 1) {
|
|---|
| 1511 | //printf("%d,%d - remainingElements: %d - numElements: %d\n", blockIdx.x, threadIdx.x, remainingElements, numElements);
|
|---|
| 1512 | if (remainingElements < numElements) {
|
|---|
| 1513 | // __syncThreads()
|
|---|
| 1514 | //printf("%d,%d - entering barrier\n", blockIdx.x, threadIdx.x);
|
|---|
| 1515 |
|
|---|
| 1516 | $cuda__syncthreads($thread, 0);
|
|---|
| 1517 | //printf("%d,%d - exiting barrier\n", blockIdx.x, threadIdx.x);
|
|---|
| 1518 | }
|
|---|
| 1519 |
|
|---|
| 1520 | if (warpStart + 1 < remainingElements) {
|
|---|
| 1521 | float val = i < numElements ? A[i] : 0;
|
|---|
| 1522 | //printf("%d,%d - val: %d\n", blockIdx.x, threadIdx.x, val);
|
|---|
| 1523 |
|
|---|
| 1524 | for (int offset = warpSize/2; offset > 0; offset /= 2) {
|
|---|
| 1525 | //__syncwarp();
|
|---|
| 1526 | float tmp = __shfl_down_sync(0, val, offset);
|
|---|
| 1527 | if (lane + offset < thisWarpSize) {
|
|---|
| 1528 | val += tmp;
|
|---|
| 1529 | }
|
|---|
| 1530 | //printf("%d,%d - offset: %d - val: %d\n", blockIdx.x, threadIdx.x, offset, val);
|
|---|
| 1531 | }
|
|---|
| 1532 |
|
|---|
| 1533 | if (i < numElements) {
|
|---|
| 1534 | A[i] = val;
|
|---|
| 1535 | }
|
|---|
| 1536 | }
|
|---|
| 1537 |
|
|---|
| 1538 | i *= warpSize;
|
|---|
| 1539 | //warpStart *= warpSize;
|
|---|
| 1540 | remainingElements = ((remainingElements - 1) / warpSize) + 1;
|
|---|
| 1541 | }
|
|---|
| 1542 |
|
|---|
| 1543 | if (i == 0) {
|
|---|
| 1544 | *C = A[0];
|
|---|
| 1545 | }
|
|---|
| 1546 | */
|
|---|
| 1547 | // Kernel REDUCTION end
|
|---|
| 1548 |
|
|---|
| 1549 | // Kernel REDUCTION 2 start
|
|---|
| 1550 |
|
|---|
| 1551 | int lane = threadIdx.x % warpSize;
|
|---|
| 1552 | int thisWarpSize = warpSize;
|
|---|
| 1553 | if (threadIdx.x - lane + warpSize > blockDim.x) {
|
|---|
| 1554 | thisWarpSize = ((blockDim.x - 1) % warpSize) + 1;
|
|---|
| 1555 | }
|
|---|
| 1556 |
|
|---|
| 1557 | int i = blockDim.x * blockIdx.x + threadIdx.x;
|
|---|
| 1558 | int warpStart = i - lane;
|
|---|
| 1559 |
|
|---|
| 1560 | if (warpStart + 1 < numElements) {
|
|---|
| 1561 | float val = i < numElements ? A[i] : 0;
|
|---|
| 1562 | for (int offset = warpSize/2; offset > 0; offset /= 2) {
|
|---|
| 1563 | float tmp = __shfl_down_sync(0, val, offset);
|
|---|
| 1564 | //float tmp = i + offset < numElements ? A[i + offset] : 0;
|
|---|
| 1565 | if (lane + offset < thisWarpSize) {
|
|---|
| 1566 | val += tmp;
|
|---|
| 1567 | }
|
|---|
| 1568 | }
|
|---|
| 1569 |
|
|---|
| 1570 | if (i < numElements) {
|
|---|
| 1571 | A[i] = val;
|
|---|
| 1572 | }
|
|---|
| 1573 | }
|
|---|
| 1574 |
|
|---|
| 1575 | $cuda__syncthreads($thread, 0);
|
|---|
| 1576 | if (threadIdx.x == 0) {
|
|---|
| 1577 | int blockEnd = blockDim.x * (blockIdx.x + 1);
|
|---|
| 1578 | if (blockEnd > numElements) {
|
|---|
| 1579 | blockEnd = numElements;
|
|---|
| 1580 | }
|
|---|
| 1581 | for (int j = i + warpSize; j < blockEnd; j += warpSize) {
|
|---|
| 1582 | A[i] += A[j];
|
|---|
| 1583 | }
|
|---|
| 1584 | atomicAdd(C + blockIdx.x, 1);
|
|---|
| 1585 | }
|
|---|
| 1586 |
|
|---|
| 1587 | if (i == 0) {
|
|---|
| 1588 | C[0] = A[0];
|
|---|
| 1589 | for (int j = 1; j < gridDim.x; j++) {
|
|---|
| 1590 | while(atomicAdd(C+j,0) == 0) {}
|
|---|
| 1591 | C[0] += A[j * blockDim.x];
|
|---|
| 1592 | }
|
|---|
| 1593 | }
|
|---|
| 1594 | // Kernel REDUCTION 2 end
|
|---|
| 1595 |
|
|---|
| 1596 | // Kernel BALLOT TEST start
|
|---|
| 1597 |
|
|---|
| 1598 | /*
|
|---|
| 1599 | int i = threadIdx.x;
|
|---|
| 1600 | if (i < numElements) {
|
|---|
| 1601 | int result = __ballot_sync(~0, A[i] > 0);
|
|---|
| 1602 | if (i == 0) {
|
|---|
| 1603 | printf("Result: %d\n", result);
|
|---|
| 1604 | *C = 0;
|
|---|
| 1605 | while(result > 0) {
|
|---|
| 1606 | if (result % 2)
|
|---|
| 1607 | *C += 1;
|
|---|
| 1608 | result /= 2;
|
|---|
| 1609 | }
|
|---|
| 1610 | printf("done calculating result\n");
|
|---|
| 1611 | }
|
|---|
| 1612 | }
|
|---|
| 1613 | */
|
|---|
| 1614 | // Kernel BALLOT TEST end
|
|---|
| 1615 | //$check_data_race($kernel, _cuda_kid);
|
|---|
| 1616 | //$read_set_pop();
|
|---|
| 1617 | //$write_set_pop();
|
|---|
| 1618 | $destroy_cuda_thread_data($thread);
|
|---|
| 1619 | $local_end();
|
|---|
| 1620 | }
|
|---|
| 1621 | $cuda_run_and_wait_on_procs(blockDim, $cuda_thread);
|
|---|
| 1622 | }
|
|---|
| 1623 | $cuda_run_and_wait_on_procs(gridDim, $cuda_block);
|
|---|
| 1624 | $destroy_cuda_kernel_data($kernel);
|
|---|
| 1625 | }
|
|---|
| 1626 |
|
|---|
| 1627 | void $cuda_kernel_1_proc ($message request, $cuda_op_state_t opState, cudaStream_t cudaStream) {
|
|---|
| 1628 | $when(opState->start);
|
|---|
| 1629 |
|
|---|
| 1630 | $cuda_kernel_1_params args;
|
|---|
| 1631 | $message_unpack(request, &args, sizeof($cuda_kernel_1_params));
|
|---|
| 1632 | $cuda_reveal_kernel_1_args(&args);
|
|---|
| 1633 |
|
|---|
| 1634 | $cuda_kernel_1(args.gridDim, args.blockDim, args.$cudaMemSize, args.A, args.B, args.C, args.numElements);
|
|---|
| 1635 | $stream_dequeue(cudaStream);
|
|---|
| 1636 | }
|
|---|
| 1637 |
|
|---|
| 1638 | /////////////////
|
|---|
| 1639 | // CUDA "file" //
|
|---|
| 1640 | /////////////////
|
|---|
| 1641 |
|
|---|
| 1642 | void $cuda_main() {
|
|---|
| 1643 |
|
|---|
| 1644 | // Device Variables
|
|---|
| 1645 |
|
|---|
| 1646 | $scope $cuda_scope = $here;
|
|---|
| 1647 |
|
|---|
| 1648 | $comm $cuda_device_comm = $comm_create($cuda_scope, $cuda_gcomm, 1);
|
|---|
| 1649 | $cuda_context $cuda_global_context;
|
|---|
| 1650 | cudaStream_t $cuda_default_stream;
|
|---|
| 1651 |
|
|---|
| 1652 | // Helper function to get the default stream if passed NULL, and just returns stream otherwise
|
|---|
| 1653 | // Currently unused until we support streams other than the default one.
|
|---|
| 1654 | cudaStream_t $default_stream_if_null(cudaStream_t stream) {
|
|---|
| 1655 | return stream == NULL ? $cuda_default_stream : stream;
|
|---|
| 1656 | }
|
|---|
| 1657 |
|
|---|
| 1658 | // Device Logic
|
|---|
| 1659 |
|
|---|
| 1660 | $cuda_stream_node_t defaultStreamNode = $create_new_stream_node($cuda_scope);
|
|---|
| 1661 | $cuda_default_stream = defaultStreamNode->stream;
|
|---|
| 1662 |
|
|---|
| 1663 | $cuda_global_context.head = defaultStreamNode;
|
|---|
| 1664 | $cuda_global_context.numStreams = 1;
|
|---|
| 1665 |
|
|---|
| 1666 | while (true) {
|
|---|
| 1667 | $message request = $comm_dequeue($cuda_device_comm, $CUDA_PLACE_HOST, $COMM_ANY_TAG);
|
|---|
| 1668 | $message response;
|
|---|
| 1669 | const int tag = $message_tag(request);
|
|---|
| 1670 |
|
|---|
| 1671 | switch(tag) {
|
|---|
| 1672 | case $CUDA_TAG_SCOPE_REQUEST :
|
|---|
| 1673 | response = $message_pack($CUDA_PLACE_DEVICE, $CUDA_PLACE_HOST, $CUDA_TAG_SCOPE_REQUEST, &$cuda_scope, sizeof($scope));
|
|---|
| 1674 | break;
|
|---|
| 1675 | case $CUDA_TAG_cudaFree :
|
|---|
| 1676 | response = $cuda_free(request);
|
|---|
| 1677 | break;
|
|---|
| 1678 | case $CUDA_TAG_cudaMemcpy :
|
|---|
| 1679 | response = $cuda_memcpy($cuda_scope, $cuda_default_stream, request, false);
|
|---|
| 1680 | break;
|
|---|
| 1681 | case $CUDA_TAG_cudaMemcpyAsync :
|
|---|
| 1682 | response = $cuda_memcpy($cuda_scope, $cuda_default_stream, request, true);
|
|---|
| 1683 | break;
|
|---|
| 1684 | case $CUDA_TAG_LAUNCH_kernel_1 :
|
|---|
| 1685 | $stream_enqueue($cuda_scope, $cuda_default_stream, request, $cuda_kernel_1_proc);
|
|---|
| 1686 |
|
|---|
| 1687 | response = $message_pack($CUDA_PLACE_DEVICE, $CUDA_PLACE_HOST, tag, NULL, 0);
|
|---|
| 1688 | break;
|
|---|
| 1689 | case $CUDA_TAG_TEARDOWN : {
|
|---|
| 1690 | $proc destructor = $destroy_stream_node($cuda_default_stream->containingNode);
|
|---|
| 1691 | $wait(destructor);
|
|---|
| 1692 | $comm_destroy($cuda_device_comm);
|
|---|
| 1693 | return;
|
|---|
| 1694 | }
|
|---|
| 1695 | default :
|
|---|
| 1696 | $assert(false, "Unknown CUDA request");
|
|---|
| 1697 | }
|
|---|
| 1698 |
|
|---|
| 1699 | $comm_enqueue($cuda_device_comm, response);
|
|---|
| 1700 | }
|
|---|
| 1701 | }
|
|---|
| 1702 |
|
|---|
| 1703 | ///////////////
|
|---|
| 1704 | // Host file //
|
|---|
| 1705 | ///////////////
|
|---|
| 1706 |
|
|---|
| 1707 | $input int N = 16;
|
|---|
| 1708 | $input int threadsPerBlock = N%2 == 0 ? N/2 : (N+1)/2;
|
|---|
| 1709 | //$input int threadsPerBlock = N;
|
|---|
| 1710 | $input float A[N];
|
|---|
| 1711 | // Currently unused but left in to save time
|
|---|
| 1712 | $input float B[N];
|
|---|
| 1713 |
|
|---|
| 1714 | void $host_main() {
|
|---|
| 1715 | int size = N * sizeof(float);
|
|---|
| 1716 | int numBlocks = (N-1)/threadsPerBlock + 1;
|
|---|
| 1717 | //int numBlocks = 1;
|
|---|
| 1718 |
|
|---|
| 1719 | float* cuda_A;
|
|---|
| 1720 | // cudaMalloc((void **)&cuda_A, size);
|
|---|
| 1721 | {
|
|---|
| 1722 | $scope deviceScope = $cuda_host_request_device_scope();
|
|---|
| 1723 | cuda_A = $hide((float*)$malloc(deviceScope, size));
|
|---|
| 1724 | }
|
|---|
| 1725 | cudaMemcpy(cuda_A, A, size, cudaMemcpyHostToDevice);
|
|---|
| 1726 |
|
|---|
| 1727 | float* cuda_B;
|
|---|
| 1728 | // cudaMalloc((void **)&cuda_B, size);
|
|---|
| 1729 | {
|
|---|
| 1730 | $scope deviceScope = $cuda_host_request_device_scope();
|
|---|
| 1731 | cuda_B = $hide((float*)$malloc(deviceScope, size));
|
|---|
| 1732 | }
|
|---|
| 1733 | cudaMemcpy(cuda_B, B, size, cudaMemcpyHostToDevice);
|
|---|
| 1734 |
|
|---|
| 1735 | float* cuda_C;
|
|---|
| 1736 | // cudaMalloc((void **)&cuda_C, sizeof(float) * numBlocks);
|
|---|
| 1737 | {
|
|---|
| 1738 | $scope deviceScope = $cuda_host_request_device_scope();
|
|---|
| 1739 | cuda_C = $hide((float*)$malloc(deviceScope, sizeof(float) * numBlocks));
|
|---|
| 1740 | }
|
|---|
| 1741 |
|
|---|
| 1742 | dim3 gridDim = {numBlocks, 1, 1};
|
|---|
| 1743 | dim3 blockDim = {threadsPerBlock, 1, 1};
|
|---|
| 1744 | // kernel_1<<<gridDim, blockDim>>>(cuda_A, cuda_B, cuda_C, N);
|
|---|
| 1745 | $cuda_host_launch_kernel_1(gridDim, blockDim, 0, NULL, cuda_A, cuda_B, cuda_C, N);
|
|---|
| 1746 |
|
|---|
| 1747 | // Checking correctness
|
|---|
| 1748 | float* C = (float *)malloc(size);
|
|---|
| 1749 |
|
|---|
| 1750 | cudaMemcpy(C, cuda_C, sizeof(float), cudaMemcpyDeviceToHost);
|
|---|
| 1751 |
|
|---|
| 1752 | // REDUCTION ASSERTION
|
|---|
| 1753 |
|
|---|
| 1754 | float sum = 0;
|
|---|
| 1755 | for(int i = 0; i < N; i++)
|
|---|
| 1756 | sum += A[i];
|
|---|
| 1757 |
|
|---|
| 1758 | $assert(*C == sum);
|
|---|
| 1759 |
|
|---|
| 1760 | // BALLOT ASSERTION
|
|---|
| 1761 | /*
|
|---|
| 1762 | float count = 0;
|
|---|
| 1763 | for (int i = 0; i < N; i++) {
|
|---|
| 1764 | if (A[i] > 0)
|
|---|
| 1765 | count++;
|
|---|
| 1766 | }
|
|---|
| 1767 | $assert(*C == count);
|
|---|
| 1768 | */
|
|---|
| 1769 |
|
|---|
| 1770 | free(C);
|
|---|
| 1771 |
|
|---|
| 1772 | cudaFree(cuda_A);
|
|---|
| 1773 | cudaFree(cuda_B);
|
|---|
| 1774 | cudaFree(cuda_C);
|
|---|
| 1775 |
|
|---|
| 1776 | }
|
|---|
| 1777 |
|
|---|
| 1778 | int main() {
|
|---|
| 1779 | $proc host = $spawn $host_main();
|
|---|
| 1780 | $proc cuda = $spawn $cuda_main();
|
|---|
| 1781 | $wait(host);
|
|---|
| 1782 | $comm_enqueue($cuda_host_comm, $message_pack($CUDA_PLACE_HOST, $CUDA_PLACE_DEVICE, $CUDA_TAG_TEARDOWN, NULL, 0));
|
|---|
| 1783 | $comm_destroy($cuda_host_comm);
|
|---|
| 1784 | $wait(cuda);
|
|---|
| 1785 | $gcomm_destroy($cuda_gcomm, NULL);
|
|---|
| 1786 | }
|
|---|