| 1 | #include <concurrency.cvh>
|
|---|
| 2 | #include <comm.cvh>
|
|---|
| 3 | #include <stdlib.h>
|
|---|
| 4 | #include <stdio.h>
|
|---|
| 5 | #include <stdbool.h>
|
|---|
| 6 | #include <string.h>
|
|---|
| 7 | #include <mem.cvh>
|
|---|
| 8 | #pragma CIVL ACSL
|
|---|
| 9 |
|
|---|
| 10 | ///////////
|
|---|
| 11 | // Types //
|
|---|
| 12 | ///////////
|
|---|
| 13 |
|
|---|
| 14 | enum cudaError {
|
|---|
| 15 | cudaSuccess
|
|---|
| 16 | };
|
|---|
| 17 | typedef enum cudaError cudaError_t;
|
|---|
| 18 |
|
|---|
| 19 | //typedef $cuda_memcpy_kind cudaMemcpyKind;
|
|---|
| 20 |
|
|---|
| 21 | typedef struct {
|
|---|
| 22 | unsigned int x, y, z;
|
|---|
| 23 | } dim3;
|
|---|
| 24 |
|
|---|
| 25 | /* used to represent a location in a three dimensional grid
|
|---|
| 26 | */
|
|---|
| 27 | typedef struct {
|
|---|
| 28 | unsigned int x, y, z;
|
|---|
| 29 | } uint3;
|
|---|
| 30 |
|
|---|
| 31 | typedef enum {
|
|---|
| 32 | cudaMemcpyHostToHost,
|
|---|
| 33 | cudaMemcpyHostToDevice,
|
|---|
| 34 | cudaMemcpyDeviceToHost,
|
|---|
| 35 | cudaMemcpyDeviceToDevice,
|
|---|
| 36 | cudaMemcpyDefault
|
|---|
| 37 | } cudaMemcpyKind;
|
|---|
| 38 |
|
|---|
| 39 | typedef struct $cuda_op_state* $cuda_op_state_t;
|
|---|
| 40 | typedef struct $cuda_op_state {
|
|---|
| 41 | _Bool start;
|
|---|
| 42 | $proc op;
|
|---|
| 43 | } $cuda_op_state;
|
|---|
| 44 |
|
|---|
| 45 | typedef struct $cuda_op_state_node* $cuda_op_state_node_t;
|
|---|
| 46 | typedef struct $cuda_op_state_node {
|
|---|
| 47 | $cuda_op_state_t opState;
|
|---|
| 48 | $cuda_op_state_node_t next;
|
|---|
| 49 | } $cuda_op_state_node;
|
|---|
| 50 |
|
|---|
| 51 | typedef struct $cuda_stream_node* $cuda_stream_node_t;
|
|---|
| 52 | typedef struct $cuda_stream* $cuda_stream_t;
|
|---|
| 53 | typedef $cuda_stream_t cudaStream_t;
|
|---|
| 54 | typedef struct $cuda_stream {
|
|---|
| 55 | $cuda_op_state_node_t head;
|
|---|
| 56 | $cuda_op_state_node_t tail;
|
|---|
| 57 | int numOps;
|
|---|
| 58 | $cuda_stream_node_t containingNode;
|
|---|
| 59 | _Bool alive;
|
|---|
| 60 | } $cuda_stream;
|
|---|
| 61 |
|
|---|
| 62 | typedef struct $cuda_stream_node{
|
|---|
| 63 | cudaStream_t stream;
|
|---|
| 64 | $cuda_stream_node_t prev;
|
|---|
| 65 | $cuda_stream_node_t next;
|
|---|
| 66 | } $cuda_stream_node;
|
|---|
| 67 |
|
|---|
| 68 | typedef struct $cuda_context* $cuda_context_t;
|
|---|
| 69 | typedef struct $cuda_context {
|
|---|
| 70 | $cuda_stream_node_t head;
|
|---|
| 71 | int numStreams;
|
|---|
| 72 | } $cuda_context;
|
|---|
| 73 |
|
|---|
| 74 | typedef struct $cuda_kernel_instance* $cuda_kernel_instance_t;
|
|---|
| 75 | typedef struct $cuda_kernel_instance {
|
|---|
| 76 | $mem* readSets;
|
|---|
| 77 | $mem* writeSets;
|
|---|
| 78 | int size;
|
|---|
| 79 | } $cuda_kernel_instance;
|
|---|
| 80 |
|
|---|
| 81 | typedef struct $cuda_memcpy_data {
|
|---|
| 82 | void* dst;
|
|---|
| 83 | const void* src;
|
|---|
| 84 | size_t count;
|
|---|
| 85 | cudaMemcpyKind kind;
|
|---|
| 86 | } $cuda_memcpy_data;
|
|---|
| 87 |
|
|---|
| 88 | //////////////////////
|
|---|
| 89 | // Global Variables //
|
|---|
| 90 | //////////////////////
|
|---|
| 91 |
|
|---|
| 92 | $gcomm $cuda_gcomm = $gcomm_create($here, 2);
|
|---|
| 93 | const int $CUDA_PLACE_HOST = 0;
|
|---|
| 94 | const int $CUDA_PLACE_DEVICE = 1;
|
|---|
| 95 | $comm $cuda_host_comm = $comm_create($here, $cuda_gcomm, $CUDA_PLACE_HOST);
|
|---|
| 96 |
|
|---|
| 97 | /**
|
|---|
| 98 | * Tags used for message-passing between host and device
|
|---|
| 99 | */
|
|---|
| 100 | enum $cuda_tag {
|
|---|
| 101 | // Predefined tags
|
|---|
| 102 | $CUDA_TAG_TEARDOWN,
|
|---|
| 103 | $CUDA_TAG_SCOPE_REQUEST,
|
|---|
| 104 | $CUDA_TAG_cudaFree,
|
|---|
| 105 | $CUDA_TAG_cudaMemcpy,
|
|---|
| 106 | $CUDA_TAG_cudaMemcpyAsync,
|
|---|
| 107 | // Generated tags (by transformer)
|
|---|
| 108 | $CUDA_TAG_LAUNCH_kernel_1
|
|---|
| 109 | };
|
|---|
| 110 |
|
|---|
| 111 | ///////////////////
|
|---|
| 112 | // CIVL-CUDA API //
|
|---|
| 113 | ///////////////////
|
|---|
| 114 |
|
|---|
| 115 | $scope $cuda_host_request_device_scope() {
|
|---|
| 116 | $comm_enqueue($cuda_host_comm, $message_pack($CUDA_PLACE_HOST, $CUDA_PLACE_DEVICE, $CUDA_TAG_SCOPE_REQUEST, NULL, 0));
|
|---|
| 117 | $message response = $comm_dequeue($cuda_host_comm, $CUDA_PLACE_DEVICE, $CUDA_TAG_SCOPE_REQUEST);
|
|---|
| 118 | $scope result;
|
|---|
| 119 | $message_unpack(response, &result, sizeof($scope));
|
|---|
| 120 |
|
|---|
| 121 | return result;
|
|---|
| 122 | }
|
|---|
| 123 |
|
|---|
| 124 | void $cuda_host_memcpy(void* dst, const void* src, size_t count, cudaMemcpyKind kind, _Bool async) {
|
|---|
| 125 | if (kind == cudaMemcpyHostToHost) {
|
|---|
| 126 | memcpy(dst, src, count);
|
|---|
| 127 | } else {
|
|---|
| 128 | $cuda_memcpy_data args;
|
|---|
| 129 | args.dst = dst;
|
|---|
| 130 | args.src = src;
|
|---|
| 131 | args.count = count;
|
|---|
| 132 | args.kind = kind;
|
|---|
| 133 |
|
|---|
| 134 | int tag = async ? $CUDA_TAG_cudaMemcpyAsync : $CUDA_TAG_cudaMemcpy;
|
|---|
| 135 |
|
|---|
| 136 | $comm_enqueue($cuda_host_comm, $message_pack($CUDA_PLACE_HOST, $CUDA_PLACE_DEVICE, tag, &args, sizeof($cuda_memcpy_data)));
|
|---|
| 137 | $comm_dequeue($cuda_host_comm, $CUDA_PLACE_DEVICE, tag);
|
|---|
| 138 | }
|
|---|
| 139 | }
|
|---|
| 140 |
|
|---|
| 141 | $cuda_stream_node_t $create_new_stream_node($scope cudaScope) {
|
|---|
| 142 | cudaStream_t newStream = (cudaStream_t) $malloc(cudaScope, sizeof($cuda_stream));
|
|---|
| 143 | newStream->head = NULL;
|
|---|
| 144 | newStream->tail = NULL;
|
|---|
| 145 | newStream->numOps = 0;
|
|---|
| 146 | newStream->alive = true;
|
|---|
| 147 |
|
|---|
| 148 | $cuda_stream_node_t newHead = ($cuda_stream_node_t) $malloc(cudaScope, sizeof($cuda_stream_node));
|
|---|
| 149 | newHead->stream = newStream;
|
|---|
| 150 | newStream->containingNode = newHead;
|
|---|
| 151 | newHead->prev = NULL;
|
|---|
| 152 | newHead->next = NULL;
|
|---|
| 153 |
|
|---|
| 154 | return newHead;
|
|---|
| 155 | }
|
|---|
| 156 |
|
|---|
| 157 | /*@ depends_on \nothing;
|
|---|
| 158 | @ assigns \nothing;
|
|---|
| 159 | @ reads \nothing;
|
|---|
| 160 | @*/
|
|---|
| 161 | $atomic_f $proc $destroy_stream_node($cuda_stream_node_t node) {
|
|---|
| 162 | $proc lastOpProc = $proc_null;
|
|---|
| 163 | cudaStream_t stream = node->stream;
|
|---|
| 164 |
|
|---|
| 165 | if (node->prev != NULL) {
|
|---|
| 166 | node->prev->next = node->next;
|
|---|
| 167 | }
|
|---|
| 168 | if (node->next != NULL) {
|
|---|
| 169 | node->next->prev = node->prev;
|
|---|
| 170 | }
|
|---|
| 171 | free(node);
|
|---|
| 172 |
|
|---|
| 173 | stream->alive = false;
|
|---|
| 174 | if(stream->tail != NULL)
|
|---|
| 175 | lastOpProc = stream->tail->opState->op;
|
|---|
| 176 |
|
|---|
| 177 | void destroyStreamWhenComplete($proc lastOpProc, cudaStream_t stream) {
|
|---|
| 178 | $wait(lastOpProc);
|
|---|
| 179 | free(stream);
|
|---|
| 180 | }
|
|---|
| 181 |
|
|---|
| 182 | return $spawn destroyStreamWhenComplete(lastOpProc, stream);
|
|---|
| 183 | }
|
|---|
| 184 |
|
|---|
| 185 | /*@ depends_on \access(stream);
|
|---|
| 186 | @ assigns stream;
|
|---|
| 187 | @ reads \nothing;
|
|---|
| 188 | @*/
|
|---|
| 189 | $atomic_f $proc $stream_enqueue($scope cudaScope, cudaStream_t stream, $message opParams, void(*opProc)($message, $cuda_op_state_t, cudaStream_t)) {
|
|---|
| 190 | $assert(stream->alive, "Attempt to enqueue a CUDA operation onto a destroyed stream");
|
|---|
| 191 |
|
|---|
| 192 | $cuda_op_state_t newOpState = ($cuda_op_state_t) $malloc(cudaScope, sizeof($cuda_op_state));
|
|---|
| 193 | newOpState->start = false;
|
|---|
| 194 | newOpState->op = $spawn opProc(opParams, newOpState, stream);
|
|---|
| 195 |
|
|---|
| 196 | $cuda_op_state_node_t newOpStateNode = ($cuda_op_state_node_t) $malloc(cudaScope, sizeof($cuda_op_state_node));
|
|---|
| 197 | newOpStateNode->opState = newOpState;
|
|---|
| 198 | newOpStateNode->next = NULL;
|
|---|
| 199 |
|
|---|
| 200 | if (stream->tail == NULL) {
|
|---|
| 201 | stream->head = newOpStateNode;
|
|---|
| 202 | stream->tail = newOpStateNode;
|
|---|
| 203 | newOpState->start = true;
|
|---|
| 204 | } else {
|
|---|
| 205 | stream->tail->next = newOpStateNode;
|
|---|
| 206 | stream->tail = newOpStateNode;
|
|---|
| 207 | }
|
|---|
| 208 | stream->numOps++;
|
|---|
| 209 |
|
|---|
| 210 | return newOpState->op;
|
|---|
| 211 | }
|
|---|
| 212 |
|
|---|
| 213 | /*@ depends_on \nothing;
|
|---|
| 214 | @ assigns \nothing;
|
|---|
| 215 | @ reads \nothing;
|
|---|
| 216 | @*/
|
|---|
| 217 | $atomic_f void $stream_dequeue(cudaStream_t stream) {
|
|---|
| 218 | $assert(stream->head != NULL, "Attempt to dequeue an empty stream");
|
|---|
| 219 |
|
|---|
| 220 | if (stream->head == stream->tail) {
|
|---|
| 221 | stream->tail = NULL;
|
|---|
| 222 | }
|
|---|
| 223 |
|
|---|
| 224 | $cuda_op_state_node_t oldHead = stream->head;
|
|---|
| 225 | stream->head = oldHead->next;
|
|---|
| 226 | if (stream->head != NULL) {
|
|---|
| 227 | stream->head->opState->start = true;
|
|---|
| 228 | }
|
|---|
| 229 |
|
|---|
| 230 | stream->numOps--;
|
|---|
| 231 | free(oldHead->opState);
|
|---|
| 232 | free(oldHead);
|
|---|
| 233 | }
|
|---|
| 234 |
|
|---|
| 235 | // Helper function
|
|---|
| 236 | int $dim3_index(dim3 size, uint3 location) {
|
|---|
| 237 | return location.x + size.x * (location.y + size.y * location.z);
|
|---|
| 238 | }
|
|---|
| 239 |
|
|---|
| 240 | // Helper function
|
|---|
| 241 | int $cuda_kernel_index (dim3 gDim, dim3 bDim, uint3 bIdx, uint3 tIdx) {
|
|---|
| 242 | return $dim3_index(gDim, bIdx) * (bDim.x * bDim.y * bDim.z) + $dim3_index(bDim, tIdx);
|
|---|
| 243 | }
|
|---|
| 244 |
|
|---|
| 245 | void $cuda_run_and_wait_on_procs(dim3 dim, void spawningFunction(uint3)) {
|
|---|
| 246 | //TODO: calculate length and index, replace this function in the kernel
|
|---|
| 247 | $local_start();
|
|---|
| 248 | int length = dim.x * dim.y * dim.z;
|
|---|
| 249 | $proc procArray[length];
|
|---|
| 250 | $range rx = 0 .. dim.x - 1;
|
|---|
| 251 | $range ry = 0 .. dim.y - 1;
|
|---|
| 252 | $range rz = 0 .. dim.z - 1;
|
|---|
| 253 | $domain(3) dom = ($domain(3)){rx, ry, rz};
|
|---|
| 254 | $for(int x,y,z : dom){
|
|---|
| 255 | uint3 id = { x, y, z };
|
|---|
| 256 | int index = $dim3_index(dim, id);
|
|---|
| 257 | procArray[index] = $spawn spawningFunction(id);
|
|---|
| 258 | }
|
|---|
| 259 | $local_end();
|
|---|
| 260 | $waitall(procArray,length);
|
|---|
| 261 | }
|
|---|
| 262 |
|
|---|
| 263 |
|
|---|
| 264 | // CUDA Ops //
|
|---|
| 265 |
|
|---|
| 266 | void $cuda_memcpy_proc($message m, $cuda_op_state_t opState, cudaStream_t stream) {
|
|---|
| 267 |
|
|---|
| 268 | $when(opState->start);
|
|---|
| 269 | $cuda_memcpy_data args;
|
|---|
| 270 | $message_unpack(m, &args, sizeof($cuda_memcpy_data));
|
|---|
| 271 |
|
|---|
| 272 | if (args.kind == cudaMemcpyHostToDevice || cudaMemcpyDeviceToDevice) {
|
|---|
| 273 | args.dst = $reveal(args.dst);
|
|---|
| 274 | }
|
|---|
| 275 | if (args.kind == cudaMemcpyDeviceToHost || cudaMemcpyDeviceToDevice) {
|
|---|
| 276 | args.src = $reveal(args.src);
|
|---|
| 277 | }
|
|---|
| 278 | memcpy(args.dst, args.src, args.count);
|
|---|
| 279 |
|
|---|
| 280 | $stream_dequeue(stream);
|
|---|
| 281 | }
|
|---|
| 282 |
|
|---|
| 283 | $message $cuda_memcpy($scope cudaScope, cudaStream_t stream, $message request, _Bool async) {
|
|---|
| 284 | $cuda_memcpy_data args;
|
|---|
| 285 | $message_unpack(request, &args, sizeof($cuda_memcpy_data));
|
|---|
| 286 |
|
|---|
| 287 | $proc memcpyProc = $stream_enqueue(cudaScope, stream, request, $cuda_memcpy_proc);
|
|---|
| 288 |
|
|---|
| 289 | if (!async && args.kind != cudaMemcpyDeviceToDevice) {
|
|---|
| 290 | $wait(memcpyProc);
|
|---|
| 291 | }
|
|---|
| 292 | int tag = async ? $CUDA_TAG_cudaMemcpyAsync : $CUDA_TAG_cudaMemcpy;
|
|---|
| 293 |
|
|---|
| 294 | return $message_pack($CUDA_PLACE_DEVICE, $CUDA_PLACE_HOST, tag, NULL, 0);
|
|---|
| 295 | }
|
|---|
| 296 |
|
|---|
| 297 | $message $cuda_free($message request) {
|
|---|
| 298 | void* devPtr;
|
|---|
| 299 | $message_unpack(request, &devPtr, sizeof(void*));
|
|---|
| 300 | free($reveal(devPtr));
|
|---|
| 301 |
|
|---|
| 302 | return $message_pack($CUDA_PLACE_DEVICE, $CUDA_PLACE_HOST, $CUDA_TAG_cudaFree, NULL, 0);
|
|---|
| 303 | }
|
|---|
| 304 |
|
|---|
| 305 | $cuda_kernel_instance_t $create_kernel_instance($scope cudaScope, dim3 gridDim, dim3 blockDim){
|
|---|
| 306 | int threadsPerBlock = (blockDim.x * blockDim.y) * blockDim.z;
|
|---|
| 307 | int numBlocks = (gridDim.x * gridDim.y) * gridDim.z;
|
|---|
| 308 | int numThreads = threadsPerBlock * numBlocks;
|
|---|
| 309 |
|
|---|
| 310 | $cuda_kernel_instance_t $kernel = ($cuda_kernel_instance_t)$malloc(cudaScope, sizeof($cuda_kernel_instance));
|
|---|
| 311 | $mem* readSets = ($mem*)$malloc(cudaScope, sizeof($mem) * numThreads);
|
|---|
| 312 | $mem* writeSets = ($mem*)$malloc(cudaScope, sizeof($mem) * numThreads);
|
|---|
| 313 | //Is this meant to be the number of threads in the grid?
|
|---|
| 314 | int size = numThreads;
|
|---|
| 315 |
|
|---|
| 316 | $kernel->readSets = readSets;
|
|---|
| 317 | $kernel->writeSets = writeSets;
|
|---|
| 318 | $kernel->size = size;
|
|---|
| 319 |
|
|---|
| 320 | return $kernel;
|
|---|
| 321 | }
|
|---|
| 322 |
|
|---|
| 323 | void $destroy_kernel_instance($cuda_kernel_instance_t $kernel){
|
|---|
| 324 | free($kernel->readSets);
|
|---|
| 325 | free($kernel->writeSets);
|
|---|
| 326 | free($kernel);
|
|---|
| 327 | return;
|
|---|
| 328 | }
|
|---|
| 329 |
|
|---|
| 330 | void $clear_mem_sets($cuda_kernel_instance_t k, int cur_tid) {
|
|---|
| 331 | k->writeSets[cur_tid] = $mem_empty();
|
|---|
| 332 | k->readSets[cur_tid] = $mem_empty();
|
|---|
| 333 | }
|
|---|
| 334 |
|
|---|
| 335 | void $clear_all_mem_sets($cuda_kernel_instance_t k){
|
|---|
| 336 | for(int i = 0; i < k->size; i++)
|
|---|
| 337 | $clear_mem_sets(k, i);
|
|---|
| 338 | }
|
|---|
| 339 |
|
|---|
| 340 | $atomic_f void $check_data_race($cuda_kernel_instance_t k, int cur_tid) {
|
|---|
| 341 | //printf("Current id: %d\n", cur_tid);
|
|---|
| 342 | $mem out_s0 = $mem_empty();
|
|---|
| 343 | $mem out_s1 = $mem_empty();
|
|---|
| 344 | $mem cur_mw = $write_set_pop();
|
|---|
| 345 | $mem cur_mr = $read_set_pop();
|
|---|
| 346 |
|
|---|
| 347 | // Update current R/W sets
|
|---|
| 348 | k->writeSets[cur_tid] = cur_mw;
|
|---|
| 349 | k->readSets[cur_tid] = cur_mr;
|
|---|
| 350 |
|
|---|
| 351 | /*
|
|---|
| 352 | printf("CHECKING DATA RACE %d [\n", cur_tid);
|
|---|
| 353 | for (int tmp_tid = 0; tmp_tid < k->size; tmp_tid++) {
|
|---|
| 354 | printf(" RS %d: %s\n", tmp_tid, k->read_sets[tmp_tid]);
|
|---|
| 355 | printf(" WS %d: %s\n", tmp_tid, k->write_sets[tmp_tid]);
|
|---|
| 356 | }
|
|---|
| 357 | printf("]\n");
|
|---|
| 358 | */
|
|---|
| 359 |
|
|---|
| 360 | // Check data race
|
|---|
| 361 | for (int tmp_tid = 0; tmp_tid < k->size; tmp_tid++) {
|
|---|
| 362 | if (tmp_tid == cur_tid) continue;
|
|---|
| 363 |
|
|---|
| 364 | $mem tmp_mr = k->readSets[tmp_tid];
|
|---|
| 365 | $mem tmp_mw = k->writeSets[tmp_tid];
|
|---|
| 366 |
|
|---|
| 367 | $assert($mem_no_intersect(cur_mr, tmp_mw, &out_s0, &out_s1),
|
|---|
| 368 | "Data-race detected: %p read by thread %d intersects %p written by thread %d\n",
|
|---|
| 369 | out_s0, cur_tid, out_s1, tmp_tid);
|
|---|
| 370 | $assert($mem_no_intersect(cur_mw, tmp_mr, &out_s0, &out_s1),
|
|---|
| 371 | "Data-race detected: %p read by thread %d intersects %p written by thread %d\n",
|
|---|
| 372 | out_s0, cur_tid, out_s1, tmp_tid);
|
|---|
| 373 | $assert($mem_no_intersect(cur_mw, tmp_mw, &out_s0, &out_s1),
|
|---|
| 374 | "Data-race detected: %p written by thread %d intersects %p written by thread %d\n",
|
|---|
| 375 | out_s0, cur_tid, out_s1, tmp_tid);
|
|---|
| 376 | }
|
|---|
| 377 | // Update current R/W sets
|
|---|
| 378 | //k->writeSets[cur_tid] = $mem_empty();
|
|---|
| 379 | //k->readSets[cur_tid] = $mem_empty();
|
|---|
| 380 | $read_set_push();
|
|---|
| 381 | $write_set_push();
|
|---|
| 382 | }
|
|---|
| 383 |
|
|---|
| 384 | void $cuda_barrier($cuda_kernel_instance_t k, int kernel_id, $barrier g) {
|
|---|
| 385 | /*$check_data_race(k, kernel_id);
|
|---|
| 386 | We have to push a new read and write set before the barrier call to ignore it's reads and writes
|
|---|
| 387 | $read_set_push();
|
|---|
| 388 | $write_set_push();
|
|---|
| 389 | void captured_clear_mems(){
|
|---|
| 390 | $clear_all_mem_sets(k);
|
|---|
| 391 | }
|
|---|
| 392 | */
|
|---|
| 393 | //$barrier_call_execute(g, captured_clear_mems);
|
|---|
| 394 | $local_end();
|
|---|
| 395 | $barrier_call(g);
|
|---|
| 396 | $local_start();
|
|---|
| 397 | //$read_set_pop();
|
|---|
| 398 | //$write_set_pop();
|
|---|
| 399 | }
|
|---|
| 400 |
|
|---|
| 401 |
|
|---|
| 402 |
|
|---|
| 403 | ////////////////////////////////////////////
|
|---|
| 404 | // CUDA API Functions (For Host-use Only) //
|
|---|
| 405 | ////////////////////////////////////////////
|
|---|
| 406 |
|
|---|
| 407 | cudaError_t cudaFree(void* devPtr) {
|
|---|
| 408 | $comm_enqueue($cuda_host_comm, $message_pack($CUDA_PLACE_HOST, $CUDA_PLACE_DEVICE, $CUDA_TAG_cudaFree, &devPtr, sizeof(void*)));
|
|---|
| 409 | $comm_dequeue($cuda_host_comm, $CUDA_PLACE_DEVICE, $CUDA_TAG_cudaFree);
|
|---|
| 410 |
|
|---|
| 411 | return cudaSuccess;
|
|---|
| 412 | }
|
|---|
| 413 |
|
|---|
| 414 | cudaError_t cudaMemcpy(void* dst, const void* src, size_t count, cudaMemcpyKind kind) {
|
|---|
| 415 | $cuda_host_memcpy(dst, src, count, kind, false);
|
|---|
| 416 | return cudaSuccess;
|
|---|
| 417 | }
|
|---|
| 418 |
|
|---|
| 419 | cudaError_t cudaMemcpyAsync(void* dst, const void* src, size_t count,
|
|---|
| 420 | cudaMemcpyKind kind, cudaStream_t stream) {
|
|---|
| 421 | $cuda_host_memcpy(dst, src, count, kind, true);
|
|---|
| 422 | return cudaSuccess;
|
|---|
| 423 | }
|
|---|
| 424 |
|
|---|
| 425 | /*
|
|---|
| 426 | cudaError_t cudaStreamCreate(cudaStream_t * pStream) {
|
|---|
| 427 | // Create new stream node in linked list
|
|---|
| 428 | $cuda_stream_node_t newHead = $create_new_stream_node();
|
|---|
| 429 | newHead->next = $cuda_global_context.head;
|
|---|
| 430 | $cuda_global_context.head->prev = newHead;
|
|---|
| 431 |
|
|---|
| 432 | // Update cuda context's head to be the new node we created
|
|---|
| 433 | $cuda_global_context.head = newHead;
|
|---|
| 434 | $cuda_global_context.numStreams++;
|
|---|
| 435 |
|
|---|
| 436 | return cudaSuccess;
|
|---|
| 437 | }
|
|---|
| 438 | */
|
|---|
| 439 |
|
|---|
| 440 | /*
|
|---|
| 441 | cudaError_t cudaStreamSynchronize(cudaStream_t stream) {
|
|---|
| 442 | stream = $default_stream_if_null(stream);
|
|---|
| 443 | $assert(stream->alive, "Attempt to synchronize with a destroyed stream");
|
|---|
| 444 | $when(stream->head == NULL) return cudaSuccess;
|
|---|
| 445 | }
|
|---|
| 446 | */
|
|---|
| 447 |
|
|---|
| 448 | /*
|
|---|
| 449 | cudaError_t cudaStreamDestroy(cudaStream_t stream) {
|
|---|
| 450 | $assert(stream != NULL && stream != $cuda_default_stream, "Attempt to destroy default stream");
|
|---|
| 451 | $assert(stream->alive, "Attempt to destroy an already destroyed stream");
|
|---|
| 452 | $destroy_stream_node(stream->containingNode);
|
|---|
| 453 | return cudaSuccess;
|
|---|
| 454 | }
|
|---|
| 455 | */
|
|---|
| 456 |
|
|---|
| 457 | /*
|
|---|
| 458 | cudaError_t cudaDeviceSynchronize() {
|
|---|
| 459 | $proc* opsToWaitOn;
|
|---|
| 460 | int numOps = 0;
|
|---|
| 461 |
|
|---|
| 462 | $atomic {
|
|---|
| 463 | opsToWaitOn = ($proc*) malloc(sizeof($proc) * $cuda_global_context.numStreams);
|
|---|
| 464 |
|
|---|
| 465 | for ($cuda_stream_node_t node = $cuda_global_context.head;
|
|---|
| 466 | node != NULL;
|
|---|
| 467 | node = node->next) {
|
|---|
| 468 | if (node->stream->tail != NULL) {
|
|---|
| 469 | opsToWaitOn[numOps] = node->stream->tail->opState->op;
|
|---|
| 470 | numOps++;
|
|---|
| 471 | }
|
|---|
| 472 | }
|
|---|
| 473 | }
|
|---|
| 474 | $waitall(opsToWaitOn, numOps);
|
|---|
| 475 |
|
|---|
| 476 | return cudaSuccess;
|
|---|
| 477 | }
|
|---|
| 478 | */
|
|---|
| 479 |
|
|---|
| 480 | ////////////////
|
|---|
| 481 | // Warp stuff //
|
|---|
| 482 | ////////////////
|
|---|
| 483 |
|
|---|
| 484 | // TODO: Handle thread termination!
|
|---|
| 485 |
|
|---|
| 486 | $input int warpSize = 32;
|
|---|
| 487 |
|
|---|
| 488 | typedef struct $cuda_warp* $cuda_warp_t;
|
|---|
| 489 | typedef struct $cuda_warp {
|
|---|
| 490 | int size;
|
|---|
| 491 | int num_alive;
|
|---|
| 492 | _Bool alive[];
|
|---|
| 493 | int num_in_barrier;
|
|---|
| 494 | _Bool in_barrier[];
|
|---|
| 495 | int reductionLane;
|
|---|
| 496 | $gcomm warp_gcomm;
|
|---|
| 497 | } $cuda_warp;
|
|---|
| 498 |
|
|---|
| 499 | $cuda_warp_t $create_cuda_warp($scope warpScope, int size) {
|
|---|
| 500 | $cuda_warp_t newWarp = ($cuda_warp_t) malloc(sizeof($cuda_warp));
|
|---|
| 501 | newWarp->size = size;
|
|---|
| 502 | newWarp->num_alive = size;
|
|---|
| 503 | newWarp->alive = (_Bool[size])$lambda(int i) $true;
|
|---|
| 504 | newWarp->num_in_barrier = 0;
|
|---|
| 505 | newWarp->in_barrier = (_Bool[size])$lambda(int i) $false;
|
|---|
| 506 | newWarp->reductionLane = -1;
|
|---|
| 507 | newWarp->warp_gcomm = $gcomm_create(warpScope, size);
|
|---|
| 508 |
|
|---|
| 509 | return newWarp;
|
|---|
| 510 | }
|
|---|
| 511 |
|
|---|
| 512 | void $destroy_cuda_warp($cuda_warp_t warp) {
|
|---|
| 513 | $assert(warp != NULL, "Attempt to destroy a NULL warp");
|
|---|
| 514 |
|
|---|
| 515 | $gcomm_destroy(warp->warp_gcomm, NULL);
|
|---|
| 516 | free(warp);
|
|---|
| 517 | }
|
|---|
| 518 |
|
|---|
| 519 | /*@ depends_on \access(warp);
|
|---|
| 520 | @ executes_when \true;
|
|---|
| 521 | @ */
|
|---|
| 522 | $atomic_f void $cuda_warp_barrier_update($cuda_warp_t warp) {
|
|---|
| 523 | if (warp->num_in_barrier == warp->num_alive) {
|
|---|
| 524 | warp->num_in_barrier = 0;
|
|---|
| 525 | for (int i = 0; i < warp->size; i++) {
|
|---|
| 526 | warp->in_barrier[i] = $false;
|
|---|
| 527 | }
|
|---|
| 528 | }
|
|---|
| 529 | }
|
|---|
| 530 |
|
|---|
| 531 | typedef struct $cuda_lane* $cuda_lane_t;
|
|---|
| 532 | typedef struct $cuda_lane {
|
|---|
| 533 | int laneID;
|
|---|
| 534 | $cuda_warp_t warp;
|
|---|
| 535 | $comm lane_comm;
|
|---|
| 536 | } $cuda_lane;
|
|---|
| 537 |
|
|---|
| 538 | $cuda_lane_t $create_cuda_lane($scope laneScope, $cuda_warp_t warp, int laneID) {
|
|---|
| 539 | $assert(warp != NULL, "Attempt to create lane from NULL warp");
|
|---|
| 540 |
|
|---|
| 541 | $cuda_lane_t newLane = ($cuda_lane_t) malloc(sizeof($cuda_lane));
|
|---|
| 542 | newLane->laneID = laneID;
|
|---|
| 543 | newLane->warp = warp;
|
|---|
| 544 | newLane->lane_comm = $comm_create(laneScope, warp->warp_gcomm, laneID);
|
|---|
| 545 |
|
|---|
| 546 | return newLane;
|
|---|
| 547 | };
|
|---|
| 548 |
|
|---|
| 549 | /*@ depends_on \access(lane);
|
|---|
| 550 | @ executes_when \true;
|
|---|
| 551 | @ */
|
|---|
| 552 | $atomic_f void $destroy_cuda_lane($cuda_lane_t lane) {
|
|---|
| 553 | $assert(lane != NULL, "Attempt to destroy NULL lane");
|
|---|
| 554 |
|
|---|
| 555 | lane->warp->alive[lane->laneID] = $false;
|
|---|
| 556 | lane->warp->num_alive--;
|
|---|
| 557 | $cuda_warp_barrier_update(lane->warp);
|
|---|
| 558 | $comm_destroy(lane->lane_comm);
|
|---|
| 559 | free(lane);
|
|---|
| 560 | }
|
|---|
| 561 |
|
|---|
| 562 | /*@ depends_on \access(lane);
|
|---|
| 563 | @ executes_when \true;
|
|---|
| 564 | @ */
|
|---|
| 565 | $atomic_f void $cuda_warp_barrier_enter($cuda_lane_t lane) {
|
|---|
| 566 | $cuda_warp_t warp = lane->warp;
|
|---|
| 567 | int place = lane->laneID;
|
|---|
| 568 | $assert(!warp->in_barrier[place]);
|
|---|
| 569 |
|
|---|
| 570 | warp->in_barrier[place] = $true;
|
|---|
| 571 | warp->num_in_barrier++;
|
|---|
| 572 | $cuda_warp_barrier_update(warp);
|
|---|
| 573 | }
|
|---|
| 574 |
|
|---|
| 575 | /*@ depends_on \access(lane);
|
|---|
| 576 | @ */
|
|---|
| 577 | $atomic_f void $cuda_warp_barrier_exit($cuda_lane_t lane) {
|
|---|
| 578 | $when(!lane->warp->in_barrier[lane->laneID]);
|
|---|
| 579 | }
|
|---|
| 580 |
|
|---|
| 581 | void $cuda_warp_barrier_call($cuda_lane_t lane) {
|
|---|
| 582 | $cuda_warp_barrier_enter(lane);
|
|---|
| 583 | $cuda_warp_barrier_exit(lane);
|
|---|
| 584 | }
|
|---|
| 585 |
|
|---|
| 586 | typedef enum {
|
|---|
| 587 | $CUDA_WARP_TAG_shfl_sync,
|
|---|
| 588 | $CUDA_WARP_TAG_shfl_up_sync,
|
|---|
| 589 | $CUDA_WARP_TAG_shfl_down_sync,
|
|---|
| 590 | $CUDA_WARP_TAG_shfl_xor_sync,
|
|---|
| 591 | $CUDA_WARP_TAG_all_sync,
|
|---|
| 592 | $CUDA_WARP_TAG_any_sync,
|
|---|
| 593 | $CUDA_WARP_TAG_ballot_sync
|
|---|
| 594 | } $cuda_warp_tag;
|
|---|
| 595 |
|
|---|
| 596 | #define $GET_ARG_1(_1, ...) _1
|
|---|
| 597 | #define $GET_ARG_2(_1, _2, ...) _2
|
|---|
| 598 | #define $CUDA_SHFL_PARAM_MACRO(...) $GET_ARG_1(__VA_ARGS__, warpSize, 0), $GET_ARG_2(__VA_ARGS__, warpSize, 0)
|
|---|
| 599 |
|
|---|
| 600 | #define __shfl_sync(mask, var, ...) \
|
|---|
| 601 | _Generic(var, \
|
|---|
| 602 | default: $cuda__shfl_sync_int, \
|
|---|
| 603 | unsigned int: $cuda__shfl_sync_uint, \
|
|---|
| 604 | long: $cuda__shfl_sync_long, \
|
|---|
| 605 | unsigned long: $cuda__shfl_sync_ulong, \
|
|---|
| 606 | long long: $cuda__shfl_sync_ll, \
|
|---|
| 607 | unsigned long long: $cuda__shfl_sync_ull, \
|
|---|
| 608 | float: $cuda__shfl_sync_float,\
|
|---|
| 609 | double: $cuda__shfl_sync_double) (mask, var, $CUDA_SHFL_PARAM_MACRO(__VA_ARGS__), $lane)
|
|---|
| 610 |
|
|---|
| 611 | #define __shfl_up_sync(mask, var, ...) \
|
|---|
| 612 | _Generic(var, \
|
|---|
| 613 | default: $cuda__shfl_up_sync_int, \
|
|---|
| 614 | unsigned int: $cuda__shfl_up_sync_uint, \
|
|---|
| 615 | long: $cuda__shfl_up_sync_long, \
|
|---|
| 616 | unsigned long: $cuda__shfl_up_sync_ulong, \
|
|---|
| 617 | long long: $cuda__shfl_up_sync_ll, \
|
|---|
| 618 | unsigned long long: $cuda__shfl_up_sync_ull, \
|
|---|
| 619 | float: $cuda__shfl_up_sync_float, \
|
|---|
| 620 | double: $cuda__shfl_up_sync_double) (mask, var, $CUDA_SHFL_PARAM_MACRO(__VA_ARGS__), $lane)
|
|---|
| 621 |
|
|---|
| 622 | #define __shfl_down_sync(mask, var, ...) \
|
|---|
| 623 | _Generic(var, \
|
|---|
| 624 | default: $cuda__shfl_down_sync_int, \
|
|---|
| 625 | unsigned int: $cuda__shfl_down_sync_uint, \
|
|---|
| 626 | long: $cuda__shfl_down_sync_long, \
|
|---|
| 627 | unsigned long: $cuda__shfl_down_sync_ulong, \
|
|---|
| 628 | long long: $cuda__shfl_down_sync_ll, \
|
|---|
| 629 | unsigned long long: $cuda__shfl_down_sync_ull, \
|
|---|
| 630 | float: $cuda__shfl_down_sync_float, \
|
|---|
| 631 | double: $cuda__shfl_down_sync_double) (mask, var, $CUDA_SHFL_PARAM_MACRO(__VA_ARGS__), $lane)
|
|---|
| 632 |
|
|---|
| 633 | #define __shfl_xor_sync(mask, var, ...) \
|
|---|
| 634 | _Generic(var, \
|
|---|
| 635 | default: $cuda__shfl_xor_sync_int, \
|
|---|
| 636 | unsigned int: $cuda__shfl_xor_sync_uint, \
|
|---|
| 637 | long: $cuda__shfl_xor_sync_long, \
|
|---|
| 638 | unsigned long: $cuda__shfl_xor_sync_ulong, \
|
|---|
| 639 | long long: $cuda__shfl_xor_sync_ll, \
|
|---|
| 640 | unsigned long long: $cuda__shfl_xor_sync_ull, \
|
|---|
| 641 | float: $cuda__shfl_xor_sync_float, \
|
|---|
| 642 | double: $cuda__shfl_xor_sync_double) (mask, var, $CUDA_SHFL_PARAM_MACRO(__VA_ARGS__), $lane)
|
|---|
| 643 |
|
|---|
| 644 | #define $CUDA_GENERIC_SHFL_BODY() \
|
|---|
| 645 | $assert (width <= warpSize); \
|
|---|
| 646 | for (int v = width; v > 1; v /= 2) { \
|
|---|
| 647 | $assert(v % 2 == 0); \
|
|---|
| 648 | } \
|
|---|
| 649 | \
|
|---|
| 650 | int requestLane; \
|
|---|
| 651 | switch(tag) { \
|
|---|
| 652 | case $CUDA_WARP_TAG_shfl_sync: \
|
|---|
| 653 | requestLane = lane->laneID/width + laneParam % width; \
|
|---|
| 654 | break; \
|
|---|
| 655 | case $CUDA_WARP_TAG_shfl_up_sync: \
|
|---|
| 656 | requestLane = lane->laneID - laneParam; \
|
|---|
| 657 | break; \
|
|---|
| 658 | case $CUDA_WARP_TAG_shfl_down_sync: \
|
|---|
| 659 | requestLane = lane->laneID + laneParam; \
|
|---|
| 660 | break; \
|
|---|
| 661 | case $CUDA_WARP_TAG_shfl_xor_sync: \
|
|---|
| 662 | requestLane = lane->laneID ^ laneParam; \
|
|---|
| 663 | break; \
|
|---|
| 664 | } \
|
|---|
| 665 | _Bool validSrcLane = requestLane >= 0 && requestLane < lane->warp->size; \
|
|---|
| 666 | if (validSrcLane) { \
|
|---|
| 667 | $comm_enqueue(lane->lane_comm, $message_pack(lane->laneID, requestLane, tag, NULL, 0)); \
|
|---|
| 668 | } \
|
|---|
| 669 | \
|
|---|
| 670 | $local_end(); \
|
|---|
| 671 | $cuda_warp_barrier_call(lane); \
|
|---|
| 672 | $local_start(); \
|
|---|
| 673 | \
|
|---|
| 674 | while ($comm_probe(lane->lane_comm, $COMM_ANY_SOURCE, tag)) { \
|
|---|
| 675 | $message request = $comm_dequeue(lane->lane_comm, $COMM_ANY_SOURCE, tag); \
|
|---|
| 676 | \
|
|---|
| 677 | $comm_enqueue(lane->lane_comm, $message_pack(lane->laneID, $message_source(request), tag, &var, typeSize)); \
|
|---|
| 678 | } \
|
|---|
| 679 | \
|
|---|
| 680 | $local_end(); \
|
|---|
| 681 | $cuda_warp_barrier_call(lane); \
|
|---|
| 682 | $local_start(); \
|
|---|
| 683 | \
|
|---|
| 684 | if (validSrcLane) { \
|
|---|
| 685 | $message result = $comm_dequeue(lane->lane_comm, requestLane, tag); \
|
|---|
| 686 | $message_unpack(result, &resultVal, typeSize); \
|
|---|
| 687 | } else { \
|
|---|
| 688 | $havoc(&resultVal); \
|
|---|
| 689 | }
|
|---|
| 690 |
|
|---|
| 691 | #define $CUDA_DEFINE_SHFL(NAME, T, TAG) \
|
|---|
| 692 | T NAME(unsigned mask, T var, int laneParam, int width, $cuda_lane_t lane) { \
|
|---|
| 693 | T resultVal; \
|
|---|
| 694 | int typeSize = sizeof(T); \
|
|---|
| 695 | $cuda_warp_tag tag = TAG; \
|
|---|
| 696 | \
|
|---|
| 697 | $CUDA_GENERIC_SHFL_BODY(); \
|
|---|
| 698 | \
|
|---|
| 699 | return resultVal; \
|
|---|
| 700 | }
|
|---|
| 701 |
|
|---|
| 702 | $CUDA_DEFINE_SHFL($cuda__shfl_sync_int, int, $CUDA_WARP_TAG_shfl_sync)
|
|---|
| 703 | $CUDA_DEFINE_SHFL($cuda__shfl_sync_uint, unsigned int, $CUDA_WARP_TAG_shfl_sync)
|
|---|
| 704 | $CUDA_DEFINE_SHFL($cuda__shfl_sync_long, long, $CUDA_WARP_TAG_shfl_sync)
|
|---|
| 705 | $CUDA_DEFINE_SHFL($cuda__shfl_sync_ulong, unsigned long, $CUDA_WARP_TAG_shfl_sync)
|
|---|
| 706 | $CUDA_DEFINE_SHFL($cuda__shfl_sync_ll, long long, $CUDA_WARP_TAG_shfl_sync)
|
|---|
| 707 | $CUDA_DEFINE_SHFL($cuda__shfl_sync_ull, unsigned long long, $CUDA_WARP_TAG_shfl_sync)
|
|---|
| 708 | $CUDA_DEFINE_SHFL($cuda__shfl_sync_float, float, $CUDA_WARP_TAG_shfl_sync)
|
|---|
| 709 | $CUDA_DEFINE_SHFL($cuda__shfl_sync_double, double, $CUDA_WARP_TAG_shfl_sync)
|
|---|
| 710 |
|
|---|
| 711 | $CUDA_DEFINE_SHFL($cuda__shfl_up_sync_int, int, $CUDA_WARP_TAG_shfl_up_sync)
|
|---|
| 712 | $CUDA_DEFINE_SHFL($cuda__shfl_up_sync_uint, unsigned int, $CUDA_WARP_TAG_shfl_up_sync)
|
|---|
| 713 | $CUDA_DEFINE_SHFL($cuda__shfl_up_sync_long, long, $CUDA_WARP_TAG_shfl_up_sync)
|
|---|
| 714 | $CUDA_DEFINE_SHFL($cuda__shfl_up_sync_ulong, unsigned long, $CUDA_WARP_TAG_shfl_up_sync)
|
|---|
| 715 | $CUDA_DEFINE_SHFL($cuda__shfl_up_sync_ll, long long, $CUDA_WARP_TAG_shfl_up_sync)
|
|---|
| 716 | $CUDA_DEFINE_SHFL($cuda__shfl_up_sync_ull, unsigned long long, $CUDA_WARP_TAG_shfl_up_sync)
|
|---|
| 717 | $CUDA_DEFINE_SHFL($cuda__shfl_up_sync_float, float, $CUDA_WARP_TAG_shfl_up_sync)
|
|---|
| 718 | $CUDA_DEFINE_SHFL($cuda__shfl_up_sync_double, double, $CUDA_WARP_TAG_shfl_up_sync)
|
|---|
| 719 |
|
|---|
| 720 | $CUDA_DEFINE_SHFL($cuda__shfl_down_sync_int, int, $CUDA_WARP_TAG_shfl_down_sync)
|
|---|
| 721 | $CUDA_DEFINE_SHFL($cuda__shfl_down_sync_uint, unsigned int, $CUDA_WARP_TAG_shfl_down_sync)
|
|---|
| 722 | $CUDA_DEFINE_SHFL($cuda__shfl_down_sync_long, long, $CUDA_WARP_TAG_shfl_down_sync)
|
|---|
| 723 | $CUDA_DEFINE_SHFL($cuda__shfl_down_sync_ulong, unsigned long, $CUDA_WARP_TAG_shfl_down_sync)
|
|---|
| 724 | $CUDA_DEFINE_SHFL($cuda__shfl_down_sync_ll, long long, $CUDA_WARP_TAG_shfl_down_sync)
|
|---|
| 725 | $CUDA_DEFINE_SHFL($cuda__shfl_down_sync_ull, unsigned long long, $CUDA_WARP_TAG_shfl_down_sync)
|
|---|
| 726 | $CUDA_DEFINE_SHFL($cuda__shfl_down_sync_float, float, $CUDA_WARP_TAG_shfl_down_sync)
|
|---|
| 727 | $CUDA_DEFINE_SHFL($cuda__shfl_down_sync_double, double, $CUDA_WARP_TAG_shfl_down_sync)
|
|---|
| 728 |
|
|---|
| 729 | $CUDA_DEFINE_SHFL($cuda__shfl_xor_sync_int, int, $CUDA_WARP_TAG_shfl_xor_sync)
|
|---|
| 730 | $CUDA_DEFINE_SHFL($cuda__shfl_xor_sync_uint, unsigned int, $CUDA_WARP_TAG_shfl_xor_sync)
|
|---|
| 731 | $CUDA_DEFINE_SHFL($cuda__shfl_xor_sync_long, long, $CUDA_WARP_TAG_shfl_xor_sync)
|
|---|
| 732 | $CUDA_DEFINE_SHFL($cuda__shfl_xor_sync_ulong, unsigned long, $CUDA_WARP_TAG_shfl_xor_sync)
|
|---|
| 733 | $CUDA_DEFINE_SHFL($cuda__shfl_xor_sync_ll, long long, $CUDA_WARP_TAG_shfl_xor_sync)
|
|---|
| 734 | $CUDA_DEFINE_SHFL($cuda__shfl_xor_sync_ull, unsigned long long, $CUDA_WARP_TAG_shfl_xor_sync)
|
|---|
| 735 | $CUDA_DEFINE_SHFL($cuda__shfl_xor_sync_float, float, $CUDA_WARP_TAG_shfl_xor_sync)
|
|---|
| 736 | $CUDA_DEFINE_SHFL($cuda__shfl_xor_sync_double, double, $CUDA_WARP_TAG_shfl_xor_sync)
|
|---|
| 737 |
|
|---|
| 738 | #define __ballot_sync(mask, predicate) $cuda__ballot_sync(mask, predicate, $lane)
|
|---|
| 739 | #define __all_sync(mask, predicate) $cuda__all_sync(mask, predicate, $lane)
|
|---|
| 740 | #define __any_sync(mask, predicate) $cuda__any_sync(mask, predicate, $lane)
|
|---|
| 741 |
|
|---|
| 742 | #define $CUDA_GENERIC_COND_REDUCTION_BODY(COND, T_REDUCTION, F_REDUCTION) \
|
|---|
| 743 | $cuda_warp_t warp = lane->warp; \
|
|---|
| 744 | int laneID = lane->laneID; \
|
|---|
| 745 | $comm comm = lane->lane_comm; \
|
|---|
| 746 | if (warp->reductionLane == -1) { \
|
|---|
| 747 | warp->reductionLane = laneID; \
|
|---|
| 748 | \
|
|---|
| 749 | result = initialValue; \
|
|---|
| 750 | for (int i = 0; i < warp->size; i++) { \
|
|---|
| 751 | if (i == laneID) { \
|
|---|
| 752 | operand = value; \
|
|---|
| 753 | } else { \
|
|---|
| 754 | $local_end(); \
|
|---|
| 755 | $when(!warp->alive[i] || $comm_probe(comm, i, tag)) $local_start(); \
|
|---|
| 756 | \
|
|---|
| 757 | if (!warp->alive[i]) { \
|
|---|
| 758 | operand = initialValue; \
|
|---|
| 759 | } else { \
|
|---|
| 760 | $local_end(); \
|
|---|
| 761 | $message_unpack($comm_dequeue(comm, i, tag), &operand, typeSize); \
|
|---|
| 762 | $local_start(); \
|
|---|
| 763 | } \
|
|---|
| 764 | } \
|
|---|
| 765 | \
|
|---|
| 766 | if (COND) { \
|
|---|
| 767 | result = T_REDUCTION; \
|
|---|
| 768 | } else { \
|
|---|
| 769 | result = F_REDUCTION; \
|
|---|
| 770 | } \
|
|---|
| 771 | } \
|
|---|
| 772 | \
|
|---|
| 773 | warp->reductionLane = -1; \
|
|---|
| 774 | \
|
|---|
| 775 | for (int i = 0; i< warp->size; i++) { \
|
|---|
| 776 | if (i != laneID && warp->alive[i]) { \
|
|---|
| 777 | $comm_enqueue(comm, $message_pack(laneID, i, tag, &result, typeSize)); \
|
|---|
| 778 | } \
|
|---|
| 779 | } \
|
|---|
| 780 | } else { \
|
|---|
| 781 | int reductionLane = warp->reductionLane; \
|
|---|
| 782 | $comm_enqueue(comm, $message_pack(laneID, reductionLane, tag, &value, typeSize)); \
|
|---|
| 783 | $local_end(); \
|
|---|
| 784 | $message_unpack($comm_dequeue(comm, reductionLane, tag), &result, typeSize); \
|
|---|
| 785 | $local_start(); \
|
|---|
| 786 | }
|
|---|
| 787 |
|
|---|
| 788 | #define $CUDA_GENERIC_REDUCTION_BODY(REDUCTION) $CUDA_GENERIC_COND_REDUCTION_BODY($true, REDUCTION, result)
|
|---|
| 789 |
|
|---|
| 790 | int $cuda__all_sync(unsigned mask, int value, $cuda_lane_t lane) {
|
|---|
| 791 | $cuda_warp_tag tag = $CUDA_WARP_TAG_all_sync;
|
|---|
| 792 | int typeSize = sizeof(int);
|
|---|
| 793 | int initialValue = 1;
|
|---|
| 794 | int result, operand;
|
|---|
| 795 |
|
|---|
| 796 | $CUDA_GENERIC_COND_REDUCTION_BODY(result != 0 && operand != 0, 1, 0);
|
|---|
| 797 |
|
|---|
| 798 | return result;
|
|---|
| 799 | }
|
|---|
| 800 |
|
|---|
| 801 | int $cuda__any_sync(unsigned mask, int value, $cuda_lane_t lane) {
|
|---|
| 802 | $cuda_warp_tag tag = $CUDA_WARP_TAG_any_sync;
|
|---|
| 803 | int typeSize = sizeof(int);
|
|---|
| 804 | int initialValue = 0;
|
|---|
| 805 | int result, operand;
|
|---|
| 806 |
|
|---|
| 807 | $CUDA_GENERIC_COND_REDUCTION_BODY(result != 0 || operand != 0, 1, 0);
|
|---|
| 808 |
|
|---|
| 809 | return result;
|
|---|
| 810 | }
|
|---|
| 811 |
|
|---|
| 812 | unsigned $cuda__ballot_sync(unsigned mask, int value, $cuda_lane_t lane) {
|
|---|
| 813 | $cuda_warp_tag tag = $CUDA_WARP_TAG_ballot_sync;
|
|---|
| 814 | int initialValue = 0;
|
|---|
| 815 | int typeSize = sizeof(int);
|
|---|
| 816 | unsigned result;
|
|---|
| 817 | int operand;
|
|---|
| 818 |
|
|---|
| 819 | $CUDA_GENERIC_COND_REDUCTION_BODY(operand == 0, 2 * result, 2 * result + 1);
|
|---|
| 820 |
|
|---|
| 821 | return result;
|
|---|
| 822 | }
|
|---|
| 823 |
|
|---|
| 824 | /*
|
|---|
| 825 | int $cuda__ballot_sync(unsigned mask, int value, $cuda_lane_t lane) {
|
|---|
| 826 | $cuda_warp_tag tag = $CUDA_WARP_TAG_ballot_sync;
|
|---|
| 827 | int initialValue = 0;
|
|---|
| 828 | int result;
|
|---|
| 829 |
|
|---|
| 830 | if (warp->reductionLane == -1) {
|
|---|
| 831 | warp->reductionLane = laneID;
|
|---|
| 832 |
|
|---|
| 833 | result = initialValue;
|
|---|
| 834 | for (int i = 0; i < warp->size; i++) {
|
|---|
| 835 | int operand;
|
|---|
| 836 | if (i == laneID) {
|
|---|
| 837 | operand = value;
|
|---|
| 838 | } else {
|
|---|
| 839 | $local_end();
|
|---|
| 840 | $when(!warp->alive[i] || $comm_probe(comm, i, tag)) $local_start();
|
|---|
| 841 |
|
|---|
| 842 | if (!warp->alive[i]) {
|
|---|
| 843 | operand = initialValue;
|
|---|
| 844 | } else {
|
|---|
| 845 | $local_end();
|
|---|
| 846 | $message_unpack($comm_dequeue(comm, i, tag), &operand, sizeof(int));
|
|---|
| 847 | $local_start();
|
|---|
| 848 | }
|
|---|
| 849 | }
|
|---|
| 850 |
|
|---|
| 851 | result = 2 * result + (operand == 0 ? 0 : 1);
|
|---|
| 852 | }
|
|---|
| 853 |
|
|---|
| 854 | warp->reductionLane = -1;
|
|---|
| 855 |
|
|---|
| 856 | for (int i = 0; i< warp->size; i++) {
|
|---|
| 857 | if (i != laneID && warp->alive[i]) {
|
|---|
| 858 | $comm_enqueue(comm, $message_pack(laneID, i, tag, &result, sizeof(int)));
|
|---|
| 859 | }
|
|---|
| 860 | }
|
|---|
| 861 | } else {
|
|---|
| 862 | int reductionLane = warp->reductionLane;
|
|---|
| 863 | $comm_enqueue(comm, $message_pack(laneID, reductionLane, tag, &value, sizeof(int)));
|
|---|
| 864 | $local_end();
|
|---|
| 865 | $message_unpack($comm_dequeue(comm, reductionLane, tag), &result, sizeof(int));
|
|---|
| 866 | $local_start();
|
|---|
| 867 | }
|
|---|
| 868 |
|
|---|
| 869 | return result;
|
|---|
| 870 | }
|
|---|
| 871 | */
|
|---|
| 872 |
|
|---|
| 873 | //////////////////////////////////
|
|---|
| 874 | // Generated code from kernel_1 //
|
|---|
| 875 | //////////////////////////////////
|
|---|
| 876 |
|
|---|
| 877 | typedef struct {
|
|---|
| 878 | dim3 gridDim;
|
|---|
| 879 | dim3 blockDim;
|
|---|
| 880 | size_t $cudaMemSize;
|
|---|
| 881 | cudaStream_t $cudaStream;
|
|---|
| 882 | float* A;
|
|---|
| 883 | const float* B;
|
|---|
| 884 | float* C;
|
|---|
| 885 | int numElements;
|
|---|
| 886 | } $cuda_kernel_1_data;
|
|---|
| 887 |
|
|---|
| 888 | void $cuda_reveal_kernel_1_args($cuda_kernel_1_data* args) {
|
|---|
| 889 | args->A = $reveal(args->A);
|
|---|
| 890 | args->B = $reveal(args->B);
|
|---|
| 891 | args->C = $reveal(args->C);
|
|---|
| 892 | }
|
|---|
| 893 |
|
|---|
| 894 | void $cuda_host_launch_kernel_1(dim3 gridDim, dim3 blockDim,
|
|---|
| 895 | size_t $cudaMemSize, cudaStream_t $cudaStream,
|
|---|
| 896 | float* A, const float* B, float* C, int numElements) {
|
|---|
| 897 | $cuda_kernel_1_data args;
|
|---|
| 898 | args.gridDim = gridDim;
|
|---|
| 899 | args.blockDim = blockDim;
|
|---|
| 900 | args.$cudaMemSize = $cudaMemSize;
|
|---|
| 901 | args.$cudaStream = $cudaStream;
|
|---|
| 902 | args.A = A;
|
|---|
| 903 | args.B = B;
|
|---|
| 904 | args.C = C;
|
|---|
| 905 | args.numElements = numElements;
|
|---|
| 906 |
|
|---|
| 907 | $comm_enqueue($cuda_host_comm, $message_pack($CUDA_PLACE_HOST, $CUDA_PLACE_DEVICE, $CUDA_TAG_LAUNCH_kernel_1, &args, sizeof($cuda_kernel_1_data)));
|
|---|
| 908 | $comm_dequeue($cuda_host_comm, $CUDA_PLACE_DEVICE, $CUDA_TAG_LAUNCH_kernel_1);
|
|---|
| 909 | }
|
|---|
| 910 |
|
|---|
| 911 | void $cuda_kernel_1(dim3 gridDim, dim3 blockDim, size_t _cuda_mem_size,
|
|---|
| 912 | float *A, const float *B, float *C, int numElements) {
|
|---|
| 913 | $cuda_kernel_instance_t $kernel = $create_kernel_instance($here, gridDim, blockDim);
|
|---|
| 914 | void $cuda_block(uint3 blockIdx) {
|
|---|
| 915 | int $numThreads = (blockDim.x * blockDim.y) * blockDim.z;
|
|---|
| 916 | int $numWarps = ($numThreads - 1)/warpSize + 1;
|
|---|
| 917 | $scope $block_root = $here;
|
|---|
| 918 | $gbarrier $cuda_block_barrier = $gbarrier_create($block_root, $numThreads);
|
|---|
| 919 |
|
|---|
| 920 | $cuda_warp_t $warps[$numWarps];
|
|---|
| 921 | for (int i = 0; i < $numWarps - 1; i++) {
|
|---|
| 922 | $warps[i] = $create_cuda_warp($block_root, warpSize);
|
|---|
| 923 | }
|
|---|
| 924 | $warps[$numWarps-1] = $create_cuda_warp($block_root, (($numThreads - 1) % warpSize) + 1);
|
|---|
| 925 |
|
|---|
| 926 | void $cuda_thread(uint3 threadIdx) {
|
|---|
| 927 | $local_start();
|
|---|
| 928 | int _cuda_tid = $dim3_index(blockDim, threadIdx);
|
|---|
| 929 | int _cuda_kid = $cuda_kernel_index(gridDim, blockDim, blockIdx, threadIdx);
|
|---|
| 930 | //$clear_mem_sets($kernel, _cuda_kid);
|
|---|
| 931 | $barrier $cuda_thread_barrier = $barrier_create($here, $cuda_block_barrier, _cuda_tid);
|
|---|
| 932 | $cuda_lane_t $lane = $create_cuda_lane($here, $warps[_cuda_tid / warpSize], _cuda_tid % warpSize);
|
|---|
| 933 |
|
|---|
| 934 | //$read_set_push();
|
|---|
| 935 | //$write_set_push();
|
|---|
| 936 |
|
|---|
| 937 | // Kernel REDUCTION start
|
|---|
| 938 | /*
|
|---|
| 939 | int lane = threadIdx.x % warpSize;
|
|---|
| 940 | int thisWarpSize = warpSize;
|
|---|
| 941 | if (threadIdx.x - lane + warpSize > blockDim.x) {
|
|---|
| 942 | thisWarpSize = ((blockDim.x - 1) % warpSize) + 1;
|
|---|
| 943 | }
|
|---|
| 944 |
|
|---|
| 945 | int i = blockDim.x * blockIdx.x + threadIdx.x;
|
|---|
| 946 | int warpStart = i - lane;
|
|---|
| 947 | printf("%d,%d - i: %d, warpStart: %d, thisWarpSize: %d\n", blockIdx.x, threadIdx.x,i, warpStart, thisWarpSize);
|
|---|
| 948 | int remainingElements = numElements;
|
|---|
| 949 |
|
|---|
| 950 | while (remainingElements > 1) {
|
|---|
| 951 | //printf("%d,%d - remainingElements: %d\n", blockIdx.x, threadIdx.x, remainingElements);
|
|---|
| 952 | if (remainingElements < numElements) {
|
|---|
| 953 | // __syncThreads()
|
|---|
| 954 | //printf("%d,%d - entering barrier\n", blockIdx.x, threadIdx.x);
|
|---|
| 955 |
|
|---|
| 956 | $cuda_barrier($kernel, _cuda_kid, $cuda_thread_barrier);
|
|---|
| 957 | //printf("%d,%d - exiting barrier\n", blockIdx.x, threadIdx.x);
|
|---|
| 958 | }
|
|---|
| 959 |
|
|---|
| 960 | if (warpStart + 1 < remainingElements) {
|
|---|
| 961 | float val = i < numElements ? A[i] : 0;
|
|---|
| 962 |
|
|---|
| 963 | for (int offset = warpSize/2; offset > 0; offset /= 2) {
|
|---|
| 964 | float tmp = __shfl_down_sync(0, val, offset);
|
|---|
| 965 | if (lane + offset < thisWarpSize) {
|
|---|
| 966 | val += tmp;
|
|---|
| 967 | }
|
|---|
| 968 | }
|
|---|
| 969 |
|
|---|
| 970 | if (i < numElements) {
|
|---|
| 971 | A[i] = val;
|
|---|
| 972 | }
|
|---|
| 973 | }
|
|---|
| 974 |
|
|---|
| 975 | i *= warpSize;
|
|---|
| 976 | //warpStart *= warpSize;
|
|---|
| 977 | remainingElements = ((remainingElements - 1) / warpSize) + 1;
|
|---|
| 978 | }
|
|---|
| 979 |
|
|---|
| 980 | if (i == 0) {
|
|---|
| 981 | *C = A[0];
|
|---|
| 982 | }
|
|---|
| 983 | // Kernel REDUCTION end
|
|---|
| 984 | */
|
|---|
| 985 | // Kernel BALLOT TEST start
|
|---|
| 986 |
|
|---|
| 987 | int i = threadIdx.x;
|
|---|
| 988 | if (i < numElements) {
|
|---|
| 989 | int result = __ballot_sync(~0, A[i] > 0);
|
|---|
| 990 | if (i == 0) {
|
|---|
| 991 | printf("Result: %d\n", result);
|
|---|
| 992 | *C = 0;
|
|---|
| 993 | while(result > 0) {
|
|---|
| 994 | if (result % 2)
|
|---|
| 995 | *C += 1;
|
|---|
| 996 | result /= 2;
|
|---|
| 997 | }
|
|---|
| 998 | printf("done calculating result\n");
|
|---|
| 999 | }
|
|---|
| 1000 | }
|
|---|
| 1001 | // Kernel BALLOT TEST end
|
|---|
| 1002 | //$check_data_race($kernel, _cuda_kid);
|
|---|
| 1003 | //$read_set_pop();
|
|---|
| 1004 | //$write_set_pop();
|
|---|
| 1005 | $barrier_destroy($cuda_thread_barrier);
|
|---|
| 1006 | $destroy_cuda_lane($lane);
|
|---|
| 1007 | $local_end();
|
|---|
| 1008 | }
|
|---|
| 1009 | $cuda_run_and_wait_on_procs(blockDim, $cuda_thread);
|
|---|
| 1010 | $gbarrier_destroy($cuda_block_barrier);
|
|---|
| 1011 |
|
|---|
| 1012 | for (int i = 0; i < $numWarps; i++) {
|
|---|
| 1013 | $destroy_cuda_warp($warps[i]);
|
|---|
| 1014 | }
|
|---|
| 1015 | }
|
|---|
| 1016 | $cuda_run_and_wait_on_procs(gridDim, $cuda_block);
|
|---|
| 1017 | $destroy_kernel_instance($kernel);
|
|---|
| 1018 | }
|
|---|
| 1019 |
|
|---|
| 1020 | void $cuda_kernel_1_proc ($message request, $cuda_op_state_t opState, cudaStream_t cudaStream) {
|
|---|
| 1021 | $when(opState->start);
|
|---|
| 1022 |
|
|---|
| 1023 | $cuda_kernel_1_data args;
|
|---|
| 1024 | $message_unpack(request, &args, sizeof($cuda_kernel_1_data));
|
|---|
| 1025 | $cuda_reveal_kernel_1_args(&args);
|
|---|
| 1026 |
|
|---|
| 1027 | $cuda_kernel_1(args.gridDim, args.blockDim, args.$cudaMemSize, args.A, args.B, args.C, args.numElements);
|
|---|
| 1028 | $stream_dequeue(cudaStream);
|
|---|
| 1029 | }
|
|---|
| 1030 |
|
|---|
| 1031 | /////////////////
|
|---|
| 1032 | // CUDA "file" //
|
|---|
| 1033 | /////////////////
|
|---|
| 1034 |
|
|---|
| 1035 | void $cuda_main() {
|
|---|
| 1036 |
|
|---|
| 1037 | // Device Variables
|
|---|
| 1038 |
|
|---|
| 1039 | $scope $cuda_scope = $here;
|
|---|
| 1040 |
|
|---|
| 1041 | $comm $cuda_device_comm = $comm_create($cuda_scope, $cuda_gcomm, 1);
|
|---|
| 1042 | $cuda_context $cuda_global_context;
|
|---|
| 1043 | cudaStream_t $cuda_default_stream;
|
|---|
| 1044 |
|
|---|
| 1045 | // Helper function to get the default stream if passed NULL, and just returns stream otherwise
|
|---|
| 1046 | // Currently unused until we support streams other than the default one.
|
|---|
| 1047 | cudaStream_t $default_stream_if_null(cudaStream_t stream) {
|
|---|
| 1048 | return stream == NULL ? $cuda_default_stream : stream;
|
|---|
| 1049 | }
|
|---|
| 1050 |
|
|---|
| 1051 | // Device Logic
|
|---|
| 1052 |
|
|---|
| 1053 | $cuda_stream_node_t defaultStreamNode = $create_new_stream_node($cuda_scope);
|
|---|
| 1054 | $cuda_default_stream = defaultStreamNode->stream;
|
|---|
| 1055 |
|
|---|
| 1056 | $cuda_global_context.head = defaultStreamNode;
|
|---|
| 1057 | $cuda_global_context.numStreams = 1;
|
|---|
| 1058 |
|
|---|
| 1059 | while (true) {
|
|---|
| 1060 | $message request = $comm_dequeue($cuda_device_comm, $CUDA_PLACE_HOST, $COMM_ANY_TAG);
|
|---|
| 1061 | $message response;
|
|---|
| 1062 | const int tag = $message_tag(request);
|
|---|
| 1063 |
|
|---|
| 1064 | switch(tag) {
|
|---|
| 1065 | case $CUDA_TAG_SCOPE_REQUEST :
|
|---|
| 1066 | response = $message_pack($CUDA_PLACE_DEVICE, $CUDA_PLACE_HOST, $CUDA_TAG_SCOPE_REQUEST, &$cuda_scope, sizeof($scope));
|
|---|
| 1067 | break;
|
|---|
| 1068 | case $CUDA_TAG_cudaFree :
|
|---|
| 1069 | response = $cuda_free(request);
|
|---|
| 1070 | break;
|
|---|
| 1071 | case $CUDA_TAG_cudaMemcpy :
|
|---|
| 1072 | response = $cuda_memcpy($cuda_scope, $cuda_default_stream, request, false);
|
|---|
| 1073 | break;
|
|---|
| 1074 | case $CUDA_TAG_cudaMemcpyAsync :
|
|---|
| 1075 | response = $cuda_memcpy($cuda_scope, $cuda_default_stream, request, true);
|
|---|
| 1076 | break;
|
|---|
| 1077 | case $CUDA_TAG_LAUNCH_kernel_1 :
|
|---|
| 1078 | $stream_enqueue($cuda_scope, $cuda_default_stream, request, $cuda_kernel_1_proc);
|
|---|
| 1079 |
|
|---|
| 1080 | response = $message_pack($CUDA_PLACE_DEVICE, $CUDA_PLACE_HOST, tag, NULL, 0);
|
|---|
| 1081 | break;
|
|---|
| 1082 | case $CUDA_TAG_TEARDOWN : {
|
|---|
| 1083 | $proc destructor = $destroy_stream_node($cuda_default_stream->containingNode);
|
|---|
| 1084 | $wait(destructor);
|
|---|
| 1085 | $comm_destroy($cuda_device_comm);
|
|---|
| 1086 | return;
|
|---|
| 1087 | }
|
|---|
| 1088 | default :
|
|---|
| 1089 | $assert(false, "Unknown CUDA request");
|
|---|
| 1090 | }
|
|---|
| 1091 |
|
|---|
| 1092 | $comm_enqueue($cuda_device_comm, response);
|
|---|
| 1093 | }
|
|---|
| 1094 | }
|
|---|
| 1095 |
|
|---|
| 1096 | ///////////////
|
|---|
| 1097 | // Host file //
|
|---|
| 1098 | ///////////////
|
|---|
| 1099 |
|
|---|
| 1100 | $input int N;
|
|---|
| 1101 | $assume (N > 0);
|
|---|
| 1102 | $input float A[N];
|
|---|
| 1103 | // Currently unused but left in to save time
|
|---|
| 1104 | $input float B[N];
|
|---|
| 1105 |
|
|---|
| 1106 | void $host_main() {
|
|---|
| 1107 | int size = N * sizeof(float);
|
|---|
| 1108 | int numBlocks = 1;
|
|---|
| 1109 | //int numThreads = N%2 == 0? N/2 : (N+1)/2;
|
|---|
| 1110 | int numThreads = warpSize;
|
|---|
| 1111 |
|
|---|
| 1112 | float* cuda_A;
|
|---|
| 1113 | // cudaMalloc((void **)&cuda_A, size);
|
|---|
| 1114 | {
|
|---|
| 1115 | $scope deviceScope = $cuda_host_request_device_scope();
|
|---|
| 1116 | cuda_A = $hide((float*)$malloc(deviceScope, size));
|
|---|
| 1117 | }
|
|---|
| 1118 | cudaMemcpy(cuda_A, A, size, cudaMemcpyHostToDevice);
|
|---|
| 1119 |
|
|---|
| 1120 | float* cuda_B;
|
|---|
| 1121 | // cudaMalloc((void **)&cuda_B, size);
|
|---|
| 1122 | {
|
|---|
| 1123 | $scope deviceScope = $cuda_host_request_device_scope();
|
|---|
| 1124 | cuda_B = $hide((float*)$malloc(deviceScope, size));
|
|---|
| 1125 | }
|
|---|
| 1126 | cudaMemcpy(cuda_B, B, size, cudaMemcpyHostToDevice);
|
|---|
| 1127 |
|
|---|
| 1128 | float* cuda_C;
|
|---|
| 1129 | // cudaMalloc((void **)&cuda_C, sizeof(float));
|
|---|
| 1130 | {
|
|---|
| 1131 | $scope deviceScope = $cuda_host_request_device_scope();
|
|---|
| 1132 | cuda_C = $hide((float*)$malloc(deviceScope, sizeof(float)));
|
|---|
| 1133 | }
|
|---|
| 1134 |
|
|---|
| 1135 | dim3 gridDim = {numBlocks, 1, 1};
|
|---|
| 1136 | dim3 blockDim = {numThreads, 1, 1};
|
|---|
| 1137 | // kernel_1<<<gridDim, blockDim>>>(cuda_A, cuda_B, cuda_C, N);
|
|---|
| 1138 | $cuda_host_launch_kernel_1(gridDim, blockDim, 0, NULL, cuda_A, cuda_B, cuda_C, N);
|
|---|
| 1139 |
|
|---|
| 1140 | // Checking correctness
|
|---|
| 1141 | float* C = (float *)malloc(size);
|
|---|
| 1142 |
|
|---|
| 1143 | cudaMemcpy(C, cuda_C, sizeof(float), cudaMemcpyDeviceToHost);
|
|---|
| 1144 |
|
|---|
| 1145 | // REDUCTION ASSERTION
|
|---|
| 1146 | /*
|
|---|
| 1147 | float sum = 0;
|
|---|
| 1148 | for(int i = 0; i < N; i++)
|
|---|
| 1149 | sum += A[i];
|
|---|
| 1150 |
|
|---|
| 1151 | $assert(*C == sum);
|
|---|
| 1152 | */
|
|---|
| 1153 | // BALLOT ASSERTION
|
|---|
| 1154 | float count = 0;
|
|---|
| 1155 | for (int i = 0; i < N; i++) {
|
|---|
| 1156 | if (A[i] > 0)
|
|---|
| 1157 | count++;
|
|---|
| 1158 | }
|
|---|
| 1159 | $assert(*C == count);
|
|---|
| 1160 |
|
|---|
| 1161 | free(C);
|
|---|
| 1162 |
|
|---|
| 1163 | cudaFree(cuda_A);
|
|---|
| 1164 | cudaFree(cuda_B);
|
|---|
| 1165 | cudaFree(cuda_C);
|
|---|
| 1166 |
|
|---|
| 1167 | }
|
|---|
| 1168 |
|
|---|
| 1169 | int main() {
|
|---|
| 1170 | $proc host = $spawn $host_main();
|
|---|
| 1171 | $proc cuda = $spawn $cuda_main();
|
|---|
| 1172 | $wait(host);
|
|---|
| 1173 | $comm_enqueue($cuda_host_comm, $message_pack($CUDA_PLACE_HOST, $CUDA_PLACE_DEVICE, $CUDA_TAG_TEARDOWN, NULL, 0));
|
|---|
| 1174 | $comm_destroy($cuda_host_comm);
|
|---|
| 1175 | $wait(cuda);
|
|---|
| 1176 | $gcomm_destroy($cuda_gcomm, NULL);
|
|---|
| 1177 | }
|
|---|