| 1 | #include <civlc.cvh>
|
|---|
| 2 | /* This header file contains useful helper functions for manipulating
|
|---|
| 3 | * the CIVL versions of various Cuda objects.
|
|---|
| 4 | */
|
|---|
| 5 |
|
|---|
| 6 | #include <string.h>
|
|---|
| 7 | // To print for debugging
|
|---|
| 8 | #include <stdio.h>
|
|---|
| 9 | #include <stdlib.h>
|
|---|
| 10 | #include <concurrency.cvh>
|
|---|
| 11 | #include <seq.cvh>
|
|---|
| 12 | #include <mem.cvh>
|
|---|
| 13 | #include <civl-cuda.cvh>
|
|---|
| 14 | #include <comm.cvh>
|
|---|
| 15 | #pragma CIVL ACSL
|
|---|
| 16 |
|
|---|
| 17 | /* type used to represent an instance of a Cuda kernel
|
|---|
| 18 | * TODO: Update with read/write sets
|
|---|
| 19 | */
|
|---|
| 20 | struct $cuda_kernel_instance {
|
|---|
| 21 | $proc process; // the actual process executing this kernel
|
|---|
| 22 | $cuda_kernel_status status; // the kernel status
|
|---|
| 23 | // Read and Write sets
|
|---|
| 24 | $mem read_sets[];
|
|---|
| 25 | $mem write_sets[];
|
|---|
| 26 | // Number of threads in kernel
|
|---|
| 27 | int size;
|
|---|
| 28 | };
|
|---|
| 29 |
|
|---|
| 30 | /* a type that wraps a kernel instance for insertion into a list
|
|---|
| 31 | */
|
|---|
| 32 | struct $cuda_kernel_instance_node {
|
|---|
| 33 | $cuda_kernel_instance_t *instance;
|
|---|
| 34 | $cuda_kernel_instance_node_t *next;
|
|---|
| 35 | };
|
|---|
| 36 |
|
|---|
| 37 | /* $cuda_kernel_instance_node_t interface
|
|---|
| 38 | */
|
|---|
| 39 | $cuda_kernel_instance_t *$cuda_get_instance($cuda_kernel_instance_node_t *node) {
|
|---|
| 40 | return node->instance;
|
|---|
| 41 | }
|
|---|
| 42 |
|
|---|
| 43 | /* a type used to represent a Cuda stream
|
|---|
| 44 | */
|
|---|
| 45 | struct _CUstream {
|
|---|
| 46 | $cuda_kernel_instance_node_t *mostRecent; // the most recently enqueued instance
|
|---|
| 47 | _Bool usable; // indicates whether or not this stream
|
|---|
| 48 | // has been marked for deletion
|
|---|
| 49 | };
|
|---|
| 50 |
|
|---|
| 51 | /* _CUstream interface
|
|---|
| 52 | */
|
|---|
| 53 | $cuda_kernel_instance_node_t *$cuda_get_most_recent(cudaStream_t s) {
|
|---|
| 54 | return s->mostRecent;
|
|---|
| 55 | }
|
|---|
| 56 |
|
|---|
| 57 | _Bool $cuda_is_usable(cudaStream_t s) {
|
|---|
| 58 | return s->usable;
|
|---|
| 59 | }
|
|---|
| 60 |
|
|---|
| 61 | void $cuda_set_usable(cudaStream_t s, _Bool b) {
|
|---|
| 62 | s->usable = b;
|
|---|
| 63 | }
|
|---|
| 64 |
|
|---|
| 65 | /* a type that wraps a stream for insertion into a list
|
|---|
| 66 | */
|
|---|
| 67 | struct $cuda_stream_node {
|
|---|
| 68 | cudaStream_t stream;
|
|---|
| 69 | $cuda_stream_node_t *next;
|
|---|
| 70 | };
|
|---|
| 71 |
|
|---|
| 72 | /* $cuda_stream_node_t encapsulation functions
|
|---|
| 73 | */
|
|---|
| 74 | void $cuda_set_stream($cuda_stream_node_t* node, cudaStream_t stream) {
|
|---|
| 75 | node->stream = stream;
|
|---|
| 76 | }
|
|---|
| 77 |
|
|---|
| 78 | void $cuda_set_next($cuda_stream_node_t* node, $cuda_stream_node_t* next) {
|
|---|
| 79 | node->next = next;
|
|---|
| 80 | }
|
|---|
| 81 |
|
|---|
| 82 | /* a type used to represent a Cuda event
|
|---|
| 83 | */
|
|---|
| 84 | struct _CUevent {
|
|---|
| 85 | $cuda_kernel_instance_t **instances;
|
|---|
| 86 | int numInstances;
|
|---|
| 87 | };
|
|---|
| 88 |
|
|---|
| 89 | /* _CUevent encapsulation functions
|
|---|
| 90 | */
|
|---|
| 91 | $cuda_kernel_instance_t **$cuda_get_instances(cudaEvent_t e) {
|
|---|
| 92 | return e->instances;
|
|---|
| 93 | }
|
|---|
| 94 |
|
|---|
| 95 | void $cuda_set_instances(cudaEvent_t e, $cuda_kernel_instance_t** instances, int numInstances) {
|
|---|
| 96 | e->instances = instances;
|
|---|
| 97 | e->numInstances = numInstances;
|
|---|
| 98 | }
|
|---|
| 99 |
|
|---|
| 100 | int $cuda_get_num_instances(cudaEvent_t e) {
|
|---|
| 101 | return e->numInstances;
|
|---|
| 102 | }
|
|---|
| 103 |
|
|---|
| 104 | /* a type representing the state of a Cuda device
|
|---|
| 105 | */
|
|---|
| 106 | struct $cuda_context {
|
|---|
| 107 | $cuda_stream_node_t *headNode;
|
|---|
| 108 | cudaStream_t nullStream;
|
|---|
| 109 | int numStreams;
|
|---|
| 110 | };
|
|---|
| 111 |
|
|---|
| 112 | /* $cuda_context_t encapsulation functions
|
|---|
| 113 | */
|
|---|
| 114 | int $cuda_get_num_streams($cuda_context_t *_context) {
|
|---|
| 115 | return _context->numStreams;
|
|---|
| 116 | }
|
|---|
| 117 |
|
|---|
| 118 | $cuda_stream_node_t *$cuda_get_head_node($cuda_context_t *_context) {
|
|---|
| 119 | return _context->headNode;
|
|---|
| 120 | }
|
|---|
| 121 |
|
|---|
| 122 | cudaStream_t $cuda_get_null_stream($cuda_context_t *_context) {
|
|---|
| 123 | return _context->nullStream;
|
|---|
| 124 | }
|
|---|
| 125 |
|
|---|
| 126 | void $cuda_add_new_stream($cuda_context_t *_context, $cuda_stream_node_t *newNode) {
|
|---|
| 127 | _context->headNode = newNode;
|
|---|
| 128 | _context->numStreams++;
|
|---|
| 129 | }
|
|---|
| 130 |
|
|---|
| 131 | /* Computes the one dimensional index of a grid cell at a given location
|
|---|
| 132 | * in a three dimensional grid of a given size
|
|---|
| 133 | */
|
|---|
| 134 | int $cuda_index (dim3 size, uint3 location) {
|
|---|
| 135 | return location.x + size.x * (location.y + size.y * location.z);
|
|---|
| 136 | }
|
|---|
| 137 |
|
|---|
| 138 | /* Computes the one dimensional index of a specific thread in the grid given
|
|---|
| 139 | * the grid dimension, block dimension, block index, and thread index
|
|---|
| 140 | */
|
|---|
| 141 | int $cuda_kernel_index (dim3 gDim, dim3 bDim, uint3 bIdx, uint3 tIdx) {
|
|---|
| 142 | return $cuda_index(gDim, bIdx) * (bDim.x * bDim.y * bDim.z) + $cuda_index(bDim, tIdx);
|
|---|
| 143 | }
|
|---|
| 144 |
|
|---|
| 145 | /* Lifts a single integer x into a three dimensional vector representing
|
|---|
| 146 | * a one dimensional grid of length x
|
|---|
| 147 | */
|
|---|
| 148 | dim3 $cuda_to_dim3(int x) {
|
|---|
| 149 | dim3 d = { x, 1, 1 };
|
|---|
| 150 |
|
|---|
| 151 | return d;
|
|---|
| 152 | }
|
|---|
| 153 |
|
|---|
| 154 | /* Given a three dimensional vector representing a grid of size dim,
|
|---|
| 155 | * create and destroy a process, in parallel, for each cell in the grid.
|
|---|
| 156 | * The location of the cell is passed to the spawning function.
|
|---|
| 157 | */
|
|---|
| 158 | void $cuda_run_procs(dim3 dim, void spawningFunction(uint3)) {
|
|---|
| 159 | $range rx = 0 .. (dim.x == 0 ? -1 : dim.x - 1);
|
|---|
| 160 | $range ry = 0 .. (dim.y == 0 ? -1 : dim.y - 1);
|
|---|
| 161 | $range rz = 0 .. (dim.z == 0 ? -1 : dim.z - 1);
|
|---|
| 162 | $domain(3) dom = ($domain){rx, ry, rz};
|
|---|
| 163 | $parfor(int x,y,z : dom){
|
|---|
| 164 | uint3 id = { x, y, z };
|
|---|
| 165 | spawningFunction(id);
|
|---|
| 166 | }
|
|---|
| 167 | }
|
|---|
| 168 |
|
|---|
| 169 | // ------------------------------------------------
|
|---|
| 170 |
|
|---|
| 171 | /* $wait on a given process is it is non-null
|
|---|
| 172 | */
|
|---|
| 173 | void $cuda_try_wait($proc p) {
|
|---|
| 174 | if (p != $proc_null) {
|
|---|
| 175 | $yield();
|
|---|
| 176 | $wait(p);
|
|---|
| 177 | }
|
|---|
| 178 | }
|
|---|
| 179 |
|
|---|
| 180 | /* The current state of the GPU
|
|---|
| 181 | */
|
|---|
| 182 | $cuda_context_t $cuda_current_context = {
|
|---|
| 183 | .headNode = NULL,
|
|---|
| 184 | .nullStream = NULL,
|
|---|
| 185 | .numStreams = 0
|
|---|
| 186 | };
|
|---|
| 187 |
|
|---|
| 188 | /* malloc and initialize a new $cuda_kernel_instance_t
|
|---|
| 189 | */
|
|---|
| 190 |
|
|---|
| 191 | // TODO: Determine if passing parameters is better or not
|
|---|
| 192 | $cuda_kernel_instance_t *$cuda_kernel_instance_create(dim3 gDim, dim3 bDim) {
|
|---|
| 193 | //printf("mallocing kernel instance\n");
|
|---|
| 194 | $cuda_kernel_instance_t *i = ($cuda_kernel_instance_t*)malloc(sizeof($cuda_kernel_instance_t));
|
|---|
| 195 |
|
|---|
| 196 | i->process = $proc_null;
|
|---|
| 197 | i->status = $cuda_kernel_status_waiting;
|
|---|
| 198 | int threadNum = gDim.x * gDim.y * gDim.z * bDim.x * bDim.y * bDim.z;
|
|---|
| 199 | i->size = threadNum;
|
|---|
| 200 | // TODO: either sequentially or non-sequentially?
|
|---|
| 201 | $mem empty = $mem_empty();
|
|---|
| 202 | $seq_init(&i->read_sets, threadNum, &empty);
|
|---|
| 203 | $seq_init(&i->write_sets, threadNum, &empty);
|
|---|
| 204 | return i;
|
|---|
| 205 | }
|
|---|
| 206 |
|
|---|
| 207 | /* cleanup and free a given $cuda_kernel_instance_t
|
|---|
| 208 | */
|
|---|
| 209 | void $cuda_kernel_instance_destroy($cuda_kernel_instance_t *i) {
|
|---|
| 210 | $cuda_try_wait(i->process);
|
|---|
| 211 | //printf("freeing kernel instance\n");
|
|---|
| 212 | $free(i);
|
|---|
| 213 | }
|
|---|
| 214 |
|
|---|
| 215 | /* malloc and initialize a new $cuda_kernel_instance_node_t
|
|---|
| 216 | */
|
|---|
| 217 | $cuda_kernel_instance_node_t *$cuda_kernel_instance_node_create() {
|
|---|
| 218 | //printf("mallocing kernel instance node\n");
|
|---|
| 219 | $cuda_kernel_instance_node_t *node = ($cuda_kernel_instance_node_t*)malloc(sizeof($cuda_kernel_instance_node_t));
|
|---|
| 220 |
|
|---|
| 221 | node->instance = NULL;
|
|---|
| 222 | node->next = NULL;
|
|---|
| 223 | return node;
|
|---|
| 224 | }
|
|---|
| 225 |
|
|---|
| 226 | /* cleanup and free a given $cuda_kernel_instance_node_t
|
|---|
| 227 | */
|
|---|
| 228 | void $cuda_kernel_instance_node_destroy($cuda_kernel_instance_node_t *node) {
|
|---|
| 229 | $cuda_kernel_instance_destroy(node->instance);
|
|---|
| 230 | //printf("freeing kernel instance node\n");
|
|---|
| 231 | $free(node);
|
|---|
| 232 | }
|
|---|
| 233 |
|
|---|
| 234 | /* malloc and initialize a new stream
|
|---|
| 235 | */
|
|---|
| 236 | cudaStream_t $cuda_stream_create() {
|
|---|
| 237 | cudaStream_t s;
|
|---|
| 238 |
|
|---|
| 239 | //printf("mallocing cuda stream\n");
|
|---|
| 240 | s = (cudaStream_t)malloc(sizeof(_CUstream));
|
|---|
| 241 | s->mostRecent = $cuda_kernel_instance_node_create();
|
|---|
| 242 | dim3 arbitrary = { 1, 1, 1 };
|
|---|
| 243 | s->mostRecent->instance = $cuda_kernel_instance_create(arbitrary, arbitrary);
|
|---|
| 244 | s->mostRecent->instance->status = $cuda_kernel_status_finished;
|
|---|
| 245 | s->usable = $true;
|
|---|
| 246 | return s;
|
|---|
| 247 | }
|
|---|
| 248 |
|
|---|
| 249 | /* block until the most recently enqueued process on the given stream
|
|---|
| 250 | * has terminated (meaning all kernels in that stream have completed)
|
|---|
| 251 | */
|
|---|
| 252 | void $cuda_stream_wait(cudaStream_t s) {
|
|---|
| 253 | $cuda_kernel_instance_t *mostRecentInstance = s->mostRecent->instance;
|
|---|
| 254 | $yield();
|
|---|
| 255 | $unidirectional_when(mostRecentInstance->status == $cuda_kernel_status_finished);
|
|---|
| 256 | }
|
|---|
| 257 |
|
|---|
| 258 | /* block until no more streams have kernels executing
|
|---|
| 259 | */
|
|---|
| 260 | void $cuda_stream_wait_all() {
|
|---|
| 261 | $cuda_stream_node_t *curNode = $cuda_current_context.headNode;
|
|---|
| 262 |
|
|---|
| 263 | while (curNode != NULL) {
|
|---|
| 264 | $cuda_stream_wait(curNode->stream);
|
|---|
| 265 | curNode = curNode->next;
|
|---|
| 266 | }
|
|---|
| 267 | }
|
|---|
| 268 |
|
|---|
| 269 | /* cleanup and free a given stream
|
|---|
| 270 | */
|
|---|
| 271 | void $cuda_stream_destroy(cudaStream_t s) {
|
|---|
| 272 | $cuda_kernel_instance_node_t *curNode = s->mostRecent;
|
|---|
| 273 | $cuda_kernel_instance_node_t *nextNode;
|
|---|
| 274 |
|
|---|
| 275 | while (curNode != NULL) {
|
|---|
| 276 | nextNode = curNode->next;
|
|---|
| 277 | $cuda_kernel_instance_node_destroy(curNode);
|
|---|
| 278 | curNode = nextNode;
|
|---|
| 279 | }
|
|---|
| 280 | //printf("freeing cuda stream\n");
|
|---|
| 281 | $free(s);
|
|---|
| 282 | }
|
|---|
| 283 |
|
|---|
| 284 | /* malloc and initialize a new $cuda_stream_node_t
|
|---|
| 285 | */
|
|---|
| 286 | $cuda_stream_node_t *$cuda_stream_node_create() {
|
|---|
| 287 | //printf("mallocing cuda stream node\n");
|
|---|
| 288 | $cuda_stream_node_t *node = ($cuda_stream_node_t*)malloc(sizeof($cuda_stream_node_t));
|
|---|
| 289 |
|
|---|
| 290 | node->stream = NULL;
|
|---|
| 291 | node->next = NULL;
|
|---|
| 292 | return node;
|
|---|
| 293 | }
|
|---|
| 294 |
|
|---|
| 295 | /* cleanup and free a given $cuda_stream_node_t
|
|---|
| 296 | */
|
|---|
| 297 | void $cuda_stream_node_destroy($cuda_stream_node_t *node) {
|
|---|
| 298 | $assert((!node->stream->usable));
|
|---|
| 299 | $cuda_stream_destroy(node->stream);
|
|---|
| 300 | //printf("freeing cuda stream node\n");
|
|---|
| 301 | $free(node);
|
|---|
| 302 | }
|
|---|
| 303 |
|
|---|
| 304 | /* destroy all stream nodes contained in the context
|
|---|
| 305 | */
|
|---|
| 306 | void $cuda_stream_node_destroy_all() {
|
|---|
| 307 | $cuda_stream_node_t *curNode = $cuda_current_context.headNode;
|
|---|
| 308 | $cuda_stream_node_t *nextNode;
|
|---|
| 309 |
|
|---|
| 310 | while (curNode != NULL) {
|
|---|
| 311 | nextNode = curNode->next;
|
|---|
| 312 | $cuda_stream_node_destroy(curNode);
|
|---|
| 313 | curNode = nextNode;
|
|---|
| 314 | }
|
|---|
| 315 | $cuda_current_context.headNode = NULL;
|
|---|
| 316 | }
|
|---|
| 317 |
|
|---|
| 318 | /* malloc and initialize a new event
|
|---|
| 319 | */
|
|---|
| 320 | cudaEvent_t $cuda_event_create() {
|
|---|
| 321 | //printf("mallocing event\n");
|
|---|
| 322 | cudaEvent_t e = (cudaEvent_t)malloc(sizeof(_CUevent));
|
|---|
| 323 |
|
|---|
| 324 | e->numInstances = 0;
|
|---|
| 325 | e->instances = NULL;
|
|---|
| 326 | return e;
|
|---|
| 327 | }
|
|---|
| 328 |
|
|---|
| 329 | /* block until all $cuda_kernel_instance_ts contained in this event have
|
|---|
| 330 | * completed
|
|---|
| 331 | */
|
|---|
| 332 | /*@ depends_on \access(e);
|
|---|
| 333 | @*/
|
|---|
| 334 | void $cuda_event_wait(cudaEvent_t e) {
|
|---|
| 335 | for (int i = 0; i < e->numInstances; i++) {
|
|---|
| 336 | $yield();
|
|---|
| 337 | $unidirectional_when(e->instances[i]->status == $cuda_kernel_status_finished);
|
|---|
| 338 | }
|
|---|
| 339 | }
|
|---|
| 340 |
|
|---|
| 341 | /* cleanup and free a given event
|
|---|
| 342 | */
|
|---|
| 343 | void $cuda_event_destroy(cudaEvent_t e) {
|
|---|
| 344 | if (e->instances != NULL) {
|
|---|
| 345 | //printf("freeing instance list a\n");
|
|---|
| 346 | $free(e->instances);
|
|---|
| 347 | }
|
|---|
| 348 | //printf("freeing event\n");
|
|---|
| 349 | $free(e);
|
|---|
| 350 | }
|
|---|
| 351 |
|
|---|
| 352 | /* initialize the cuda context. must be called before any cuda functions.
|
|---|
| 353 | */
|
|---|
| 354 | void $cuda_init() {
|
|---|
| 355 | $cuda_current_context.nullStream = $cuda_stream_create();
|
|---|
| 356 | }
|
|---|
| 357 |
|
|---|
| 358 | /* cleanup the cuda context. must be called after all cuda functions.
|
|---|
| 359 | */
|
|---|
| 360 | void $cuda_finalize() {
|
|---|
| 361 | $cuda_stream_wait_all();
|
|---|
| 362 | $cuda_stream_wait($cuda_current_context.nullStream);
|
|---|
| 363 | $cuda_stream_node_destroy_all();
|
|---|
| 364 | $cuda_stream_destroy($cuda_current_context.nullStream);
|
|---|
| 365 | }
|
|---|
| 366 |
|
|---|
| 367 | /* returns an array of pointers to the most recently enqueued kernel
|
|---|
| 368 | * of each stream.
|
|---|
| 369 | */
|
|---|
| 370 | $cuda_kernel_instance_t **$cuda_all_most_recent_kernels() {
|
|---|
| 371 | int n = $cuda_current_context.numStreams + 1;
|
|---|
| 372 | $cuda_stream_node_t *curNode = $cuda_current_context.headNode;
|
|---|
| 373 | //printf("mallocing instance list a\n");
|
|---|
| 374 | $cuda_kernel_instance_t **insts = ($cuda_kernel_instance_t**)malloc(n * sizeof($cuda_kernel_instance_t*)) ;
|
|---|
| 375 |
|
|---|
| 376 | insts[0] = $cuda_current_context.nullStream->mostRecent->instance;
|
|---|
| 377 | for (int i = 1; i < n; i++, curNode = curNode->next) {
|
|---|
| 378 | insts[i] = curNode->stream->mostRecent->instance;
|
|---|
| 379 | }
|
|---|
| 380 | return insts;
|
|---|
| 381 | }
|
|---|
| 382 |
|
|---|
| 383 | /* create a kernel instance for the given function k, and enqueue it
|
|---|
| 384 | * onto the given stream.
|
|---|
| 385 | */
|
|---|
| 386 | void $cuda_enqueue_kernel(cudaStream_t stream, void (*k)($cuda_kernel_instance_t*, cudaEvent_t), dim3 gDim,
|
|---|
| 387 | dim3 bDim) {
|
|---|
| 388 | cudaStream_t s;
|
|---|
| 389 | cudaEvent_t e = $cuda_event_create();
|
|---|
| 390 | $cuda_kernel_instance_node_t *newNode = $cuda_kernel_instance_node_create();
|
|---|
| 391 |
|
|---|
| 392 | if (stream == NULL) {
|
|---|
| 393 | e->numInstances = $cuda_current_context.numStreams + 1;
|
|---|
| 394 | e->instances = $cuda_all_most_recent_kernels();
|
|---|
| 395 | s = $cuda_current_context.nullStream;
|
|---|
| 396 | } else {
|
|---|
| 397 | e->numInstances = 2;
|
|---|
| 398 | //printf("mallocing instance list b\n");
|
|---|
| 399 | e->instances = ($cuda_kernel_instance_t**)malloc(2 * sizeof($cuda_kernel_instance_t*)) ;
|
|---|
| 400 | e->instances[0] = stream->mostRecent->instance;
|
|---|
| 401 | e->instances[1] = $cuda_current_context.nullStream->mostRecent->instance;
|
|---|
| 402 | s = stream;
|
|---|
| 403 | }
|
|---|
| 404 | $assert((s->usable));
|
|---|
| 405 | newNode->instance = $cuda_kernel_instance_create(gDim, bDim);
|
|---|
| 406 | newNode->next = s->mostRecent;
|
|---|
| 407 | s->mostRecent = newNode;
|
|---|
| 408 | s->mostRecent->instance->process = $spawn k(s->mostRecent->instance, e);
|
|---|
| 409 | }
|
|---|
| 410 |
|
|---|
| 411 | /* called by kernel processes. wait on the given event, then update
|
|---|
| 412 | * the status of the calling kernel to indicate it has finished waiting
|
|---|
| 413 | */
|
|---|
| 414 | void $cuda_wait_in_queue ($cuda_kernel_instance_t *this, cudaEvent_t e) {
|
|---|
| 415 | $cuda_event_wait(e);
|
|---|
| 416 | $cuda_event_destroy(e);
|
|---|
| 417 | this->status = $cuda_kernel_status_running;
|
|---|
| 418 | }
|
|---|
| 419 |
|
|---|
| 420 | /* called by kernel processes. update the status of the calling kernel
|
|---|
| 421 | * to indicate that it has completed execution
|
|---|
| 422 | */
|
|---|
| 423 | void $cuda_kernel_finish($cuda_kernel_instance_t *k) {
|
|---|
| 424 | k->status = $cuda_kernel_status_finished;
|
|---|
| 425 | }
|
|---|
| 426 |
|
|---|
| 427 | /* TODO: Finish check_data_race
|
|---|
| 428 | */
|
|---|
| 429 | void $cuda_barrier($cuda_kernel_instance_t *k, int kernel_id, $barrier g) {
|
|---|
| 430 | $check_data_race(k, kernel_id);
|
|---|
| 431 | // We have to push a new read and write set before the barrier call to ignore it's reads and writes
|
|---|
| 432 | $read_set_push();
|
|---|
| 433 | $write_set_push();
|
|---|
| 434 | void captured_clear_mems(){
|
|---|
| 435 | $clear_all_mem_sets(k);
|
|---|
| 436 | }
|
|---|
| 437 | $barrier_call_execute(g, captured_clear_mems);
|
|---|
| 438 | $read_set_pop();
|
|---|
| 439 | $write_set_pop();
|
|---|
| 440 | }
|
|---|
| 441 |
|
|---|
| 442 | /* TODO: Finish read_write_sets
|
|---|
| 443 | */
|
|---|
| 444 | $atomic_f void $check_data_race($cuda_kernel_instance_t *k, int cur_tid) {
|
|---|
| 445 | //printf("Current id: %d\n", cur_tid);
|
|---|
| 446 | $mem out_s0 = $mem_empty();
|
|---|
| 447 | $mem out_s1 = $mem_empty();
|
|---|
| 448 | $mem cur_mw = $write_set_pop();
|
|---|
| 449 | $mem cur_mr = $read_set_pop();
|
|---|
| 450 |
|
|---|
| 451 | // Update current R/W sets
|
|---|
| 452 | k->write_sets[cur_tid] = cur_mw;
|
|---|
| 453 | k->read_sets[cur_tid] = cur_mr;
|
|---|
| 454 |
|
|---|
| 455 | /*
|
|---|
| 456 | printf("CHECKING DATA RACE %d [\n", cur_tid);
|
|---|
| 457 | for (int tmp_tid = 0; tmp_tid < k->size; tmp_tid++) {
|
|---|
| 458 | printf(" RS %d: %s\n", tmp_tid, k->read_sets[tmp_tid]);
|
|---|
| 459 | printf(" WS %d: %s\n", tmp_tid, k->write_sets[tmp_tid]);
|
|---|
| 460 | }
|
|---|
| 461 | printf("]\n");
|
|---|
| 462 | */
|
|---|
| 463 |
|
|---|
| 464 | // Check data race
|
|---|
| 465 | for (int tmp_tid = 0; tmp_tid < k->size; tmp_tid++) {
|
|---|
| 466 | if (tmp_tid == cur_tid) continue;
|
|---|
| 467 |
|
|---|
| 468 | $mem tmp_mr = k->read_sets[tmp_tid];
|
|---|
| 469 | $mem tmp_mw = k->write_sets[tmp_tid];
|
|---|
| 470 |
|
|---|
| 471 | $assert($mem_no_intersect(cur_mr, tmp_mw, &out_s0, &out_s1),
|
|---|
| 472 | "Data-race detected: %p read by thread %d intersects %p written by thread %d\n",
|
|---|
| 473 | out_s0, cur_tid, out_s1, tmp_tid);
|
|---|
| 474 | $assert($mem_no_intersect(cur_mw, tmp_mr, &out_s0, &out_s1),
|
|---|
| 475 | "Data-race detected: %p read by thread %d intersects %p written by thread %d\n",
|
|---|
| 476 | out_s0, cur_tid, out_s1, tmp_tid);
|
|---|
| 477 | $assert($mem_no_intersect(cur_mw, tmp_mw, &out_s0, &out_s1),
|
|---|
| 478 | "Data-race detected: %p written by thread %d intersects %p written by thread %d\n",
|
|---|
| 479 | out_s0, cur_tid, out_s1, tmp_tid);
|
|---|
| 480 | }
|
|---|
| 481 | // Update current R/W sets
|
|---|
| 482 | //k->write_sets[cur_tid] = $mem_empty();
|
|---|
| 483 | //k->read_sets[cur_tid] = $mem_empty();
|
|---|
| 484 | $read_set_push();
|
|---|
| 485 | $write_set_push();
|
|---|
| 486 | }
|
|---|
| 487 |
|
|---|
| 488 | /* Clears read and write memory sets of the given thread
|
|---|
| 489 | */
|
|---|
| 490 | void $clear_mem_sets($cuda_kernel_instance_t *k, int cur_tid) {
|
|---|
| 491 | k->write_sets[cur_tid] = $mem_empty();
|
|---|
| 492 | k->read_sets[cur_tid] = $mem_empty();
|
|---|
| 493 | }
|
|---|
| 494 |
|
|---|
| 495 | void $clear_all_mem_sets($cuda_kernel_instance_t *k){
|
|---|
| 496 | for(int i = 0; i < k->size; i++)
|
|---|
| 497 | $clear_mem_sets(k, i);
|
|---|
| 498 | }
|
|---|
| 499 |
|
|---|
| 500 | /* Publishes current read a write sets to global arrays. Local sets are not cleared
|
|---|
| 501 | */
|
|---|
| 502 | void $publish($cuda_kernel_instance_t *k, int cur_tid) {
|
|---|
| 503 | k->write_sets[cur_tid] = $write_set_peek();
|
|---|
| 504 | k->read_sets[cur_tid] = $read_set_peek();
|
|---|
| 505 | }
|
|---|
| 506 |
|
|---|
| 507 | int is_valid_width(int width){
|
|---|
| 508 | for(int i = 32; i > 1; i /= 2){
|
|---|
| 509 | if(width == i){
|
|---|
| 510 | return 1;
|
|---|
| 511 | }
|
|---|
| 512 | }
|
|---|
| 513 | return 0;
|
|---|
| 514 | }
|
|---|
| 515 |
|
|---|
| 516 | int exchange_data(unsigned mask, int var, int srcLane, int tid, $comm comm, $gbarrier* warpBarriers){
|
|---|
| 517 | $read_set_push();
|
|---|
| 518 | $write_set_push();
|
|---|
| 519 | int laneID = tid % 32;
|
|---|
| 520 | int warpID = tid / 32;
|
|---|
| 521 | int currLaneInMask = (mask >> laneID) & 1;
|
|---|
| 522 | int srcLaneInMask = (mask >> srcLane) & 1;
|
|---|
| 523 | int dest;
|
|---|
| 524 | $gbarrier gbarrier = warpBarriers[warpID];
|
|---|
| 525 | int numActiveThreads = 0;
|
|---|
| 526 | for (int i = 0; i < $get_nprocs(gbarrier); i++){
|
|---|
| 527 | numActiveThreads += (mask >> i) & 1;
|
|---|
| 528 | }
|
|---|
| 529 | if(currLaneInMask){
|
|---|
| 530 | $barrier barrier = $barrier_create($here, gbarrier, laneID);
|
|---|
| 531 | if(srcLaneInMask){
|
|---|
| 532 | $message request_message = $message_pack(tid, srcLane, 0, &tid, sizeof(int));
|
|---|
| 533 | $comm_enqueue(comm, request_message);
|
|---|
| 534 | }
|
|---|
| 535 | $barrier_call_subset(barrier, numActiveThreads);
|
|---|
| 536 | while ($comm_probe(comm, $COMM_ANY_SOURCE, 0)){
|
|---|
| 537 | $yield();
|
|---|
| 538 | $message recv_request = $comm_dequeue(comm, $COMM_ANY_SOURCE, 0);
|
|---|
| 539 | $message_unpack(recv_request, &dest, sizeof(int));
|
|---|
| 540 | $message send_message = $message_pack(tid, dest, 1, &var, sizeof(int));
|
|---|
| 541 | $comm_enqueue(comm, send_message);
|
|---|
| 542 | }
|
|---|
| 543 | if(srcLaneInMask){
|
|---|
| 544 | $yield();
|
|---|
| 545 | $message recv_message = $comm_dequeue(comm, srcLane, 1);
|
|---|
| 546 | $message_unpack(recv_message, &var, sizeof(int));
|
|---|
| 547 | }
|
|---|
| 548 | else{
|
|---|
| 549 | $havoc(&var);
|
|---|
| 550 | }
|
|---|
| 551 | $barrier_destroy(barrier);
|
|---|
| 552 | }
|
|---|
| 553 | else{
|
|---|
| 554 | if(laneID != srcLane)
|
|---|
| 555 | $havoc(&var);
|
|---|
| 556 | }
|
|---|
| 557 | $read_set_pop();
|
|---|
| 558 | $write_set_pop();
|
|---|
| 559 | return var;
|
|---|
| 560 | }
|
|---|
| 561 |
|
|---|
| 562 | int _cuda__shfl_sync(unsigned mask, int var, int srcLane, int width, int numThreads, int tid, $comm comm, $gbarrier* warpBarriers){
|
|---|
| 563 | $assert(is_valid_width(width));
|
|---|
| 564 | int subWarpID = tid / width;
|
|---|
| 565 | srcLane = srcLane % width + subWarpID * width;
|
|---|
| 566 | if(srcLane >= numThreads){
|
|---|
| 567 | int lastSubWarpSize = numThreads % width;
|
|---|
| 568 | srcLane = srcLane % lastSubWarpSize + subWarpID * width;
|
|---|
| 569 | }
|
|---|
| 570 | return exchange_data(mask, var, srcLane, tid, comm, warpBarriers);
|
|---|
| 571 | }
|
|---|
| 572 | int _cuda__shfl_up_sync(unsigned mask, int var, unsigned int delta, int width, int numThreads, int tid, $comm comm, $gbarrier* warpBarriers){
|
|---|
| 573 | $assert(is_valid_width(width));
|
|---|
| 574 | int subWarpLaneID = tid % width;
|
|---|
| 575 | int srcLane = tid;
|
|---|
| 576 | if (subWarpLaneID - delta >= 0){
|
|---|
| 577 | srcLane = tid - delta;
|
|---|
| 578 | }
|
|---|
| 579 | return exchange_data(mask, var, srcLane, tid, comm, warpBarriers);
|
|---|
| 580 | }
|
|---|
| 581 | int _cuda__shfl_down_sync(unsigned mask, int var, unsigned int delta, int width, int numThreads, int tid, $comm comm, $gbarrier* warpBarriers){
|
|---|
| 582 | $assert(is_valid_width(width));
|
|---|
| 583 | int subWarpLaneID = tid % width;
|
|---|
| 584 | int srcLane = tid;
|
|---|
| 585 | if (subWarpLaneID + delta < width && tid + delta < numThreads) {
|
|---|
| 586 | srcLane = tid + delta;
|
|---|
| 587 | }
|
|---|
| 588 | return exchange_data(mask, var, srcLane, tid, comm, warpBarriers);
|
|---|
| 589 | }
|
|---|
| 590 | int _cuda__shfl_xor_sync(unsigned mask, int var, int laneMask, int width, int numThreads, int tid, $comm comm, $gbarrier* warpBarriers){
|
|---|
| 591 | $assert(is_valid_width(width));
|
|---|
| 592 | int laneID = tid % 32;
|
|---|
| 593 | int warpID = tid / 32;
|
|---|
| 594 | int subWarpID = tid / width;
|
|---|
| 595 | int srcLane = laneID ^ laneMask;
|
|---|
| 596 | if(!(srcLane / 32 == warpID && srcLane / width <= subWarpID && srcLane < numThreads)){
|
|---|
| 597 | srcLane = tid;
|
|---|
| 598 | }
|
|---|
| 599 | return exchange_data(mask, var, srcLane, tid, comm, warpBarriers);
|
|---|
| 600 | }
|
|---|
| 601 |
|
|---|