source: CIVL/include/impls/civl-cuda.cvl@ 1aaefd4

main test-branch
Last change on this file since 1aaefd4 was ea777aa, checked in by Alex Wilton <awilton@…>, 3 years ago

Moved examples, include, build_default.properties, common.xml, and README out from dev.civl.com into the root of the repo.

git-svn-id: svn://vsl.cis.udel.edu/civl/trunk@5704 fb995dde-84ed-4084-dfe6-e5aef3e2452c

  • Property mode set to 100644
File size: 17.5 KB
RevLine 
[3ff27cf]1#include <civlc.cvh>
[ef14ce6]2/* This header file contains useful helper functions for manipulating
3 * the CIVL versions of various Cuda objects.
4 */
5
6#include <string.h>
[d811c20]7// To print for debugging
8#include <stdio.h>
[d66b03b]9#include <stdlib.h>
[ef14ce6]10#include <concurrency.cvh>
[d811c20]11#include <seq.cvh>
12#include <mem.cvh>
[3af26ac]13#include <civl-cuda.cvh>
[4149b300]14#include <comm.cvh>
[e93c797]15#pragma CIVL ACSL
[47665c1]16
17/* type used to represent an instance of a Cuda kernel
[d811c20]18 * TODO: Update with read/write sets
[47665c1]19 */
[42ab01f]20struct $cuda_kernel_instance {
[47665c1]21 $proc process; // the actual process executing this kernel
[42ab01f]22 $cuda_kernel_status status; // the kernel status
[d811c20]23 // Read and Write sets
24 $mem read_sets[];
25 $mem write_sets[];
26 // Number of threads in kernel
27 int size;
[47665c1]28};
29
30/* a type that wraps a kernel instance for insertion into a list
31 */
[42ab01f]32struct $cuda_kernel_instance_node {
33 $cuda_kernel_instance_t *instance;
34 $cuda_kernel_instance_node_t *next;
[47665c1]35};
36
[42ab01f]37/* $cuda_kernel_instance_node_t interface
[47665c1]38 */
[42ab01f]39$cuda_kernel_instance_t *$cuda_get_instance($cuda_kernel_instance_node_t *node) {
[47665c1]40 return node->instance;
41}
42
43/* a type used to represent a Cuda stream
44 */
45struct _CUstream {
[42ab01f]46 $cuda_kernel_instance_node_t *mostRecent; // the most recently enqueued instance
[47665c1]47 _Bool usable; // indicates whether or not this stream
48 // has been marked for deletion
49};
50
51/* _CUstream interface
52 */
[42ab01f]53$cuda_kernel_instance_node_t *$cuda_get_most_recent(cudaStream_t s) {
[47665c1]54 return s->mostRecent;
55}
56
[42ab01f]57_Bool $cuda_is_usable(cudaStream_t s) {
[47665c1]58 return s->usable;
59}
60
[42ab01f]61void $cuda_set_usable(cudaStream_t s, _Bool b) {
[47665c1]62 s->usable = b;
63}
64
65/* a type that wraps a stream for insertion into a list
66 */
[42ab01f]67struct $cuda_stream_node {
[47665c1]68 cudaStream_t stream;
[42ab01f]69 $cuda_stream_node_t *next;
[47665c1]70};
71
[42ab01f]72/* $cuda_stream_node_t encapsulation functions
[47665c1]73 */
[42ab01f]74void $cuda_set_stream($cuda_stream_node_t* node, cudaStream_t stream) {
[47665c1]75 node->stream = stream;
76}
77
[42ab01f]78void $cuda_set_next($cuda_stream_node_t* node, $cuda_stream_node_t* next) {
[47665c1]79 node->next = next;
80}
81
82/* a type used to represent a Cuda event
83 */
84struct _CUevent {
[42ab01f]85 $cuda_kernel_instance_t **instances;
[47665c1]86 int numInstances;
87};
88
89/* _CUevent encapsulation functions
90 */
[42ab01f]91$cuda_kernel_instance_t **$cuda_get_instances(cudaEvent_t e) {
[47665c1]92 return e->instances;
93}
94
[42ab01f]95void $cuda_set_instances(cudaEvent_t e, $cuda_kernel_instance_t** instances, int numInstances) {
[47665c1]96 e->instances = instances;
97 e->numInstances = numInstances;
98}
99
[42ab01f]100int $cuda_get_num_instances(cudaEvent_t e) {
[47665c1]101 return e->numInstances;
102}
103
104/* a type representing the state of a Cuda device
105 */
[42ab01f]106struct $cuda_context {
107 $cuda_stream_node_t *headNode;
[47665c1]108 cudaStream_t nullStream;
109 int numStreams;
110};
111
[42ab01f]112/* $cuda_context_t encapsulation functions
[47665c1]113 */
[42ab01f]114int $cuda_get_num_streams($cuda_context_t *_context) {
[47665c1]115 return _context->numStreams;
116}
117
[42ab01f]118$cuda_stream_node_t *$cuda_get_head_node($cuda_context_t *_context) {
[47665c1]119 return _context->headNode;
120}
121
[42ab01f]122cudaStream_t $cuda_get_null_stream($cuda_context_t *_context) {
[47665c1]123 return _context->nullStream;
124}
125
[42ab01f]126void $cuda_add_new_stream($cuda_context_t *_context, $cuda_stream_node_t *newNode) {
[47665c1]127 _context->headNode = newNode;
128 _context->numStreams++;
129}
130
[ef14ce6]131/* Computes the one dimensional index of a grid cell at a given location
132 * in a three dimensional grid of a given size
133 */
[42ab01f]134int $cuda_index (dim3 size, uint3 location) {
[ef14ce6]135 return location.x + size.x * (location.y + size.y * location.z);
136}
137
[d811c20]138/* Computes the one dimensional index of a specific thread in the grid given
139 * the grid dimension, block dimension, block index, and thread index
140 */
141int $cuda_kernel_index (dim3 gDim, dim3 bDim, uint3 bIdx, uint3 tIdx) {
142 return $cuda_index(gDim, bIdx) * (bDim.x * bDim.y * bDim.z) + $cuda_index(bDim, tIdx);
143}
144
[ef14ce6]145/* Lifts a single integer x into a three dimensional vector representing
146 * a one dimensional grid of length x
147 */
[42ab01f]148dim3 $cuda_to_dim3(int x) {
[ef14ce6]149 dim3 d = { x, 1, 1 };
150
151 return d;
152}
153
154/* Given a three dimensional vector representing a grid of size dim,
155 * create and destroy a process, in parallel, for each cell in the grid.
156 * The location of the cell is passed to the spawning function.
157 */
[42ab01f]158void $cuda_run_procs(dim3 dim, void spawningFunction(uint3)) {
[e81f0c8]159 $range rx = 0 .. (dim.x == 0 ? -1 : dim.x - 1);
160 $range ry = 0 .. (dim.y == 0 ? -1 : dim.y - 1);
161 $range rz = 0 .. (dim.z == 0 ? -1 : dim.z - 1);
[47665c1]162 $domain(3) dom = ($domain){rx, ry, rz};
163 $parfor(int x,y,z : dom){
164 uint3 id = { x, y, z };
165 spawningFunction(id);
166 }
[ef14ce6]167}
168
169// ------------------------------------------------
170
171/* $wait on a given process is it is non-null
172 */
[42ab01f]173void $cuda_try_wait($proc p) {
[4c2f120]174 if (p != $proc_null) {
175 $yield();
[47665c1]176 $wait(p);
[4c2f120]177 }
[ef14ce6]178}
179
180/* The current state of the GPU
181 */
[42ab01f]182$cuda_context_t $cuda_current_context = {
[ef14ce6]183 .headNode = NULL,
184 .nullStream = NULL,
185 .numStreams = 0
186};
187
[42ab01f]188/* malloc and initialize a new $cuda_kernel_instance_t
[ef14ce6]189 */
[d811c20]190
191 // TODO: Determine if passing parameters is better or not
192$cuda_kernel_instance_t *$cuda_kernel_instance_create(dim3 gDim, dim3 bDim) {
[ef14ce6]193 //printf("mallocing kernel instance\n");
[d66b03b]194 $cuda_kernel_instance_t *i = ($cuda_kernel_instance_t*)malloc(sizeof($cuda_kernel_instance_t));
[ef14ce6]195
196 i->process = $proc_null;
[42ab01f]197 i->status = $cuda_kernel_status_waiting;
[d811c20]198 int threadNum = gDim.x * gDim.y * gDim.z * bDim.x * bDim.y * bDim.z;
199 i->size = threadNum;
200 // TODO: either sequentially or non-sequentially?
201 $mem empty = $mem_empty();
202 $seq_init(&i->read_sets, threadNum, &empty);
203 $seq_init(&i->write_sets, threadNum, &empty);
[ef14ce6]204 return i;
205}
206
[42ab01f]207/* cleanup and free a given $cuda_kernel_instance_t
[ef14ce6]208 */
[42ab01f]209void $cuda_kernel_instance_destroy($cuda_kernel_instance_t *i) {
210 $cuda_try_wait(i->process);
[ef14ce6]211 //printf("freeing kernel instance\n");
212 $free(i);
213}
214
[42ab01f]215/* malloc and initialize a new $cuda_kernel_instance_node_t
[ef14ce6]216 */
[42ab01f]217$cuda_kernel_instance_node_t *$cuda_kernel_instance_node_create() {
[ef14ce6]218 //printf("mallocing kernel instance node\n");
[d66b03b]219 $cuda_kernel_instance_node_t *node = ($cuda_kernel_instance_node_t*)malloc(sizeof($cuda_kernel_instance_node_t));
[ef14ce6]220
221 node->instance = NULL;
222 node->next = NULL;
223 return node;
224}
225
[42ab01f]226/* cleanup and free a given $cuda_kernel_instance_node_t
[ef14ce6]227 */
[42ab01f]228void $cuda_kernel_instance_node_destroy($cuda_kernel_instance_node_t *node) {
229 $cuda_kernel_instance_destroy(node->instance);
[ef14ce6]230 //printf("freeing kernel instance node\n");
231 $free(node);
232}
233
234/* malloc and initialize a new stream
235 */
[42ab01f]236cudaStream_t $cuda_stream_create() {
[ef14ce6]237 cudaStream_t s;
238
239 //printf("mallocing cuda stream\n");
[d66b03b]240 s = (cudaStream_t)malloc(sizeof(_CUstream));
[42ab01f]241 s->mostRecent = $cuda_kernel_instance_node_create();
[d811c20]242 dim3 arbitrary = { 1, 1, 1 };
243 s->mostRecent->instance = $cuda_kernel_instance_create(arbitrary, arbitrary);
[42ab01f]244 s->mostRecent->instance->status = $cuda_kernel_status_finished;
[ef14ce6]245 s->usable = $true;
246 return s;
247}
248
249/* block until the most recently enqueued process on the given stream
250 * has terminated (meaning all kernels in that stream have completed)
251 */
[42ab01f]252void $cuda_stream_wait(cudaStream_t s) {
253 $cuda_kernel_instance_t *mostRecentInstance = s->mostRecent->instance;
[4c2f120]254 $yield();
[e5a03f2]255 $unidirectional_when(mostRecentInstance->status == $cuda_kernel_status_finished);
[ef14ce6]256}
257
258/* block until no more streams have kernels executing
259 */
[42ab01f]260void $cuda_stream_wait_all() {
261 $cuda_stream_node_t *curNode = $cuda_current_context.headNode;
[ef14ce6]262
263 while (curNode != NULL) {
[42ab01f]264 $cuda_stream_wait(curNode->stream);
[ef14ce6]265 curNode = curNode->next;
266 }
267}
268
269/* cleanup and free a given stream
270 */
[42ab01f]271void $cuda_stream_destroy(cudaStream_t s) {
272 $cuda_kernel_instance_node_t *curNode = s->mostRecent;
273 $cuda_kernel_instance_node_t *nextNode;
[ef14ce6]274
275 while (curNode != NULL) {
[47665c1]276 nextNode = curNode->next;
[42ab01f]277 $cuda_kernel_instance_node_destroy(curNode);
[47665c1]278 curNode = nextNode;
[ef14ce6]279 }
280 //printf("freeing cuda stream\n");
281 $free(s);
282}
283
[42ab01f]284/* malloc and initialize a new $cuda_stream_node_t
[ef14ce6]285 */
[42ab01f]286$cuda_stream_node_t *$cuda_stream_node_create() {
[ef14ce6]287 //printf("mallocing cuda stream node\n");
[d66b03b]288 $cuda_stream_node_t *node = ($cuda_stream_node_t*)malloc(sizeof($cuda_stream_node_t));
[ef14ce6]289
290 node->stream = NULL;
291 node->next = NULL;
292 return node;
293}
294
[42ab01f]295/* cleanup and free a given $cuda_stream_node_t
[ef14ce6]296 */
[42ab01f]297void $cuda_stream_node_destroy($cuda_stream_node_t *node) {
[3ff27cf]298 $assert((!node->stream->usable));
[42ab01f]299 $cuda_stream_destroy(node->stream);
[ef14ce6]300 //printf("freeing cuda stream node\n");
301 $free(node);
302}
303
304/* destroy all stream nodes contained in the context
305 */
[42ab01f]306void $cuda_stream_node_destroy_all() {
307 $cuda_stream_node_t *curNode = $cuda_current_context.headNode;
308 $cuda_stream_node_t *nextNode;
[ef14ce6]309
310 while (curNode != NULL) {
[47665c1]311 nextNode = curNode->next;
[42ab01f]312 $cuda_stream_node_destroy(curNode);
[47665c1]313 curNode = nextNode;
[ef14ce6]314 }
[42ab01f]315 $cuda_current_context.headNode = NULL;
[ef14ce6]316}
317
318/* malloc and initialize a new event
319 */
[42ab01f]320cudaEvent_t $cuda_event_create() {
[ef14ce6]321 //printf("mallocing event\n");
[d66b03b]322 cudaEvent_t e = (cudaEvent_t)malloc(sizeof(_CUevent));
[ef14ce6]323
324 e->numInstances = 0;
325 e->instances = NULL;
326 return e;
327}
328
[42ab01f]329/* block until all $cuda_kernel_instance_ts contained in this event have
[ef14ce6]330 * completed
331 */
[e045201d]332/*@ depends_on \access(e);
333 @*/
[42ab01f]334void $cuda_event_wait(cudaEvent_t e) {
[ef14ce6]335 for (int i = 0; i < e->numInstances; i++) {
[4c2f120]336 $yield();
[e5a03f2]337 $unidirectional_when(e->instances[i]->status == $cuda_kernel_status_finished);
[ef14ce6]338 }
339}
340
341/* cleanup and free a given event
342 */
[42ab01f]343void $cuda_event_destroy(cudaEvent_t e) {
[ef14ce6]344 if (e->instances != NULL) {
345 //printf("freeing instance list a\n");
346 $free(e->instances);
347 }
348 //printf("freeing event\n");
349 $free(e);
350}
351
352/* initialize the cuda context. must be called before any cuda functions.
353 */
[42ab01f]354void $cuda_init() {
355 $cuda_current_context.nullStream = $cuda_stream_create();
[ef14ce6]356}
357
358/* cleanup the cuda context. must be called after all cuda functions.
359 */
[42ab01f]360void $cuda_finalize() {
361 $cuda_stream_wait_all();
362 $cuda_stream_wait($cuda_current_context.nullStream);
363 $cuda_stream_node_destroy_all();
364 $cuda_stream_destroy($cuda_current_context.nullStream);
[ef14ce6]365}
366
367/* returns an array of pointers to the most recently enqueued kernel
368 * of each stream.
369 */
[42ab01f]370$cuda_kernel_instance_t **$cuda_all_most_recent_kernels() {
371 int n = $cuda_current_context.numStreams + 1;
372 $cuda_stream_node_t *curNode = $cuda_current_context.headNode;
[ef14ce6]373 //printf("mallocing instance list a\n");
[d66b03b]374 $cuda_kernel_instance_t **insts = ($cuda_kernel_instance_t**)malloc(n * sizeof($cuda_kernel_instance_t*)) ;
[ef14ce6]375
[42ab01f]376 insts[0] = $cuda_current_context.nullStream->mostRecent->instance;
[ef14ce6]377 for (int i = 1; i < n; i++, curNode = curNode->next) {
378 insts[i] = curNode->stream->mostRecent->instance;
379 }
380 return insts;
381}
382
383/* create a kernel instance for the given function k, and enqueue it
384 * onto the given stream.
385 */
[d811c20]386void $cuda_enqueue_kernel(cudaStream_t stream, void (*k)($cuda_kernel_instance_t*, cudaEvent_t), dim3 gDim,
387dim3 bDim) {
[ef14ce6]388 cudaStream_t s;
[42ab01f]389 cudaEvent_t e = $cuda_event_create();
390 $cuda_kernel_instance_node_t *newNode = $cuda_kernel_instance_node_create();
[ef14ce6]391
392 if (stream == NULL) {
[42ab01f]393 e->numInstances = $cuda_current_context.numStreams + 1;
394 e->instances = $cuda_all_most_recent_kernels();
395 s = $cuda_current_context.nullStream;
[ef14ce6]396 } else {
397 e->numInstances = 2;
398 //printf("mallocing instance list b\n");
[d66b03b]399 e->instances = ($cuda_kernel_instance_t**)malloc(2 * sizeof($cuda_kernel_instance_t*)) ;
[ef14ce6]400 e->instances[0] = stream->mostRecent->instance;
[42ab01f]401 e->instances[1] = $cuda_current_context.nullStream->mostRecent->instance;
[ef14ce6]402 s = stream;
403 }
[3ff27cf]404 $assert((s->usable));
[d811c20]405 newNode->instance = $cuda_kernel_instance_create(gDim, bDim);
[ef14ce6]406 newNode->next = s->mostRecent;
407 s->mostRecent = newNode;
408 s->mostRecent->instance->process = $spawn k(s->mostRecent->instance, e);
409}
410
411/* called by kernel processes. wait on the given event, then update
412 * the status of the calling kernel to indicate it has finished waiting
413 */
[42ab01f]414void $cuda_wait_in_queue ($cuda_kernel_instance_t *this, cudaEvent_t e) {
415 $cuda_event_wait(e);
416 $cuda_event_destroy(e);
417 this->status = $cuda_kernel_status_running;
[ef14ce6]418}
419
420/* called by kernel processes. update the status of the calling kernel
421 * to indicate that it has completed execution
422 */
[42ab01f]423void $cuda_kernel_finish($cuda_kernel_instance_t *k) {
424 k->status = $cuda_kernel_status_finished;
[ef14ce6]425}
426
[d811c20]427/* TODO: Finish check_data_race
428 */
429void $cuda_barrier($cuda_kernel_instance_t *k, int kernel_id, $barrier g) {
430 $check_data_race(k, kernel_id);
431 // We have to push a new read and write set before the barrier call to ignore it's reads and writes
432 $read_set_push();
433 $write_set_push();
[4724a68]434 void captured_clear_mems(){
435 $clear_all_mem_sets(k);
436 }
437 $barrier_call_execute(g, captured_clear_mems);
[d811c20]438 $read_set_pop();
439 $write_set_pop();
440}
441
442/* TODO: Finish read_write_sets
443 */
[5229f4f]444$atomic_f void $check_data_race($cuda_kernel_instance_t *k, int cur_tid) {
445 //printf("Current id: %d\n", cur_tid);
[d811c20]446 $mem out_s0 = $mem_empty();
447 $mem out_s1 = $mem_empty();
448 $mem cur_mw = $write_set_pop();
449 $mem cur_mr = $read_set_pop();
450
451 // Update current R/W sets
452 k->write_sets[cur_tid] = cur_mw;
453 k->read_sets[cur_tid] = cur_mr;
[ce6b559]454
455 /*
456 printf("CHECKING DATA RACE %d [\n", cur_tid);
457 for (int tmp_tid = 0; tmp_tid < k->size; tmp_tid++) {
458 printf(" RS %d: %s\n", tmp_tid, k->read_sets[tmp_tid]);
459 printf(" WS %d: %s\n", tmp_tid, k->write_sets[tmp_tid]);
460 }
461 printf("]\n");
462 */
463
[d811c20]464 // Check data race
465 for (int tmp_tid = 0; tmp_tid < k->size; tmp_tid++) {
466 if (tmp_tid == cur_tid) continue;
467
468 $mem tmp_mr = k->read_sets[tmp_tid];
469 $mem tmp_mw = k->write_sets[tmp_tid];
470
471 $assert($mem_no_intersect(cur_mr, tmp_mw, &out_s0, &out_s1),
472 "Data-race detected: %p read by thread %d intersects %p written by thread %d\n",
473 out_s0, cur_tid, out_s1, tmp_tid);
474 $assert($mem_no_intersect(cur_mw, tmp_mr, &out_s0, &out_s1),
475 "Data-race detected: %p read by thread %d intersects %p written by thread %d\n",
476 out_s0, cur_tid, out_s1, tmp_tid);
477 $assert($mem_no_intersect(cur_mw, tmp_mw, &out_s0, &out_s1),
478 "Data-race detected: %p written by thread %d intersects %p written by thread %d\n",
479 out_s0, cur_tid, out_s1, tmp_tid);
480 }
481 // Update current R/W sets
[5229f4f]482 //k->write_sets[cur_tid] = $mem_empty();
483 //k->read_sets[cur_tid] = $mem_empty();
[d811c20]484 $read_set_push();
485 $write_set_push();
486}
[ce6b559]487
488/* Clears read and write memory sets of the given thread
489*/
490void $clear_mem_sets($cuda_kernel_instance_t *k, int cur_tid) {
491 k->write_sets[cur_tid] = $mem_empty();
492 k->read_sets[cur_tid] = $mem_empty();
493}
[4149b300]494
[4724a68]495void $clear_all_mem_sets($cuda_kernel_instance_t *k){
496 for(int i = 0; i < k->size; i++)
497 $clear_mem_sets(k, i);
498}
499
[4149b300]500/* Publishes current read a write sets to global arrays. Local sets are not cleared
501*/
502void $publish($cuda_kernel_instance_t *k, int cur_tid) {
503 k->write_sets[cur_tid] = $write_set_peek();
504 k->read_sets[cur_tid] = $read_set_peek();
505}
506
507int is_valid_width(int width){
508 for(int i = 32; i > 1; i /= 2){
509 if(width == i){
510 return 1;
511 }
512 }
513 return 0;
514}
515
516int exchange_data(unsigned mask, int var, int srcLane, int tid, $comm comm, $gbarrier* warpBarriers){
517 $read_set_push();
518 $write_set_push();
519 int laneID = tid % 32;
520 int warpID = tid / 32;
[4724a68]521 int currLaneInMask = (mask >> laneID) & 1;
522 int srcLaneInMask = (mask >> srcLane) & 1;
[4149b300]523 int dest;
524 $gbarrier gbarrier = warpBarriers[warpID];
525 int numActiveThreads = 0;
526 for (int i = 0; i < $get_nprocs(gbarrier); i++){
527 numActiveThreads += (mask >> i) & 1;
528 }
[4724a68]529 if(currLaneInMask){
530 $barrier barrier = $barrier_create($here, gbarrier, laneID);
531 if(srcLaneInMask){
532 $message request_message = $message_pack(tid, srcLane, 0, &tid, sizeof(int));
533 $comm_enqueue(comm, request_message);
534 }
[4149b300]535 $barrier_call_subset(barrier, numActiveThreads);
[4724a68]536 while ($comm_probe(comm, $COMM_ANY_SOURCE, 0)){
537 $yield();
538 $message recv_request = $comm_dequeue(comm, $COMM_ANY_SOURCE, 0);
539 $message_unpack(recv_request, &dest, sizeof(int));
540 $message send_message = $message_pack(tid, dest, 1, &var, sizeof(int));
541 $comm_enqueue(comm, send_message);
542 }
543 if(srcLaneInMask){
544 $yield();
545 $message recv_message = $comm_dequeue(comm, srcLane, 1);
546 $message_unpack(recv_message, &var, sizeof(int));
547 }
548 else{
549 $havoc(&var);
550 }
551 $barrier_destroy(barrier);
[4149b300]552 }
[4724a68]553 else{
554 if(laneID != srcLane)
555 $havoc(&var);
556 }
[4149b300]557 $read_set_pop();
558 $write_set_pop();
559 return var;
560}
561
562int _cuda__shfl_sync(unsigned mask, int var, int srcLane, int width, int numThreads, int tid, $comm comm, $gbarrier* warpBarriers){
563 $assert(is_valid_width(width));
564 int subWarpID = tid / width;
565 srcLane = srcLane % width + subWarpID * width;
566 if(srcLane >= numThreads){
567 int lastSubWarpSize = numThreads % width;
568 srcLane = srcLane % lastSubWarpSize + subWarpID * width;
569 }
570 return exchange_data(mask, var, srcLane, tid, comm, warpBarriers);
571}
572int _cuda__shfl_up_sync(unsigned mask, int var, unsigned int delta, int width, int numThreads, int tid, $comm comm, $gbarrier* warpBarriers){
573 $assert(is_valid_width(width));
574 int subWarpLaneID = tid % width;
575 int srcLane = tid;
576 if (subWarpLaneID - delta >= 0){
577 srcLane = tid - delta;
578 }
579 return exchange_data(mask, var, srcLane, tid, comm, warpBarriers);
580}
581int _cuda__shfl_down_sync(unsigned mask, int var, unsigned int delta, int width, int numThreads, int tid, $comm comm, $gbarrier* warpBarriers){
582 $assert(is_valid_width(width));
583 int subWarpLaneID = tid % width;
584 int srcLane = tid;
585 if (subWarpLaneID + delta < width && tid + delta < numThreads) {
586 srcLane = tid + delta;
587 }
588 return exchange_data(mask, var, srcLane, tid, comm, warpBarriers);
589}
590int _cuda__shfl_xor_sync(unsigned mask, int var, int laneMask, int width, int numThreads, int tid, $comm comm, $gbarrier* warpBarriers){
591 $assert(is_valid_width(width));
592 int laneID = tid % 32;
593 int warpID = tid / 32;
594 int subWarpID = tid / width;
595 int srcLane = laneID ^ laneMask;
596 if(!(srcLane / 32 == warpID && srcLane / width <= subWarpID && srcLane < numThreads)){
597 srcLane = tid;
598 }
599 return exchange_data(mask, var, srcLane, tid, comm, warpBarriers);
600}
601
Note: See TracBrowser for help on using the repository browser.