Context Navigation

civl-cuda.cvl@ 1aaefd4

main test-branch

Last change on this file since 1aaefd4 was ea777aa, checked in by Alex Wilton <awilton@…>, 3 years ago

Moved examples, include, build_default.properties, common.xml, and README out from dev.civl.com into the root of the repo.

git-svn-id: svn://vsl.cis.udel.edu/civl/trunk@5704 fb995dde-84ed-4084-dfe6-e5aef3e2452c

Property mode set to 100644

File size: 17.5 KB

Rev	Line
[3ff27cf]	1	#include <civlc.cvh>
[ef14ce6]	2	/* This header file contains useful helper functions for manipulating
	3	* the CIVL versions of various Cuda objects.
	4	*/
	5
	6	#include <string.h>
[d811c20]	7	// To print for debugging
	8	#include <stdio.h>
[d66b03b]	9	#include <stdlib.h>
[ef14ce6]	10	#include <concurrency.cvh>
[d811c20]	11	#include <seq.cvh>
	12	#include <mem.cvh>
[3af26ac]	13	#include <civl-cuda.cvh>
[4149b300]	14	#include <comm.cvh>
[e93c797]	15	#pragma CIVL ACSL
[47665c1]	16
	17	/* type used to represent an instance of a Cuda kernel
[d811c20]	18	* TODO: Update with read/write sets
[47665c1]	19	*/
[42ab01f]	20	struct $cuda_kernel_instance {
[47665c1]	21	$proc process; // the actual process executing this kernel
[42ab01f]	22	$cuda_kernel_status status; // the kernel status
[d811c20]	23	// Read and Write sets
	24	$mem read_sets[];
	25	$mem write_sets[];
	26	// Number of threads in kernel
	27	int size;
[47665c1]	28	};
	29
	30	/* a type that wraps a kernel instance for insertion into a list
	31	*/
[42ab01f]	32	struct $cuda_kernel_instance_node {
	33	$cuda_kernel_instance_t *instance;
	34	$cuda_kernel_instance_node_t *next;
[47665c1]	35	};
	36
[42ab01f]	37	/* $cuda_kernel_instance_node_t interface
[47665c1]	38	*/
[42ab01f]	39	$cuda_kernel_instance_t $cuda_get_instance($cuda_kernel_instance_node_t node) {
[47665c1]	40	return node->instance;
	41	}
	42
	43	/* a type used to represent a Cuda stream
	44	*/
	45	struct _CUstream {
[42ab01f]	46	$cuda_kernel_instance_node_t *mostRecent; // the most recently enqueued instance
[47665c1]	47	_Bool usable; // indicates whether or not this stream
	48	// has been marked for deletion
	49	};
	50
	51	/* _CUstream interface
	52	*/
[42ab01f]	53	$cuda_kernel_instance_node_t *$cuda_get_most_recent(cudaStream_t s) {
[47665c1]	54	return s->mostRecent;
	55	}
	56
[42ab01f]	57	_Bool $cuda_is_usable(cudaStream_t s) {
[47665c1]	58	return s->usable;
	59	}
	60
[42ab01f]	61	void $cuda_set_usable(cudaStream_t s, _Bool b) {
[47665c1]	62	s->usable = b;
	63	}
	64
	65	/* a type that wraps a stream for insertion into a list
	66	*/
[42ab01f]	67	struct $cuda_stream_node {
[47665c1]	68	cudaStream_t stream;
[42ab01f]	69	$cuda_stream_node_t *next;
[47665c1]	70	};
	71
[42ab01f]	72	/* $cuda_stream_node_t encapsulation functions
[47665c1]	73	*/
[42ab01f]	74	void $cuda_set_stream($cuda_stream_node_t* node, cudaStream_t stream) {
[47665c1]	75	node->stream = stream;
	76	}
	77
[42ab01f]	78	void $cuda_set_next($cuda_stream_node_t* node, $cuda_stream_node_t* next) {
[47665c1]	79	node->next = next;
	80	}
	81
	82	/* a type used to represent a Cuda event
	83	*/
	84	struct _CUevent {
[42ab01f]	85	$cuda_kernel_instance_t **instances;
[47665c1]	86	int numInstances;
	87	};
	88
	89	/* _CUevent encapsulation functions
	90	*/
[42ab01f]	91	$cuda_kernel_instance_t **$cuda_get_instances(cudaEvent_t e) {
[47665c1]	92	return e->instances;
	93	}
	94
[42ab01f]	95	void $cuda_set_instances(cudaEvent_t e, $cuda_kernel_instance_t** instances, int numInstances) {
[47665c1]	96	e->instances = instances;
	97	e->numInstances = numInstances;
	98	}
	99
[42ab01f]	100	int $cuda_get_num_instances(cudaEvent_t e) {
[47665c1]	101	return e->numInstances;
	102	}
	103
	104	/* a type representing the state of a Cuda device
	105	*/
[42ab01f]	106	struct $cuda_context {
	107	$cuda_stream_node_t *headNode;
[47665c1]	108	cudaStream_t nullStream;
	109	int numStreams;
	110	};
	111
[42ab01f]	112	/* $cuda_context_t encapsulation functions
[47665c1]	113	*/
[42ab01f]	114	int $cuda_get_num_streams($cuda_context_t *_context) {
[47665c1]	115	return _context->numStreams;
	116	}
	117
[42ab01f]	118	$cuda_stream_node_t $cuda_get_head_node($cuda_context_t _context) {
[47665c1]	119	return _context->headNode;
	120	}
	121
[42ab01f]	122	cudaStream_t $cuda_get_null_stream($cuda_context_t *_context) {
[47665c1]	123	return _context->nullStream;
	124	}
	125
[42ab01f]	126	void $cuda_add_new_stream($cuda_context_t _context, $cuda_stream_node_t newNode) {
[47665c1]	127	_context->headNode = newNode;
	128	_context->numStreams++;
	129	}
	130
[ef14ce6]	131	/* Computes the one dimensional index of a grid cell at a given location
	132	* in a three dimensional grid of a given size
	133	*/
[42ab01f]	134	int $cuda_index (dim3 size, uint3 location) {
[ef14ce6]	135	return location.x + size.x * (location.y + size.y * location.z);
	136	}
	137
[d811c20]	138	/* Computes the one dimensional index of a specific thread in the grid given
	139	* the grid dimension, block dimension, block index, and thread index
	140	*/
	141	int $cuda_kernel_index (dim3 gDim, dim3 bDim, uint3 bIdx, uint3 tIdx) {
	142	return $cuda_index(gDim, bIdx) * (bDim.x * bDim.y * bDim.z) + $cuda_index(bDim, tIdx);
	143	}
	144
[ef14ce6]	145	/* Lifts a single integer x into a three dimensional vector representing
	146	* a one dimensional grid of length x
	147	*/
[42ab01f]	148	dim3 $cuda_to_dim3(int x) {
[ef14ce6]	149	dim3 d = { x, 1, 1 };
	150
	151	return d;
	152	}
	153
	154	/* Given a three dimensional vector representing a grid of size dim,
	155	* create and destroy a process, in parallel, for each cell in the grid.
	156	* The location of the cell is passed to the spawning function.
	157	*/
[42ab01f]	158	void $cuda_run_procs(dim3 dim, void spawningFunction(uint3)) {
[e81f0c8]	159	$range rx = 0 .. (dim.x == 0 ? -1 : dim.x - 1);
	160	$range ry = 0 .. (dim.y == 0 ? -1 : dim.y - 1);
	161	$range rz = 0 .. (dim.z == 0 ? -1 : dim.z - 1);
[47665c1]	162	$domain(3) dom = ($domain){rx, ry, rz};
	163	$parfor(int x,y,z : dom){
	164	uint3 id = { x, y, z };
	165	spawningFunction(id);
	166	}
[ef14ce6]	167	}
	168
	169	// ------------------------------------------------
	170
	171	/* $wait on a given process is it is non-null
	172	*/
[42ab01f]	173	void $cuda_try_wait($proc p) {
[4c2f120]	174	if (p != $proc_null) {
	175	$yield();
[47665c1]	176	$wait(p);
[4c2f120]	177	}
[ef14ce6]	178	}
	179
	180	/* The current state of the GPU
	181	*/
[42ab01f]	182	$cuda_context_t $cuda_current_context = {
[ef14ce6]	183	.headNode = NULL,
	184	.nullStream = NULL,
	185	.numStreams = 0
	186	};
	187
[42ab01f]	188	/* malloc and initialize a new $cuda_kernel_instance_t
[ef14ce6]	189	*/
[d811c20]	190
	191	// TODO: Determine if passing parameters is better or not
	192	$cuda_kernel_instance_t *$cuda_kernel_instance_create(dim3 gDim, dim3 bDim) {
[ef14ce6]	193	//printf("mallocing kernel instance\n");
[d66b03b]	194	$cuda_kernel_instance_t i = ($cuda_kernel_instance_t)malloc(sizeof($cuda_kernel_instance_t));
[ef14ce6]	195
	196	i->process = $proc_null;
[42ab01f]	197	i->status = $cuda_kernel_status_waiting;
[d811c20]	198	int threadNum = gDim.x * gDim.y * gDim.z * bDim.x * bDim.y * bDim.z;
	199	i->size = threadNum;
	200	// TODO: either sequentially or non-sequentially?
	201	$mem empty = $mem_empty();
	202	$seq_init(&i->read_sets, threadNum, &empty);
	203	$seq_init(&i->write_sets, threadNum, &empty);
[ef14ce6]	204	return i;
	205	}
	206
[42ab01f]	207	/* cleanup and free a given $cuda_kernel_instance_t
[ef14ce6]	208	*/
[42ab01f]	209	void $cuda_kernel_instance_destroy($cuda_kernel_instance_t *i) {
	210	$cuda_try_wait(i->process);
[ef14ce6]	211	//printf("freeing kernel instance\n");
	212	$free(i);
	213	}
	214
[42ab01f]	215	/* malloc and initialize a new $cuda_kernel_instance_node_t
[ef14ce6]	216	*/
[42ab01f]	217	$cuda_kernel_instance_node_t *$cuda_kernel_instance_node_create() {
[ef14ce6]	218	//printf("mallocing kernel instance node\n");
[d66b03b]	219	$cuda_kernel_instance_node_t node = ($cuda_kernel_instance_node_t)malloc(sizeof($cuda_kernel_instance_node_t));
[ef14ce6]	220
	221	node->instance = NULL;
	222	node->next = NULL;
	223	return node;
	224	}
	225
[42ab01f]	226	/* cleanup and free a given $cuda_kernel_instance_node_t
[ef14ce6]	227	*/
[42ab01f]	228	void $cuda_kernel_instance_node_destroy($cuda_kernel_instance_node_t *node) {
	229	$cuda_kernel_instance_destroy(node->instance);
[ef14ce6]	230	//printf("freeing kernel instance node\n");
	231	$free(node);
	232	}
	233
	234	/* malloc and initialize a new stream
	235	*/
[42ab01f]	236	cudaStream_t $cuda_stream_create() {
[ef14ce6]	237	cudaStream_t s;
	238
	239	//printf("mallocing cuda stream\n");
[d66b03b]	240	s = (cudaStream_t)malloc(sizeof(_CUstream));
[42ab01f]	241	s->mostRecent = $cuda_kernel_instance_node_create();
[d811c20]	242	dim3 arbitrary = { 1, 1, 1 };
	243	s->mostRecent->instance = $cuda_kernel_instance_create(arbitrary, arbitrary);
[42ab01f]	244	s->mostRecent->instance->status = $cuda_kernel_status_finished;
[ef14ce6]	245	s->usable = $true;
	246	return s;
	247	}
	248
	249	/* block until the most recently enqueued process on the given stream
	250	* has terminated (meaning all kernels in that stream have completed)
	251	*/
[42ab01f]	252	void $cuda_stream_wait(cudaStream_t s) {
	253	$cuda_kernel_instance_t *mostRecentInstance = s->mostRecent->instance;
[4c2f120]	254	$yield();
[e5a03f2]	255	$unidirectional_when(mostRecentInstance->status == $cuda_kernel_status_finished);
[ef14ce6]	256	}
	257
	258	/* block until no more streams have kernels executing
	259	*/
[42ab01f]	260	void $cuda_stream_wait_all() {
	261	$cuda_stream_node_t *curNode = $cuda_current_context.headNode;
[ef14ce6]	262
	263	while (curNode != NULL) {
[42ab01f]	264	$cuda_stream_wait(curNode->stream);
[ef14ce6]	265	curNode = curNode->next;
	266	}
	267	}
	268
	269	/* cleanup and free a given stream
	270	*/
[42ab01f]	271	void $cuda_stream_destroy(cudaStream_t s) {
	272	$cuda_kernel_instance_node_t *curNode = s->mostRecent;
	273	$cuda_kernel_instance_node_t *nextNode;
[ef14ce6]	274
	275	while (curNode != NULL) {
[47665c1]	276	nextNode = curNode->next;
[42ab01f]	277	$cuda_kernel_instance_node_destroy(curNode);
[47665c1]	278	curNode = nextNode;
[ef14ce6]	279	}
	280	//printf("freeing cuda stream\n");
	281	$free(s);
	282	}
	283
[42ab01f]	284	/* malloc and initialize a new $cuda_stream_node_t
[ef14ce6]	285	*/
[42ab01f]	286	$cuda_stream_node_t *$cuda_stream_node_create() {
[ef14ce6]	287	//printf("mallocing cuda stream node\n");
[d66b03b]	288	$cuda_stream_node_t node = ($cuda_stream_node_t)malloc(sizeof($cuda_stream_node_t));
[ef14ce6]	289
	290	node->stream = NULL;
	291	node->next = NULL;
	292	return node;
	293	}
	294
[42ab01f]	295	/* cleanup and free a given $cuda_stream_node_t
[ef14ce6]	296	*/
[42ab01f]	297	void $cuda_stream_node_destroy($cuda_stream_node_t *node) {
[3ff27cf]	298	$assert((!node->stream->usable));
[42ab01f]	299	$cuda_stream_destroy(node->stream);
[ef14ce6]	300	//printf("freeing cuda stream node\n");
	301	$free(node);
	302	}
	303
	304	/* destroy all stream nodes contained in the context
	305	*/
[42ab01f]	306	void $cuda_stream_node_destroy_all() {
	307	$cuda_stream_node_t *curNode = $cuda_current_context.headNode;
	308	$cuda_stream_node_t *nextNode;
[ef14ce6]	309
	310	while (curNode != NULL) {
[47665c1]	311	nextNode = curNode->next;
[42ab01f]	312	$cuda_stream_node_destroy(curNode);
[47665c1]	313	curNode = nextNode;
[ef14ce6]	314	}
[42ab01f]	315	$cuda_current_context.headNode = NULL;
[ef14ce6]	316	}
	317
	318	/* malloc and initialize a new event
	319	*/
[42ab01f]	320	cudaEvent_t $cuda_event_create() {
[ef14ce6]	321	//printf("mallocing event\n");
[d66b03b]	322	cudaEvent_t e = (cudaEvent_t)malloc(sizeof(_CUevent));
[ef14ce6]	323
	324	e->numInstances = 0;
	325	e->instances = NULL;
	326	return e;
	327	}
	328
[42ab01f]	329	/* block until all $cuda_kernel_instance_ts contained in this event have
[ef14ce6]	330	* completed
	331	*/
[e045201d]	332	/*@ depends_on \access(e);
	333	@*/
[42ab01f]	334	void $cuda_event_wait(cudaEvent_t e) {
[ef14ce6]	335	for (int i = 0; i < e->numInstances; i++) {
[4c2f120]	336	$yield();
[e5a03f2]	337	$unidirectional_when(e->instances[i]->status == $cuda_kernel_status_finished);
[ef14ce6]	338	}
	339	}
	340
	341	/* cleanup and free a given event
	342	*/
[42ab01f]	343	void $cuda_event_destroy(cudaEvent_t e) {
[ef14ce6]	344	if (e->instances != NULL) {
	345	//printf("freeing instance list a\n");
	346	$free(e->instances);
	347	}
	348	//printf("freeing event\n");
	349	$free(e);
	350	}
	351
	352	/* initialize the cuda context. must be called before any cuda functions.
	353	*/
[42ab01f]	354	void $cuda_init() {
	355	$cuda_current_context.nullStream = $cuda_stream_create();
[ef14ce6]	356	}
	357
	358	/* cleanup the cuda context. must be called after all cuda functions.
	359	*/
[42ab01f]	360	void $cuda_finalize() {
	361	$cuda_stream_wait_all();
	362	$cuda_stream_wait($cuda_current_context.nullStream);
	363	$cuda_stream_node_destroy_all();
	364	$cuda_stream_destroy($cuda_current_context.nullStream);
[ef14ce6]	365	}
	366
	367	/* returns an array of pointers to the most recently enqueued kernel
	368	* of each stream.
	369	*/
[42ab01f]	370	$cuda_kernel_instance_t **$cuda_all_most_recent_kernels() {
	371	int n = $cuda_current_context.numStreams + 1;
	372	$cuda_stream_node_t *curNode = $cuda_current_context.headNode;
[ef14ce6]	373	//printf("mallocing instance list a\n");
[d66b03b]	374	$cuda_kernel_instance_t insts = ($cuda_kernel_instance_t)malloc(n * sizeof($cuda_kernel_instance_t*)) ;
[ef14ce6]	375
[42ab01f]	376	insts[0] = $cuda_current_context.nullStream->mostRecent->instance;
[ef14ce6]	377	for (int i = 1; i < n; i++, curNode = curNode->next) {
	378	insts[i] = curNode->stream->mostRecent->instance;
	379	}
	380	return insts;
	381	}
	382
	383	/* create a kernel instance for the given function k, and enqueue it
	384	* onto the given stream.
	385	*/
[d811c20]	386	void $cuda_enqueue_kernel(cudaStream_t stream, void (k)($cuda_kernel_instance_t, cudaEvent_t), dim3 gDim,
	387	dim3 bDim) {
[ef14ce6]	388	cudaStream_t s;
[42ab01f]	389	cudaEvent_t e = $cuda_event_create();
	390	$cuda_kernel_instance_node_t *newNode = $cuda_kernel_instance_node_create();
[ef14ce6]	391
	392	if (stream == NULL) {
[42ab01f]	393	e->numInstances = $cuda_current_context.numStreams + 1;
	394	e->instances = $cuda_all_most_recent_kernels();
	395	s = $cuda_current_context.nullStream;
[ef14ce6]	396	} else {
	397	e->numInstances = 2;
	398	//printf("mallocing instance list b\n");
[d66b03b]	399	e->instances = ($cuda_kernel_instance_t*)malloc(2 sizeof($cuda_kernel_instance_t*)) ;
[ef14ce6]	400	e->instances[0] = stream->mostRecent->instance;
[42ab01f]	401	e->instances[1] = $cuda_current_context.nullStream->mostRecent->instance;
[ef14ce6]	402	s = stream;
	403	}
[3ff27cf]	404	$assert((s->usable));
[d811c20]	405	newNode->instance = $cuda_kernel_instance_create(gDim, bDim);
[ef14ce6]	406	newNode->next = s->mostRecent;
	407	s->mostRecent = newNode;
	408	s->mostRecent->instance->process = $spawn k(s->mostRecent->instance, e);
	409	}
	410
	411	/* called by kernel processes. wait on the given event, then update
	412	* the status of the calling kernel to indicate it has finished waiting
	413	*/
[42ab01f]	414	void $cuda_wait_in_queue ($cuda_kernel_instance_t *this, cudaEvent_t e) {
	415	$cuda_event_wait(e);
	416	$cuda_event_destroy(e);
	417	this->status = $cuda_kernel_status_running;
[ef14ce6]	418	}
	419
	420	/* called by kernel processes. update the status of the calling kernel
	421	* to indicate that it has completed execution
	422	*/
[42ab01f]	423	void $cuda_kernel_finish($cuda_kernel_instance_t *k) {
	424	k->status = $cuda_kernel_status_finished;
[ef14ce6]	425	}
	426
[d811c20]	427	/* TODO: Finish check_data_race
	428	*/
	429	void $cuda_barrier($cuda_kernel_instance_t *k, int kernel_id, $barrier g) {
	430	$check_data_race(k, kernel_id);
	431	// We have to push a new read and write set before the barrier call to ignore it's reads and writes
	432	$read_set_push();
	433	$write_set_push();
[4724a68]	434	void captured_clear_mems(){
	435	$clear_all_mem_sets(k);
	436	}
	437	$barrier_call_execute(g, captured_clear_mems);
[d811c20]	438	$read_set_pop();
	439	$write_set_pop();
	440	}
	441
	442	/* TODO: Finish read_write_sets
	443	*/
[5229f4f]	444	$atomic_f void $check_data_race($cuda_kernel_instance_t *k, int cur_tid) {
	445	//printf("Current id: %d\n", cur_tid);
[d811c20]	446	$mem out_s0 = $mem_empty();
	447	$mem out_s1 = $mem_empty();
	448	$mem cur_mw = $write_set_pop();
	449	$mem cur_mr = $read_set_pop();
	450
	451	// Update current R/W sets
	452	k->write_sets[cur_tid] = cur_mw;
	453	k->read_sets[cur_tid] = cur_mr;
[ce6b559]	454
	455	/*
	456	printf("CHECKING DATA RACE %d [\n", cur_tid);
	457	for (int tmp_tid = 0; tmp_tid < k->size; tmp_tid++) {
	458	printf(" RS %d: %s\n", tmp_tid, k->read_sets[tmp_tid]);
	459	printf(" WS %d: %s\n", tmp_tid, k->write_sets[tmp_tid]);
	460	}
	461	printf("]\n");
	462	*/
	463
[d811c20]	464	// Check data race
	465	for (int tmp_tid = 0; tmp_tid < k->size; tmp_tid++) {
	466	if (tmp_tid == cur_tid) continue;
	467
	468	$mem tmp_mr = k->read_sets[tmp_tid];
	469	$mem tmp_mw = k->write_sets[tmp_tid];
	470
	471	$assert($mem_no_intersect(cur_mr, tmp_mw, &out_s0, &out_s1),
	472	"Data-race detected: %p read by thread %d intersects %p written by thread %d\n",
	473	out_s0, cur_tid, out_s1, tmp_tid);
	474	$assert($mem_no_intersect(cur_mw, tmp_mr, &out_s0, &out_s1),
	475	"Data-race detected: %p read by thread %d intersects %p written by thread %d\n",
	476	out_s0, cur_tid, out_s1, tmp_tid);
	477	$assert($mem_no_intersect(cur_mw, tmp_mw, &out_s0, &out_s1),
	478	"Data-race detected: %p written by thread %d intersects %p written by thread %d\n",
	479	out_s0, cur_tid, out_s1, tmp_tid);
	480	}
	481	// Update current R/W sets
[5229f4f]	482	//k->write_sets[cur_tid] = $mem_empty();
	483	//k->read_sets[cur_tid] = $mem_empty();
[d811c20]	484	$read_set_push();
	485	$write_set_push();
	486	}
[ce6b559]	487
	488	/* Clears read and write memory sets of the given thread
	489	*/
	490	void $clear_mem_sets($cuda_kernel_instance_t *k, int cur_tid) {
	491	k->write_sets[cur_tid] = $mem_empty();
	492	k->read_sets[cur_tid] = $mem_empty();
	493	}
[4149b300]	494
[4724a68]	495	void $clear_all_mem_sets($cuda_kernel_instance_t *k){
	496	for(int i = 0; i < k->size; i++)
	497	$clear_mem_sets(k, i);
	498	}
	499
[4149b300]	500	/* Publishes current read a write sets to global arrays. Local sets are not cleared
	501	*/
	502	void $publish($cuda_kernel_instance_t *k, int cur_tid) {
	503	k->write_sets[cur_tid] = $write_set_peek();
	504	k->read_sets[cur_tid] = $read_set_peek();
	505	}
	506
	507	int is_valid_width(int width){
	508	for(int i = 32; i > 1; i /= 2){
	509	if(width == i){
	510	return 1;
	511	}
	512	}
	513	return 0;
	514	}
	515
	516	int exchange_data(unsigned mask, int var, int srcLane, int tid, $comm comm, $gbarrier* warpBarriers){
	517	$read_set_push();
	518	$write_set_push();
	519	int laneID = tid % 32;
	520	int warpID = tid / 32;
[4724a68]	521	int currLaneInMask = (mask >> laneID) & 1;
	522	int srcLaneInMask = (mask >> srcLane) & 1;
[4149b300]	523	int dest;
	524	$gbarrier gbarrier = warpBarriers[warpID];
	525	int numActiveThreads = 0;
	526	for (int i = 0; i < $get_nprocs(gbarrier); i++){
	527	numActiveThreads += (mask >> i) & 1;
	528	}
[4724a68]	529	if(currLaneInMask){
	530	$barrier barrier = $barrier_create($here, gbarrier, laneID);
	531	if(srcLaneInMask){
	532	$message request_message = $message_pack(tid, srcLane, 0, &tid, sizeof(int));
	533	$comm_enqueue(comm, request_message);
	534	}
[4149b300]	535	$barrier_call_subset(barrier, numActiveThreads);
[4724a68]	536	while ($comm_probe(comm, $COMM_ANY_SOURCE, 0)){
	537	$yield();
	538	$message recv_request = $comm_dequeue(comm, $COMM_ANY_SOURCE, 0);
	539	$message_unpack(recv_request, &dest, sizeof(int));
	540	$message send_message = $message_pack(tid, dest, 1, &var, sizeof(int));
	541	$comm_enqueue(comm, send_message);
	542	}
	543	if(srcLaneInMask){
	544	$yield();
	545	$message recv_message = $comm_dequeue(comm, srcLane, 1);
	546	$message_unpack(recv_message, &var, sizeof(int));
	547	}
	548	else{
	549	$havoc(&var);
	550	}
	551	$barrier_destroy(barrier);
[4149b300]	552	}
[4724a68]	553	else{
	554	if(laneID != srcLane)
	555	$havoc(&var);
	556	}
[4149b300]	557	$read_set_pop();
	558	$write_set_pop();
	559	return var;
	560	}
	561
	562	int _cuda__shfl_sync(unsigned mask, int var, int srcLane, int width, int numThreads, int tid, $comm comm, $gbarrier* warpBarriers){
	563	$assert(is_valid_width(width));
	564	int subWarpID = tid / width;
	565	srcLane = srcLane % width + subWarpID * width;
	566	if(srcLane >= numThreads){
	567	int lastSubWarpSize = numThreads % width;
	568	srcLane = srcLane % lastSubWarpSize + subWarpID * width;
	569	}
	570	return exchange_data(mask, var, srcLane, tid, comm, warpBarriers);
	571	}
	572	int _cuda__shfl_up_sync(unsigned mask, int var, unsigned int delta, int width, int numThreads, int tid, $comm comm, $gbarrier* warpBarriers){
	573	$assert(is_valid_width(width));
	574	int subWarpLaneID = tid % width;
	575	int srcLane = tid;
	576	if (subWarpLaneID - delta >= 0){
	577	srcLane = tid - delta;
	578	}
	579	return exchange_data(mask, var, srcLane, tid, comm, warpBarriers);
	580	}
	581	int _cuda__shfl_down_sync(unsigned mask, int var, unsigned int delta, int width, int numThreads, int tid, $comm comm, $gbarrier* warpBarriers){
	582	$assert(is_valid_width(width));
	583	int subWarpLaneID = tid % width;
	584	int srcLane = tid;
	585	if (subWarpLaneID + delta < width && tid + delta < numThreads) {
	586	srcLane = tid + delta;
	587	}
	588	return exchange_data(mask, var, srcLane, tid, comm, warpBarriers);
	589	}
	590	int _cuda__shfl_xor_sync(unsigned mask, int var, int laneMask, int width, int numThreads, int tid, $comm comm, $gbarrier* warpBarriers){
	591	$assert(is_valid_width(width));
	592	int laneID = tid % 32;
	593	int warpID = tid / 32;
	594	int subWarpID = tid / width;
	595	int srcLane = laneID ^ laneMask;
	596	if(!(srcLane / 32 == warpID && srcLane / width <= subWarpID && srcLane < numThreads)){
	597	srcLane = tid;
	598	}
	599	return exchange_data(mask, var, srcLane, tid, comm, warpBarriers);
	600	}
	601

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

Original Format