Context Navigation

civl-cuda.cvl@ 1aaefd4

main test-branch

Last change on this file since 1aaefd4 was ea777aa, checked in by Alex Wilton <awilton@…>, 3 years ago

Moved examples, include, build_default.properties, common.xml, and README out from dev.civl.com into the root of the repo.

git-svn-id: svn://vsl.cis.udel.edu/civl/trunk@5704 fb995dde-84ed-4084-dfe6-e5aef3e2452c

Property mode set to 100644

File size: 17.5 KB

Line
1	#include <civlc.cvh>
2	/* This header file contains useful helper functions for manipulating
3	* the CIVL versions of various Cuda objects.
4	*/
5
6	#include <string.h>
7	// To print for debugging
8	#include <stdio.h>
9	#include <stdlib.h>
10	#include <concurrency.cvh>
11	#include <seq.cvh>
12	#include <mem.cvh>
13	#include <civl-cuda.cvh>
14	#include <comm.cvh>
15	#pragma CIVL ACSL
16
17	/* type used to represent an instance of a Cuda kernel
18	* TODO: Update with read/write sets
19	*/
20	struct $cuda_kernel_instance {
21	$proc process; // the actual process executing this kernel
22	$cuda_kernel_status status; // the kernel status
23	// Read and Write sets
24	$mem read_sets[];
25	$mem write_sets[];
26	// Number of threads in kernel
27	int size;
28	};
29
30	/* a type that wraps a kernel instance for insertion into a list
31	*/
32	struct $cuda_kernel_instance_node {
33	$cuda_kernel_instance_t *instance;
34	$cuda_kernel_instance_node_t *next;
35	};
36
37	/* $cuda_kernel_instance_node_t interface
38	*/
39	$cuda_kernel_instance_t $cuda_get_instance($cuda_kernel_instance_node_t node) {
40	return node->instance;
41	}
42
43	/* a type used to represent a Cuda stream
44	*/
45	struct _CUstream {
46	$cuda_kernel_instance_node_t *mostRecent; // the most recently enqueued instance
47	_Bool usable; // indicates whether or not this stream
48	// has been marked for deletion
49	};
50
51	/* _CUstream interface
52	*/
53	$cuda_kernel_instance_node_t *$cuda_get_most_recent(cudaStream_t s) {
54	return s->mostRecent;
55	}
56
57	_Bool $cuda_is_usable(cudaStream_t s) {
58	return s->usable;
59	}
60
61	void $cuda_set_usable(cudaStream_t s, _Bool b) {
62	s->usable = b;
63	}
64
65	/* a type that wraps a stream for insertion into a list
66	*/
67	struct $cuda_stream_node {
68	cudaStream_t stream;
69	$cuda_stream_node_t *next;
70	};
71
72	/* $cuda_stream_node_t encapsulation functions
73	*/
74	void $cuda_set_stream($cuda_stream_node_t* node, cudaStream_t stream) {
75	node->stream = stream;
76	}
77
78	void $cuda_set_next($cuda_stream_node_t* node, $cuda_stream_node_t* next) {
79	node->next = next;
80	}
81
82	/* a type used to represent a Cuda event
83	*/
84	struct _CUevent {
85	$cuda_kernel_instance_t **instances;
86	int numInstances;
87	};
88
89	/* _CUevent encapsulation functions
90	*/
91	$cuda_kernel_instance_t **$cuda_get_instances(cudaEvent_t e) {
92	return e->instances;
93	}
94
95	void $cuda_set_instances(cudaEvent_t e, $cuda_kernel_instance_t** instances, int numInstances) {
96	e->instances = instances;
97	e->numInstances = numInstances;
98	}
99
100	int $cuda_get_num_instances(cudaEvent_t e) {
101	return e->numInstances;
102	}
103
104	/* a type representing the state of a Cuda device
105	*/
106	struct $cuda_context {
107	$cuda_stream_node_t *headNode;
108	cudaStream_t nullStream;
109	int numStreams;
110	};
111
112	/* $cuda_context_t encapsulation functions
113	*/
114	int $cuda_get_num_streams($cuda_context_t *_context) {
115	return _context->numStreams;
116	}
117
118	$cuda_stream_node_t $cuda_get_head_node($cuda_context_t _context) {
119	return _context->headNode;
120	}
121
122	cudaStream_t $cuda_get_null_stream($cuda_context_t *_context) {
123	return _context->nullStream;
124	}
125
126	void $cuda_add_new_stream($cuda_context_t _context, $cuda_stream_node_t newNode) {
127	_context->headNode = newNode;
128	_context->numStreams++;
129	}
130
131	/* Computes the one dimensional index of a grid cell at a given location
132	* in a three dimensional grid of a given size
133	*/
134	int $cuda_index (dim3 size, uint3 location) {
135	return location.x + size.x * (location.y + size.y * location.z);
136	}
137
138	/* Computes the one dimensional index of a specific thread in the grid given
139	* the grid dimension, block dimension, block index, and thread index
140	*/
141	int $cuda_kernel_index (dim3 gDim, dim3 bDim, uint3 bIdx, uint3 tIdx) {
142	return $cuda_index(gDim, bIdx) * (bDim.x * bDim.y * bDim.z) + $cuda_index(bDim, tIdx);
143	}
144
145	/* Lifts a single integer x into a three dimensional vector representing
146	* a one dimensional grid of length x
147	*/
148	dim3 $cuda_to_dim3(int x) {
149	dim3 d = { x, 1, 1 };
150
151	return d;
152	}
153
154	/* Given a three dimensional vector representing a grid of size dim,
155	* create and destroy a process, in parallel, for each cell in the grid.
156	* The location of the cell is passed to the spawning function.
157	*/
158	void $cuda_run_procs(dim3 dim, void spawningFunction(uint3)) {
159	$range rx = 0 .. (dim.x == 0 ? -1 : dim.x - 1);
160	$range ry = 0 .. (dim.y == 0 ? -1 : dim.y - 1);
161	$range rz = 0 .. (dim.z == 0 ? -1 : dim.z - 1);
162	$domain(3) dom = ($domain){rx, ry, rz};
163	$parfor(int x,y,z : dom){
164	uint3 id = { x, y, z };
165	spawningFunction(id);
166	}
167	}
168
169	// ------------------------------------------------
170
171	/* $wait on a given process is it is non-null
172	*/
173	void $cuda_try_wait($proc p) {
174	if (p != $proc_null) {
175	$yield();
176	$wait(p);
177	}
178	}
179
180	/* The current state of the GPU
181	*/
182	$cuda_context_t $cuda_current_context = {
183	.headNode = NULL,
184	.nullStream = NULL,
185	.numStreams = 0
186	};
187
188	/* malloc and initialize a new $cuda_kernel_instance_t
189	*/
190
191	// TODO: Determine if passing parameters is better or not
192	$cuda_kernel_instance_t *$cuda_kernel_instance_create(dim3 gDim, dim3 bDim) {
193	//printf("mallocing kernel instance\n");
194	$cuda_kernel_instance_t i = ($cuda_kernel_instance_t)malloc(sizeof($cuda_kernel_instance_t));
195
196	i->process = $proc_null;
197	i->status = $cuda_kernel_status_waiting;
198	int threadNum = gDim.x * gDim.y * gDim.z * bDim.x * bDim.y * bDim.z;
199	i->size = threadNum;
200	// TODO: either sequentially or non-sequentially?
201	$mem empty = $mem_empty();
202	$seq_init(&i->read_sets, threadNum, &empty);
203	$seq_init(&i->write_sets, threadNum, &empty);
204	return i;
205	}
206
207	/* cleanup and free a given $cuda_kernel_instance_t
208	*/
209	void $cuda_kernel_instance_destroy($cuda_kernel_instance_t *i) {
210	$cuda_try_wait(i->process);
211	//printf("freeing kernel instance\n");
212	$free(i);
213	}
214
215	/* malloc and initialize a new $cuda_kernel_instance_node_t
216	*/
217	$cuda_kernel_instance_node_t *$cuda_kernel_instance_node_create() {
218	//printf("mallocing kernel instance node\n");
219	$cuda_kernel_instance_node_t node = ($cuda_kernel_instance_node_t)malloc(sizeof($cuda_kernel_instance_node_t));
220
221	node->instance = NULL;
222	node->next = NULL;
223	return node;
224	}
225
226	/* cleanup and free a given $cuda_kernel_instance_node_t
227	*/
228	void $cuda_kernel_instance_node_destroy($cuda_kernel_instance_node_t *node) {
229	$cuda_kernel_instance_destroy(node->instance);
230	//printf("freeing kernel instance node\n");
231	$free(node);
232	}
233
234	/* malloc and initialize a new stream
235	*/
236	cudaStream_t $cuda_stream_create() {
237	cudaStream_t s;
238
239	//printf("mallocing cuda stream\n");
240	s = (cudaStream_t)malloc(sizeof(_CUstream));
241	s->mostRecent = $cuda_kernel_instance_node_create();
242	dim3 arbitrary = { 1, 1, 1 };
243	s->mostRecent->instance = $cuda_kernel_instance_create(arbitrary, arbitrary);
244	s->mostRecent->instance->status = $cuda_kernel_status_finished;
245	s->usable = $true;
246	return s;
247	}
248
249	/* block until the most recently enqueued process on the given stream
250	* has terminated (meaning all kernels in that stream have completed)
251	*/
252	void $cuda_stream_wait(cudaStream_t s) {
253	$cuda_kernel_instance_t *mostRecentInstance = s->mostRecent->instance;
254	$yield();
255	$unidirectional_when(mostRecentInstance->status == $cuda_kernel_status_finished);
256	}
257
258	/* block until no more streams have kernels executing
259	*/
260	void $cuda_stream_wait_all() {
261	$cuda_stream_node_t *curNode = $cuda_current_context.headNode;
262
263	while (curNode != NULL) {
264	$cuda_stream_wait(curNode->stream);
265	curNode = curNode->next;
266	}
267	}
268
269	/* cleanup and free a given stream
270	*/
271	void $cuda_stream_destroy(cudaStream_t s) {
272	$cuda_kernel_instance_node_t *curNode = s->mostRecent;
273	$cuda_kernel_instance_node_t *nextNode;
274
275	while (curNode != NULL) {
276	nextNode = curNode->next;
277	$cuda_kernel_instance_node_destroy(curNode);
278	curNode = nextNode;
279	}
280	//printf("freeing cuda stream\n");
281	$free(s);
282	}
283
284	/* malloc and initialize a new $cuda_stream_node_t
285	*/
286	$cuda_stream_node_t *$cuda_stream_node_create() {
287	//printf("mallocing cuda stream node\n");
288	$cuda_stream_node_t node = ($cuda_stream_node_t)malloc(sizeof($cuda_stream_node_t));
289
290	node->stream = NULL;
291	node->next = NULL;
292	return node;
293	}
294
295	/* cleanup and free a given $cuda_stream_node_t
296	*/
297	void $cuda_stream_node_destroy($cuda_stream_node_t *node) {
298	$assert((!node->stream->usable));
299	$cuda_stream_destroy(node->stream);
300	//printf("freeing cuda stream node\n");
301	$free(node);
302	}
303
304	/* destroy all stream nodes contained in the context
305	*/
306	void $cuda_stream_node_destroy_all() {
307	$cuda_stream_node_t *curNode = $cuda_current_context.headNode;
308	$cuda_stream_node_t *nextNode;
309
310	while (curNode != NULL) {
311	nextNode = curNode->next;
312	$cuda_stream_node_destroy(curNode);
313	curNode = nextNode;
314	}
315	$cuda_current_context.headNode = NULL;
316	}
317
318	/* malloc and initialize a new event
319	*/
320	cudaEvent_t $cuda_event_create() {
321	//printf("mallocing event\n");
322	cudaEvent_t e = (cudaEvent_t)malloc(sizeof(_CUevent));
323
324	e->numInstances = 0;
325	e->instances = NULL;
326	return e;
327	}
328
329	/* block until all $cuda_kernel_instance_ts contained in this event have
330	* completed
331	*/
332	/*@ depends_on \access(e);
333	@*/
334	void $cuda_event_wait(cudaEvent_t e) {
335	for (int i = 0; i < e->numInstances; i++) {
336	$yield();
337	$unidirectional_when(e->instances[i]->status == $cuda_kernel_status_finished);
338	}
339	}
340
341	/* cleanup and free a given event
342	*/
343	void $cuda_event_destroy(cudaEvent_t e) {
344	if (e->instances != NULL) {
345	//printf("freeing instance list a\n");
346	$free(e->instances);
347	}
348	//printf("freeing event\n");
349	$free(e);
350	}
351
352	/* initialize the cuda context. must be called before any cuda functions.
353	*/
354	void $cuda_init() {
355	$cuda_current_context.nullStream = $cuda_stream_create();
356	}
357
358	/* cleanup the cuda context. must be called after all cuda functions.
359	*/
360	void $cuda_finalize() {
361	$cuda_stream_wait_all();
362	$cuda_stream_wait($cuda_current_context.nullStream);
363	$cuda_stream_node_destroy_all();
364	$cuda_stream_destroy($cuda_current_context.nullStream);
365	}
366
367	/* returns an array of pointers to the most recently enqueued kernel
368	* of each stream.
369	*/
370	$cuda_kernel_instance_t **$cuda_all_most_recent_kernels() {
371	int n = $cuda_current_context.numStreams + 1;
372	$cuda_stream_node_t *curNode = $cuda_current_context.headNode;
373	//printf("mallocing instance list a\n");
374	$cuda_kernel_instance_t insts = ($cuda_kernel_instance_t)malloc(n * sizeof($cuda_kernel_instance_t*)) ;
375
376	insts[0] = $cuda_current_context.nullStream->mostRecent->instance;
377	for (int i = 1; i < n; i++, curNode = curNode->next) {
378	insts[i] = curNode->stream->mostRecent->instance;
379	}
380	return insts;
381	}
382
383	/* create a kernel instance for the given function k, and enqueue it
384	* onto the given stream.
385	*/
386	void $cuda_enqueue_kernel(cudaStream_t stream, void (k)($cuda_kernel_instance_t, cudaEvent_t), dim3 gDim,
387	dim3 bDim) {
388	cudaStream_t s;
389	cudaEvent_t e = $cuda_event_create();
390	$cuda_kernel_instance_node_t *newNode = $cuda_kernel_instance_node_create();
391
392	if (stream == NULL) {
393	e->numInstances = $cuda_current_context.numStreams + 1;
394	e->instances = $cuda_all_most_recent_kernels();
395	s = $cuda_current_context.nullStream;
396	} else {
397	e->numInstances = 2;
398	//printf("mallocing instance list b\n");
399	e->instances = ($cuda_kernel_instance_t*)malloc(2 sizeof($cuda_kernel_instance_t*)) ;
400	e->instances[0] = stream->mostRecent->instance;
401	e->instances[1] = $cuda_current_context.nullStream->mostRecent->instance;
402	s = stream;
403	}
404	$assert((s->usable));
405	newNode->instance = $cuda_kernel_instance_create(gDim, bDim);
406	newNode->next = s->mostRecent;
407	s->mostRecent = newNode;
408	s->mostRecent->instance->process = $spawn k(s->mostRecent->instance, e);
409	}
410
411	/* called by kernel processes. wait on the given event, then update
412	* the status of the calling kernel to indicate it has finished waiting
413	*/
414	void $cuda_wait_in_queue ($cuda_kernel_instance_t *this, cudaEvent_t e) {
415	$cuda_event_wait(e);
416	$cuda_event_destroy(e);
417	this->status = $cuda_kernel_status_running;
418	}
419
420	/* called by kernel processes. update the status of the calling kernel
421	* to indicate that it has completed execution
422	*/
423	void $cuda_kernel_finish($cuda_kernel_instance_t *k) {
424	k->status = $cuda_kernel_status_finished;
425	}
426
427	/* TODO: Finish check_data_race
428	*/
429	void $cuda_barrier($cuda_kernel_instance_t *k, int kernel_id, $barrier g) {
430	$check_data_race(k, kernel_id);
431	// We have to push a new read and write set before the barrier call to ignore it's reads and writes
432	$read_set_push();
433	$write_set_push();
434	void captured_clear_mems(){
435	$clear_all_mem_sets(k);
436	}
437	$barrier_call_execute(g, captured_clear_mems);
438	$read_set_pop();
439	$write_set_pop();
440	}
441
442	/* TODO: Finish read_write_sets
443	*/
444	$atomic_f void $check_data_race($cuda_kernel_instance_t *k, int cur_tid) {
445	//printf("Current id: %d\n", cur_tid);
446	$mem out_s0 = $mem_empty();
447	$mem out_s1 = $mem_empty();
448	$mem cur_mw = $write_set_pop();
449	$mem cur_mr = $read_set_pop();
450
451	// Update current R/W sets
452	k->write_sets[cur_tid] = cur_mw;
453	k->read_sets[cur_tid] = cur_mr;
454
455	/*
456	printf("CHECKING DATA RACE %d [\n", cur_tid);
457	for (int tmp_tid = 0; tmp_tid < k->size; tmp_tid++) {
458	printf(" RS %d: %s\n", tmp_tid, k->read_sets[tmp_tid]);
459	printf(" WS %d: %s\n", tmp_tid, k->write_sets[tmp_tid]);
460	}
461	printf("]\n");
462	*/
463
464	// Check data race
465	for (int tmp_tid = 0; tmp_tid < k->size; tmp_tid++) {
466	if (tmp_tid == cur_tid) continue;
467
468	$mem tmp_mr = k->read_sets[tmp_tid];
469	$mem tmp_mw = k->write_sets[tmp_tid];
470
471	$assert($mem_no_intersect(cur_mr, tmp_mw, &out_s0, &out_s1),
472	"Data-race detected: %p read by thread %d intersects %p written by thread %d\n",
473	out_s0, cur_tid, out_s1, tmp_tid);
474	$assert($mem_no_intersect(cur_mw, tmp_mr, &out_s0, &out_s1),
475	"Data-race detected: %p read by thread %d intersects %p written by thread %d\n",
476	out_s0, cur_tid, out_s1, tmp_tid);
477	$assert($mem_no_intersect(cur_mw, tmp_mw, &out_s0, &out_s1),
478	"Data-race detected: %p written by thread %d intersects %p written by thread %d\n",
479	out_s0, cur_tid, out_s1, tmp_tid);
480	}
481	// Update current R/W sets
482	//k->write_sets[cur_tid] = $mem_empty();
483	//k->read_sets[cur_tid] = $mem_empty();
484	$read_set_push();
485	$write_set_push();
486	}
487
488	/* Clears read and write memory sets of the given thread
489	*/
490	void $clear_mem_sets($cuda_kernel_instance_t *k, int cur_tid) {
491	k->write_sets[cur_tid] = $mem_empty();
492	k->read_sets[cur_tid] = $mem_empty();
493	}
494
495	void $clear_all_mem_sets($cuda_kernel_instance_t *k){
496	for(int i = 0; i < k->size; i++)
497	$clear_mem_sets(k, i);
498	}
499
500	/* Publishes current read a write sets to global arrays. Local sets are not cleared
501	*/
502	void $publish($cuda_kernel_instance_t *k, int cur_tid) {
503	k->write_sets[cur_tid] = $write_set_peek();
504	k->read_sets[cur_tid] = $read_set_peek();
505	}
506
507	int is_valid_width(int width){
508	for(int i = 32; i > 1; i /= 2){
509	if(width == i){
510	return 1;
511	}
512	}
513	return 0;
514	}
515
516	int exchange_data(unsigned mask, int var, int srcLane, int tid, $comm comm, $gbarrier* warpBarriers){
517	$read_set_push();
518	$write_set_push();
519	int laneID = tid % 32;
520	int warpID = tid / 32;
521	int currLaneInMask = (mask >> laneID) & 1;
522	int srcLaneInMask = (mask >> srcLane) & 1;
523	int dest;
524	$gbarrier gbarrier = warpBarriers[warpID];
525	int numActiveThreads = 0;
526	for (int i = 0; i < $get_nprocs(gbarrier); i++){
527	numActiveThreads += (mask >> i) & 1;
528	}
529	if(currLaneInMask){
530	$barrier barrier = $barrier_create($here, gbarrier, laneID);
531	if(srcLaneInMask){
532	$message request_message = $message_pack(tid, srcLane, 0, &tid, sizeof(int));
533	$comm_enqueue(comm, request_message);
534	}
535	$barrier_call_subset(barrier, numActiveThreads);
536	while ($comm_probe(comm, $COMM_ANY_SOURCE, 0)){
537	$yield();
538	$message recv_request = $comm_dequeue(comm, $COMM_ANY_SOURCE, 0);
539	$message_unpack(recv_request, &dest, sizeof(int));
540	$message send_message = $message_pack(tid, dest, 1, &var, sizeof(int));
541	$comm_enqueue(comm, send_message);
542	}
543	if(srcLaneInMask){
544	$yield();
545	$message recv_message = $comm_dequeue(comm, srcLane, 1);
546	$message_unpack(recv_message, &var, sizeof(int));
547	}
548	else{
549	$havoc(&var);
550	}
551	$barrier_destroy(barrier);
552	}
553	else{
554	if(laneID != srcLane)
555	$havoc(&var);
556	}
557	$read_set_pop();
558	$write_set_pop();
559	return var;
560	}
561
562	int _cuda__shfl_sync(unsigned mask, int var, int srcLane, int width, int numThreads, int tid, $comm comm, $gbarrier* warpBarriers){
563	$assert(is_valid_width(width));
564	int subWarpID = tid / width;
565	srcLane = srcLane % width + subWarpID * width;
566	if(srcLane >= numThreads){
567	int lastSubWarpSize = numThreads % width;
568	srcLane = srcLane % lastSubWarpSize + subWarpID * width;
569	}
570	return exchange_data(mask, var, srcLane, tid, comm, warpBarriers);
571	}
572	int _cuda__shfl_up_sync(unsigned mask, int var, unsigned int delta, int width, int numThreads, int tid, $comm comm, $gbarrier* warpBarriers){
573	$assert(is_valid_width(width));
574	int subWarpLaneID = tid % width;
575	int srcLane = tid;
576	if (subWarpLaneID - delta >= 0){
577	srcLane = tid - delta;
578	}
579	return exchange_data(mask, var, srcLane, tid, comm, warpBarriers);
580	}
581	int _cuda__shfl_down_sync(unsigned mask, int var, unsigned int delta, int width, int numThreads, int tid, $comm comm, $gbarrier* warpBarriers){
582	$assert(is_valid_width(width));
583	int subWarpLaneID = tid % width;
584	int srcLane = tid;
585	if (subWarpLaneID + delta < width && tid + delta < numThreads) {
586	srcLane = tid + delta;
587	}
588	return exchange_data(mask, var, srcLane, tid, comm, warpBarriers);
589	}
590	int _cuda__shfl_xor_sync(unsigned mask, int var, int laneMask, int width, int numThreads, int tid, $comm comm, $gbarrier* warpBarriers){
591	$assert(is_valid_width(width));
592	int laneID = tid % 32;
593	int warpID = tid / 32;
594	int subWarpID = tid / width;
595	int srcLane = laneID ^ laneMask;
596	if(!(srcLane / 32 == warpID && srcLane / width <= subWarpID && srcLane < numThreads)){
597	srcLane = tid;
598	}
599	return exchange_data(mask, var, srcLane, tid, comm, warpBarriers);
600	}
601

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

Original Format