Context Navigation

dot2.cvl@ 764d472

1.23 2.0 acw/focus-triggers main test-branch

Last change on this file since 764d472 was 9dbe6120, checked in by Andre Marianiello <andre.marianiello@…>, 12 years ago

Initial commit of Cuda translation header for CIVL and a non-working translation of dot.cu

git-svn-id: svn://vsl.cis.udel.edu/civl/trunk@1186 fb995dde-84ed-4084-dfe6-e5aef3e2452c

Property mode set to 100644

File size: 4.1 KB

Line
1	/*
2	* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
3	*
4	* NVIDIA Corporation and its licensors retain all intellectual property and
5	* proprietary rights in and to this software and related documentation.
6	* Any use, reproduction, disclosure, or distribution of this software
7	* and related documentation without an express license agreement from
8	* NVIDIA Corporation is strictly prohibited.
9	*
10	* Please refer to the applicable NVIDIA end user license agreement (EULA)
11	* associated with this source code for terms and conditions that govern
12	* your use of this NVIDIA software.
13	*
14	* This is a version translated to CIVL.
15	*
16	*/
17
18	#include "civlc.h"
19	#include "civlc-cuda.cvl"
20	#include "stdio.h"
21
22	#define imin(a,b) (a<b?a:b)
23
24	$input int N;
25	$input int B;
26	$assume(0 <= N && N <= B);
27	const int threadsPerBlock = 2;
28	const int blocksPerGrid =
29	imin( 32, (N+threadsPerBlock-1) / threadsPerBlock );
30
31	void _kernel_dot(dim3 gridDim, dim3 blockDim, cudaStream_t s, float a, float b, float *c ) {
32
33	void _kernel ($proc prevProc) {
34
35	_tryWait(prevProc);
36
37	void _block (uint3 blockIdx) {
38	int _numThreads = blockDim.x * blockDim.y * blockDim.z;
39	$gbarrier _block_barrier = $gbarrier_create($here, _numThreads);
40
41	float cache[threadsPerBlock];
42
43	void _thread (uint3 threadIdx) {
44
45	int _tid = _index(blockDim, threadIdx);
46	$barrier _b = $barrier_create($here, _block_barrier, _tid);
47
48	int tid = threadIdx.x + blockIdx.x * blockDim.x;
49	int cacheIndex = threadIdx.x;
50
51	float temp = 0;
52	while (tid < N) {
53	temp += a[tid] * b[tid];
54	tid += blockDim.x * gridDim.x;
55	}
56
57	// set the cache values
58	cache[cacheIndex] = temp;
59
60	// synchronize threads in this block
61
62	$barrier_call(_b);
63
64	// for reductions, threadsPerBlock must be a power of 2
65	// because of the following code
66	int i = blockDim.x/2;
67	while (i != 0) {
68	if (cacheIndex < i) {
69	cache[cacheIndex] += cache[cacheIndex + i];
70	}
71	//$barrier_call(_b);
72	i /= 2;
73	}
74
75	if (cacheIndex == 0)
76	c[blockIdx.x] = cache[0];
77
78	$barrier_destroy(_b);
79	}
80	_createProcs(blockDim, _thread);
81	$gbarrier_destroy(_block_barrier);
82	}
83	_createProcs(gridDim, _block);
84	}
85	_initKernel(s, _kernel);
86	}
87
88	int main ( void ) {
89	_host_scope = $here;
90
91	int _main( void ) {
92	float a, b, c, *partial_c;
93	float dev_a, dev_b, *dev_partial_c;
94
95	// allocate memory on the cpu side
96	a = (float)$malloc(_host_scope, Nsizeof(float) );
97	b = (float)$malloc(_host_scope, Nsizeof(float) );
98	partial_c = (float)$malloc(_host_scope, blocksPerGridsizeof(float) );
99
100	// allocate the memory on the GPU
101	dev_a = (float )$malloc( _global_scope, Nsizeof(float) );
102	dev_b = (float )$malloc( _global_scope, Nsizeof(float) );
103	dev_partial_c = (float )$malloc( _global_scope, blocksPerGridsizeof(float) );
104
105	// fill in the host memory with data
106	for (int i=0; i<N; i++) {
107	a[i] = i;
108	b[i] = i*2;
109	}
110
111	// copy the arrays 'a' and 'b' to the GPU
112	cudaMemcpy( dev_a, a, N*sizeof(float),
113	cudaMemcpyHostToDevice ); // civl version of cudaMemcpy will check memory correctness, then call regular memcpy
114	cudaMemcpy( dev_b, b, N*sizeof(float),
115	cudaMemcpyHostToDevice );
116
117	dim3 _t1, _t2;
118	_t1 = _toDim3(blocksPerGrid);
119	_t2 = _toDim3(threadsPerBlock);
120	_kernel_dot(_t1, _t2, dev_a, dev_b,
121	dev_partial_c );
122
123	// copy the array 'c' back from the GPU to the CPU
124	cudaMemcpy( partial_c, dev_partial_c,
125	blocksPerGrid*sizeof(float),
126	cudaMemcpyDeviceToHost );
127
128	// finish up on the CPU side
129	c = 0;
130	for (int i=0; i<blocksPerGrid; i++) {
131	c += partial_c[i];
132	}
133
134	#define sum_squares(x) (x(x+1)(2*x+1)/6)
135	printf( "Does GPU value %.6g = %.6g?\n", c,
136	2 * sum_squares( (float)(N - 1) ) );
137	$assert(c == 2 * sum_squares( (float)(N - 1) ) );
138
139	// free memory on the gpu side
140	cudaFree( dev_a ); // civl versionof cudaFree will check memory correctness, then call regular free
141	cudaFree( dev_b );
142	cudaFree( dev_partial_c );
143
144	// free memory on the cpu side
145	$free( a );
146	$free( b );
147	$free( partial_c );
148	return 0;
149	}
150
151	_main();
152	}
153
154

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

Original Format