Context Navigation

dot2.cvl@ ed451d9

1.23 2.0 main test-branch

Last change on this file since ed451d9 was d66b03b, checked in by Ziqing Luo <ziqing@…>, 10 years ago

got rid of $root

git-svn-id: svn://vsl.cis.udel.edu/civl/trunk@3026 fb995dde-84ed-4084-dfe6-e5aef3e2452c

Property mode set to 100644

File size: 4.9 KB

Line
1	/*
2	* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
3	*
4	* NVIDIA Corporation and its licensors retain all intellectual property and
5	* proprietary rights in and to this software and related documentation.
6	* Any use, reproduction, disclosure, or distribution of this software
7	* and related documentation without an express license agreement from
8	* NVIDIA Corporation is strictly prohibited.
9	*
10	* Please refer to the applicable NVIDIA end user license agreement (EULA)
11	* associated with this source code for terms and conditions that govern
12	* your use of this NVIDIA software.
13	*
14	* This is a version translated to CIVL.
15	*
16	*/
17
18	#include <civlc.cvh>
19	#include <stdio.h>
20	#include <cuda.h>
21
22	#define imin(a,b) (a<b?a:b)
23	$scope root = $here;
24
25	_Bool isPowerOfTwo(int x) {
26	if (x == 1) {
27	return $true;
28	} else {
29	return x % 2 == 0 && isPowerOfTwo(x / 2);
30	}
31	}
32
33
34	// the length of the vectors to dot product
35	$input int N;
36	// upper bound on N
37	$input int B;
38	$assume((0 <= N && N <= B));
39	$input int THREADS_PER_BLOCK; // thread number per block: must be a power of 2, due to the while loop at the end of gpuThread();
40	$input int THREADS_B;
41	$assume((1 <= THREADS_PER_BLOCK && THREADS_PER_BLOCK <= THREADS_B));
42
43	const int threadsPerBlock = THREADS_PER_BLOCK;
44	// the number of blocks to create
45	const int blocksPerGrid =
46	imin( 32, (N+threadsPerBlock-1) / threadsPerBlock );
47
48	void _kernel_dot(dim3 gridDim, dim3 blockDim, cudaStream_t s, float a, float b, float *c ) {
49
50	void _kernel (_kernelInstance *this, cudaEvent_t e) {
51
52	void _block (uint3 blockIdx) {
53	// calculate total number of threads
54	int _numThreads = blockDim.x * blockDim.y * blockDim.z;
55	$gbarrier _block_barrier = $gbarrier_create($here, _numThreads);
56
57	// shared memory for a block
58	float cache[threadsPerBlock];
59
60	void _thread (uint3 threadIdx) {
61
62	// get the one dimensional index of this threads
63	int _tid = _index(blockDim, threadIdx);
64	$barrier _b = $barrier_create($here, _block_barrier, _tid);
65
66	// get this thread's id
67	int tid = threadIdx.x + blockIdx.x * blockDim.x;
68	int cacheIndex = threadIdx.x;
69
70	float temp = 0;
71	int i = blockDim.x/2;
72
73	while (tid < N) {
74	temp += a[tid] * b[tid];
75	tid += blockDim.x * gridDim.x;
76	}
77
78	// set the cache values
79	cache[cacheIndex] = temp;
80
81	// synchronize threads in this block
82
83	$barrier_call(_b);
84
85	// for reductions, threadsPerBlock must be a power of 2
86	// because of the following code
87	while (i != 0) {
88	if (cacheIndex < i) {
89	cache[cacheIndex] += cache[cacheIndex + i];
90	}
91	$barrier_call(_b);
92	i /= 2;
93	}
94
95	if (cacheIndex == 0) {
96	c[blockIdx.x] = cache[0];
97	}
98
99
100	$barrier_destroy(_b);
101	}
102
103	_runProcs(blockDim, _thread);
104	$gbarrier_destroy(_block_barrier);
105	}
106
107	_waitInQueue(this, e);
108	_runProcs(gridDim, _block);
109	_kernelFinish(this);
110	}
111	_enqueueKernel(s, _kernel);
112	}
113
114	int main ( void ) {
115
116	int _main( void ) {
117	float a, b, c, *partial_c;
118	float dev_a, dev_b, *dev_partial_c;
119
120	// allocate memory on the cpu side
121	a = (float)$malloc(root, Nsizeof(float) );
122	b = (float)$malloc(root, Nsizeof(float) );
123	partial_c = (float)$malloc(root, blocksPerGridsizeof(float) );
124
125	// allocate the memory on the GPU
126	dev_a = (float )$malloc(root, Nsizeof(float) );
127	dev_b = (float )$malloc(root, Nsizeof(float) );
128	dev_partial_c = (float )$malloc(root, blocksPerGridsizeof(float) );
129
130	// fill in the host memory with data
131	for (int i=0; i<N; i++) {
132	a[i] = i;
133	b[i] = i*2;
134	}
135
136	// copy the arrays 'a' and 'b' to the GPU
137	cudaMemcpy( dev_a, a, N*sizeof(float), cudaMemcpyHostToDevice );
138	cudaMemcpy( dev_b, b, N*sizeof(float), cudaMemcpyHostToDevice );
139
140	dim3 _t1, _t2;
141	_t1 = _toDim3(blocksPerGrid);
142	_t2 = _toDim3(threadsPerBlock);
143	_kernel_dot(_t1, _t2, 0, dev_a, dev_b, dev_partial_c );
144	// copy the array 'c' back from the GPU to the CPU
145	cudaMemcpy( partial_c, dev_partial_c, blocksPerGrid*sizeof(float), cudaMemcpyDeviceToHost );
146
147	// finish up on the CPU side
148	c = 0;
149	for (int i=0; i<blocksPerGrid; i++) {
150	c += partial_c[i];
151	}
152
153	#define sum_squares(x) (x(x+1)(2*x+1)/6)
154	printf( "Does GPU value %.6g = %.6g?\n", c,
155	2 * sum_squares( (float)(N - 1) ) );
156	$assert(c == 2 * sum_squares( (float)(N - 1) ) );
157
158	// free memory on the gpu side
159	cudaFree( dev_a ); // civl versionof cudaFree will check memory correctness, then call regular free
160	cudaFree( dev_b );
161	cudaFree( dev_partial_c );
162
163	// free memory on the cpu side
164	$free( a );
165	$free( b );
166	$free( partial_c );
167	return 0;
168	}
169
170	_Bool valid = isPowerOfTwo(THREADS_PER_BLOCK);
171	$assume((valid));
172
173	_cudaInit();
174	_main();
175	_cudaFinalize();
176	}
177
178

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

Original Format