source: CIVL/examples/cuda/dot2.cvl@ 77ad37b

1.23 2.0 main test-branch
Last change on this file since 77ad37b was 3ff27cf, checked in by Manchun Zheng <zmanchun@…>, 11 years ago

updated examples since $assert/$assume has been changed to functions; fixed the model builder for the new side-effect remover.

git-svn-id: svn://vsl.cis.udel.edu/civl/trunk@2085 fb995dde-84ed-4084-dfe6-e5aef3e2452c

  • Property mode set to 100644
File size: 4.9 KB
Line 
1/*
2 * Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
3 *
4 * NVIDIA Corporation and its licensors retain all intellectual property and
5 * proprietary rights in and to this software and related documentation.
6 * Any use, reproduction, disclosure, or distribution of this software
7 * and related documentation without an express license agreement from
8 * NVIDIA Corporation is strictly prohibited.
9 *
10 * Please refer to the applicable NVIDIA end user license agreement (EULA)
11 * associated with this source code for terms and conditions that govern
12 * your use of this NVIDIA software.
13 *
14 * This is a version translated to CIVL.
15 *
16 */
17
18#include <civlc.cvh>
19#include <stdio.h>
20#include <cuda.h>
21
22#define imin(a,b) (a<b?a:b)
23
24_Bool isPowerOfTwo(int x) {
25 if (x == 1) {
26 return $true;
27 } else {
28 return x % 2 == 0 && isPowerOfTwo(x / 2);
29 }
30}
31
32
33// the length of the vectors to dot product
34$input int N;
35// upper bound on N
36$input int B;
37$assume((0 <= N && N <= B));
38$input int THREADS_PER_BLOCK; // thread number per block: must be a power of 2, due to the while loop at the end of gpuThread();
39$input int THREADS_B;
40$assume((1 <= THREADS_PER_BLOCK && THREADS_PER_BLOCK <= THREADS_B));
41
42const int threadsPerBlock = THREADS_PER_BLOCK;
43// the number of blocks to create
44const int blocksPerGrid =
45 imin( 32, (N+threadsPerBlock-1) / threadsPerBlock );
46
47void _kernel_dot(dim3 gridDim, dim3 blockDim, cudaStream_t s, float *a, float *b, float *c ) {
48
49 void _kernel (_kernelInstance *this, cudaEvent_t e) {
50
51 void _block (uint3 blockIdx) {
52 // calculate total number of threads
53 int _numThreads = blockDim.x * blockDim.y * blockDim.z;
54 $gbarrier _block_barrier = $gbarrier_create($here, _numThreads);
55
56 // shared memory for a block
57 float cache[threadsPerBlock];
58
59 void _thread (uint3 threadIdx) {
60
61 // get the one dimensional index of this threads
62 int _tid = _index(blockDim, threadIdx);
63 $barrier _b = $barrier_create($here, _block_barrier, _tid);
64
65 // get this thread's id
66 int tid = threadIdx.x + blockIdx.x * blockDim.x;
67 int cacheIndex = threadIdx.x;
68
69 float temp = 0;
70 int i = blockDim.x/2;
71
72 while (tid < N) {
73 temp += a[tid] * b[tid];
74 tid += blockDim.x * gridDim.x;
75 }
76
77 // set the cache values
78 cache[cacheIndex] = temp;
79
80 // synchronize threads in this block
81
82 $barrier_call(_b);
83
84 // for reductions, threadsPerBlock must be a power of 2
85 // because of the following code
86 while (i != 0) {
87 if (cacheIndex < i) {
88 cache[cacheIndex] += cache[cacheIndex + i];
89 }
90 $barrier_call(_b);
91 i /= 2;
92 }
93
94 if (cacheIndex == 0) {
95 c[blockIdx.x] = cache[0];
96 }
97
98
99 $barrier_destroy(_b);
100 }
101
102 _runProcs(blockDim, _thread);
103 $gbarrier_destroy(_block_barrier);
104 }
105
106 _waitInQueue(this, e);
107 _runProcs(gridDim, _block);
108 _kernelFinish(this);
109 }
110 _enqueueKernel(s, _kernel);
111}
112
113int main ( void ) {
114
115 int _main( void ) {
116 float *a, *b, c, *partial_c;
117 float *dev_a, *dev_b, *dev_partial_c;
118
119 // allocate memory on the cpu side
120 a = (float*)$malloc($root, N*sizeof(float) );
121 b = (float*)$malloc($root, N*sizeof(float) );
122 partial_c = (float*)$malloc($root, blocksPerGrid*sizeof(float) );
123
124 // allocate the memory on the GPU
125 dev_a = (float *)$malloc($root, N*sizeof(float) );
126 dev_b = (float *)$malloc($root, N*sizeof(float) );
127 dev_partial_c = (float *)$malloc($root, blocksPerGrid*sizeof(float) );
128
129 // fill in the host memory with data
130 for (int i=0; i<N; i++) {
131 a[i] = i;
132 b[i] = i*2;
133 }
134
135 // copy the arrays 'a' and 'b' to the GPU
136 cudaMemcpy( dev_a, a, N*sizeof(float), cudaMemcpyHostToDevice );
137 cudaMemcpy( dev_b, b, N*sizeof(float), cudaMemcpyHostToDevice );
138
139 dim3 _t1, _t2;
140 _t1 = _toDim3(blocksPerGrid);
141 _t2 = _toDim3(threadsPerBlock);
142 _kernel_dot(_t1, _t2, 0, dev_a, dev_b, dev_partial_c );
143// copy the array 'c' back from the GPU to the CPU
144 cudaMemcpy( partial_c, dev_partial_c, blocksPerGrid*sizeof(float), cudaMemcpyDeviceToHost );
145
146 // finish up on the CPU side
147 c = 0;
148 for (int i=0; i<blocksPerGrid; i++) {
149 c += partial_c[i];
150 }
151
152 #define sum_squares(x) (x*(x+1)*(2*x+1)/6)
153 printf( "Does GPU value %.6g = %.6g?\n", c,
154 2 * sum_squares( (float)(N - 1) ) );
155 $assert((c == 2 * sum_squares( (float)(N - 1) ) ));
156
157 // free memory on the gpu side
158 cudaFree( dev_a ); // civl versionof cudaFree will check memory correctness, then call regular free
159 cudaFree( dev_b );
160 cudaFree( dev_partial_c );
161
162 // free memory on the cpu side
163 $free( a );
164 $free( b );
165 $free( partial_c );
166 return 0;
167 }
168
169 _Bool valid = isPowerOfTwo(THREADS_PER_BLOCK);
170 $assume((valid));
171
172 _cudaInit();
173 _main();
174 _cudaFinalize();
175}
176
177
Note: See TracBrowser for help on using the repository browser.