source: CIVL/examples/translation/cuda/dot2.cvl@ d87ec9c

1.23 2.0 main test-branch
Last change on this file since d87ec9c was fc22fbf, checked in by Andre Marianiello <andre.marianiello@…>, 12 years ago

Fixed the ifdefs that prevent multiple header inclusion in cuda.cvh and cuda-helper.cvh

git-svn-id: svn://vsl.cis.udel.edu/civl/trunk@1281 fb995dde-84ed-4084-dfe6-e5aef3e2452c

  • Property mode set to 100644
File size: 4.6 KB
Line 
1/*
2 * Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
3 *
4 * NVIDIA Corporation and its licensors retain all intellectual property and
5 * proprietary rights in and to this software and related documentation.
6 * Any use, reproduction, disclosure, or distribution of this software
7 * and related documentation without an express license agreement from
8 * NVIDIA Corporation is strictly prohibited.
9 *
10 * Please refer to the applicable NVIDIA end user license agreement (EULA)
11 * associated with this source code for terms and conditions that govern
12 * your use of this NVIDIA software.
13 *
14 * This is a version translated to CIVL.
15 *
16 */
17
18#include "civlc.h"
19#include "cuda.cvh"
20#include "stdio.h"
21
22#define imin(a,b) (a<b?a:b)
23
24// the length of the vectors to dot product
25$input int N;
26// upper bound on N
27$input int B;
28$assume(0 <= N && N <= B);
29$input int THREADS_PER_BLOCK; // thread number per block: must be a power of 2, due to the while loop at the end of gpuThread();
30const int threadsPerBlock = THREADS_PER_BLOCK;
31// the number of blocks to create
32const int blocksPerGrid =
33 imin( 32, (N+threadsPerBlock-1) / threadsPerBlock );
34
35void _kernel_dot(dim3 gridDim, dim3 blockDim, cudaStream_t s, float *a, float *b, float *c ) {
36
37 void _kernel (_kernelInstance *this, cudaEvent_t e) {
38
39 void _block (uint3 blockIdx) {
40 // calculate total number of threads
41 int _numThreads = blockDim.x * blockDim.y * blockDim.z;
42 $gbarrier _block_barrier = $gbarrier_create($here, _numThreads);
43
44 // shared memory for a block
45 float cache[threadsPerBlock];
46
47 void _thread (uint3 threadIdx) {
48
49 // get the one dimensional index of this threads
50 int _tid = _index(blockDim, threadIdx);
51 $barrier _b = $barrier_create($here, _block_barrier, _tid);
52
53 // get this thread's id
54 int tid = threadIdx.x + blockIdx.x * blockDim.x;
55 int cacheIndex = threadIdx.x;
56
57 float temp = 0;
58 int i = blockDim.x/2;
59
60 while (tid < N) {
61 temp += a[tid] * b[tid];
62 tid += blockDim.x * gridDim.x;
63 }
64
65 // set the cache values
66 cache[cacheIndex] = temp;
67
68 // synchronize threads in this block
69
70 $barrier_call(_b);
71
72 // for reductions, threadsPerBlock must be a power of 2
73 // because of the following code
74 while (i != 0) {
75 if (cacheIndex < i) {
76 cache[cacheIndex] += cache[cacheIndex + i];
77 }
78 $barrier_call(_b);
79 i /= 2;
80 }
81
82 if (cacheIndex == 0) {
83 c[blockIdx.x] = cache[0];
84 }
85
86
87 $barrier_destroy(_b);
88 }
89
90 _runProcs(blockDim, _thread);
91 $gbarrier_destroy(_block_barrier);
92 }
93
94 _waitInQueue(this, e);
95 _runProcs(gridDim, _block);
96 _kernelFinish(this);
97 }
98 _enqueueKernel(s, _kernel);
99}
100
101int main ( void ) {
102
103 int _main( void ) {
104 float *a, *b, c, *partial_c;
105 float *dev_a, *dev_b, *dev_partial_c;
106
107 // allocate memory on the cpu side
108 a = (float*)$malloc($root, N*sizeof(float) );
109 b = (float*)$malloc($root, N*sizeof(float) );
110 partial_c = (float*)$malloc($root, blocksPerGrid*sizeof(float) );
111
112 // allocate the memory on the GPU
113 dev_a = (float *)$malloc($root, N*sizeof(float) );
114 dev_b = (float *)$malloc($root, N*sizeof(float) );
115 dev_partial_c = (float *)$malloc($root, blocksPerGrid*sizeof(float) );
116
117 // fill in the host memory with data
118 for (int i=0; i<N; i++) {
119 a[i] = i;
120 b[i] = i*2;
121 }
122
123 // copy the arrays 'a' and 'b' to the GPU
124 cudaMemcpy( dev_a, a, N*sizeof(float), cudaMemcpyHostToDevice );
125 cudaMemcpy( dev_b, b, N*sizeof(float), cudaMemcpyHostToDevice );
126
127 dim3 _t1, _t2;
128 _t1 = _toDim3(blocksPerGrid);
129 _t2 = _toDim3(threadsPerBlock);
130 _kernel_dot(_t1, _t2, 0, dev_a, dev_b, dev_partial_c );
131// copy the array 'c' back from the GPU to the CPU
132 cudaMemcpy( partial_c, dev_partial_c, blocksPerGrid*sizeof(float), cudaMemcpyDeviceToHost );
133
134 // finish up on the CPU side
135 c = 0;
136 for (int i=0; i<blocksPerGrid; i++) {
137 c += partial_c[i];
138 }
139
140 #define sum_squares(x) (x*(x+1)*(2*x+1)/6)
141 printf( "Does GPU value %.6g = %.6g?\n", c,
142 2 * sum_squares( (float)(N - 1) ) );
143 $assert(c == 2 * sum_squares( (float)(N - 1) ) );
144
145 // free memory on the gpu side
146 cudaFree( dev_a ); // civl versionof cudaFree will check memory correctness, then call regular free
147 cudaFree( dev_b );
148 cudaFree( dev_partial_c );
149
150 // free memory on the cpu side
151 $free( a );
152 $free( b );
153 $free( partial_c );
154 return 0;
155 }
156
157 _cudaInit();
158 _main();
159 _cudaFinalize();
160}
161
162
Note: See TracBrowser for help on using the repository browser.