source: CIVL/examples/translation/cuda/dot2.cvl@ 764d472

1.23 2.0 acw/focus-triggers main test-branch
Last change on this file since 764d472 was 9dbe6120, checked in by Andre Marianiello <andre.marianiello@…>, 12 years ago

Initial commit of Cuda translation header for CIVL and a non-working translation of dot.cu

git-svn-id: svn://vsl.cis.udel.edu/civl/trunk@1186 fb995dde-84ed-4084-dfe6-e5aef3e2452c

  • Property mode set to 100644
File size: 4.1 KB
Line 
1/*
2 * Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
3 *
4 * NVIDIA Corporation and its licensors retain all intellectual property and
5 * proprietary rights in and to this software and related documentation.
6 * Any use, reproduction, disclosure, or distribution of this software
7 * and related documentation without an express license agreement from
8 * NVIDIA Corporation is strictly prohibited.
9 *
10 * Please refer to the applicable NVIDIA end user license agreement (EULA)
11 * associated with this source code for terms and conditions that govern
12 * your use of this NVIDIA software.
13 *
14 * This is a version translated to CIVL.
15 *
16 */
17
18#include "civlc.h"
19#include "civlc-cuda.cvl"
20#include "stdio.h"
21
22#define imin(a,b) (a<b?a:b)
23
24$input int N;
25$input int B;
26$assume(0 <= N && N <= B);
27const int threadsPerBlock = 2;
28const int blocksPerGrid =
29 imin( 32, (N+threadsPerBlock-1) / threadsPerBlock );
30
31void _kernel_dot(dim3 gridDim, dim3 blockDim, cudaStream_t s, float *a, float *b, float *c ) {
32
33 void _kernel ($proc prevProc) {
34
35 _tryWait(prevProc);
36
37 void _block (uint3 blockIdx) {
38 int _numThreads = blockDim.x * blockDim.y * blockDim.z;
39 $gbarrier _block_barrier = $gbarrier_create($here, _numThreads);
40
41 float cache[threadsPerBlock];
42
43 void _thread (uint3 threadIdx) {
44
45 int _tid = _index(blockDim, threadIdx);
46 $barrier _b = $barrier_create($here, _block_barrier, _tid);
47
48 int tid = threadIdx.x + blockIdx.x * blockDim.x;
49 int cacheIndex = threadIdx.x;
50
51 float temp = 0;
52 while (tid < N) {
53 temp += a[tid] * b[tid];
54 tid += blockDim.x * gridDim.x;
55 }
56
57 // set the cache values
58 cache[cacheIndex] = temp;
59
60 // synchronize threads in this block
61
62 $barrier_call(_b);
63
64 // for reductions, threadsPerBlock must be a power of 2
65 // because of the following code
66 int i = blockDim.x/2;
67 while (i != 0) {
68 if (cacheIndex < i) {
69 cache[cacheIndex] += cache[cacheIndex + i];
70 }
71 //$barrier_call(_b);
72 i /= 2;
73 }
74
75 if (cacheIndex == 0)
76 c[blockIdx.x] = cache[0];
77
78 $barrier_destroy(_b);
79 }
80 _createProcs(blockDim, _thread);
81 $gbarrier_destroy(_block_barrier);
82 }
83 _createProcs(gridDim, _block);
84 }
85 _initKernel(s, _kernel);
86}
87
88int main ( void ) {
89 _host_scope = $here;
90
91 int _main( void ) {
92 float *a, *b, c, *partial_c;
93 float *dev_a, *dev_b, *dev_partial_c;
94
95 // allocate memory on the cpu side
96 a = (float*)$malloc(_host_scope, N*sizeof(float) );
97 b = (float*)$malloc(_host_scope, N*sizeof(float) );
98 partial_c = (float*)$malloc(_host_scope, blocksPerGrid*sizeof(float) );
99
100 // allocate the memory on the GPU
101 dev_a = (float *)$malloc( _global_scope, N*sizeof(float) );
102 dev_b = (float *)$malloc( _global_scope, N*sizeof(float) );
103 dev_partial_c = (float *)$malloc( _global_scope, blocksPerGrid*sizeof(float) );
104
105 // fill in the host memory with data
106 for (int i=0; i<N; i++) {
107 a[i] = i;
108 b[i] = i*2;
109 }
110
111 // copy the arrays 'a' and 'b' to the GPU
112 cudaMemcpy( dev_a, a, N*sizeof(float),
113 cudaMemcpyHostToDevice ); // civl version of cudaMemcpy will check memory correctness, then call regular memcpy
114 cudaMemcpy( dev_b, b, N*sizeof(float),
115 cudaMemcpyHostToDevice );
116
117 dim3 _t1, _t2;
118 _t1 = _toDim3(blocksPerGrid);
119 _t2 = _toDim3(threadsPerBlock);
120 _kernel_dot(_t1, _t2, dev_a, dev_b,
121 dev_partial_c );
122
123 // copy the array 'c' back from the GPU to the CPU
124 cudaMemcpy( partial_c, dev_partial_c,
125 blocksPerGrid*sizeof(float),
126 cudaMemcpyDeviceToHost );
127
128 // finish up on the CPU side
129 c = 0;
130 for (int i=0; i<blocksPerGrid; i++) {
131 c += partial_c[i];
132 }
133
134 #define sum_squares(x) (x*(x+1)*(2*x+1)/6)
135 printf( "Does GPU value %.6g = %.6g?\n", c,
136 2 * sum_squares( (float)(N - 1) ) );
137 $assert(c == 2 * sum_squares( (float)(N - 1) ) );
138
139 // free memory on the gpu side
140 cudaFree( dev_a ); // civl versionof cudaFree will check memory correctness, then call regular free
141 cudaFree( dev_b );
142 cudaFree( dev_partial_c );
143
144 // free memory on the cpu side
145 $free( a );
146 $free( b );
147 $free( partial_c );
148 return 0;
149 }
150
151 _main();
152}
153
154
Note: See TracBrowser for help on using the repository browser.