| 1 | /*
|
|---|
| 2 | * Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
|
|---|
| 3 | *
|
|---|
| 4 | * NVIDIA Corporation and its licensors retain all intellectual property and
|
|---|
| 5 | * proprietary rights in and to this software and related documentation.
|
|---|
| 6 | * Any use, reproduction, disclosure, or distribution of this software
|
|---|
| 7 | * and related documentation without an express license agreement from
|
|---|
| 8 | * NVIDIA Corporation is strictly prohibited.
|
|---|
| 9 | *
|
|---|
| 10 | * Please refer to the applicable NVIDIA end user license agreement (EULA)
|
|---|
| 11 | * associated with this source code for terms and conditions that govern
|
|---|
| 12 | * your use of this NVIDIA software.
|
|---|
| 13 | *
|
|---|
| 14 | * This is a version translated to CIVL.
|
|---|
| 15 | *
|
|---|
| 16 | */
|
|---|
| 17 |
|
|---|
| 18 | #include <civlc.cvh>
|
|---|
| 19 | #include "cuda.cvh"
|
|---|
| 20 | #include <stdio.h>
|
|---|
| 21 |
|
|---|
| 22 | #define imin(a,b) (a<b?a:b)
|
|---|
| 23 |
|
|---|
| 24 | _Bool isPowerOfTwo(int x) {
|
|---|
| 25 | if (x == 1) {
|
|---|
| 26 | return $true;
|
|---|
| 27 | } else {
|
|---|
| 28 | return x % 2 == 0 && isPowerOfTwo(x / 2);
|
|---|
| 29 | }
|
|---|
| 30 | }
|
|---|
| 31 |
|
|---|
| 32 |
|
|---|
| 33 | // the length of the vectors to dot product
|
|---|
| 34 | $input int N;
|
|---|
| 35 | // upper bound on N
|
|---|
| 36 | $input int B;
|
|---|
| 37 | $assume(0 <= N && N <= B);
|
|---|
| 38 | $input int THREADS_PER_BLOCK; // thread number per block: must be a power of 2, due to the while loop at the end of gpuThread();
|
|---|
| 39 | $input int THREADS_B;
|
|---|
| 40 | $assume(1 <= THREADS_PER_BLOCK && THREADS_PER_BLOCK <= THREADS_B);
|
|---|
| 41 |
|
|---|
| 42 | const int threadsPerBlock = THREADS_PER_BLOCK;
|
|---|
| 43 | // the number of blocks to create
|
|---|
| 44 | const int blocksPerGrid =
|
|---|
| 45 | imin( 32, (N+threadsPerBlock-1) / threadsPerBlock );
|
|---|
| 46 |
|
|---|
| 47 | void _kernel_dot(dim3 gridDim, dim3 blockDim, cudaStream_t s, float *a, float *b, float *c ) {
|
|---|
| 48 |
|
|---|
| 49 | void _kernel (_kernelInstance *this, cudaEvent_t e) {
|
|---|
| 50 |
|
|---|
| 51 | void _block (uint3 blockIdx) {
|
|---|
| 52 | // calculate total number of threads
|
|---|
| 53 | int _numThreads = blockDim.x * blockDim.y * blockDim.z;
|
|---|
| 54 | $gbarrier _block_barrier = $gbarrier_create($here, _numThreads);
|
|---|
| 55 |
|
|---|
| 56 | // shared memory for a block
|
|---|
| 57 | float cache[threadsPerBlock];
|
|---|
| 58 |
|
|---|
| 59 | void _thread (uint3 threadIdx) {
|
|---|
| 60 |
|
|---|
| 61 | // get the one dimensional index of this threads
|
|---|
| 62 | int _tid = _index(blockDim, threadIdx);
|
|---|
| 63 | $barrier _b = $barrier_create($here, _block_barrier, _tid);
|
|---|
| 64 |
|
|---|
| 65 | // get this thread's id
|
|---|
| 66 | int tid = threadIdx.x + blockIdx.x * blockDim.x;
|
|---|
| 67 | int cacheIndex = threadIdx.x;
|
|---|
| 68 |
|
|---|
| 69 | float temp = 0;
|
|---|
| 70 | int i = blockDim.x/2;
|
|---|
| 71 |
|
|---|
| 72 | while (tid < N) {
|
|---|
| 73 | temp += a[tid] * b[tid];
|
|---|
| 74 | tid += blockDim.x * gridDim.x;
|
|---|
| 75 | }
|
|---|
| 76 |
|
|---|
| 77 | // set the cache values
|
|---|
| 78 | cache[cacheIndex] = temp;
|
|---|
| 79 |
|
|---|
| 80 | // synchronize threads in this block
|
|---|
| 81 |
|
|---|
| 82 | $barrier_call(_b);
|
|---|
| 83 |
|
|---|
| 84 | // for reductions, threadsPerBlock must be a power of 2
|
|---|
| 85 | // because of the following code
|
|---|
| 86 | while (i != 0) {
|
|---|
| 87 | if (cacheIndex < i) {
|
|---|
| 88 | cache[cacheIndex] += cache[cacheIndex + i];
|
|---|
| 89 | }
|
|---|
| 90 | $barrier_call(_b);
|
|---|
| 91 | i /= 2;
|
|---|
| 92 | }
|
|---|
| 93 |
|
|---|
| 94 | if (cacheIndex == 0) {
|
|---|
| 95 | c[blockIdx.x] = cache[0];
|
|---|
| 96 | }
|
|---|
| 97 |
|
|---|
| 98 |
|
|---|
| 99 | $barrier_destroy(_b);
|
|---|
| 100 | }
|
|---|
| 101 |
|
|---|
| 102 | _runProcs(blockDim, _thread);
|
|---|
| 103 | $gbarrier_destroy(_block_barrier);
|
|---|
| 104 | }
|
|---|
| 105 |
|
|---|
| 106 | _waitInQueue(this, e);
|
|---|
| 107 | _runProcs(gridDim, _block);
|
|---|
| 108 | _kernelFinish(this);
|
|---|
| 109 | }
|
|---|
| 110 | _enqueueKernel(s, _kernel);
|
|---|
| 111 | }
|
|---|
| 112 |
|
|---|
| 113 | int main ( void ) {
|
|---|
| 114 |
|
|---|
| 115 | int _main( void ) {
|
|---|
| 116 | float *a, *b, c, *partial_c;
|
|---|
| 117 | float *dev_a, *dev_b, *dev_partial_c;
|
|---|
| 118 |
|
|---|
| 119 | // allocate memory on the cpu side
|
|---|
| 120 | a = (float*)$malloc($root, N*sizeof(float) );
|
|---|
| 121 | b = (float*)$malloc($root, N*sizeof(float) );
|
|---|
| 122 | partial_c = (float*)$malloc($root, blocksPerGrid*sizeof(float) );
|
|---|
| 123 |
|
|---|
| 124 | // allocate the memory on the GPU
|
|---|
| 125 | dev_a = (float *)$malloc($root, N*sizeof(float) );
|
|---|
| 126 | dev_b = (float *)$malloc($root, N*sizeof(float) );
|
|---|
| 127 | dev_partial_c = (float *)$malloc($root, blocksPerGrid*sizeof(float) );
|
|---|
| 128 |
|
|---|
| 129 | // fill in the host memory with data
|
|---|
| 130 | for (int i=0; i<N; i++) {
|
|---|
| 131 | a[i] = i;
|
|---|
| 132 | b[i] = i*2;
|
|---|
| 133 | }
|
|---|
| 134 |
|
|---|
| 135 | // copy the arrays 'a' and 'b' to the GPU
|
|---|
| 136 | cudaMemcpy( dev_a, a, N*sizeof(float), cudaMemcpyHostToDevice );
|
|---|
| 137 | cudaMemcpy( dev_b, b, N*sizeof(float), cudaMemcpyHostToDevice );
|
|---|
| 138 |
|
|---|
| 139 | dim3 _t1, _t2;
|
|---|
| 140 | _t1 = _toDim3(blocksPerGrid);
|
|---|
| 141 | _t2 = _toDim3(threadsPerBlock);
|
|---|
| 142 | _kernel_dot(_t1, _t2, 0, dev_a, dev_b, dev_partial_c );
|
|---|
| 143 | // copy the array 'c' back from the GPU to the CPU
|
|---|
| 144 | cudaMemcpy( partial_c, dev_partial_c, blocksPerGrid*sizeof(float), cudaMemcpyDeviceToHost );
|
|---|
| 145 |
|
|---|
| 146 | // finish up on the CPU side
|
|---|
| 147 | c = 0;
|
|---|
| 148 | for (int i=0; i<blocksPerGrid; i++) {
|
|---|
| 149 | c += partial_c[i];
|
|---|
| 150 | }
|
|---|
| 151 |
|
|---|
| 152 | #define sum_squares(x) (x*(x+1)*(2*x+1)/6)
|
|---|
| 153 | printf( "Does GPU value %.6g = %.6g?\n", c,
|
|---|
| 154 | 2 * sum_squares( (float)(N - 1) ) );
|
|---|
| 155 | $assert(c == 2 * sum_squares( (float)(N - 1) ) );
|
|---|
| 156 |
|
|---|
| 157 | // free memory on the gpu side
|
|---|
| 158 | cudaFree( dev_a ); // civl versionof cudaFree will check memory correctness, then call regular free
|
|---|
| 159 | cudaFree( dev_b );
|
|---|
| 160 | cudaFree( dev_partial_c );
|
|---|
| 161 |
|
|---|
| 162 | // free memory on the cpu side
|
|---|
| 163 | $free( a );
|
|---|
| 164 | $free( b );
|
|---|
| 165 | $free( partial_c );
|
|---|
| 166 | return 0;
|
|---|
| 167 | }
|
|---|
| 168 |
|
|---|
| 169 | _Bool valid = isPowerOfTwo(THREADS_PER_BLOCK);
|
|---|
| 170 | $assume(valid);
|
|---|
| 171 |
|
|---|
| 172 | _cudaInit();
|
|---|
| 173 | _main();
|
|---|
| 174 | _cudaFinalize();
|
|---|
| 175 | }
|
|---|
| 176 |
|
|---|
| 177 |
|
|---|