| 1 | /*
|
|---|
| 2 | * Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
|
|---|
| 3 | *
|
|---|
| 4 | * NVIDIA Corporation and its licensors retain all intellectual property and
|
|---|
| 5 | * proprietary rights in and to this software and related documentation.
|
|---|
| 6 | * Any use, reproduction, disclosure, or distribution of this software
|
|---|
| 7 | * and related documentation without an express license agreement from
|
|---|
| 8 | * NVIDIA Corporation is strictly prohibited.
|
|---|
| 9 | *
|
|---|
| 10 | * Please refer to the applicable NVIDIA end user license agreement (EULA)
|
|---|
| 11 | * associated with this source code for terms and conditions that govern
|
|---|
| 12 | * your use of this NVIDIA software.
|
|---|
| 13 | *
|
|---|
| 14 | */
|
|---|
| 15 |
|
|---|
| 16 |
|
|---|
| 17 | //#include "../common/book.h"
|
|---|
| 18 |
|
|---|
| 19 | #define imin(a,b) (a<b?a:b)
|
|---|
| 20 |
|
|---|
| 21 | const int N = 33 * 1024;
|
|---|
| 22 | const int threadsPerBlock = 256;
|
|---|
| 23 | const int blocksPerGrid =
|
|---|
| 24 | imin(32, (N+threadsPerBlock-1) / threadsPerBlock );
|
|---|
| 25 |
|
|---|
| 26 | __global__ void dot( float *a, float *b, float *c ) {
|
|---|
| 27 | __shared__ float cache[threadsPerBlock];
|
|---|
| 28 | int tid = threadIdx.x + blockIdx.x * blockDim.x;
|
|---|
| 29 | int cacheIndex = threadIdx.x;
|
|---|
| 30 | float temp = 0;
|
|---|
| 31 |
|
|---|
| 32 | while (tid < N) {
|
|---|
| 33 | temp += a[tid] * b[tid];
|
|---|
| 34 | tid += blockDim.x * gridDim.x;
|
|---|
| 35 | }
|
|---|
| 36 | // set the cache values
|
|---|
| 37 | cache[cacheIndex] = temp;
|
|---|
| 38 | // synchronize threads in this block
|
|---|
| 39 | __syncthreads();
|
|---|
| 40 | // for reductions, threadsPerBlock must be a power of 2
|
|---|
| 41 | // because of the following code
|
|---|
| 42 | int i = blockDim.x/2;
|
|---|
| 43 | while (i != 0) {
|
|---|
| 44 | if (cacheIndex < i)
|
|---|
| 45 | cache[cacheIndex] += cache[cacheIndex + i];
|
|---|
| 46 | __syncthreads();
|
|---|
| 47 | i /= 2;
|
|---|
| 48 | }
|
|---|
| 49 |
|
|---|
| 50 | if (cacheIndex == 0)
|
|---|
| 51 | c[blockIdx.x] = cache[0];
|
|---|
| 52 | }
|
|---|
| 53 |
|
|---|
| 54 |
|
|---|
| 55 | int main( void ) {
|
|---|
| 56 | float *a, *b, c, *partial_c;
|
|---|
| 57 | float *dev_a, *dev_b, *dev_partial_c;
|
|---|
| 58 |
|
|---|
| 59 | // allocate memory on the cpu side
|
|---|
| 60 | a = (float*)malloc( N*sizeof(float) );
|
|---|
| 61 | b = (float*)malloc( N*sizeof(float) );
|
|---|
| 62 | partial_c = (float*)malloc( blocksPerGrid*sizeof(float) );
|
|---|
| 63 |
|
|---|
| 64 | // allocate the memory on the GPU
|
|---|
| 65 | HANDLE_ERROR( cudaMalloc( (void**)&dev_a,
|
|---|
| 66 | N*sizeof(float) ) );
|
|---|
| 67 | HANDLE_ERROR( cudaMalloc( (void**)&dev_b,
|
|---|
| 68 | N*sizeof(float) ) );
|
|---|
| 69 | HANDLE_ERROR( cudaMalloc( (void**)&dev_partial_c,
|
|---|
| 70 | blocksPerGrid*sizeof(float) ) );
|
|---|
| 71 |
|
|---|
| 72 | // fill in the host memory with data
|
|---|
| 73 | for (int i=0; i<N; i++) {
|
|---|
| 74 | a[i] = i;
|
|---|
| 75 | b[i] = i*2;
|
|---|
| 76 | }
|
|---|
| 77 |
|
|---|
| 78 | // copy the arrays 'a' and 'b' to the GPU
|
|---|
| 79 | HANDLE_ERROR( cudaMemcpy( dev_a, a, N*sizeof(float),
|
|---|
| 80 | cudaMemcpyHostToDevice ) );
|
|---|
| 81 | HANDLE_ERROR( cudaMemcpy( dev_b, b, N*sizeof(float),
|
|---|
| 82 | cudaMemcpyHostToDevice ) );
|
|---|
| 83 |
|
|---|
| 84 | dot<<<blocksPerGrid,threadsPerBlock>>>( dev_a, dev_b,
|
|---|
| 85 | dev_partial_c );
|
|---|
| 86 |
|
|---|
| 87 | // copy the array 'c' back from the GPU to the CPU
|
|---|
| 88 | HANDLE_ERROR( cudaMemcpy( partial_c, dev_partial_c,
|
|---|
| 89 | blocksPerGrid*sizeof(float),
|
|---|
| 90 | cudaMemcpyDeviceToHost ) );
|
|---|
| 91 |
|
|---|
| 92 | // finish up on the CPU side
|
|---|
| 93 | c = 0;
|
|---|
| 94 | for (int i=0; i<blocksPerGrid; i++) {
|
|---|
| 95 | c += partial_c[i];
|
|---|
| 96 | }
|
|---|
| 97 |
|
|---|
| 98 | #define sum_squares(x) (x*(x+1)*(2*x+1)/6)
|
|---|
| 99 | printf( "Does GPU value %.6g = %.6g?\n", c,
|
|---|
| 100 | 2 * sum_squares( (float)(N - 1) ) );
|
|---|
| 101 |
|
|---|
| 102 | // free memory on the gpu side
|
|---|
| 103 | HANDLE_ERROR( cudaFree( dev_a ) );
|
|---|
| 104 | HANDLE_ERROR( cudaFree( dev_b ) );
|
|---|
| 105 | HANDLE_ERROR( cudaFree( dev_partial_c ) );
|
|---|
| 106 |
|
|---|
| 107 | // free memory on the cpu side
|
|---|
| 108 | free( a );
|
|---|
| 109 | free( b );
|
|---|
| 110 | free( partial_c );
|
|---|
| 111 | }
|
|---|