source: CIVL/examples/cuda/dot_orig.cu

main
Last change on this file was ea777aa, checked in by Alex Wilton <awilton@…>, 3 years ago

Moved examples, include, build_default.properties, common.xml, and README out from dev.civl.com into the root of the repo.

git-svn-id: svn://vsl.cis.udel.edu/civl/trunk@5704 fb995dde-84ed-4084-dfe6-e5aef3e2452c

  • Property mode set to 100644
File size: 3.4 KB
RevLine 
[1e59dce]1/*
2 * Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
3 *
4 * NVIDIA Corporation and its licensors retain all intellectual property and
5 * proprietary rights in and to this software and related documentation.
6 * Any use, reproduction, disclosure, or distribution of this software
7 * and related documentation without an express license agreement from
8 * NVIDIA Corporation is strictly prohibited.
9 *
10 * Please refer to the applicable NVIDIA end user license agreement (EULA)
11 * associated with this source code for terms and conditions that govern
12 * your use of this NVIDIA software.
13 *
14 */
15
16
17//#include "../common/book.h"
18
19#define imin(a,b) (a<b?a:b)
20
21const int N = 33 * 1024;
22const int threadsPerBlock = 256;
23const int blocksPerGrid =
24 imin(32, (N+threadsPerBlock-1) / threadsPerBlock );
25
26__global__ void dot( float *a, float *b, float *c ) {
27 __shared__ float cache[threadsPerBlock];
28 int tid = threadIdx.x + blockIdx.x * blockDim.x;
29 int cacheIndex = threadIdx.x;
30 float temp = 0;
31
32 while (tid < N) {
33 temp += a[tid] * b[tid];
34 tid += blockDim.x * gridDim.x;
35 }
36 // set the cache values
37 cache[cacheIndex] = temp;
38 // synchronize threads in this block
39 __syncthreads();
40 // for reductions, threadsPerBlock must be a power of 2
41 // because of the following code
42 int i = blockDim.x/2;
43 while (i != 0) {
44 if (cacheIndex < i)
45 cache[cacheIndex] += cache[cacheIndex + i];
46 __syncthreads();
47 i /= 2;
48 }
49
50 if (cacheIndex == 0)
51 c[blockIdx.x] = cache[0];
52}
53
54
55int main( void ) {
56 float *a, *b, c, *partial_c;
57 float *dev_a, *dev_b, *dev_partial_c;
58
59 // allocate memory on the cpu side
60 a = (float*)malloc( N*sizeof(float) );
61 b = (float*)malloc( N*sizeof(float) );
62 partial_c = (float*)malloc( blocksPerGrid*sizeof(float) );
63
64 // allocate the memory on the GPU
65 HANDLE_ERROR( cudaMalloc( (void**)&dev_a,
66 N*sizeof(float) ) );
67 HANDLE_ERROR( cudaMalloc( (void**)&dev_b,
68 N*sizeof(float) ) );
69 HANDLE_ERROR( cudaMalloc( (void**)&dev_partial_c,
70 blocksPerGrid*sizeof(float) ) );
71
72 // fill in the host memory with data
73 for (int i=0; i<N; i++) {
74 a[i] = i;
75 b[i] = i*2;
76 }
77
78 // copy the arrays 'a' and 'b' to the GPU
79 HANDLE_ERROR( cudaMemcpy( dev_a, a, N*sizeof(float),
80 cudaMemcpyHostToDevice ) );
81 HANDLE_ERROR( cudaMemcpy( dev_b, b, N*sizeof(float),
82 cudaMemcpyHostToDevice ) );
83
84 dot<<<blocksPerGrid,threadsPerBlock>>>( dev_a, dev_b,
85 dev_partial_c );
86
87 // copy the array 'c' back from the GPU to the CPU
88 HANDLE_ERROR( cudaMemcpy( partial_c, dev_partial_c,
89 blocksPerGrid*sizeof(float),
90 cudaMemcpyDeviceToHost ) );
91
92 // finish up on the CPU side
93 c = 0;
94 for (int i=0; i<blocksPerGrid; i++) {
95 c += partial_c[i];
96 }
97
98 #define sum_squares(x) (x*(x+1)*(2*x+1)/6)
99 printf( "Does GPU value %.6g = %.6g?\n", c,
100 2 * sum_squares( (float)(N - 1) ) );
101
102 // free memory on the gpu side
103 HANDLE_ERROR( cudaFree( dev_a ) );
104 HANDLE_ERROR( cudaFree( dev_b ) );
105 HANDLE_ERROR( cudaFree( dev_partial_c ) );
106
107 // free memory on the cpu side
108 free( a );
109 free( b );
110 free( partial_c );
111}
Note: See TracBrowser for help on using the repository browser.