source: CIVL/examples/cuda/dot.cu

main
Last change on this file was 9cabba4, checked in by Alex Wilton <awilton@…>, 2 years ago

Merged CUDA branch into trunk.

git-svn-id: svn://vsl.cis.udel.edu/civl/trunk@5867 fb995dde-84ed-4084-dfe6-e5aef3e2452c

  • Property mode set to 100644
File size: 4.2 KB
RevLine 
[3ff27cf]1#ifdef _CIVL
2#include <civlc.cvh>
3#endif
[54d543c]4/*
5 * Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
6 *
7 * NVIDIA Corporation and its licensors retain all intellectual property and
8 * proprietary rights in and to this software and related documentation.
9 * Any use, reproduction, disclosure, or distribution of this software
10 * and related documentation without an express license agreement from
11 * NVIDIA Corporation is strictly prohibited.
12 *
13 * Please refer to the applicable NVIDIA end user license agreement (EULA)
14 * associated with this source code for terms and conditions that govern
15 * your use of this NVIDIA software.
16 *
17 */
18
19
[0f059298]20//#include "../common/book.h"
[1e59dce]21#include <stdio.h>
22#include <stdlib.h>
23#include <cuda.h>
24
25#define HANDLE_ERROR(x) x
[54d543c]26
27#define imin(a,b) (a<b?a:b)
28
[354d5fa]29#ifdef _CIVL
[e75355b]30_Bool isPowerOfTwo(int x) {
31 if (x == 1) {
32 return $true;
33 } else {
34 return x % 2 == 0 && isPowerOfTwo(x / 2);
35 }
36}
37
38
39// the length of the vectors to dot product
[354d5fa]40$input int N;
41// upper bound on N
42$input int N_B;
[3ff27cf]43$assume((0 <= N && N <= N_B));
[354d5fa]44$input int threadsPerBlock; // thread number per block: must be a power of 2, due to the while loop at the end of gpuThread();
[9cabba4]45$input int threadsPerBlock_B = N;
[3ff27cf]46$assume((1 <= threadsPerBlock && threadsPerBlock <= threadsPerBlock_B));
[354d5fa]47#else
48const int N = 33 * 1024;
49const int threadsPerBlock = 256;
50#endif
[441e680]51const int blocksPerGrid =
[354d5fa]52 imin( 32, (N+threadsPerBlock-1) / threadsPerBlock );
[54d543c]53
54__global__ void dot( float *a, float *b, float *c ) {
55 __shared__ float cache[threadsPerBlock];
56 int tid = threadIdx.x + blockIdx.x * blockDim.x;
57 int cacheIndex = threadIdx.x;
58 float temp = 0;
59
60 while (tid < N) {
61 temp += a[tid] * b[tid];
62 tid += blockDim.x * gridDim.x;
63 }
64 // set the cache values
65 cache[cacheIndex] = temp;
66 // synchronize threads in this block
67 __syncthreads();
68 // for reductions, threadsPerBlock must be a power of 2
69 // because of the following code
70 int i = blockDim.x/2;
71 while (i != 0) {
72 if (cacheIndex < i)
73 cache[cacheIndex] += cache[cacheIndex + i];
74 __syncthreads();
75 i /= 2;
76 }
77
78 if (cacheIndex == 0)
79 c[blockIdx.x] = cache[0];
80}
81
82
[a52ef0f]83int main() {
[354d5fa]84#ifdef _CIVL
[3ff27cf]85 $assume((isPowerOfTwo(threadsPerBlock)));
[354d5fa]86#endif
87
[54d543c]88 float *a, *b, c, *partial_c;
89 float *dev_a, *dev_b, *dev_partial_c;
90
91 // allocate memory on the cpu side
92 a = (float*)malloc( N*sizeof(float) );
93 b = (float*)malloc( N*sizeof(float) );
94 partial_c = (float*)malloc( blocksPerGrid*sizeof(float) );
95
96 // allocate the memory on the GPU
97 HANDLE_ERROR( cudaMalloc( (void**)&dev_a,
98 N*sizeof(float) ) );
99 HANDLE_ERROR( cudaMalloc( (void**)&dev_b,
100 N*sizeof(float) ) );
101 HANDLE_ERROR( cudaMalloc( (void**)&dev_partial_c,
102 blocksPerGrid*sizeof(float) ) );
103
104 // fill in the host memory with data
105 for (int i=0; i<N; i++) {
106 a[i] = i;
107 b[i] = i*2;
108 }
109
110 // copy the arrays 'a' and 'b' to the GPU
111 HANDLE_ERROR( cudaMemcpy( dev_a, a, N*sizeof(float),
112 cudaMemcpyHostToDevice ) );
113 HANDLE_ERROR( cudaMemcpy( dev_b, b, N*sizeof(float),
114 cudaMemcpyHostToDevice ) );
115
116 dot<<<blocksPerGrid,threadsPerBlock>>>( dev_a, dev_b,
117 dev_partial_c );
118
119 // copy the array 'c' back from the GPU to the CPU
120 HANDLE_ERROR( cudaMemcpy( partial_c, dev_partial_c,
121 blocksPerGrid*sizeof(float),
122 cudaMemcpyDeviceToHost ) );
123
124 // finish up on the CPU side
125 c = 0;
126 for (int i=0; i<blocksPerGrid; i++) {
127 c += partial_c[i];
128 }
129
130 #define sum_squares(x) (x*(x+1)*(2*x+1)/6)
131 printf( "Does GPU value %.6g = %.6g?\n", c,
132 2 * sum_squares( (float)(N - 1) ) );
[354d5fa]133#ifdef _CIVL
[d980649]134 $assert(c == 2 * sum_squares( (float)(N - 1) ) );
[354d5fa]135#endif
[54d543c]136
137 // free memory on the gpu side
138 HANDLE_ERROR( cudaFree( dev_a ) );
139 HANDLE_ERROR( cudaFree( dev_b ) );
140 HANDLE_ERROR( cudaFree( dev_partial_c ) );
141
142 // free memory on the cpu side
143 free( a );
144 free( b );
145 free( partial_c );
146}
Note: See TracBrowser for help on using the repository browser.