source: CIVL/examples/cuda/dot.cu@ 83af34d

1.23 2.0 main test-branch
Last change on this file since 83af34d was e75355b, checked in by Andre Marianiello <andre.marianiello@…>, 11 years ago

Updated Cuda2CIVLTransformTest with a few new tests that are not yet working correctly.

git-svn-id: svn://vsl.cis.udel.edu/civl/trunk@1868 fb995dde-84ed-4084-dfe6-e5aef3e2452c

  • Property mode set to 100644
File size: 4.3 KB
Line 
1/*
2 * Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
3 *
4 * NVIDIA Corporation and its licensors retain all intellectual property and
5 * proprietary rights in and to this software and related documentation.
6 * Any use, reproduction, disclosure, or distribution of this software
7 * and related documentation without an express license agreement from
8 * NVIDIA Corporation is strictly prohibited.
9 *
10 * Please refer to the applicable NVIDIA end user license agreement (EULA)
11 * associated with this source code for terms and conditions that govern
12 * your use of this NVIDIA software.
13 *
14 */
15
16
17//#include "../common/book.h"
18#include <stdio.h>
19#include <stdlib.h>
20#include <cuda.h>
21
22#define HANDLE_ERROR(x) x
23
24#define imin(a,b) (a<b?a:b)
25
26_Bool isPowerOfTwo(int x) {
27 if (x == 1) {
28 return $true;
29 } else {
30 return x % 2 == 0 && isPowerOfTwo(x / 2);
31 }
32}
33
34
35// the length of the vectors to dot product
36$input int LENGTH;
37// upper bound on LENGTH
38$input int B;
39//$assume(0 <= LENGTH && LENGTH <= B);
40$assume(0 < LENGTH && LENGTH <= B);
41$input int THREADS_PER_BLOCK; // thread number per block: must be a power of 2, due to the while loop at the end of gpuThread();
42$input int THREADS_B;
43$assume(1 <= THREADS_PER_BLOCK && THREADS_PER_BLOCK <= THREADS_B);
44
45
46
47const int N = LENGTH;
48const int threadsPerBlock = THREADS_PER_BLOCK;
49//const int blocksPerGrid =
50// imin(32, (N+threadsPerBlock-1) / threadsPerBlock );
51const int blocksPerGrid = (N+threadsPerBlock-1) / threadsPerBlock;
52
53__global__ void dot( float *a, float *b, float *c ) {
54 __shared__ float cache[threadsPerBlock];
55 int tid = threadIdx.x + blockIdx.x * blockDim.x;
56 int cacheIndex = threadIdx.x;
57 float temp = 0;
58
59 while (tid < N) {
60 temp += a[tid] * b[tid];
61 tid += blockDim.x * gridDim.x;
62 }
63 // set the cache values
64 cache[cacheIndex] = temp;
65 // synchronize threads in this block
66 __syncthreads();
67 // for reductions, threadsPerBlock must be a power of 2
68 // because of the following code
69 int i = blockDim.x/2;
70 while (i != 0) {
71 if (cacheIndex < i)
72 cache[cacheIndex] += cache[cacheIndex + i];
73 __syncthreads();
74 i /= 2;
75 }
76
77 if (cacheIndex == 0)
78 c[blockIdx.x] = cache[0];
79}
80
81
82int main( void ) {
83 elaborate(N);
84 //_Bool validThreadsPerBlock = isPowerOfTwo(THREADS_PER_BLOCK);
85 //$assume(validThreadsPerBlock);
86 //$assume(isPowerOfTwo(THREADS_PER_BLOCK));
87 float *a, *b, c, *partial_c;
88 float *dev_a, *dev_b, *dev_partial_c;
89
90 // allocate memory on the cpu side
91 a = (float*)malloc( N*sizeof(float) );
92 b = (float*)malloc( N*sizeof(float) );
93 partial_c = (float*)malloc( blocksPerGrid*sizeof(float) );
94
95 // allocate the memory on the GPU
96 HANDLE_ERROR( cudaMalloc( (void**)&dev_a,
97 N*sizeof(float) ) );
98 HANDLE_ERROR( cudaMalloc( (void**)&dev_b,
99 N*sizeof(float) ) );
100 HANDLE_ERROR( cudaMalloc( (void**)&dev_partial_c,
101 blocksPerGrid*sizeof(float) ) );
102
103 // fill in the host memory with data
104 for (int i=0; i<N; i++) {
105 a[i] = i;
106 b[i] = i*2;
107 }
108
109 // copy the arrays 'a' and 'b' to the GPU
110 HANDLE_ERROR( cudaMemcpy( dev_a, a, N*sizeof(float),
111 cudaMemcpyHostToDevice ) );
112 HANDLE_ERROR( cudaMemcpy( dev_b, b, N*sizeof(float),
113 cudaMemcpyHostToDevice ) );
114
115 dot<<<blocksPerGrid,threadsPerBlock>>>( dev_a, dev_b,
116 dev_partial_c );
117
118 // copy the array 'c' back from the GPU to the CPU
119 HANDLE_ERROR( cudaMemcpy( partial_c, dev_partial_c,
120 blocksPerGrid*sizeof(float),
121 cudaMemcpyDeviceToHost ) );
122
123 // finish up on the CPU side
124 c = 0;
125 for (int i=0; i<blocksPerGrid; i++) {
126 c += partial_c[i];
127 }
128
129 #define sum_squares(x) (x*(x+1)*(2*x+1)/6)
130 printf( "Does GPU value %.6g = %.6g?\n", c,
131 2 * sum_squares( (float)(N - 1) ) );
132 $assert(c == 2 * sum_squares( (float)(N - 1) ) );
133
134 // free memory on the gpu side
135 HANDLE_ERROR( cudaFree( dev_a ) );
136 HANDLE_ERROR( cudaFree( dev_b ) );
137 HANDLE_ERROR( cudaFree( dev_partial_c ) );
138
139 // free memory on the cpu side
140 free( a );
141 free( b );
142 free( partial_c );
143}
Note: See TracBrowser for help on using the repository browser.