Context Navigation

dot.cu@ 83af34d

1.23 2.0 main test-branch

Last change on this file since 83af34d was e75355b, checked in by Andre Marianiello <andre.marianiello@…>, 11 years ago

Updated Cuda2CIVLTransformTest with a few new tests that are not yet working correctly.

git-svn-id: svn://vsl.cis.udel.edu/civl/trunk@1868 fb995dde-84ed-4084-dfe6-e5aef3e2452c

Property mode set to 100644

File size: 4.3 KB

Line
1	/*
2	* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
3	*
4	* NVIDIA Corporation and its licensors retain all intellectual property and
5	* proprietary rights in and to this software and related documentation.
6	* Any use, reproduction, disclosure, or distribution of this software
7	* and related documentation without an express license agreement from
8	* NVIDIA Corporation is strictly prohibited.
9	*
10	* Please refer to the applicable NVIDIA end user license agreement (EULA)
11	* associated with this source code for terms and conditions that govern
12	* your use of this NVIDIA software.
13	*
14	*/
15
16
17	//#include "../common/book.h"
18	#include <stdio.h>
19	#include <stdlib.h>
20	#include <cuda.h>
21
22	#define HANDLE_ERROR(x) x
23
24	#define imin(a,b) (a<b?a:b)
25
26	_Bool isPowerOfTwo(int x) {
27	if (x == 1) {
28	return $true;
29	} else {
30	return x % 2 == 0 && isPowerOfTwo(x / 2);
31	}
32	}
33
34
35	// the length of the vectors to dot product
36	$input int LENGTH;
37	// upper bound on LENGTH
38	$input int B;
39	//$assume(0 <= LENGTH && LENGTH <= B);
40	$assume(0 < LENGTH && LENGTH <= B);
41	$input int THREADS_PER_BLOCK; // thread number per block: must be a power of 2, due to the while loop at the end of gpuThread();
42	$input int THREADS_B;
43	$assume(1 <= THREADS_PER_BLOCK && THREADS_PER_BLOCK <= THREADS_B);
44
45
46
47	const int N = LENGTH;
48	const int threadsPerBlock = THREADS_PER_BLOCK;
49	//const int blocksPerGrid =
50	// imin(32, (N+threadsPerBlock-1) / threadsPerBlock );
51	const int blocksPerGrid = (N+threadsPerBlock-1) / threadsPerBlock;
52
53	__global__ void dot( float a, float b, float *c ) {
54	__shared__ float cache[threadsPerBlock];
55	int tid = threadIdx.x + blockIdx.x * blockDim.x;
56	int cacheIndex = threadIdx.x;
57	float temp = 0;
58
59	while (tid < N) {
60	temp += a[tid] * b[tid];
61	tid += blockDim.x * gridDim.x;
62	}
63	// set the cache values
64	cache[cacheIndex] = temp;
65	// synchronize threads in this block
66	__syncthreads();
67	// for reductions, threadsPerBlock must be a power of 2
68	// because of the following code
69	int i = blockDim.x/2;
70	while (i != 0) {
71	if (cacheIndex < i)
72	cache[cacheIndex] += cache[cacheIndex + i];
73	__syncthreads();
74	i /= 2;
75	}
76
77	if (cacheIndex == 0)
78	c[blockIdx.x] = cache[0];
79	}
80
81
82	int main( void ) {
83	elaborate(N);
84	//_Bool validThreadsPerBlock = isPowerOfTwo(THREADS_PER_BLOCK);
85	//$assume(validThreadsPerBlock);
86	//$assume(isPowerOfTwo(THREADS_PER_BLOCK));
87	float a, b, c, *partial_c;
88	float dev_a, dev_b, *dev_partial_c;
89
90	// allocate memory on the cpu side
91	a = (float)malloc( Nsizeof(float) );
92	b = (float)malloc( Nsizeof(float) );
93	partial_c = (float)malloc( blocksPerGridsizeof(float) );
94
95	// allocate the memory on the GPU
96	HANDLE_ERROR( cudaMalloc( (void**)&dev_a,
97	N*sizeof(float) ) );
98	HANDLE_ERROR( cudaMalloc( (void**)&dev_b,
99	N*sizeof(float) ) );
100	HANDLE_ERROR( cudaMalloc( (void**)&dev_partial_c,
101	blocksPerGrid*sizeof(float) ) );
102
103	// fill in the host memory with data
104	for (int i=0; i<N; i++) {
105	a[i] = i;
106	b[i] = i*2;
107	}
108
109	// copy the arrays 'a' and 'b' to the GPU
110	HANDLE_ERROR( cudaMemcpy( dev_a, a, N*sizeof(float),
111	cudaMemcpyHostToDevice ) );
112	HANDLE_ERROR( cudaMemcpy( dev_b, b, N*sizeof(float),
113	cudaMemcpyHostToDevice ) );
114
115	dot<<<blocksPerGrid,threadsPerBlock>>>( dev_a, dev_b,
116	dev_partial_c );
117
118	// copy the array 'c' back from the GPU to the CPU
119	HANDLE_ERROR( cudaMemcpy( partial_c, dev_partial_c,
120	blocksPerGrid*sizeof(float),
121	cudaMemcpyDeviceToHost ) );
122
123	// finish up on the CPU side
124	c = 0;
125	for (int i=0; i<blocksPerGrid; i++) {
126	c += partial_c[i];
127	}
128
129	#define sum_squares(x) (x(x+1)(2*x+1)/6)
130	printf( "Does GPU value %.6g = %.6g?\n", c,
131	2 * sum_squares( (float)(N - 1) ) );
132	$assert(c == 2 * sum_squares( (float)(N - 1) ) );
133
134	// free memory on the gpu side
135	HANDLE_ERROR( cudaFree( dev_a ) );
136	HANDLE_ERROR( cudaFree( dev_b ) );
137	HANDLE_ERROR( cudaFree( dev_partial_c ) );
138
139	// free memory on the cpu side
140	free( a );
141	free( b );
142	free( partial_c );
143	}

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: CIVL/examples/cuda/dot.cu@ 83af34d

Download in other formats: