Context Navigation

dot_orig.cu

main

Last change on this file was ea777aa, checked in by Alex Wilton <awilton@…>, 3 years ago

Moved examples, include, build_default.properties, common.xml, and README out from dev.civl.com into the root of the repo.

git-svn-id: svn://vsl.cis.udel.edu/civl/trunk@5704 fb995dde-84ed-4084-dfe6-e5aef3e2452c

Property mode set to 100644

File size: 3.4 KB

Line
1	/*
2	* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
3	*
4	* NVIDIA Corporation and its licensors retain all intellectual property and
5	* proprietary rights in and to this software and related documentation.
6	* Any use, reproduction, disclosure, or distribution of this software
7	* and related documentation without an express license agreement from
8	* NVIDIA Corporation is strictly prohibited.
9	*
10	* Please refer to the applicable NVIDIA end user license agreement (EULA)
11	* associated with this source code for terms and conditions that govern
12	* your use of this NVIDIA software.
13	*
14	*/
15
16
17	//#include "../common/book.h"
18
19	#define imin(a,b) (a<b?a:b)
20
21	const int N = 33 * 1024;
22	const int threadsPerBlock = 256;
23	const int blocksPerGrid =
24	imin(32, (N+threadsPerBlock-1) / threadsPerBlock );
25
26	__global__ void dot( float a, float b, float *c ) {
27	__shared__ float cache[threadsPerBlock];
28	int tid = threadIdx.x + blockIdx.x * blockDim.x;
29	int cacheIndex = threadIdx.x;
30	float temp = 0;
31
32	while (tid < N) {
33	temp += a[tid] * b[tid];
34	tid += blockDim.x * gridDim.x;
35	}
36	// set the cache values
37	cache[cacheIndex] = temp;
38	// synchronize threads in this block
39	__syncthreads();
40	// for reductions, threadsPerBlock must be a power of 2
41	// because of the following code
42	int i = blockDim.x/2;
43	while (i != 0) {
44	if (cacheIndex < i)
45	cache[cacheIndex] += cache[cacheIndex + i];
46	__syncthreads();
47	i /= 2;
48	}
49
50	if (cacheIndex == 0)
51	c[blockIdx.x] = cache[0];
52	}
53
54
55	int main( void ) {
56	float a, b, c, *partial_c;
57	float dev_a, dev_b, *dev_partial_c;
58
59	// allocate memory on the cpu side
60	a = (float)malloc( Nsizeof(float) );
61	b = (float)malloc( Nsizeof(float) );
62	partial_c = (float)malloc( blocksPerGridsizeof(float) );
63
64	// allocate the memory on the GPU
65	HANDLE_ERROR( cudaMalloc( (void**)&dev_a,
66	N*sizeof(float) ) );
67	HANDLE_ERROR( cudaMalloc( (void**)&dev_b,
68	N*sizeof(float) ) );
69	HANDLE_ERROR( cudaMalloc( (void**)&dev_partial_c,
70	blocksPerGrid*sizeof(float) ) );
71
72	// fill in the host memory with data
73	for (int i=0; i<N; i++) {
74	a[i] = i;
75	b[i] = i*2;
76	}
77
78	// copy the arrays 'a' and 'b' to the GPU
79	HANDLE_ERROR( cudaMemcpy( dev_a, a, N*sizeof(float),
80	cudaMemcpyHostToDevice ) );
81	HANDLE_ERROR( cudaMemcpy( dev_b, b, N*sizeof(float),
82	cudaMemcpyHostToDevice ) );
83
84	dot<<<blocksPerGrid,threadsPerBlock>>>( dev_a, dev_b,
85	dev_partial_c );
86
87	// copy the array 'c' back from the GPU to the CPU
88	HANDLE_ERROR( cudaMemcpy( partial_c, dev_partial_c,
89	blocksPerGrid*sizeof(float),
90	cudaMemcpyDeviceToHost ) );
91
92	// finish up on the CPU side
93	c = 0;
94	for (int i=0; i<blocksPerGrid; i++) {
95	c += partial_c[i];
96	}
97
98	#define sum_squares(x) (x(x+1)(2*x+1)/6)
99	printf( "Does GPU value %.6g = %.6g?\n", c,
100	2 * sum_squares( (float)(N - 1) ) );
101
102	// free memory on the gpu side
103	HANDLE_ERROR( cudaFree( dev_a ) );
104	HANDLE_ERROR( cudaFree( dev_b ) );
105	HANDLE_ERROR( cudaFree( dev_partial_c ) );
106
107	// free memory on the cpu side
108	free( a );
109	free( b );
110	free( partial_c );
111	}

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: CIVL/examples/cuda/dot_orig.cu

Download in other formats: