Context Navigation

dot.cu@ 139c8d5

1.23 2.0 main test-branch

Last change on this file since 139c8d5 was 441e680, checked in by Andre Marianiello <andre.marianiello@…>, 11 years ago

Minor changes to dot.cu example cuda program.

git-svn-id: svn://vsl.cis.udel.edu/civl/trunk@1908 fb995dde-84ed-4084-dfe6-e5aef3e2452c

Property mode set to 100644

File size: 4.1 KB

Line
1	/*
2	* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
3	*
4	* NVIDIA Corporation and its licensors retain all intellectual property and
5	* proprietary rights in and to this software and related documentation.
6	* Any use, reproduction, disclosure, or distribution of this software
7	* and related documentation without an express license agreement from
8	* NVIDIA Corporation is strictly prohibited.
9	*
10	* Please refer to the applicable NVIDIA end user license agreement (EULA)
11	* associated with this source code for terms and conditions that govern
12	* your use of this NVIDIA software.
13	*
14	*/
15
16
17	//#include "../common/book.h"
18	#include <stdio.h>
19	#include <stdlib.h>
20	#include <cuda.h>
21
22	#define HANDLE_ERROR(x) x
23
24	#define imin(a,b) (a<b?a:b)
25
26	_Bool isPowerOfTwo(int x) {
27	if (x == 1) {
28	return $true;
29	} else {
30	return x % 2 == 0 && isPowerOfTwo(x / 2);
31	}
32	}
33
34
35	// the length of the vectors to dot product
36	$input int LENGTH;
37	// upper bound on LENGTH
38	$input int B;
39	$assume(0 <= LENGTH && LENGTH <= B);
40	$input int THREADS_PER_BLOCK; // thread number per block: must be a power of 2, due to the while loop at the end of gpuThread();
41	$input int THREADS_B;
42	$assume(1 <= THREADS_PER_BLOCK && THREADS_PER_BLOCK <= THREADS_B);
43
44
45
46	const int N = LENGTH;
47	const int threadsPerBlock = THREADS_PER_BLOCK;
48	const int blocksPerGrid =
49	imin(32, (N+threadsPerBlock-1) / threadsPerBlock );
50
51	__global__ void dot( float a, float b, float *c ) {
52	__shared__ float cache[threadsPerBlock];
53	int tid = threadIdx.x + blockIdx.x * blockDim.x;
54	int cacheIndex = threadIdx.x;
55	float temp = 0;
56
57	while (tid < N) {
58	temp += a[tid] * b[tid];
59	tid += blockDim.x * gridDim.x;
60	}
61	// set the cache values
62	cache[cacheIndex] = temp;
63	// synchronize threads in this block
64	__syncthreads();
65	// for reductions, threadsPerBlock must be a power of 2
66	// because of the following code
67	int i = blockDim.x/2;
68	while (i != 0) {
69	if (cacheIndex < i)
70	cache[cacheIndex] += cache[cacheIndex + i];
71	__syncthreads();
72	i /= 2;
73	}
74
75	if (cacheIndex == 0)
76	c[blockIdx.x] = cache[0];
77	}
78
79
80	int main( void ) {
81	elaborate(THREADS_PER_BLOCK);
82	$assume(isPowerOfTwo(THREADS_PER_BLOCK));
83	float a, b, c, *partial_c;
84	float dev_a, dev_b, *dev_partial_c;
85
86	// allocate memory on the cpu side
87	a = (float)malloc( Nsizeof(float) );
88	b = (float)malloc( Nsizeof(float) );
89	partial_c = (float)malloc( blocksPerGridsizeof(float) );
90
91	// allocate the memory on the GPU
92	HANDLE_ERROR( cudaMalloc( (void**)&dev_a,
93	N*sizeof(float) ) );
94	HANDLE_ERROR( cudaMalloc( (void**)&dev_b,
95	N*sizeof(float) ) );
96	HANDLE_ERROR( cudaMalloc( (void**)&dev_partial_c,
97	blocksPerGrid*sizeof(float) ) );
98
99	// fill in the host memory with data
100	for (int i=0; i<N; i++) {
101	a[i] = i;
102	b[i] = i*2;
103	}
104
105	// copy the arrays 'a' and 'b' to the GPU
106	HANDLE_ERROR( cudaMemcpy( dev_a, a, N*sizeof(float),
107	cudaMemcpyHostToDevice ) );
108	HANDLE_ERROR( cudaMemcpy( dev_b, b, N*sizeof(float),
109	cudaMemcpyHostToDevice ) );
110
111	dot<<<blocksPerGrid,threadsPerBlock>>>( dev_a, dev_b,
112	dev_partial_c );
113
114	// copy the array 'c' back from the GPU to the CPU
115	HANDLE_ERROR( cudaMemcpy( partial_c, dev_partial_c,
116	blocksPerGrid*sizeof(float),
117	cudaMemcpyDeviceToHost ) );
118
119	// finish up on the CPU side
120	c = 0;
121	for (int i=0; i<blocksPerGrid; i++) {
122	c += partial_c[i];
123	}
124
125	#define sum_squares(x) (x(x+1)(2*x+1)/6)
126	printf( "Does GPU value %.6g = %.6g?\n", c,
127	2 * sum_squares( (float)(N - 1) ) );
128	$assert(c == 2 * sum_squares( (float)(N - 1) ) );
129
130	// free memory on the gpu side
131	HANDLE_ERROR( cudaFree( dev_a ) );
132	HANDLE_ERROR( cudaFree( dev_b ) );
133	HANDLE_ERROR( cudaFree( dev_partial_c ) );
134
135	// free memory on the cpu side
136	free( a );
137	free( b );
138	free( partial_c );
139	}

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: CIVL/examples/cuda/dot.cu@ 139c8d5

Download in other formats: