Context Navigation

dot.cu

main

Last change on this file was 9cabba4, checked in by Alex Wilton <awilton@…>, 2 years ago

Merged CUDA branch into trunk.

git-svn-id: svn://vsl.cis.udel.edu/civl/trunk@5867 fb995dde-84ed-4084-dfe6-e5aef3e2452c

Property mode set to 100644

File size: 4.2 KB

Line
1	#ifdef _CIVL
2	#include <civlc.cvh>
3	#endif
4	/*
5	* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
6	*
7	* NVIDIA Corporation and its licensors retain all intellectual property and
8	* proprietary rights in and to this software and related documentation.
9	* Any use, reproduction, disclosure, or distribution of this software
10	* and related documentation without an express license agreement from
11	* NVIDIA Corporation is strictly prohibited.
12	*
13	* Please refer to the applicable NVIDIA end user license agreement (EULA)
14	* associated with this source code for terms and conditions that govern
15	* your use of this NVIDIA software.
16	*
17	*/
18
19
20	//#include "../common/book.h"
21	#include <stdio.h>
22	#include <stdlib.h>
23	#include <cuda.h>
24
25	#define HANDLE_ERROR(x) x
26
27	#define imin(a,b) (a<b?a:b)
28
29	#ifdef _CIVL
30	_Bool isPowerOfTwo(int x) {
31	if (x == 1) {
32	return $true;
33	} else {
34	return x % 2 == 0 && isPowerOfTwo(x / 2);
35	}
36	}
37
38
39	// the length of the vectors to dot product
40	$input int N;
41	// upper bound on N
42	$input int N_B;
43	$assume((0 <= N && N <= N_B));
44	$input int threadsPerBlock; // thread number per block: must be a power of 2, due to the while loop at the end of gpuThread();
45	$input int threadsPerBlock_B = N;
46	$assume((1 <= threadsPerBlock && threadsPerBlock <= threadsPerBlock_B));
47	#else
48	const int N = 33 * 1024;
49	const int threadsPerBlock = 256;
50	#endif
51	const int blocksPerGrid =
52	imin( 32, (N+threadsPerBlock-1) / threadsPerBlock );
53
54	__global__ void dot( float a, float b, float *c ) {
55	__shared__ float cache[threadsPerBlock];
56	int tid = threadIdx.x + blockIdx.x * blockDim.x;
57	int cacheIndex = threadIdx.x;
58	float temp = 0;
59
60	while (tid < N) {
61	temp += a[tid] * b[tid];
62	tid += blockDim.x * gridDim.x;
63	}
64	// set the cache values
65	cache[cacheIndex] = temp;
66	// synchronize threads in this block
67	__syncthreads();
68	// for reductions, threadsPerBlock must be a power of 2
69	// because of the following code
70	int i = blockDim.x/2;
71	while (i != 0) {
72	if (cacheIndex < i)
73	cache[cacheIndex] += cache[cacheIndex + i];
74	__syncthreads();
75	i /= 2;
76	}
77
78	if (cacheIndex == 0)
79	c[blockIdx.x] = cache[0];
80	}
81
82
83	int main() {
84	#ifdef _CIVL
85	$assume((isPowerOfTwo(threadsPerBlock)));
86	#endif
87
88	float a, b, c, *partial_c;
89	float dev_a, dev_b, *dev_partial_c;
90
91	// allocate memory on the cpu side
92	a = (float)malloc( Nsizeof(float) );
93	b = (float)malloc( Nsizeof(float) );
94	partial_c = (float)malloc( blocksPerGridsizeof(float) );
95
96	// allocate the memory on the GPU
97	HANDLE_ERROR( cudaMalloc( (void**)&dev_a,
98	N*sizeof(float) ) );
99	HANDLE_ERROR( cudaMalloc( (void**)&dev_b,
100	N*sizeof(float) ) );
101	HANDLE_ERROR( cudaMalloc( (void**)&dev_partial_c,
102	blocksPerGrid*sizeof(float) ) );
103
104	// fill in the host memory with data
105	for (int i=0; i<N; i++) {
106	a[i] = i;
107	b[i] = i*2;
108	}
109
110	// copy the arrays 'a' and 'b' to the GPU
111	HANDLE_ERROR( cudaMemcpy( dev_a, a, N*sizeof(float),
112	cudaMemcpyHostToDevice ) );
113	HANDLE_ERROR( cudaMemcpy( dev_b, b, N*sizeof(float),
114	cudaMemcpyHostToDevice ) );
115
116	dot<<<blocksPerGrid,threadsPerBlock>>>( dev_a, dev_b,
117	dev_partial_c );
118
119	// copy the array 'c' back from the GPU to the CPU
120	HANDLE_ERROR( cudaMemcpy( partial_c, dev_partial_c,
121	blocksPerGrid*sizeof(float),
122	cudaMemcpyDeviceToHost ) );
123
124	// finish up on the CPU side
125	c = 0;
126	for (int i=0; i<blocksPerGrid; i++) {
127	c += partial_c[i];
128	}
129
130	#define sum_squares(x) (x(x+1)(2*x+1)/6)
131	printf( "Does GPU value %.6g = %.6g?\n", c,
132	2 * sum_squares( (float)(N - 1) ) );
133	#ifdef _CIVL
134	$assert(c == 2 * sum_squares( (float)(N - 1) ) );
135	#endif
136
137	// free memory on the gpu side
138	HANDLE_ERROR( cudaFree( dev_a ) );
139	HANDLE_ERROR( cudaFree( dev_b ) );
140	HANDLE_ERROR( cudaFree( dev_partial_c ) );
141
142	// free memory on the cpu side
143	free( a );
144	free( b );
145	free( partial_c );
146	}

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: CIVL/examples/cuda/dot.cu

Download in other formats: