Context Navigation

dot.cu@ f3527dd

1.23 2.0 main test-branch

Last change on this file since f3527dd was 2321281, checked in by Manchun Zheng <zmanchun@…>, 11 years ago

fixed $elaborate for examples.

git-svn-id: svn://vsl.cis.udel.edu/civl/trunk@2355 fb995dde-84ed-4084-dfe6-e5aef3e2452c

Property mode set to 100644

File size: 4.2 KB

Line
1	#ifdef _CIVL
2	#include <civlc.cvh>
3	#endif
4	/*
5	* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
6	*
7	* NVIDIA Corporation and its licensors retain all intellectual property and
8	* proprietary rights in and to this software and related documentation.
9	* Any use, reproduction, disclosure, or distribution of this software
10	* and related documentation without an express license agreement from
11	* NVIDIA Corporation is strictly prohibited.
12	*
13	* Please refer to the applicable NVIDIA end user license agreement (EULA)
14	* associated with this source code for terms and conditions that govern
15	* your use of this NVIDIA software.
16	*
17	*/
18
19
20	//#include "../common/book.h"
21	#include <stdio.h>
22	#include <stdlib.h>
23	#include <cuda.h>
24
25	#define HANDLE_ERROR(x) x
26
27	#define imin(a,b) (a<b?a:b)
28
29	#ifdef _CIVL
30	_Bool isPowerOfTwo(int x) {
31	if (x == 1) {
32	return $true;
33	} else {
34	return x % 2 == 0 && isPowerOfTwo(x / 2);
35	}
36	}
37
38
39	// the length of the vectors to dot product
40	$input int N;
41	// upper bound on N
42	$input int N_B;
43	$assume((0 <= N && N <= N_B));
44	$input int threadsPerBlock; // thread number per block: must be a power of 2, due to the while loop at the end of gpuThread();
45	$input int threadsPerBlock_B;
46	$assume((1 <= threadsPerBlock && threadsPerBlock <= threadsPerBlock_B));
47	#else
48	const int N = 33 * 1024;
49	const int threadsPerBlock = 256;
50	#endif
51	const int blocksPerGrid =
52	imin( 32, (N+threadsPerBlock-1) / threadsPerBlock );
53
54	__global__ void dot( float a, float b, float *c ) {
55	__shared__ float cache[threadsPerBlock];
56	int tid = threadIdx.x + blockIdx.x * blockDim.x;
57	int cacheIndex = threadIdx.x;
58	float temp = 0;
59
60	while (tid < N) {
61	temp += a[tid] * b[tid];
62	tid += blockDim.x * gridDim.x;
63	}
64	// set the cache values
65	cache[cacheIndex] = temp;
66	// synchronize threads in this block
67	__syncthreads();
68	// for reductions, threadsPerBlock must be a power of 2
69	// because of the following code
70	int i = blockDim.x/2;
71	while (i != 0) {
72	if (cacheIndex < i)
73	cache[cacheIndex] += cache[cacheIndex + i];
74	__syncthreads();
75	i /= 2;
76	}
77
78	if (cacheIndex == 0)
79	c[blockIdx.x] = cache[0];
80	}
81
82
83	int main( void ) {
84	#ifdef _CIVL
85	$elaborate(threadsPerBlock);
86	$assume((isPowerOfTwo(threadsPerBlock)));
87	#endif
88
89	float a, b, c, *partial_c;
90	float dev_a, dev_b, *dev_partial_c;
91
92	// allocate memory on the cpu side
93	a = (float)malloc( Nsizeof(float) );
94	b = (float)malloc( Nsizeof(float) );
95	partial_c = (float)malloc( blocksPerGridsizeof(float) );
96
97	// allocate the memory on the GPU
98	HANDLE_ERROR( cudaMalloc( (void**)&dev_a,
99	N*sizeof(float) ) );
100	HANDLE_ERROR( cudaMalloc( (void**)&dev_b,
101	N*sizeof(float) ) );
102	HANDLE_ERROR( cudaMalloc( (void**)&dev_partial_c,
103	blocksPerGrid*sizeof(float) ) );
104
105	// fill in the host memory with data
106	for (int i=0; i<N; i++) {
107	a[i] = i;
108	b[i] = i*2;
109	}
110
111	// copy the arrays 'a' and 'b' to the GPU
112	HANDLE_ERROR( cudaMemcpy( dev_a, a, N*sizeof(float),
113	cudaMemcpyHostToDevice ) );
114	HANDLE_ERROR( cudaMemcpy( dev_b, b, N*sizeof(float),
115	cudaMemcpyHostToDevice ) );
116
117	dot<<<blocksPerGrid,threadsPerBlock>>>( dev_a, dev_b,
118	dev_partial_c );
119
120	// copy the array 'c' back from the GPU to the CPU
121	HANDLE_ERROR( cudaMemcpy( partial_c, dev_partial_c,
122	blocksPerGrid*sizeof(float),
123	cudaMemcpyDeviceToHost ) );
124
125	// finish up on the CPU side
126	c = 0;
127	for (int i=0; i<blocksPerGrid; i++) {
128	c += partial_c[i];
129	}
130
131	#define sum_squares(x) (x(x+1)(2*x+1)/6)
132	printf( "Does GPU value %.6g = %.6g?\n", c,
133	2 * sum_squares( (float)(N - 1) ) );
134	#ifdef _CIVL
135	$assert((c == 2 * sum_squares( (float)(N - 1) ) ));
136	#endif
137
138	// free memory on the gpu side
139	HANDLE_ERROR( cudaFree( dev_a ) );
140	HANDLE_ERROR( cudaFree( dev_b ) );
141	HANDLE_ERROR( cudaFree( dev_partial_c ) );
142
143	// free memory on the cpu side
144	free( a );
145	free( b );
146	free( partial_c );
147	}

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: CIVL/examples/cuda/dot.cu@ f3527dd

Download in other formats: