Context Navigation

dot.cu@ e35f210

1.23 2.0 main test-branch

Last change on this file since e35f210 was 354d5fa, checked in by Andre Marianiello <andre.marianiello@…>, 11 years ago

Updated cuda examples with #ifdef _CIVL in appropriate places.

git-svn-id: svn://vsl.cis.udel.edu/civl/trunk@1981 fb995dde-84ed-4084-dfe6-e5aef3e2452c

Property mode set to 100644

File size: 4.1 KB

Line
1	/*
2	* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
3	*
4	* NVIDIA Corporation and its licensors retain all intellectual property and
5	* proprietary rights in and to this software and related documentation.
6	* Any use, reproduction, disclosure, or distribution of this software
7	* and related documentation without an express license agreement from
8	* NVIDIA Corporation is strictly prohibited.
9	*
10	* Please refer to the applicable NVIDIA end user license agreement (EULA)
11	* associated with this source code for terms and conditions that govern
12	* your use of this NVIDIA software.
13	*
14	*/
15
16
17	//#include "../common/book.h"
18	#include <stdio.h>
19	#include <stdlib.h>
20	#include <cuda.h>
21
22	#define HANDLE_ERROR(x) x
23
24	#define imin(a,b) (a<b?a:b)
25
26	#ifdef _CIVL
27	_Bool isPowerOfTwo(int x) {
28	if (x == 1) {
29	return $true;
30	} else {
31	return x % 2 == 0 && isPowerOfTwo(x / 2);
32	}
33	}
34
35
36	// the length of the vectors to dot product
37	$input int N;
38	// upper bound on N
39	$input int N_B;
40	$assume(0 <= N && N <= N_B);
41	$input int threadsPerBlock; // thread number per block: must be a power of 2, due to the while loop at the end of gpuThread();
42	$input int threadsPerBlock_B;
43	$assume(1 <= threadsPerBlock && threadsPerBlock <= threadsPerBlock_B);
44	#else
45	const int N = 33 * 1024;
46	const int threadsPerBlock = 256;
47	#endif
48	const int blocksPerGrid =
49	imin( 32, (N+threadsPerBlock-1) / threadsPerBlock );
50
51	__global__ void dot( float a, float b, float *c ) {
52	__shared__ float cache[threadsPerBlock];
53	int tid = threadIdx.x + blockIdx.x * blockDim.x;
54	int cacheIndex = threadIdx.x;
55	float temp = 0;
56
57	while (tid < N) {
58	temp += a[tid] * b[tid];
59	tid += blockDim.x * gridDim.x;
60	}
61	// set the cache values
62	cache[cacheIndex] = temp;
63	// synchronize threads in this block
64	__syncthreads();
65	// for reductions, threadsPerBlock must be a power of 2
66	// because of the following code
67	int i = blockDim.x/2;
68	while (i != 0) {
69	if (cacheIndex < i)
70	cache[cacheIndex] += cache[cacheIndex + i];
71	__syncthreads();
72	i /= 2;
73	}
74
75	if (cacheIndex == 0)
76	c[blockIdx.x] = cache[0];
77	}
78
79
80	int main( void ) {
81	#ifdef _CIVL
82	elaborate(threadsPerBlock);
83	$assume(isPowerOfTwo(threadsPerBlock));
84	#endif
85
86	float a, b, c, *partial_c;
87	float dev_a, dev_b, *dev_partial_c;
88
89	// allocate memory on the cpu side
90	a = (float)malloc( Nsizeof(float) );
91	b = (float)malloc( Nsizeof(float) );
92	partial_c = (float)malloc( blocksPerGridsizeof(float) );
93
94	// allocate the memory on the GPU
95	HANDLE_ERROR( cudaMalloc( (void**)&dev_a,
96	N*sizeof(float) ) );
97	HANDLE_ERROR( cudaMalloc( (void**)&dev_b,
98	N*sizeof(float) ) );
99	HANDLE_ERROR( cudaMalloc( (void**)&dev_partial_c,
100	blocksPerGrid*sizeof(float) ) );
101
102	// fill in the host memory with data
103	for (int i=0; i<N; i++) {
104	a[i] = i;
105	b[i] = i*2;
106	}
107
108	// copy the arrays 'a' and 'b' to the GPU
109	HANDLE_ERROR( cudaMemcpy( dev_a, a, N*sizeof(float),
110	cudaMemcpyHostToDevice ) );
111	HANDLE_ERROR( cudaMemcpy( dev_b, b, N*sizeof(float),
112	cudaMemcpyHostToDevice ) );
113
114	dot<<<blocksPerGrid,threadsPerBlock>>>( dev_a, dev_b,
115	dev_partial_c );
116
117	// copy the array 'c' back from the GPU to the CPU
118	HANDLE_ERROR( cudaMemcpy( partial_c, dev_partial_c,
119	blocksPerGrid*sizeof(float),
120	cudaMemcpyDeviceToHost ) );
121
122	// finish up on the CPU side
123	c = 0;
124	for (int i=0; i<blocksPerGrid; i++) {
125	c += partial_c[i];
126	}
127
128	#define sum_squares(x) (x(x+1)(2*x+1)/6)
129	printf( "Does GPU value %.6g = %.6g?\n", c,
130	2 * sum_squares( (float)(N - 1) ) );
131	#ifdef _CIVL
132	$assert(c == 2 * sum_squares( (float)(N - 1) ) );
133	#endif
134
135	// free memory on the gpu side
136	HANDLE_ERROR( cudaFree( dev_a ) );
137	HANDLE_ERROR( cudaFree( dev_b ) );
138	HANDLE_ERROR( cudaFree( dev_partial_c ) );
139
140	// free memory on the cpu side
141	free( a );
142	free( b );
143	free( partial_c );
144	}

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: CIVL/examples/cuda/dot.cu@ e35f210

Download in other formats: