Context Navigation

dot2.cvl@ d87ec9c

1.23 2.0 main test-branch

Last change on this file since d87ec9c was fc22fbf, checked in by Andre Marianiello <andre.marianiello@…>, 12 years ago

Fixed the ifdefs that prevent multiple header inclusion in cuda.cvh and cuda-helper.cvh

git-svn-id: svn://vsl.cis.udel.edu/civl/trunk@1281 fb995dde-84ed-4084-dfe6-e5aef3e2452c

Property mode set to 100644

File size: 4.6 KB

Line
1	/*
2	* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
3	*
4	* NVIDIA Corporation and its licensors retain all intellectual property and
5	* proprietary rights in and to this software and related documentation.
6	* Any use, reproduction, disclosure, or distribution of this software
7	* and related documentation without an express license agreement from
8	* NVIDIA Corporation is strictly prohibited.
9	*
10	* Please refer to the applicable NVIDIA end user license agreement (EULA)
11	* associated with this source code for terms and conditions that govern
12	* your use of this NVIDIA software.
13	*
14	* This is a version translated to CIVL.
15	*
16	*/
17
18	#include "civlc.h"
19	#include "cuda.cvh"
20	#include "stdio.h"
21
22	#define imin(a,b) (a<b?a:b)
23
24	// the length of the vectors to dot product
25	$input int N;
26	// upper bound on N
27	$input int B;
28	$assume(0 <= N && N <= B);
29	$input int THREADS_PER_BLOCK; // thread number per block: must be a power of 2, due to the while loop at the end of gpuThread();
30	const int threadsPerBlock = THREADS_PER_BLOCK;
31	// the number of blocks to create
32	const int blocksPerGrid =
33	imin( 32, (N+threadsPerBlock-1) / threadsPerBlock );
34
35	void _kernel_dot(dim3 gridDim, dim3 blockDim, cudaStream_t s, float a, float b, float *c ) {
36
37	void _kernel (_kernelInstance *this, cudaEvent_t e) {
38
39	void _block (uint3 blockIdx) {
40	// calculate total number of threads
41	int _numThreads = blockDim.x * blockDim.y * blockDim.z;
42	$gbarrier _block_barrier = $gbarrier_create($here, _numThreads);
43
44	// shared memory for a block
45	float cache[threadsPerBlock];
46
47	void _thread (uint3 threadIdx) {
48
49	// get the one dimensional index of this threads
50	int _tid = _index(blockDim, threadIdx);
51	$barrier _b = $barrier_create($here, _block_barrier, _tid);
52
53	// get this thread's id
54	int tid = threadIdx.x + blockIdx.x * blockDim.x;
55	int cacheIndex = threadIdx.x;
56
57	float temp = 0;
58	int i = blockDim.x/2;
59
60	while (tid < N) {
61	temp += a[tid] * b[tid];
62	tid += blockDim.x * gridDim.x;
63	}
64
65	// set the cache values
66	cache[cacheIndex] = temp;
67
68	// synchronize threads in this block
69
70	$barrier_call(_b);
71
72	// for reductions, threadsPerBlock must be a power of 2
73	// because of the following code
74	while (i != 0) {
75	if (cacheIndex < i) {
76	cache[cacheIndex] += cache[cacheIndex + i];
77	}
78	$barrier_call(_b);
79	i /= 2;
80	}
81
82	if (cacheIndex == 0) {
83	c[blockIdx.x] = cache[0];
84	}
85
86
87	$barrier_destroy(_b);
88	}
89
90	_runProcs(blockDim, _thread);
91	$gbarrier_destroy(_block_barrier);
92	}
93
94	_waitInQueue(this, e);
95	_runProcs(gridDim, _block);
96	_kernelFinish(this);
97	}
98	_enqueueKernel(s, _kernel);
99	}
100
101	int main ( void ) {
102
103	int _main( void ) {
104	float a, b, c, *partial_c;
105	float dev_a, dev_b, *dev_partial_c;
106
107	// allocate memory on the cpu side
108	a = (float)$malloc($root, Nsizeof(float) );
109	b = (float)$malloc($root, Nsizeof(float) );
110	partial_c = (float)$malloc($root, blocksPerGridsizeof(float) );
111
112	// allocate the memory on the GPU
113	dev_a = (float )$malloc($root, Nsizeof(float) );
114	dev_b = (float )$malloc($root, Nsizeof(float) );
115	dev_partial_c = (float )$malloc($root, blocksPerGridsizeof(float) );
116
117	// fill in the host memory with data
118	for (int i=0; i<N; i++) {
119	a[i] = i;
120	b[i] = i*2;
121	}
122
123	// copy the arrays 'a' and 'b' to the GPU
124	cudaMemcpy( dev_a, a, N*sizeof(float), cudaMemcpyHostToDevice );
125	cudaMemcpy( dev_b, b, N*sizeof(float), cudaMemcpyHostToDevice );
126
127	dim3 _t1, _t2;
128	_t1 = _toDim3(blocksPerGrid);
129	_t2 = _toDim3(threadsPerBlock);
130	_kernel_dot(_t1, _t2, 0, dev_a, dev_b, dev_partial_c );
131	// copy the array 'c' back from the GPU to the CPU
132	cudaMemcpy( partial_c, dev_partial_c, blocksPerGrid*sizeof(float), cudaMemcpyDeviceToHost );
133
134	// finish up on the CPU side
135	c = 0;
136	for (int i=0; i<blocksPerGrid; i++) {
137	c += partial_c[i];
138	}
139
140	#define sum_squares(x) (x(x+1)(2*x+1)/6)
141	printf( "Does GPU value %.6g = %.6g?\n", c,
142	2 * sum_squares( (float)(N - 1) ) );
143	$assert(c == 2 * sum_squares( (float)(N - 1) ) );
144
145	// free memory on the gpu side
146	cudaFree( dev_a ); // civl versionof cudaFree will check memory correctness, then call regular free
147	cudaFree( dev_b );
148	cudaFree( dev_partial_c );
149
150	// free memory on the cpu side
151	$free( a );
152	$free( b );
153	$free( partial_c );
154	return 0;
155	}
156
157	_cudaInit();
158	_main();
159	_cudaFinalize();
160	}
161
162

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

Original Format