Context Navigation

dot_orig.cu

main

Last change on this file was ea777aa, checked in by Alex Wilton <awilton@…>, 3 years ago

Moved examples, include, build_default.properties, common.xml, and README out from dev.civl.com into the root of the repo.

git-svn-id: svn://vsl.cis.udel.edu/civl/trunk@5704 fb995dde-84ed-4084-dfe6-e5aef3e2452c

Property mode set to 100644

File size: 3.4 KB

Rev	Line
[1e59dce]	1	/*
	2	* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
	3	*
	4	* NVIDIA Corporation and its licensors retain all intellectual property and
	5	* proprietary rights in and to this software and related documentation.
	6	* Any use, reproduction, disclosure, or distribution of this software
	7	* and related documentation without an express license agreement from
	8	* NVIDIA Corporation is strictly prohibited.
	9	*
	10	* Please refer to the applicable NVIDIA end user license agreement (EULA)
	11	* associated with this source code for terms and conditions that govern
	12	* your use of this NVIDIA software.
	13	*
	14	*/
	15
	16
	17	//#include "../common/book.h"
	18
	19	#define imin(a,b) (a<b?a:b)
	20
	21	const int N = 33 * 1024;
	22	const int threadsPerBlock = 256;
	23	const int blocksPerGrid =
	24	imin(32, (N+threadsPerBlock-1) / threadsPerBlock );
	25
	26	__global__ void dot( float a, float b, float *c ) {
	27	__shared__ float cache[threadsPerBlock];
	28	int tid = threadIdx.x + blockIdx.x * blockDim.x;
	29	int cacheIndex = threadIdx.x;
	30	float temp = 0;
	31
	32	while (tid < N) {
	33	temp += a[tid] * b[tid];
	34	tid += blockDim.x * gridDim.x;
	35	}
	36	// set the cache values
	37	cache[cacheIndex] = temp;
	38	// synchronize threads in this block
	39	__syncthreads();
	40	// for reductions, threadsPerBlock must be a power of 2
	41	// because of the following code
	42	int i = blockDim.x/2;
	43	while (i != 0) {
	44	if (cacheIndex < i)
	45	cache[cacheIndex] += cache[cacheIndex + i];
	46	__syncthreads();
	47	i /= 2;
	48	}
	49
	50	if (cacheIndex == 0)
	51	c[blockIdx.x] = cache[0];
	52	}
	53
	54
	55	int main( void ) {
	56	float a, b, c, *partial_c;
	57	float dev_a, dev_b, *dev_partial_c;
	58
	59	// allocate memory on the cpu side
	60	a = (float)malloc( Nsizeof(float) );
	61	b = (float)malloc( Nsizeof(float) );
	62	partial_c = (float)malloc( blocksPerGridsizeof(float) );
	63
	64	// allocate the memory on the GPU
	65	HANDLE_ERROR( cudaMalloc( (void**)&dev_a,
	66	N*sizeof(float) ) );
	67	HANDLE_ERROR( cudaMalloc( (void**)&dev_b,
	68	N*sizeof(float) ) );
	69	HANDLE_ERROR( cudaMalloc( (void**)&dev_partial_c,
	70	blocksPerGrid*sizeof(float) ) );
	71
	72	// fill in the host memory with data
	73	for (int i=0; i<N; i++) {
	74	a[i] = i;
	75	b[i] = i*2;
	76	}
	77
	78	// copy the arrays 'a' and 'b' to the GPU
	79	HANDLE_ERROR( cudaMemcpy( dev_a, a, N*sizeof(float),
	80	cudaMemcpyHostToDevice ) );
	81	HANDLE_ERROR( cudaMemcpy( dev_b, b, N*sizeof(float),
	82	cudaMemcpyHostToDevice ) );
	83
	84	dot<<<blocksPerGrid,threadsPerBlock>>>( dev_a, dev_b,
	85	dev_partial_c );
	86
	87	// copy the array 'c' back from the GPU to the CPU
	88	HANDLE_ERROR( cudaMemcpy( partial_c, dev_partial_c,
	89	blocksPerGrid*sizeof(float),
	90	cudaMemcpyDeviceToHost ) );
	91
	92	// finish up on the CPU side
	93	c = 0;
	94	for (int i=0; i<blocksPerGrid; i++) {
	95	c += partial_c[i];
	96	}
	97
	98	#define sum_squares(x) (x(x+1)(2*x+1)/6)
	99	printf( "Does GPU value %.6g = %.6g?\n", c,
	100	2 * sum_squares( (float)(N - 1) ) );
	101
	102	// free memory on the gpu side
	103	HANDLE_ERROR( cudaFree( dev_a ) );
	104	HANDLE_ERROR( cudaFree( dev_b ) );
	105	HANDLE_ERROR( cudaFree( dev_partial_c ) );
	106
	107	// free memory on the cpu side
	108	free( a );
	109	free( b );
	110	free( partial_c );
	111	}

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: CIVL/examples/cuda/dot_orig.cu

Download in other formats: