Context Navigation

dot.cu

main

Last change on this file was 9cabba4, checked in by Alex Wilton <awilton@…>, 2 years ago

Merged CUDA branch into trunk.

git-svn-id: svn://vsl.cis.udel.edu/civl/trunk@5867 fb995dde-84ed-4084-dfe6-e5aef3e2452c

Property mode set to 100644

File size: 4.2 KB

Rev	Line
[3ff27cf]	1	#ifdef _CIVL
	2	#include <civlc.cvh>
	3	#endif
[54d543c]	4	/*
	5	* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
	6	*
	7	* NVIDIA Corporation and its licensors retain all intellectual property and
	8	* proprietary rights in and to this software and related documentation.
	9	* Any use, reproduction, disclosure, or distribution of this software
	10	* and related documentation without an express license agreement from
	11	* NVIDIA Corporation is strictly prohibited.
	12	*
	13	* Please refer to the applicable NVIDIA end user license agreement (EULA)
	14	* associated with this source code for terms and conditions that govern
	15	* your use of this NVIDIA software.
	16	*
	17	*/
	18
	19
[0f059298]	20	//#include "../common/book.h"
[1e59dce]	21	#include <stdio.h>
	22	#include <stdlib.h>
	23	#include <cuda.h>
	24
	25	#define HANDLE_ERROR(x) x
[54d543c]	26
	27	#define imin(a,b) (a<b?a:b)
	28
[354d5fa]	29	#ifdef _CIVL
[e75355b]	30	_Bool isPowerOfTwo(int x) {
	31	if (x == 1) {
	32	return $true;
	33	} else {
	34	return x % 2 == 0 && isPowerOfTwo(x / 2);
	35	}
	36	}
	37
	38
	39	// the length of the vectors to dot product
[354d5fa]	40	$input int N;
	41	// upper bound on N
	42	$input int N_B;
[3ff27cf]	43	$assume((0 <= N && N <= N_B));
[354d5fa]	44	$input int threadsPerBlock; // thread number per block: must be a power of 2, due to the while loop at the end of gpuThread();
[9cabba4]	45	$input int threadsPerBlock_B = N;
[3ff27cf]	46	$assume((1 <= threadsPerBlock && threadsPerBlock <= threadsPerBlock_B));
[354d5fa]	47	#else
	48	const int N = 33 * 1024;
	49	const int threadsPerBlock = 256;
	50	#endif
[441e680]	51	const int blocksPerGrid =
[354d5fa]	52	imin( 32, (N+threadsPerBlock-1) / threadsPerBlock );
[54d543c]	53
	54	__global__ void dot( float a, float b, float *c ) {
	55	__shared__ float cache[threadsPerBlock];
	56	int tid = threadIdx.x + blockIdx.x * blockDim.x;
	57	int cacheIndex = threadIdx.x;
	58	float temp = 0;
	59
	60	while (tid < N) {
	61	temp += a[tid] * b[tid];
	62	tid += blockDim.x * gridDim.x;
	63	}
	64	// set the cache values
	65	cache[cacheIndex] = temp;
	66	// synchronize threads in this block
	67	__syncthreads();
	68	// for reductions, threadsPerBlock must be a power of 2
	69	// because of the following code
	70	int i = blockDim.x/2;
	71	while (i != 0) {
	72	if (cacheIndex < i)
	73	cache[cacheIndex] += cache[cacheIndex + i];
	74	__syncthreads();
	75	i /= 2;
	76	}
	77
	78	if (cacheIndex == 0)
	79	c[blockIdx.x] = cache[0];
	80	}
	81
	82
[a52ef0f]	83	int main() {
[354d5fa]	84	#ifdef _CIVL
[3ff27cf]	85	$assume((isPowerOfTwo(threadsPerBlock)));
[354d5fa]	86	#endif
	87
[54d543c]	88	float a, b, c, *partial_c;
	89	float dev_a, dev_b, *dev_partial_c;
	90
	91	// allocate memory on the cpu side
	92	a = (float)malloc( Nsizeof(float) );
	93	b = (float)malloc( Nsizeof(float) );
	94	partial_c = (float)malloc( blocksPerGridsizeof(float) );
	95
	96	// allocate the memory on the GPU
	97	HANDLE_ERROR( cudaMalloc( (void**)&dev_a,
	98	N*sizeof(float) ) );
	99	HANDLE_ERROR( cudaMalloc( (void**)&dev_b,
	100	N*sizeof(float) ) );
	101	HANDLE_ERROR( cudaMalloc( (void**)&dev_partial_c,
	102	blocksPerGrid*sizeof(float) ) );
	103
	104	// fill in the host memory with data
	105	for (int i=0; i<N; i++) {
	106	a[i] = i;
	107	b[i] = i*2;
	108	}
	109
	110	// copy the arrays 'a' and 'b' to the GPU
	111	HANDLE_ERROR( cudaMemcpy( dev_a, a, N*sizeof(float),
	112	cudaMemcpyHostToDevice ) );
	113	HANDLE_ERROR( cudaMemcpy( dev_b, b, N*sizeof(float),
	114	cudaMemcpyHostToDevice ) );
	115
	116	dot<<<blocksPerGrid,threadsPerBlock>>>( dev_a, dev_b,
	117	dev_partial_c );
	118
	119	// copy the array 'c' back from the GPU to the CPU
	120	HANDLE_ERROR( cudaMemcpy( partial_c, dev_partial_c,
	121	blocksPerGrid*sizeof(float),
	122	cudaMemcpyDeviceToHost ) );
	123
	124	// finish up on the CPU side
	125	c = 0;
	126	for (int i=0; i<blocksPerGrid; i++) {
	127	c += partial_c[i];
	128	}
	129
	130	#define sum_squares(x) (x(x+1)(2*x+1)/6)
	131	printf( "Does GPU value %.6g = %.6g?\n", c,
	132	2 * sum_squares( (float)(N - 1) ) );
[354d5fa]	133	#ifdef _CIVL
[d980649]	134	$assert(c == 2 * sum_squares( (float)(N - 1) ) );
[354d5fa]	135	#endif
[54d543c]	136
	137	// free memory on the gpu side
	138	HANDLE_ERROR( cudaFree( dev_a ) );
	139	HANDLE_ERROR( cudaFree( dev_b ) );
	140	HANDLE_ERROR( cudaFree( dev_partial_c ) );
	141
	142	// free memory on the cpu side
	143	free( a );
	144	free( b );
	145	free( partial_c );
	146	}

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: CIVL/examples/cuda/dot.cu

Download in other formats: