Context Navigation

mockupAlg.cu

main

Last change on this file was 9dbe9864, checked in by Alex Wilton <awilton@…>, 2 years ago

Merged CUDA branch into trunk.

git-svn-id: svn://vsl.cis.udel.edu/civl/trunk@5890 fb995dde-84ed-4084-dfe6-e5aef3e2452c

Property mode set to 100644

File size: 1.9 KB

Rev	Line
[9cabba4]	1	#include <cuda.h>
	2	#include <stdlib.h>
	3	#include <stdio.h>
	4	#include <assert.h>
	5
	6	__global__ void kernel_1(float* A, float* C, int numElements) {
	7	int lane = threadIdx.x % warpSize;
	8	int thisWarpSize = warpSize;
	9	if (threadIdx.x - lane + warpSize > blockDim.x) {
	10	thisWarpSize = ((blockDim.x - 1) % warpSize) + 1;
	11	}
	12
	13	int i = blockDim.x * blockIdx.x + threadIdx.x;
	14	int warpStart = i - lane;
	15	//printf("%d,%d - i: %d, warpStart: %d, thisWarpSize: %d\n", blockIdx.x, threadIdx.x,i, warpStart, thisWarpSize);
	16	int remainingElements = numElements;
	17
	18	while (remainingElements > 1) {
	19	if (remainingElements < numElements) {
	20	__syncthreads();
	21	}
[9dbe9864]	22
[9cabba4]	23	if (warpStart + 1 < remainingElements) {
	24	float val = i < numElements ? A[i] : 0;
	25
	26	for (int offset = warpSize/2; offset > 0; offset /= 2) {
	27	//float tmp = $cuda__shfl_down_sync(val, offset, $lane);
	28	float tmp = __shfl_down_sync(0xFFFFFFFF, val, offset);
	29	if (lane + offset < thisWarpSize) {
	30	val += tmp;
	31	}
	32	}
	33
	34	if (i < numElements) {
	35	A[i] = val;
	36	//printf("%d,%d - writing A[%d]: %f\n", blockIdx.x, threadIdx.x, i, val);
	37	}
	38	}
	39
	40	i *= warpSize;
	41	//warpStart *= warpSize;
	42	remainingElements = ((remainingElements - 1) / warpSize) + 1;
	43	}
[9dbe9864]	44
[9cabba4]	45	if (i == 0) {
	46	*C = A[0];
	47	}
	48	}
	49
	50	$input int N = 64;
[9dbe9864]	51	$input float A[N];
[9cabba4]	52
[9dbe9864]	53	int threadsPerBlock = N;
	54	int numBlocks = 1;
[9cabba4]	55
[9dbe9864]	56	int main() {
[9cabba4]	57	int size = N * sizeof(float);
	58
	59	float* cuda_A;
	60	cudaMalloc((void **)&cuda_A, size);
	61	cudaMemcpy(cuda_A, A, size, cudaMemcpyHostToDevice);
	62
	63	float* cuda_C;
[9dbe9864]	64	cudaMalloc((void *)&cuda_C, numBlocks sizeof(float));
	65
	66	kernel_1<<<numBlocks, threadsPerBlock>>>(cuda_A, cuda_C, N);
[9cabba4]	67
	68	float* C = (float *)malloc(sizeof(float));
	69	cudaMemcpy(C, cuda_C, sizeof(float), cudaMemcpyDeviceToHost);
	70
[9dbe9864]	71	float sum = 0;
	72	for(int i = 0; i < N; i++)
	73	sum += A[i];
[9cabba4]	74
[9dbe9864]	75	$assert(*C == sum);
[9cabba4]	76
	77	free(C);
	78
	79	cudaFree(cuda_A);
	80	cudaFree(cuda_C);
	81	}

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: CIVL/examples/cuda/mockupAlg.cu

Download in other formats: