source: CIVL/examples/cuda/mockupAlg.cu

main
Last change on this file was 9dbe9864, checked in by Alex Wilton <awilton@…>, 2 years ago

Merged CUDA branch into trunk.

git-svn-id: svn://vsl.cis.udel.edu/civl/trunk@5890 fb995dde-84ed-4084-dfe6-e5aef3e2452c

  • Property mode set to 100644
File size: 1.9 KB
RevLine 
[9cabba4]1#include <cuda.h>
2#include <stdlib.h>
3#include <stdio.h>
4#include <assert.h>
5
6__global__ void kernel_1(float* A, float* C, int numElements) {
7 int lane = threadIdx.x % warpSize;
8 int thisWarpSize = warpSize;
9 if (threadIdx.x - lane + warpSize > blockDim.x) {
10 thisWarpSize = ((blockDim.x - 1) % warpSize) + 1;
11 }
12
13 int i = blockDim.x * blockIdx.x + threadIdx.x;
14 int warpStart = i - lane;
15 //printf("%d,%d - i: %d, warpStart: %d, thisWarpSize: %d\n", blockIdx.x, threadIdx.x,i, warpStart, thisWarpSize);
16 int remainingElements = numElements;
17
18 while (remainingElements > 1) {
19 if (remainingElements < numElements) {
20 __syncthreads();
21 }
[9dbe9864]22
[9cabba4]23 if (warpStart + 1 < remainingElements) {
24 float val = i < numElements ? A[i] : 0;
25
26 for (int offset = warpSize/2; offset > 0; offset /= 2) {
27 //float tmp = $cuda__shfl_down_sync(val, offset, $lane);
28 float tmp = __shfl_down_sync(0xFFFFFFFF, val, offset);
29 if (lane + offset < thisWarpSize) {
30 val += tmp;
31 }
32 }
33
34 if (i < numElements) {
35 A[i] = val;
36 //printf("%d,%d - writing A[%d]: %f\n", blockIdx.x, threadIdx.x, i, val);
37 }
38 }
39
40 i *= warpSize;
41 //warpStart *= warpSize;
42 remainingElements = ((remainingElements - 1) / warpSize) + 1;
43 }
[9dbe9864]44
[9cabba4]45 if (i == 0) {
46 *C = A[0];
47 }
48}
49
50$input int N = 64;
[9dbe9864]51$input float A[N];
[9cabba4]52
[9dbe9864]53int threadsPerBlock = N;
54int numBlocks = 1;
[9cabba4]55
[9dbe9864]56int main() {
[9cabba4]57 int size = N * sizeof(float);
58
59 float* cuda_A;
60 cudaMalloc((void **)&cuda_A, size);
61 cudaMemcpy(cuda_A, A, size, cudaMemcpyHostToDevice);
62
63 float* cuda_C;
[9dbe9864]64 cudaMalloc((void **)&cuda_C, numBlocks * sizeof(float));
65
66 kernel_1<<<numBlocks, threadsPerBlock>>>(cuda_A, cuda_C, N);
[9cabba4]67
68 float* C = (float *)malloc(sizeof(float));
69 cudaMemcpy(C, cuda_C, sizeof(float), cudaMemcpyDeviceToHost);
70
[9dbe9864]71 float sum = 0;
72 for(int i = 0; i < N; i++)
73 sum += A[i];
[9cabba4]74
[9dbe9864]75 $assert(*C == sum);
[9cabba4]76
77 free(C);
78
79 cudaFree(cuda_A);
80 cudaFree(cuda_C);
81}
Note: See TracBrowser for help on using the repository browser.