source: CIVL/examples/cuda/mockupAlg2.cu

main
Last change on this file was 9dbe9864, checked in by Alex Wilton <awilton@…>, 2 years ago

Merged CUDA branch into trunk.

git-svn-id: svn://vsl.cis.udel.edu/civl/trunk@5890 fb995dde-84ed-4084-dfe6-e5aef3e2452c

  • Property mode set to 100644
File size: 2.0 KB
Line 
1#include <cuda.h>
2#include <stdlib.h>
3#include <stdio.h>
4#include <assert.h>
5
6__global__ void kernel_1(float* A, float* C, int numElements) {
7 int lane = threadIdx.x % warpSize;
8 int thisWarpSize = warpSize;
9 if (threadIdx.x - lane + warpSize > blockDim.x) {
10 thisWarpSize = ((blockDim.x - 1) % warpSize) + 1;
11 }
12
13 int i = blockDim.x * blockIdx.x + threadIdx.x;
14 int warpStart = i - lane;
15
16 if (warpStart + 1 < numElements) {
17 float val = i < numElements ? A[i] : 0;
18 for (int offset = warpSize/2; offset > 0; offset /= 2) {
19 float tmp = __shfl_down_sync(0, val, offset);
20 //float tmp = i + offset < numElements ? A[i + offset] : 0;
21 if (lane + offset < thisWarpSize) {
22 val += tmp;
23 }
24 }
25
26 if (i < numElements) {
27 A[i] = val;
28 }
29 }
30
31 __syncthreads();
32 if (threadIdx.x == 0) {
33 int blockEnd = blockDim.x * (blockIdx.x + 1);
34 if (blockEnd > numElements) {
35 blockEnd = numElements;
36 }
37 for (int j = i + warpSize; j < blockEnd; j += warpSize) {
38 A[i] += A[j];
39 }
40 atomicAdd(C + blockIdx.x, 1);
41 }
42
43 if (i == 0) {
44 C[0] = A[0];
45 for (int j = 1; j < gridDim.x; j++) {
46 while(atomicAdd(C+j,0) == 0) {}
47 C[0] += A[j * blockDim.x];
48 }
49 }
50}
51
52$input int N = 64;
53$input float A[N];
54
55$input int threadsPerBlock = N%2 == 0 ? N/2 : (N+1)/2;
56int numBlocks = (N-1)/threadsPerBlock + 1;
57
58int main() {
59 int size = N * sizeof(float);
60
61 float* cuda_A;
62 cudaMalloc((void **)&cuda_A, size);
63 cudaMemcpy(cuda_A, A, size, cudaMemcpyHostToDevice);
64
65 float C[numBlocks];
66 for (int i = 0; i < numBlocks; i++) {
67 C[i] = 0;
68 }
69
70 float* cuda_C;
71 cudaMalloc((void **)&cuda_C, numBlocks * sizeof(float));
72 cudaMemcpy(cuda_C, C, numBlocks * sizeof(float), cudaMemcpyHostToDevice);
73
74 kernel_1<<<numBlocks, threadsPerBlock>>>(cuda_A, cuda_C, N);
75
76 cudaMemcpy(C, cuda_C, sizeof(float), cudaMemcpyDeviceToHost);
77
78 float sum = 0;
79 for(int i = 0; i < N; i++)
80 sum += A[i];
81
82 $assert(*C == sum);
83
84 cudaFree(cuda_A);
85 cudaFree(cuda_C);
86}
Note: See TracBrowser for help on using the repository browser.