| 1 | #include <cuda.h>
|
|---|
| 2 | #include <stdlib.h>
|
|---|
| 3 | #include <stdio.h>
|
|---|
| 4 | #include <assert.h>
|
|---|
| 5 |
|
|---|
| 6 | __global__ void kernel_1(float* A, float* C, int numElements) {
|
|---|
| 7 | int lane = threadIdx.x % warpSize;
|
|---|
| 8 | int thisWarpSize = warpSize;
|
|---|
| 9 | if (threadIdx.x - lane + warpSize > blockDim.x) {
|
|---|
| 10 | thisWarpSize = ((blockDim.x - 1) % warpSize) + 1;
|
|---|
| 11 | }
|
|---|
| 12 |
|
|---|
| 13 | int i = blockDim.x * blockIdx.x + threadIdx.x;
|
|---|
| 14 | int warpStart = i - lane;
|
|---|
| 15 |
|
|---|
| 16 | if (warpStart + 1 < numElements) {
|
|---|
| 17 | float val = i < numElements ? A[i] : 0;
|
|---|
| 18 | for (int offset = warpSize/2; offset > 0; offset /= 2) {
|
|---|
| 19 | float tmp = __shfl_down_sync(0, val, offset);
|
|---|
| 20 | //float tmp = i + offset < numElements ? A[i + offset] : 0;
|
|---|
| 21 | if (lane + offset < thisWarpSize) {
|
|---|
| 22 | val += tmp;
|
|---|
| 23 | }
|
|---|
| 24 | }
|
|---|
| 25 |
|
|---|
| 26 | if (i < numElements) {
|
|---|
| 27 | A[i] = val;
|
|---|
| 28 | }
|
|---|
| 29 | }
|
|---|
| 30 |
|
|---|
| 31 | __syncthreads();
|
|---|
| 32 | if (threadIdx.x == 0) {
|
|---|
| 33 | int blockEnd = blockDim.x * (blockIdx.x + 1);
|
|---|
| 34 | if (blockEnd > numElements) {
|
|---|
| 35 | blockEnd = numElements;
|
|---|
| 36 | }
|
|---|
| 37 | for (int j = i + warpSize; j < blockEnd; j += warpSize) {
|
|---|
| 38 | A[i] += A[j];
|
|---|
| 39 | }
|
|---|
| 40 | atomicAdd(C + blockIdx.x, 1);
|
|---|
| 41 | }
|
|---|
| 42 |
|
|---|
| 43 | if (i == 0) {
|
|---|
| 44 | C[0] = A[0];
|
|---|
| 45 | for (int j = 1; j < gridDim.x; j++) {
|
|---|
| 46 | while(atomicAdd(C+j,0) == 0) {}
|
|---|
| 47 | C[0] += A[j * blockDim.x];
|
|---|
| 48 | }
|
|---|
| 49 | }
|
|---|
| 50 | }
|
|---|
| 51 |
|
|---|
| 52 | $input int N = 64;
|
|---|
| 53 | $input float A[N];
|
|---|
| 54 |
|
|---|
| 55 | $input int threadsPerBlock = N%2 == 0 ? N/2 : (N+1)/2;
|
|---|
| 56 | int numBlocks = (N-1)/threadsPerBlock + 1;
|
|---|
| 57 |
|
|---|
| 58 | int main() {
|
|---|
| 59 | int size = N * sizeof(float);
|
|---|
| 60 |
|
|---|
| 61 | float* cuda_A;
|
|---|
| 62 | cudaMalloc((void **)&cuda_A, size);
|
|---|
| 63 | cudaMemcpy(cuda_A, A, size, cudaMemcpyHostToDevice);
|
|---|
| 64 |
|
|---|
| 65 | float C[numBlocks];
|
|---|
| 66 | for (int i = 0; i < numBlocks; i++) {
|
|---|
| 67 | C[i] = 0;
|
|---|
| 68 | }
|
|---|
| 69 |
|
|---|
| 70 | float* cuda_C;
|
|---|
| 71 | cudaMalloc((void **)&cuda_C, numBlocks * sizeof(float));
|
|---|
| 72 | cudaMemcpy(cuda_C, C, numBlocks * sizeof(float), cudaMemcpyHostToDevice);
|
|---|
| 73 |
|
|---|
| 74 | kernel_1<<<numBlocks, threadsPerBlock>>>(cuda_A, cuda_C, N);
|
|---|
| 75 |
|
|---|
| 76 | cudaMemcpy(C, cuda_C, sizeof(float), cudaMemcpyDeviceToHost);
|
|---|
| 77 |
|
|---|
| 78 | float sum = 0;
|
|---|
| 79 | for(int i = 0; i < N; i++)
|
|---|
| 80 | sum += A[i];
|
|---|
| 81 |
|
|---|
| 82 | $assert(*C == sum);
|
|---|
| 83 |
|
|---|
| 84 | cudaFree(cuda_A);
|
|---|
| 85 | cudaFree(cuda_C);
|
|---|
| 86 | } |
|---|