| [9cabba4] | 1 | #include <cuda.h>
|
|---|
| 2 | #include <stdlib.h>
|
|---|
| 3 | #include <stdio.h>
|
|---|
| 4 | #include <assert.h>
|
|---|
| 5 |
|
|---|
| 6 | __global__ void kernel_1(float* A, float* C, int numElements) {
|
|---|
| 7 | int lane = threadIdx.x % warpSize;
|
|---|
| 8 | int thisWarpSize = warpSize;
|
|---|
| 9 | if (threadIdx.x - lane + warpSize > blockDim.x) {
|
|---|
| 10 | thisWarpSize = ((blockDim.x - 1) % warpSize) + 1;
|
|---|
| 11 | }
|
|---|
| 12 |
|
|---|
| 13 | int i = blockDim.x * blockIdx.x + threadIdx.x;
|
|---|
| 14 | int warpStart = i - lane;
|
|---|
| 15 | //printf("%d,%d - i: %d, warpStart: %d, thisWarpSize: %d\n", blockIdx.x, threadIdx.x,i, warpStart, thisWarpSize);
|
|---|
| 16 | int remainingElements = numElements;
|
|---|
| 17 |
|
|---|
| 18 | while (remainingElements > 1) {
|
|---|
| 19 | if (remainingElements < numElements) {
|
|---|
| 20 | __syncthreads();
|
|---|
| 21 | }
|
|---|
| [9dbe9864] | 22 |
|
|---|
| [9cabba4] | 23 | if (warpStart + 1 < remainingElements) {
|
|---|
| 24 | float val = i < numElements ? A[i] : 0;
|
|---|
| 25 |
|
|---|
| 26 | for (int offset = warpSize/2; offset > 0; offset /= 2) {
|
|---|
| 27 | //float tmp = $cuda__shfl_down_sync(val, offset, $lane);
|
|---|
| 28 | float tmp = __shfl_down_sync(0xFFFFFFFF, val, offset);
|
|---|
| 29 | if (lane + offset < thisWarpSize) {
|
|---|
| 30 | val += tmp;
|
|---|
| 31 | }
|
|---|
| 32 | }
|
|---|
| 33 |
|
|---|
| 34 | if (i < numElements) {
|
|---|
| 35 | A[i] = val;
|
|---|
| 36 | //printf("%d,%d - writing A[%d]: %f\n", blockIdx.x, threadIdx.x, i, val);
|
|---|
| 37 | }
|
|---|
| 38 | }
|
|---|
| 39 |
|
|---|
| 40 | i *= warpSize;
|
|---|
| 41 | //warpStart *= warpSize;
|
|---|
| 42 | remainingElements = ((remainingElements - 1) / warpSize) + 1;
|
|---|
| 43 | }
|
|---|
| [9dbe9864] | 44 |
|
|---|
| [9cabba4] | 45 | if (i == 0) {
|
|---|
| 46 | *C = A[0];
|
|---|
| 47 | }
|
|---|
| 48 | }
|
|---|
| 49 |
|
|---|
| 50 | $input int N = 64;
|
|---|
| [9dbe9864] | 51 | $input float A[N];
|
|---|
| [9cabba4] | 52 |
|
|---|
| [9dbe9864] | 53 | int threadsPerBlock = N;
|
|---|
| 54 | int numBlocks = 1;
|
|---|
| [9cabba4] | 55 |
|
|---|
| [9dbe9864] | 56 | int main() {
|
|---|
| [9cabba4] | 57 | int size = N * sizeof(float);
|
|---|
| 58 |
|
|---|
| 59 | float* cuda_A;
|
|---|
| 60 | cudaMalloc((void **)&cuda_A, size);
|
|---|
| 61 | cudaMemcpy(cuda_A, A, size, cudaMemcpyHostToDevice);
|
|---|
| 62 |
|
|---|
| 63 | float* cuda_C;
|
|---|
| [9dbe9864] | 64 | cudaMalloc((void **)&cuda_C, numBlocks * sizeof(float));
|
|---|
| 65 |
|
|---|
| 66 | kernel_1<<<numBlocks, threadsPerBlock>>>(cuda_A, cuda_C, N);
|
|---|
| [9cabba4] | 67 |
|
|---|
| 68 | float* C = (float *)malloc(sizeof(float));
|
|---|
| 69 | cudaMemcpy(C, cuda_C, sizeof(float), cudaMemcpyDeviceToHost);
|
|---|
| 70 |
|
|---|
| [9dbe9864] | 71 | float sum = 0;
|
|---|
| 72 | for(int i = 0; i < N; i++)
|
|---|
| 73 | sum += A[i];
|
|---|
| [9cabba4] | 74 |
|
|---|
| [9dbe9864] | 75 | $assert(*C == sum);
|
|---|
| [9cabba4] | 76 |
|
|---|
| 77 | free(C);
|
|---|
| 78 |
|
|---|
| 79 | cudaFree(cuda_A);
|
|---|
| 80 | cudaFree(cuda_C);
|
|---|
| 81 | } |
|---|