| 1 | #include <cuda.h>
|
|---|
| 2 | #include <stdlib.h>
|
|---|
| 3 | #include <stdio.h>
|
|---|
| 4 | #include <assert.h>
|
|---|
| 5 |
|
|---|
| 6 | __global__ void kernel_1(float* A, float* C, int numElements) {
|
|---|
| 7 | int lane = threadIdx.x % warpSize;
|
|---|
| 8 | int thisWarpSize = warpSize;
|
|---|
| 9 | if (threadIdx.x - lane + warpSize > blockDim.x) {
|
|---|
| 10 | thisWarpSize = ((blockDim.x - 1) % warpSize) + 1;
|
|---|
| 11 | }
|
|---|
| 12 |
|
|---|
| 13 | int i = blockDim.x * blockIdx.x + threadIdx.x;
|
|---|
| 14 | int warpStart = i - lane;
|
|---|
| 15 | //printf("%d,%d - i: %d, warpStart: %d, thisWarpSize: %d\n", blockIdx.x, threadIdx.x,i, warpStart, thisWarpSize);
|
|---|
| 16 | int remainingElements = numElements;
|
|---|
| 17 |
|
|---|
| 18 | while (remainingElements > 1) {
|
|---|
| 19 | //printf("%d,%d - remainingElements: %d\n", blockIdx.x, threadIdx.x, remainingElements);
|
|---|
| 20 | if (remainingElements < numElements) {
|
|---|
| 21 | __syncthreads();
|
|---|
| 22 | //printf("%d,%d - entering barrier\n", blockIdx.x, threadIdx.x);
|
|---|
| 23 | //$cuda_barrier($kernel, _cuda_kid, _cuda_thread_barrier);
|
|---|
| 24 | //printf("%d,%d - exiting barrier\n", blockIdx.x, threadIdx.x);
|
|---|
| 25 | }
|
|---|
| 26 |
|
|---|
| 27 | if (warpStart + 1 < remainingElements) {
|
|---|
| 28 | float val = i < numElements ? A[i] : 0;
|
|---|
| 29 |
|
|---|
| 30 | for (int offset = warpSize/2; offset > 0; offset /= 2) {
|
|---|
| 31 | //float tmp = $cuda__shfl_down_sync(val, offset, $lane);
|
|---|
| 32 | float tmp = __shfl_down_sync(0xFFFFFFFF, val, offset);
|
|---|
| 33 | if (lane + offset < thisWarpSize) {
|
|---|
| 34 | val += tmp;
|
|---|
| 35 | }
|
|---|
| 36 | }
|
|---|
| 37 |
|
|---|
| 38 | if (i < numElements) {
|
|---|
| 39 | A[i] = val;
|
|---|
| 40 | //printf("%d,%d - writing A[%d]: %f\n", blockIdx.x, threadIdx.x, i, val);
|
|---|
| 41 | }
|
|---|
| 42 | }
|
|---|
| 43 |
|
|---|
| 44 | i *= warpSize;
|
|---|
| 45 | //warpStart *= warpSize;
|
|---|
| 46 | remainingElements = ((remainingElements - 1) / warpSize) + 1;
|
|---|
| 47 | }
|
|---|
| 48 |
|
|---|
| 49 | if (i == 0) {
|
|---|
| 50 | *C = A[0];
|
|---|
| 51 | }
|
|---|
| 52 | }
|
|---|
| 53 |
|
|---|
| 54 | /*
|
|---|
| 55 | __global__ void kernel_1(float* A, float* C, int numElements) {
|
|---|
| 56 | if (blockDim.x * blockIdx.x + threadIdx.x == 0) {
|
|---|
| 57 | *C = 0;
|
|---|
| 58 | for (int i = 0; i < numElements; i++) {
|
|---|
| 59 | *C += A[i];
|
|---|
| 60 | }
|
|---|
| 61 | }
|
|---|
| 62 | }
|
|---|
| 63 | */
|
|---|
| 64 | $input int N = 64;
|
|---|
| 65 |
|
|---|
| 66 | int main() {
|
|---|
| 67 | float A[N];
|
|---|
| 68 | float sum = 0;
|
|---|
| 69 |
|
|---|
| 70 | for (int i = 0; i < N; i++) {
|
|---|
| 71 | A[i] = i;
|
|---|
| 72 | sum += A[i];
|
|---|
| 73 | printf("sum: %f, A[i]: %f\n", sum, A[i]);
|
|---|
| 74 | }
|
|---|
| 75 |
|
|---|
| 76 | int size = N * sizeof(float);
|
|---|
| 77 | int numBlocks = 1;
|
|---|
| 78 | //int numThreads = N%2 == 0? N/2 : (N+1)/2;
|
|---|
| 79 | int numThreads = N;
|
|---|
| 80 |
|
|---|
| 81 | float* cuda_A;
|
|---|
| 82 | cudaMalloc((void **)&cuda_A, size);
|
|---|
| 83 | cudaMemcpy(cuda_A, A, size, cudaMemcpyHostToDevice);
|
|---|
| 84 |
|
|---|
| 85 | float* cuda_C;
|
|---|
| 86 | cudaMalloc((void **)&cuda_C, sizeof(float));
|
|---|
| 87 |
|
|---|
| 88 | kernel_1<<<numBlocks, numThreads>>>(cuda_A, cuda_C, N);
|
|---|
| 89 | //kernel_1<<<1, N>>>(cuda_A, cuda_C, N);
|
|---|
| 90 |
|
|---|
| 91 | // Checking correctness
|
|---|
| 92 | float* C = (float *)malloc(sizeof(float));
|
|---|
| 93 |
|
|---|
| 94 | cudaMemcpy(C, cuda_C, sizeof(float), cudaMemcpyDeviceToHost);
|
|---|
| 95 |
|
|---|
| 96 | printf("sum: %f, C: %f", sum, *C);
|
|---|
| 97 |
|
|---|
| 98 | assert(*C == sum);
|
|---|
| 99 |
|
|---|
| 100 | free(C);
|
|---|
| 101 |
|
|---|
| 102 | cudaFree(cuda_A);
|
|---|
| 103 | cudaFree(cuda_C);
|
|---|
| 104 |
|
|---|
| 105 | } |
|---|