source: CIVL/examples/cuda/mockupAlg.cu@ 7ffcb1b

main test-branch
Last change on this file since 7ffcb1b was 9cabba4, checked in by Alex Wilton <awilton@…>, 2 years ago

Merged CUDA branch into trunk.

git-svn-id: svn://vsl.cis.udel.edu/civl/trunk@5867 fb995dde-84ed-4084-dfe6-e5aef3e2452c

  • Property mode set to 100644
File size: 2.6 KB
Line 
1#include <cuda.h>
2#include <stdlib.h>
3#include <stdio.h>
4#include <assert.h>
5
6__global__ void kernel_1(float* A, float* C, int numElements) {
7 int lane = threadIdx.x % warpSize;
8 int thisWarpSize = warpSize;
9 if (threadIdx.x - lane + warpSize > blockDim.x) {
10 thisWarpSize = ((blockDim.x - 1) % warpSize) + 1;
11 }
12
13 int i = blockDim.x * blockIdx.x + threadIdx.x;
14 int warpStart = i - lane;
15 //printf("%d,%d - i: %d, warpStart: %d, thisWarpSize: %d\n", blockIdx.x, threadIdx.x,i, warpStart, thisWarpSize);
16 int remainingElements = numElements;
17
18 while (remainingElements > 1) {
19 //printf("%d,%d - remainingElements: %d\n", blockIdx.x, threadIdx.x, remainingElements);
20 if (remainingElements < numElements) {
21 __syncthreads();
22 //printf("%d,%d - entering barrier\n", blockIdx.x, threadIdx.x);
23 //$cuda_barrier($kernel, _cuda_kid, _cuda_thread_barrier);
24 //printf("%d,%d - exiting barrier\n", blockIdx.x, threadIdx.x);
25 }
26
27 if (warpStart + 1 < remainingElements) {
28 float val = i < numElements ? A[i] : 0;
29
30 for (int offset = warpSize/2; offset > 0; offset /= 2) {
31 //float tmp = $cuda__shfl_down_sync(val, offset, $lane);
32 float tmp = __shfl_down_sync(0xFFFFFFFF, val, offset);
33 if (lane + offset < thisWarpSize) {
34 val += tmp;
35 }
36 }
37
38 if (i < numElements) {
39 A[i] = val;
40 //printf("%d,%d - writing A[%d]: %f\n", blockIdx.x, threadIdx.x, i, val);
41 }
42 }
43
44 i *= warpSize;
45 //warpStart *= warpSize;
46 remainingElements = ((remainingElements - 1) / warpSize) + 1;
47 }
48
49 if (i == 0) {
50 *C = A[0];
51 }
52}
53
54/*
55__global__ void kernel_1(float* A, float* C, int numElements) {
56 if (blockDim.x * blockIdx.x + threadIdx.x == 0) {
57 *C = 0;
58 for (int i = 0; i < numElements; i++) {
59 *C += A[i];
60 }
61 }
62}
63*/
64$input int N = 64;
65
66int main() {
67 float A[N];
68 float sum = 0;
69
70 for (int i = 0; i < N; i++) {
71 A[i] = i;
72 sum += A[i];
73 printf("sum: %f, A[i]: %f\n", sum, A[i]);
74 }
75
76 int size = N * sizeof(float);
77 int numBlocks = 1;
78 //int numThreads = N%2 == 0? N/2 : (N+1)/2;
79 int numThreads = N;
80
81 float* cuda_A;
82 cudaMalloc((void **)&cuda_A, size);
83 cudaMemcpy(cuda_A, A, size, cudaMemcpyHostToDevice);
84
85 float* cuda_C;
86 cudaMalloc((void **)&cuda_C, sizeof(float));
87
88 kernel_1<<<numBlocks, numThreads>>>(cuda_A, cuda_C, N);
89 //kernel_1<<<1, N>>>(cuda_A, cuda_C, N);
90
91 // Checking correctness
92 float* C = (float *)malloc(sizeof(float));
93
94 cudaMemcpy(C, cuda_C, sizeof(float), cudaMemcpyDeviceToHost);
95
96 printf("sum: %f, C: %f", sum, *C);
97
98 assert(*C == sum);
99
100 free(C);
101
102 cudaFree(cuda_A);
103 cudaFree(cuda_C);
104
105}
Note: See TracBrowser for help on using the repository browser.