/** * This is an example from the paper "Formal Semantics of Heterogeneous CUDA-C: * A Modular Approach with Applications" by Chris Hathhorn et al. */ #include #include #ifdef _CIVL $input int N; $input int N_B; $assume(1 <= N && N <= N_B); $input int NBLOCKS; $input int NBLOCKS_B; $assume(1 <= NBLOCKS && NBLOCKS <= NBLOCKS_B); $assume(NBLOCKS <= N); $assume(N % NBLOCKS == 0); $assume(N % 2 == 0); $assume(NBLOCKS % 2 == 0); #else #define N 8 #define NBLOCKS 4 #endif #define NTHREADS (N/NBLOCKS) __global__ void sum(int* in, int* out) { //extern __shared__ int shared[]; __shared__ int shared[blockDim.x]; // commenting out the extern qualifier since // it doen't have the definition, // need to figure out if this is something special for cuda int i, tid = threadIdx.x, bid = blockIdx.x, bdim = blockDim.x; shared[tid] = in[bid * bdim + tid]; __syncthreads(); if(tid < bdim/2) { shared[tid] += shared[bdim/2 + tid]; } __syncthreads(); if(tid == 0) { for (i = 1; i != (bdim/2) + (bdim%2); ++i) { shared[0] += shared[i]; } out[bid] = shared[0]; } } int main() { int i, *dev_in, *dev_out, host[N]; #ifdef _CIVL int seqSum = 0; #endif printf("INPUT: "); for(i = 0; i != N; ++i) { host[i] = (21*i + 29) % 100; #ifdef _CIVL seqSum += host[i]; #endif printf(" %d ", host[i]); } printf("\n"); cudaMalloc(&dev_in, N * sizeof(int)); cudaMalloc(&dev_out, NBLOCKS * sizeof(int)); cudaMemcpy(dev_in, host, N * sizeof(int), cudaMemcpyHostToDevice); sum<<>>( dev_in, dev_out); sum<<<1, NBLOCKS, NBLOCKS * sizeof(int)>>>( dev_out, dev_out); cudaMemcpy(host, dev_out, sizeof(int), cudaMemcpyDeviceToHost); cudaDeviceSynchronize(); printf("OUTPUT: %u\n", *host); #ifdef _CIVL $assert(*host == seqSum); #endif cudaFree(dev_in); cudaFree(dev_out); return 0; }