/** * This is an example from the paper "Formal Semantics of Heterogeneous CUDA-C: * A Modular Approach with Applications" by Chris Hathhorn et al. */ #include #include #include #define N 8 #define NBLOCKS 4 #define NTHREADS (N/NBLOCKS) void gpu(int nb, int nt, int *in, int *out){ void gpuBlock(int bid){ int shared[]; int num_in_barrier =0; int barrier_size = 0; int in_barrier[nt]; #include "cuda.cvh" void gpuThread(int tid){ int i; int bdim = NTHREADS; shared[tid] = in[bid * bdim + tid]; __syncthreads(in_barrier, tid); if(tid < bdim/2) { shared[tid] += shared[bdim/2 + tid]; } __syncthreads(in_barrier, tid); if(tid == 0) { for (i = 1; i != (bdim/2) + (bdim%2); ++i) { shared[0] += shared[i]; } out[bid] = shared[0]; } } $proc threads[nt]; __sync_init(in_barrier, nt); for(int i = 0; i < nt; i++) { threads[i] = $spawn gpuThread(i); } for(int i = 0; i < nt; i++) { $wait(threads[i]); } } $proc blocks[nb]; for(int i = 0; i < nb; i++) { blocks[i] = $spawn gpuBlock(i); } for(int i = 0; i < nb; i++) { $wait(blocks[i]); } } void main() { int i, *dev_out, host[N]; printf("INPUT: "); for(i = 0; i != N; ++i) { host[i] = (21*i + 29) % 100; printf(" %d ", host[i]); } printf("\n"); dev_out = (int *) malloc(NBLOCKS * sizeof(int)); gpu(NBLOCKS, NTHREADS, host, dev_out); gpu(1, NBLOCKS, dev_out, dev_out); printf("OUTPUT: %d\n", *dev_out); free(dev_out); }