/** * This is an example from the paper "Formal Semantics of Heterogeneous CUDA-C: * A Modular Approach with Applications" by Chris Hathhorn et al. */ #include #include #define N 18 #define NBLOCKS 2 #define NTHREADS (N/NBLOCKS) void gpu(int nb, int nt, int *in, int *out){ void gpuBlock(int bid){ int shared[]; int num_in_barrier =0; int barrier_size = 0; int in_barrier[nt]; #include "cuda.cvh" void gpuThread(int tid){ int i; int bdim = NTHREADS; shared[tid] = in[bid * bdim + tid]; __syncthreads(in_barrier, tid); if(tid < bdim/2) { shared[tid] += shared[bdim/2 + tid]; } __syncthreads(in_barrier, tid); if(tid == 0) { for (i = 1; i != (bdim/2) + (bdim%2); ++i) { shared[0] += shared[i]; } out[bid] = shared[0]; } } $proc threads[nt]; __sync_init(in_barrier, nt); for(int i = 0; i < nt; i++) { threads[i] = $spawn gpuThread(i); } for(int i = 0; i < nt; i++) { $wait threads[i]; } } $proc blocks[nb]; for(int i = 0; i < nb; i++) { blocks[i] = $spawn gpuBlock(i); } for(int i = 0; i < nb; i++) { $wait blocks[i]; } } int main(void) { int i, *dev_out, host[N]; $heap h; printf("INPUT: "); for(i = 0; i != N; ++i) { host[i] = (21*i + 29) % 100; printf(" %d ", host[i]); } printf("\n"); //dev_in = (int *) $malloc(&h, N * sizeof(int)); dev_out = (int *) $malloc(&h, NBLOCKS * sizeof(int)); //memcpy(dev_in, &host[0], N * sizeof(int)); gpu(NBLOCKS, NTHREADS, host, dev_out); gpu(1, NBLOCKS, dev_out, dev_out); //memcpy(&host[0], dev_out, sizeof(int)); //cudaDeviceSynchronize(); printf("OUTPUT: %u\n", *dev_out); //$free(&h, dev_in); $free(&h, dev_out); return 0; }