/** * This is an example from the paper "Formal Semantics of Heterogeneous CUDA-C: * A Modular Approach with Applications" by Chris Hathhorn et al. */ #include #include #include #define N 8 #define NBLOCKS 4 #define NTHREADS (N/NBLOCKS) void gpu(int nb, int nt, int *in, int *out){ void gpuBlock(int bid){ int shared[]; $gbarrier gbarrier = $gbarrier_create($here, NTHREADS); void gpuThread(int tid){ int i; int bdim = NTHREADS; $barrier barrier = $barrier_create($here, gbarrier, tid); shared[tid] = in[bid * bdim + tid]; //synchronize threads $barrier_call(barrier); if(tid < bdim/2) { shared[tid] += shared[bdim/2 + tid]; } //synchronize threads $barrier_call(barrier); if(tid == 0) { for (i = 1; i != (bdim/2) + (bdim%2); ++i) { shared[0] += shared[i]; } out[bid] = shared[0]; } $barrier_destroy(barrier); } $proc threads[nt]; for(int i = 0; i < nt; i++) { threads[i] = $spawn gpuThread(i); } for(int i = 0; i < nt; i++) { $wait(threads[i]); } $gbarrier_destroy(gbarrier); } $proc blocks[nb]; for(int i = 0; i < nb; i++) { blocks[i] = $spawn gpuBlock(i); } for(int i = 0; i < nb; i++) { $wait(blocks[i]); } } void main() { int i, *dev_out, host[N]; printf("INPUT: "); for(i = 0; i != N; ++i) { host[i] = (21*i + 29) % 100; printf(" %d ", host[i]); } printf("\n"); dev_out = (int *) malloc(NBLOCKS * sizeof(int)); gpu(NBLOCKS, NTHREADS, host, dev_out); gpu(1, NBLOCKS, dev_out, dev_out); printf("OUTPUT: %d\n", *dev_out); free(dev_out); }