/**
* This is an example from the paper "Formal Semantics of Heterogeneous CUDA-C: 
* A Modular Approach with Applications" by Chris Hathhorn et al. 
*/

#include <stdio.h>
#include <stdlib.h>
#include <civlc.cvh>

#define N 8
#define NBLOCKS 4
#define NTHREADS (N/NBLOCKS)

void gpu(int nb, int nt, int *in, int *out){
  void gpuBlock(int bid){
    int shared[];
    $gbarrier gbarrier = $gbarrier_create($here, NTHREADS);
    
    void gpuThread(int tid){
      int i;
      int bdim = NTHREADS;
      $barrier barrier = $barrier_create($here, gbarrier, tid);
      
      shared[tid] = in[bid * bdim + tid];
      //synchronize threads
      $barrier_call(barrier);
      if(tid < bdim/2) {
        shared[tid] += shared[bdim/2 + tid];
      }
       //synchronize threads
      $barrier_call(barrier);
      if(tid == 0) {
        for (i = 1; i != (bdim/2) + (bdim%2); ++i) {
          shared[0] += shared[i];
        }
        out[bid] = shared[0];
      }
      $barrier_destroy(barrier);
    }
    
    $proc threads[nt];
    for(int i = 0; i < nt; i++) {
      threads[i] = $spawn gpuThread(i);
    }
    for(int i = 0; i < nt; i++) {
      $wait(threads[i]);
    }
    $gbarrier_destroy(gbarrier);
  }
  
  $proc blocks[nb];
  for(int i = 0; i < nb; i++) {
     blocks[i] = $spawn gpuBlock(i);
  }
  for(int i = 0; i < nb; i++) {
    $wait(blocks[i]);
  }
}

void main() {
  int i, *dev_out, host[N];
  
  printf("INPUT: ");
  for(i = 0; i != N; ++i) {
    host[i] = (21*i + 29) % 100;
    printf(" %d ", host[i]);
  }
  printf("\n");
  
  dev_out = (int *) malloc(NBLOCKS * sizeof(int));
  
  gpu(NBLOCKS, NTHREADS, host, dev_out);
  gpu(1, NBLOCKS, dev_out, dev_out);
  printf("OUTPUT: %d\n", *dev_out);
  free(dev_out);
}