#include <civlc.cvh>
/* This header file contains useful helper functions for manipulating
 * the CIVL versions of various Cuda objects.
 */

#include <string.h>
// To print for debugging
#include <stdio.h>
#include <stdlib.h>
#include <concurrency.cvh>
#include <seq.cvh>
#include <mem.cvh>
#include <civl-cuda.cvh>
#include <comm.cvh>
#pragma CIVL ACSL

/* type used to represent an instance of a Cuda kernel
 * TODO: Update with read/write sets
 */
struct $cuda_kernel_instance {
  $proc process;        // the actual process executing this kernel
  $cuda_kernel_status status; // the kernel status
  // Read and Write sets
  $mem read_sets[];
  $mem write_sets[];
  // Number of threads in kernel
  int size;
};

/* a type that wraps a kernel instance for insertion into a list
 */
struct $cuda_kernel_instance_node {
  $cuda_kernel_instance_t *instance;
  $cuda_kernel_instance_node_t  *next;
};

/* $cuda_kernel_instance_node_t interface
 */
$cuda_kernel_instance_t *$cuda_get_instance($cuda_kernel_instance_node_t *node) {
  return node->instance;
}

/* a type used to represent a Cuda stream
 */
struct _CUstream {
    $cuda_kernel_instance_node_t *mostRecent;  // the most recently enqueued instance
    _Bool usable;                     // indicates whether or not this stream
                                      // has been marked for deletion
};

/* _CUstream interface
 */
$cuda_kernel_instance_node_t *$cuda_get_most_recent(cudaStream_t s) {
  return s->mostRecent;
}

_Bool $cuda_is_usable(cudaStream_t s) {
  return s->usable;
}

void $cuda_set_usable(cudaStream_t s, _Bool b) {
  s->usable = b;
}

/* a type that wraps a stream for insertion into a list
 */
struct $cuda_stream_node {
  cudaStream_t stream;
  $cuda_stream_node_t *next;
};

/* $cuda_stream_node_t encapsulation functions
 */
void $cuda_set_stream($cuda_stream_node_t* node, cudaStream_t stream) {
  node->stream = stream;
}

void $cuda_set_next($cuda_stream_node_t* node, $cuda_stream_node_t* next) {
  node->next = next;
}

/* a type used to represent a Cuda event
 */
struct _CUevent {
  $cuda_kernel_instance_t **instances;
  int numInstances;
};

/* _CUevent encapsulation functions
 */
$cuda_kernel_instance_t **$cuda_get_instances(cudaEvent_t e) {
  return e->instances;
}

void $cuda_set_instances(cudaEvent_t e, $cuda_kernel_instance_t** instances, int numInstances) {
  e->instances = instances;
  e->numInstances = numInstances;
}

int $cuda_get_num_instances(cudaEvent_t e) {
  return e->numInstances;
}

/* a type representing the state of a Cuda device
 */
struct $cuda_context {
  $cuda_stream_node_t *headNode;
  cudaStream_t nullStream;
  int numStreams;
};

/* $cuda_context_t encapsulation functions
 */
int $cuda_get_num_streams($cuda_context_t *_context) {
  return _context->numStreams;
}

$cuda_stream_node_t *$cuda_get_head_node($cuda_context_t *_context) {
  return _context->headNode;
}

cudaStream_t $cuda_get_null_stream($cuda_context_t *_context) {
  return _context->nullStream;
}

void $cuda_add_new_stream($cuda_context_t *_context, $cuda_stream_node_t *newNode) {
  _context->headNode = newNode;
  _context->numStreams++;
}

/* Computes the one dimensional index of a grid cell at a given location
 * in a three dimensional grid of a given size
 */
int $cuda_index (dim3 size, uint3 location) {
  return location.x + size.x * (location.y + size.y * location.z);
}

/* Computes the one dimensional index of a specific thread in the grid given
 * the grid dimension, block dimension, block index, and thread index
 */
int $cuda_kernel_index (dim3 gDim, dim3 bDim, uint3 bIdx, uint3 tIdx) {
  return $cuda_index(gDim, bIdx) * (bDim.x * bDim.y * bDim.z) + $cuda_index(bDim, tIdx);
}

/* Lifts a single integer x into a three dimensional vector representing
 * a one dimensional grid of length x
 */
dim3 $cuda_to_dim3(int x) {
  dim3 d = { x, 1, 1 };

  return d;
}

/* Given a three dimensional vector representing a grid of size dim,
 * create and destroy a process, in parallel, for each cell in the grid. 
 * The location of the cell is passed to the spawning function.
 */
void $cuda_run_procs(dim3 dim, void spawningFunction(uint3)) {
  $range rx = 0 .. (dim.x == 0 ? -1 : dim.x - 1);
  $range ry = 0 .. (dim.y == 0 ? -1 : dim.y - 1);
  $range rz = 0 .. (dim.z == 0 ? -1 : dim.z - 1);
  $domain(3) dom = ($domain){rx, ry, rz};
  $parfor(int x,y,z : dom){
    uint3 id = { x, y, z };
    spawningFunction(id);
  }
}

// ------------------------------------------------

/* $wait on a given process is it is non-null
 */
void $cuda_try_wait($proc p) {
  if (p != $proc_null) {
    $yield();
    $wait(p);
  }
}

/* The current state of the GPU
 */
$cuda_context_t $cuda_current_context = { 
  .headNode = NULL, 
  .nullStream = NULL,
  .numStreams = 0
};

/* malloc and initialize a new $cuda_kernel_instance_t
 */
 
 // TODO: Determine if passing parameters is better or not
$cuda_kernel_instance_t *$cuda_kernel_instance_create(dim3 gDim, dim3 bDim) {
  //printf("mallocing kernel instance\n");
  $cuda_kernel_instance_t *i = ($cuda_kernel_instance_t*)malloc(sizeof($cuda_kernel_instance_t));

  i->process = $proc_null;
  i->status = $cuda_kernel_status_waiting;
  int threadNum = gDim.x * gDim.y * gDim.z * bDim.x * bDim.y * bDim.z;
  i->size = threadNum;
  // TODO: either sequentially or non-sequentially?
  $mem empty = $mem_empty();
  $seq_init(&i->read_sets, threadNum, &empty);
  $seq_init(&i->write_sets, threadNum, &empty);
  return i;
}

/* cleanup and free a given $cuda_kernel_instance_t
 */
void $cuda_kernel_instance_destroy($cuda_kernel_instance_t *i) {
  $cuda_try_wait(i->process);
  //printf("freeing kernel instance\n");
  $free(i); 
}

/* malloc and initialize a new $cuda_kernel_instance_node_t
 */
$cuda_kernel_instance_node_t *$cuda_kernel_instance_node_create() {
  //printf("mallocing kernel instance node\n");
  $cuda_kernel_instance_node_t *node = ($cuda_kernel_instance_node_t*)malloc(sizeof($cuda_kernel_instance_node_t));

  node->instance = NULL;
  node->next = NULL;
  return node;
}

/* cleanup and free a given $cuda_kernel_instance_node_t
 */
void $cuda_kernel_instance_node_destroy($cuda_kernel_instance_node_t *node) {
  $cuda_kernel_instance_destroy(node->instance);
  //printf("freeing kernel instance node\n");
  $free(node); 
}

/* malloc and initialize a new stream
 */
cudaStream_t $cuda_stream_create() {
  cudaStream_t s;

  //printf("mallocing cuda stream\n");
  s = (cudaStream_t)malloc(sizeof(_CUstream));
  s->mostRecent = $cuda_kernel_instance_node_create();
  dim3 arbitrary = { 1, 1, 1 };
  s->mostRecent->instance = $cuda_kernel_instance_create(arbitrary, arbitrary);
  s->mostRecent->instance->status = $cuda_kernel_status_finished;
  s->usable = $true;
  return s;
}

/* block until the most recently enqueued process on the given stream
 * has terminated (meaning all kernels in that stream have completed)
 */
void $cuda_stream_wait(cudaStream_t s) {
  $cuda_kernel_instance_t *mostRecentInstance = s->mostRecent->instance;
  $yield();
  $unidirectional_when(mostRecentInstance->status == $cuda_kernel_status_finished);
}

/* block until no more streams have kernels executing
 */
void $cuda_stream_wait_all() {
  $cuda_stream_node_t *curNode = $cuda_current_context.headNode;

  while (curNode != NULL) {
      $cuda_stream_wait(curNode->stream);
      curNode = curNode->next;
  }
}

/* cleanup and free a given stream
 */
void $cuda_stream_destroy(cudaStream_t s) {
  $cuda_kernel_instance_node_t *curNode = s->mostRecent;
  $cuda_kernel_instance_node_t *nextNode;

  while (curNode != NULL) {
    nextNode = curNode->next;
    $cuda_kernel_instance_node_destroy(curNode);
    curNode = nextNode;
  }
  //printf("freeing cuda stream\n");
  $free(s);
}

/* malloc and initialize a new $cuda_stream_node_t
 */
$cuda_stream_node_t *$cuda_stream_node_create() {
  //printf("mallocing cuda stream node\n");
  $cuda_stream_node_t *node = ($cuda_stream_node_t*)malloc(sizeof($cuda_stream_node_t));

  node->stream = NULL;
  node->next = NULL;
  return node;
}

/* cleanup and free a given $cuda_stream_node_t
 */
void $cuda_stream_node_destroy($cuda_stream_node_t *node) {
  $assert((!node->stream->usable));
  $cuda_stream_destroy(node->stream);
  //printf("freeing cuda stream node\n");
  $free(node);
}

/* destroy all stream nodes contained in the context
 */
void $cuda_stream_node_destroy_all() {
  $cuda_stream_node_t *curNode = $cuda_current_context.headNode;
  $cuda_stream_node_t *nextNode;

  while (curNode != NULL) {
    nextNode = curNode->next;
    $cuda_stream_node_destroy(curNode);
    curNode = nextNode;
  }
  $cuda_current_context.headNode = NULL;
}

/* malloc and initialize a new event
 */
cudaEvent_t $cuda_event_create() {
  //printf("mallocing event\n");
  cudaEvent_t e = (cudaEvent_t)malloc(sizeof(_CUevent));

  e->numInstances = 0;
  e->instances = NULL;
  return e;
}

/* block until all $cuda_kernel_instance_ts contained in this event have
 * completed
 */
/*@ depends_on \access(e);
  @*/	
void $cuda_event_wait(cudaEvent_t e) {
  for (int i = 0; i < e->numInstances; i++) {
    $yield();
    $unidirectional_when(e->instances[i]->status == $cuda_kernel_status_finished);
  }
}

/* cleanup and free a given event
 */
void $cuda_event_destroy(cudaEvent_t e) {
  if (e->instances != NULL) { 
    //printf("freeing instance list a\n");
    $free(e->instances);
  }
  //printf("freeing event\n");
  $free(e);
}

/* initialize the cuda context. must be called before any cuda functions.
 */
void $cuda_init() {
  $cuda_current_context.nullStream = $cuda_stream_create();
}

/* cleanup the cuda context. must be called after all cuda functions.
 */
void $cuda_finalize() {
  $cuda_stream_wait_all();
  $cuda_stream_wait($cuda_current_context.nullStream);
  $cuda_stream_node_destroy_all();
  $cuda_stream_destroy($cuda_current_context.nullStream);
}

/* returns an array of pointers to the most recently enqueued kernel
 * of each stream.
 */
$cuda_kernel_instance_t **$cuda_all_most_recent_kernels() {
  int n = $cuda_current_context.numStreams + 1;
  $cuda_stream_node_t *curNode = $cuda_current_context.headNode;
  //printf("mallocing instance list a\n");
  $cuda_kernel_instance_t **insts = ($cuda_kernel_instance_t**)malloc(n * sizeof($cuda_kernel_instance_t*)) ;

  insts[0] = $cuda_current_context.nullStream->mostRecent->instance;
  for (int i = 1; i < n; i++, curNode = curNode->next) {
    insts[i] = curNode->stream->mostRecent->instance;
  }
  return insts;
}

/* create a kernel instance for the given function k, and enqueue it
 * onto the given stream.
 */
void $cuda_enqueue_kernel(cudaStream_t stream, void (*k)($cuda_kernel_instance_t*, cudaEvent_t), dim3 gDim, 
dim3 bDim) {
  cudaStream_t s;
  cudaEvent_t e = $cuda_event_create();
  $cuda_kernel_instance_node_t *newNode = $cuda_kernel_instance_node_create();

  if (stream == NULL) {
    e->numInstances = $cuda_current_context.numStreams + 1;
    e->instances = $cuda_all_most_recent_kernels();
    s = $cuda_current_context.nullStream;
  } else {
    e->numInstances = 2;
    //printf("mallocing instance list b\n");
    e->instances = ($cuda_kernel_instance_t**)malloc(2 * sizeof($cuda_kernel_instance_t*)) ;
    e->instances[0] = stream->mostRecent->instance;
    e->instances[1] = $cuda_current_context.nullStream->mostRecent->instance;
    s = stream;
  }
  $assert((s->usable));
  newNode->instance = $cuda_kernel_instance_create(gDim, bDim);
  newNode->next = s->mostRecent;
  s->mostRecent = newNode;
  s->mostRecent->instance->process = $spawn k(s->mostRecent->instance, e);
}

/* called by kernel processes. wait on the given event, then update
 * the status of the calling kernel to indicate it has finished waiting
 */
void $cuda_wait_in_queue ($cuda_kernel_instance_t *this, cudaEvent_t e) {
  $cuda_event_wait(e);
  $cuda_event_destroy(e);
  this->status = $cuda_kernel_status_running;
}

/* called by kernel processes. update the status of the calling kernel
 * to indicate that it has completed execution
 */
void $cuda_kernel_finish($cuda_kernel_instance_t *k) {
  k->status = $cuda_kernel_status_finished;
}

/* TODO: Finish check_data_race
 */
void $cuda_barrier($cuda_kernel_instance_t *k, int kernel_id, $barrier g) {
  $check_data_race(k, kernel_id);
  // We have to push a new read and write set before the barrier call to ignore it's reads and writes
  $read_set_push();
  $write_set_push();
  void captured_clear_mems(){
    $clear_all_mem_sets(k);
  }  
  $barrier_call_execute(g, captured_clear_mems);
  $read_set_pop();
  $write_set_pop();
}

/* TODO: Finish read_write_sets
 */
$atomic_f void $check_data_race($cuda_kernel_instance_t *k, int cur_tid) {
  //printf("Current id: %d\n", cur_tid);
  $mem out_s0 = $mem_empty();
  $mem out_s1 = $mem_empty();
  $mem cur_mw = $write_set_pop();
  $mem cur_mr = $read_set_pop();
  
  // Update current R/W sets
  k->write_sets[cur_tid] = cur_mw;
  k->read_sets[cur_tid] = cur_mr;
  
  /*
  printf("CHECKING DATA RACE %d [\n", cur_tid);
  for (int tmp_tid = 0; tmp_tid < k->size; tmp_tid++) {
    printf("  RS %d: %s\n", tmp_tid, k->read_sets[tmp_tid]);
    printf("  WS %d: %s\n", tmp_tid, k->write_sets[tmp_tid]);
  }
  printf("]\n");
  */
  
  // Check data race
  for (int tmp_tid = 0; tmp_tid < k->size; tmp_tid++) {
    if (tmp_tid == cur_tid) continue;
    
    $mem tmp_mr = k->read_sets[tmp_tid];
    $mem tmp_mw = k->write_sets[tmp_tid];
    
    $assert($mem_no_intersect(cur_mr, tmp_mw, &out_s0, &out_s1), 
              "Data-race detected: %p read by thread %d intersects %p written by thread %d\n",
	          out_s0, cur_tid, out_s1, tmp_tid);
    $assert($mem_no_intersect(cur_mw, tmp_mr, &out_s0, &out_s1), 
              "Data-race detected: %p read by thread %d intersects %p written by thread %d\n",
	          out_s0, cur_tid, out_s1, tmp_tid);
    $assert($mem_no_intersect(cur_mw, tmp_mw, &out_s0, &out_s1), 
              "Data-race detected: %p written by thread %d intersects %p written by thread %d\n",
	          out_s0, cur_tid, out_s1, tmp_tid);
  }
  // Update current R/W sets
  //k->write_sets[cur_tid] = $mem_empty();
  //k->read_sets[cur_tid] = $mem_empty();
  $read_set_push();
  $write_set_push();
}

/* Clears read and write memory sets of the given thread
*/
void $clear_mem_sets($cuda_kernel_instance_t *k, int cur_tid) {
	k->write_sets[cur_tid] = $mem_empty();
    k->read_sets[cur_tid] = $mem_empty();
}

void $clear_all_mem_sets($cuda_kernel_instance_t *k){
    for(int i = 0; i < k->size; i++)
      $clear_mem_sets(k, i);
}

/* Publishes current read a write sets to global arrays. Local sets are not cleared
*/
void $publish($cuda_kernel_instance_t *k, int cur_tid) {
    k->write_sets[cur_tid] = $write_set_peek();
    k->read_sets[cur_tid] = $read_set_peek();
}

int is_valid_width(int width){
    for(int i = 32; i > 1; i /= 2){
        if(width == i){
            return 1;
        }
    }
    return 0;
}

int exchange_data(unsigned mask, int var, int srcLane, int tid, $comm comm, $gbarrier* warpBarriers){
    $read_set_push();
	$write_set_push();
	int laneID = tid % 32;
	int warpID = tid / 32;
	int currLaneInMask = (mask >> laneID) & 1;
	int srcLaneInMask = (mask >> srcLane) & 1;
	int dest;
	$gbarrier gbarrier = warpBarriers[warpID];
	int numActiveThreads = 0;
	for (int i = 0; i < $get_nprocs(gbarrier); i++){
		numActiveThreads += (mask >> i) & 1;
	}
	if(currLaneInMask){
		$barrier barrier = $barrier_create($here, gbarrier, laneID);
		if(srcLaneInMask){
			$message request_message = $message_pack(tid, srcLane, 0, &tid, sizeof(int));
			$comm_enqueue(comm, request_message);
		}
		$barrier_call_subset(barrier, numActiveThreads);
		while ($comm_probe(comm, $COMM_ANY_SOURCE, 0)){
		    $yield();
			$message recv_request = $comm_dequeue(comm, $COMM_ANY_SOURCE, 0);
			$message_unpack(recv_request, &dest, sizeof(int));
			$message send_message = $message_pack(tid, dest, 1, &var, sizeof(int));
			$comm_enqueue(comm, send_message); 
		}
		if(srcLaneInMask){
		    $yield();
			$message recv_message = $comm_dequeue(comm, srcLane, 1);
			$message_unpack(recv_message, &var, sizeof(int));
		}
		else{
			$havoc(&var);
		}
		$barrier_destroy(barrier);
	}
	else{
		if(laneID != srcLane)
			$havoc(&var);
	}	
	$read_set_pop();
	$write_set_pop();
	return var;
}

int _cuda__shfl_sync(unsigned mask, int var, int srcLane, int width, int numThreads, int tid, $comm comm, $gbarrier* warpBarriers){
    $assert(is_valid_width(width));
	int subWarpID = tid / width;
	srcLane = srcLane % width + subWarpID * width;
	if(srcLane >= numThreads){
	    int lastSubWarpSize = numThreads % width;
	    srcLane = srcLane % lastSubWarpSize + subWarpID * width;
	}
    return exchange_data(mask, var, srcLane, tid, comm, warpBarriers);
}
int _cuda__shfl_up_sync(unsigned mask, int var, unsigned int delta, int width, int numThreads, int tid, $comm comm, $gbarrier* warpBarriers){
    $assert(is_valid_width(width));
	int subWarpLaneID = tid % width;
	int srcLane = tid;
	if (subWarpLaneID - delta >= 0){
	    srcLane = tid - delta;
	}
    return exchange_data(mask, var, srcLane, tid, comm, warpBarriers);
}
int _cuda__shfl_down_sync(unsigned mask, int var, unsigned int delta, int width, int numThreads, int tid, $comm comm, $gbarrier* warpBarriers){
	$assert(is_valid_width(width));
	int subWarpLaneID = tid % width;
	int srcLane = tid;
	if (subWarpLaneID + delta < width && tid + delta < numThreads) {
	    srcLane = tid + delta;
	}
    return exchange_data(mask, var, srcLane, tid, comm, warpBarriers);
}
int _cuda__shfl_xor_sync(unsigned mask, int var, int laneMask, int width, int numThreads, int tid, $comm comm, $gbarrier* warpBarriers){
    $assert(is_valid_width(width));
    int laneID = tid % 32;
	int warpID = tid / 32;
	int subWarpID = tid / width; 
	int srcLane = laneID ^ laneMask;
	if(!(srcLane / 32 == warpID && srcLane / width <= subWarpID && srcLane < numThreads)){
	    srcLane = tid;
	}
    return exchange_data(mask, var, srcLane, tid, comm, warpBarriers);
}