/* This header file contains useful helper functions for manipulating
 * the CIVL versions of various Cuda objects.
 */
#ifdef __CUDA_HELPER__
#else
#define __CUDA_HELPER__

#include "civlc.cvh"
#include "cuda-types.cvh"
#include <string.h>
#include <stdio.h>

/* Computes the one dimensional index of a grid cell at a given location
 * in a three dimensional grid of a given size
 */
int _index (dim3 size, uint3 location) {
  return location.x + size.x * (location.y + size.y * location.z);
}

/* Lifts a single integer x into a three dimensional vector representing
 * a one dimensional grid of length x
 */
dim3 _toDim3(int x) {
  dim3 d = { x, 1, 1 };

  return d;
}

/* Given a three dimensional vector representing a grid of size dim,
 * create and destroy a process, in parallel, for each cell in the grid. 
 * The location of the cell is passed to the spawning function.
 */
void _runProcs(dim3 dim, void spawningFunction(uint3)) {
/*/
	$range rx = 0 .. dim.x;
	$range ry = 0 .. dim.y;
	$range rz = 0 .. dim.z;
	$domain(3) dom = ($domain){rx, ry, rz};
	$parfor(int x,y,z : dom){
		uint3 id = { x, y, z };
		spawningFunction(id);
	}
/*/
  $proc procs[dim.x][dim.y][dim.z];
  for (int x = 0; x < dim.x; x++) {
		for (int y = 0; y < dim.y; y++) {
			for (int z = 0; z < dim.z; z++) {
				uint3 id = { x, y, z };
				procs[x][y][z] = $spawn spawningFunction(id);
			}

		}
  }
  for (int x = 0; x < dim.x; x++) {
		for (int y = 0; y < dim.y; y++) {
			for (int z = 0; z < dim.z; z++) {
				$wait(procs[x][y][z]);
			}
		}
  }
//*/
}

// ------------------------------------------------

/* $wait on a given process is it is non-null
 */
void _tryWait($proc p) {
  if (p != $proc_null) 
		$wait(p);
}

/* The current state of the GPU
 */
_cudaContext _context = { 
  .headNode = NULL, 
  .nullStream = NULL,
  .numStreams = 0
};

/* malloc and initialize a new _kernelInstance
 */
_kernelInstance *_kernelInstanceCreate() {
  //printf("mallocing kernel instance\n");
  _kernelInstance *i = (_kernelInstance*)$malloc($root, sizeof(_kernelInstance));

  i->process = $proc_null;
  i->status = _kernelStatusWaiting;
  return i;
}

/* cleanup and free a given _kernelInstance
 */
void _kernelInstanceDestroy(_kernelInstance *i) {
  _tryWait(i->process);
  //printf("freeing kernel instance\n");
  $free(i); 
}

/* malloc and initialize a new _kernelInstanceNode
 */
_kernelInstanceNode *_kernelInstanceNodeCreate() {
  //printf("mallocing kernel instance node\n");
  _kernelInstanceNode *node = (_kernelInstanceNode*)$malloc($root, sizeof(_kernelInstanceNode));

  node->instance = NULL;
  node->next = NULL;
  return node;
}

/* cleanup and free a given _kernelInstanceNode
 */
void _kernelInstanceNodeDestroy(_kernelInstanceNode *node) {
  _kernelInstanceDestroy(node->instance);
  //printf("freeing kernel instance node\n");
  $free(node); 
}

/* malloc and initialize a new stream
 */
cudaStream_t _streamCreate() {
  cudaStream_t s;

  //printf("mallocing cuda stream\n");
  s = (cudaStream_t)$malloc($root, sizeof(_CUstream));
  s->mostRecent = _kernelInstanceNodeCreate();
  s->mostRecent->instance = _kernelInstanceCreate();
  s->mostRecent->instance->status = _kernelStatusFinished;
  s->usable = $true;
  return s;
}

/* block until the most recently enqueued process on the given stream
 * has terminated (meaning all kernels in that stream have completed)
 */
void _streamWait(cudaStream_t s) {
  _kernelInstance *mostRecentInstance = s->mostRecent->instance;

  $when (mostRecentInstance->status == _kernelStatusFinished) ;
}

/* block until no more streams have kernels executing
 */
void _streamWaitAll() {
  _cudaStreamNode *curNode = _context.headNode;

  while (curNode != NULL) {
      _streamWait(curNode->stream);
      curNode = curNode->next;
  }
}

/* cleanup and free a given stream
 */
void _streamDestroy(cudaStream_t s) {
  _kernelInstanceNode *curNode = s->mostRecent;
  _kernelInstanceNode *nextNode;

  while (curNode != NULL) {
		nextNode = curNode->next;
		_kernelInstanceNodeDestroy(curNode);
		curNode = nextNode;
  }
  //printf("freeing cuda stream\n");
  $free(s);
}

/* malloc and initialize a new _cudaStreamNode
 */
_cudaStreamNode *_streamNodeCreate() {
  //printf("mallocing cuda stream node\n");
  _cudaStreamNode *node = (_cudaStreamNode*)$malloc($root, sizeof(_cudaStreamNode));

  node->stream = NULL;
  node->next = NULL;
  return node;
}

/* cleanup and free a given _cudaStreamNode
 */
void _streamNodeDestroy(_cudaStreamNode *node) {
  $assert(!node->stream->usable);
  _streamDestroy(node->stream);
  //printf("freeing cuda stream node\n");
  $free(node);
}

/* destroy all stream nodes contained in the context
 */
void _streamNodeDestroyAll() {
  _cudaStreamNode *curNode = _context.headNode;
  _cudaStreamNode *nextNode;

  while (curNode != NULL) {
		nextNode = curNode->next;
		_streamNodeDestroy(curNode);
		curNode = nextNode;
  }
  _context.headNode = NULL;
}

/* malloc and initialize a new event
 */
cudaEvent_t _eventCreate() {
  //printf("mallocing event\n");
  cudaEvent_t e = (cudaEvent_t)$malloc($root, sizeof(_CUevent));

  e->numInstances = 0;
  e->instances = NULL;
  return e;
}

/* block until all _kernelInstances contained in this event have
 * completed
 */
void _eventWait(cudaEvent_t e) {
  for (int i = 0; i < e->numInstances; i++) {
    $when (e->instances[i]->status == _kernelStatusFinished) ;
  }
}

/* cleanup and free a given event
 */
void _eventDestroy(cudaEvent_t e) {
  if (e->instances != NULL) { 
    //printf("freeing instance list a\n");
    $free(e->instances);
  }
  //printf("freeing event\n");
  $free(e);
}

/* initialize the cuda context. must be called before any cuda functions.
 */
void _cudaInit() {
  _context.nullStream = _streamCreate();
}

/* cleanup the cuda context. must be called after all cuda functions.
 */
void _cudaFinalize() {
  _streamWaitAll();
  _streamWait(_context.nullStream);
  _streamNodeDestroyAll();
  _streamDestroy(_context.nullStream);
}

/* returns an array of pointers to the most recently enqueued kernel
 * of each stream.
 */
_kernelInstance **_allMostRecentKernels() {
  int n = _context.numStreams + 1;
  _cudaStreamNode *curNode = _context.headNode;
  //printf("mallocing instance list a\n");
  _kernelInstance **insts = (_kernelInstance**)$malloc($root, n * sizeof(_kernelInstance*)) ;

  insts[0] = _context.nullStream->mostRecent->instance;
  for (int i = 1; i < n; i++, curNode = curNode->next) {
    insts[i] = curNode->stream->mostRecent->instance;
  }
  return insts;
}

/* create a kernel instance for the given function k, and enqueue it
 * onto the given stream.
 */
void _enqueueKernel(cudaStream_t stream, void (*k)(_kernelInstance*, cudaEvent_t)) {
  cudaStream_t s;
  cudaEvent_t e = _eventCreate();
  _kernelInstanceNode *newNode = _kernelInstanceNodeCreate();

  if (stream == NULL) {
    e->numInstances = _context.numStreams + 1;
    e->instances = _allMostRecentKernels();
    s = _context.nullStream;
  } else {
    e->numInstances = 2;
    //printf("mallocing instance list b\n");
    e->instances = (_kernelInstance**)$malloc($root, 2 * sizeof(_kernelInstance*)) ;
    e->instances[0] = stream->mostRecent->instance;
    e->instances[1] = _context.nullStream->mostRecent->instance;
    s = stream;
  }
  $assert(s->usable);
  newNode->instance = _kernelInstanceCreate();
  newNode->next = s->mostRecent;
  s->mostRecent = newNode;
  s->mostRecent->instance->process = $spawn k(s->mostRecent->instance, e);
}

/* called by kernel processes. wait on the given event, then update
 * the status of the calling kernel to indicate it has finished waiting
 */
void _waitInQueue (_kernelInstance *this, cudaEvent_t e) {
  _eventWait(e);
  _eventDestroy(e);
  this->status = _kernelStatusRunning;
}

/* called by kernel processes. update the status of the calling kernel
 * to indicate that it has completed execution
 */
void _kernelFinish(_kernelInstance *k) {
  k->status = _kernelStatusFinished;
}

#endif