/**
 * TODO:
 * - implement cudaMemset and cudaMemsetAsync
 * - flesh out basic structure of cuda kernel:
 *   - spawn gridDim blocks
 *   - spawn blockDim threads
 *     Alternatively, spawn blockDim warps and then spawn warp's threads
 *   - wait for blocks to finish
 * - Add in block-level barriers and a __syncthreads() nested function that uses the barrier.
 * - Add in data race checking support and implement atomicAdd for integers
 * - Handle dependencies at atomic blocks (replace some with local blocks if possible)
 */

#include <concurrency.cvh>
#include <comm.cvh>
#include <stdlib.h>
#include <stdio.h>
#include <stdbool.h>
#include <string.h>

///////////
// Types //
///////////

enum cudaError {
  cudaSuccess
};
typedef enum cudaError cudaError_t;

typedef enum cudaMemcpyKind {
	cudaMemcpyHostToHost,
	cudaMemcpyHostToDevice,
	cudaMemcpyDeviceToHost,
	cudaMemcpyDeviceToDevice,
	cudaMemcpyDefault
} cudaMemcpyKind;

typedef struct {
  unsigned int x, y, z;
} dim3;

/* used to represent a location in a three dimensional grid
 */
typedef struct {
  unsigned int x, y, z;
} uint3;

typedef struct $cuda_op_state* $cuda_op_state_t;
struct $cuda_op_state {
	_Bool start;
	$proc op;
};

typedef struct $cuda_op_state_node* $cuda_op_state_node_t;
struct $cuda_op_state_node {
	$cuda_op_state_t opState;
	$cuda_op_state_node_t next;
};

typedef struct cudaStream* cudaStream_t;
typedef struct $cuda_stream_node* $cuda_stream_node_t;
struct cudaStream {
	$cuda_op_state_node_t head;
	$cuda_op_state_node_t tail;
	int numOps;
	$cuda_stream_node_t containingNode;
	_Bool alive;
};

struct $cuda_stream_node {
	cudaStream_t stream;
	$cuda_stream_node_t prev;
	$cuda_stream_node_t next;
};

typedef struct $cuda_context {
	$cuda_stream_node_t head; //list of streams
	int numStreams;
} $cuda_context;

typedef struct $cuda_memcpy_data {
  void* dst;
  const void* src;
  size_t count;
  cudaMemcpyKind kind;
} $cuda_memcpy_data;

typedef struct $cuda_kernel_1_data {
  dim3 gridDim;
  dim3 blockDim;
  size_t $cudaMemSize;
  cudaStream_t $cudaStream;
  const float* A;
  const float* B;
  float* C;
  int numElements;
} $cuda_kernel_1_data;

//////////////////////
// Global Variables //
//////////////////////

$gcomm $cuda_gcomm = $gcomm_create($here, 2);
const int $CUDA_PLACE_HOST = 0;
const int $CUDA_PLACE_DEVICE = 1;
$comm $cuda_host_comm = $comm_create($here, $cuda_gcomm, $CUDA_PLACE_HOST);

/**
 * Tags used for message-passing between host and device
 */

enum $cuda_tag {
  // Predefined tags
  $CUDA_TAG_TEARDOWN,
  $CUDA_TAG_SCOPE_REQUEST,
  $CUDA_TAG_cudaFree,
  $CUDA_TAG_cudaMemcpy,
  $CUDA_TAG_cudaMemcpyAsync,
  // Generated tags (by transformer)
  $CUDA_TAG_LAUNCH_kernel_1
};

////////////////////////////////////////////
// CUDA API Functions (For Host-use Only) //
////////////////////////////////////////////

/*
cudaError_t cudaMalloc(void** devPtr, size_t size) {
  $comm_enqueue($cuda_host_comm, $message_pack($CUDA_PLACE_HOST, $CUDA_PLACE_DEVICE, $CUDA_TAG_cudaMalloc, &size, sizeof(size_t)));
  $message response = $comm_dequeue($cuda_host_comm, $CUDA_PLACE_DEVICE, $CUDA_TAG_cudaMalloc);
  $message_unpack(response, devPtr, sizeof(void*));

  return cudaSuccess;
}
*/

$scope $cuda_host_request_device_scope() {
  $comm_enqueue($cuda_host_comm, $message_pack($CUDA_PLACE_HOST, $CUDA_PLACE_DEVICE, $CUDA_TAG_SCOPE_REQUEST, NULL, 0));
  $message response = $comm_dequeue($cuda_host_comm, $CUDA_PLACE_DEVICE, $CUDA_TAG_SCOPE_REQUEST);
  $scope result;
  $message_unpack(response, &result, sizeof($scope));

  return result;
}

cudaError_t cudaFree(void* devPtr) {
  $comm_enqueue($cuda_host_comm, $message_pack($CUDA_PLACE_HOST, $CUDA_PLACE_DEVICE, $CUDA_TAG_cudaFree, &devPtr, sizeof(void*)));
  $comm_dequeue($cuda_host_comm, $CUDA_PLACE_DEVICE, $CUDA_TAG_cudaFree);
  
  return cudaSuccess;
}

void $cuda_helper_host_memcpy(void* dst, const void* src, size_t count, cudaMemcpyKind kind, _Bool async) {
  if (kind == cudaMemcpyHostToHost) {
		memcpy(dst, src, count);
	} else {
    $cuda_memcpy_data args;
    args.dst = dst;
    args.src = src;
    args.count = count;
    args.kind = kind;

    int tag = async ? $CUDA_TAG_cudaMemcpyAsync : $CUDA_TAG_cudaMemcpy;

    $comm_enqueue($cuda_host_comm, $message_pack($CUDA_PLACE_HOST, $CUDA_PLACE_DEVICE, tag, &args, sizeof($cuda_memcpy_data)));
    $comm_dequeue($cuda_host_comm, $CUDA_PLACE_DEVICE, tag);
	}
}

cudaError_t cudaMemcpy(void* dst, const void* src, size_t count, cudaMemcpyKind kind) {
	$cuda_helper_host_memcpy(dst, src, count, kind, false);
	return cudaSuccess;
}

cudaError_t cudaMemcpyAsync(void* dst, const void* src, size_t count,
                            cudaMemcpyKind kind, cudaStream_t stream) {
  $cuda_helper_host_memcpy(dst, src, count, kind, true);
	return cudaSuccess;
}

/**
 * TODO:
 *   - test
 *   - atomic?
 */
/*
cudaError_t cudaStreamCreate(cudaStream_t * pStream) {
	// Create new stream node in linked list
	$cuda_stream_node_t newHead = $create_new_stream_node();
	newHead->next = $cuda_global_context.head;
	$cuda_global_context.head->prev = newHead;
	
	// Update cuda context's head to be the new node we created
	$cuda_global_context.head = newHead;
	$cuda_global_context.numStreams++;
	
	return cudaSuccess;
}
*/

/**
 * TODO:
 *   - test
 *   - atomic?
 */
/*
cudaError_t cudaStreamSynchronize(cudaStream_t stream) {
	stream = $default_stream_if_null(stream);
	$assert(stream->alive, "Attempt to synchronize with a destroyed stream");
	$when(stream->head == NULL) return cudaSuccess;
}
*/

// TODO: atomic
/*
cudaError_t cudaStreamDestroy(cudaStream_t stream) {
	$assert(stream != NULL && stream != $cuda_default_stream, "Attempt to destroy default stream");
	$assert(stream->alive, "Attempt to destroy an already destroyed stream");
	$destroy_stream_node(stream->containingNode);
	return cudaSuccess;
}
*/

/*
cudaError_t cudaDeviceSynchronize() {
	$proc* opsToWaitOn;
	int numOps = 0;
	
	$atomic {
		opsToWaitOn = ($proc*) malloc(sizeof($proc) * $cuda_global_context.numStreams);
		
		for ($cuda_stream_node_t node = $cuda_global_context.head;
         node != NULL;
         node = node->next) {
      if (node->stream->tail != NULL) {
        opsToWaitOn[numOps] = node->stream->tail->opState->op;
        numOps++;
      }
    }
  }
  $waitall(opsToWaitOn, numOps);
	
  return cudaSuccess;
}
*/

void $cuda_host_launch_kernel_1(dim3 gridDim, dim3 blockDim, size_t $cudaMemSize, cudaStream_t $cudaStream,
                                const float* A, const float* B, float* C, int numElements) {
  $cuda_kernel_1_data args;
  args.gridDim = gridDim;
  args.blockDim = blockDim;
  args.$cudaMemSize = $cudaMemSize;
  args.$cudaStream = $cudaStream;
  args.A = A;
  args.B = B;
  args.C = C;
  args.numElements = numElements;

  $comm_enqueue($cuda_host_comm, $message_pack($CUDA_PLACE_HOST, $CUDA_PLACE_DEVICE, $CUDA_TAG_LAUNCH_kernel_1, &args, sizeof($cuda_kernel_1_data)));
  $comm_dequeue($cuda_host_comm, $CUDA_PLACE_DEVICE, $CUDA_TAG_LAUNCH_kernel_1);
}

/////////////////
// CUDA "file" //
/////////////////

void _cuda_main() {

  //////////////////////
  // Device Variables //
  //////////////////////

  $scope $cuda_scope = $here;
  
  $comm $cuda_device_comm = $comm_create($cuda_scope, $cuda_gcomm, 1);
  $cuda_context $cuda_global_context;
  cudaStream_t $cuda_default_stream;

  /////////////////////////////////
  // Context & Stream Management //
  /////////////////////////////////

  // Helper function to get the default stream if passed NULL, and just returns stream otherwise
  cudaStream_t $default_stream_if_null(cudaStream_t stream) {
    return stream == NULL ? $cuda_default_stream : stream;
  }

  $cuda_stream_node_t $create_new_stream_node() {
    cudaStream_t newStream = (cudaStream_t) malloc(sizeof(struct cudaStream));
    newStream->head = NULL;
    newStream->tail = NULL;
    newStream->numOps = 0;
    newStream->alive = true;
	
    $cuda_stream_node_t newHead = ($cuda_stream_node_t) malloc(sizeof(struct $cuda_stream_node));
    newHead->stream = newStream;
    newStream->containingNode = newHead;
    newHead->prev = NULL;
    newHead->next = NULL;
	
    return newHead;
  }

  //@ depends_on \nothing;
  $atomic_f $proc $destroy_stream_node($cuda_stream_node_t node) {
    $proc lastOpProc = $proc_null;
    cudaStream_t stream = node->stream;
    
    if (node->prev != NULL) {
      node->prev->next = node->next;
    }
    if (node->next != NULL) {
      node->next->prev = node->prev;
    }
    free(node);
    
    stream->alive = false;
    if(stream->tail != NULL)
      lastOpProc = stream->tail->opState->op;
	
    void $destroy_stream_when_complete($proc lastOpProc, cudaStream_t stream) {
      $wait(lastOpProc);
      free(stream);
    }
  
    return $spawn $destroy_stream_when_complete(lastOpProc, stream);
  }
  
  /**
   * Enqueues the calling $proc as a new cuda operation onto stream. Then blocks until the cuda operation is allowed to execute.
   *
   * Reasoning behind using enqueuedFlag:
   * + Enforces in the interface more explicitly that device proc shouldn't continue until new op state is created and properly filled out (including the $proc field)
   * + Keeps symmetry since this method means the op does both enqueueing and dequeueing. The alternative technique would have device enqueuing and op dequeuing.
   * + Reduces dependencies since device proc will not be manipulating the streams nor have direct access to the newly created op state.
   * + Keeps responsibility of device proc strictly to interpreting messages, spawning appropriate ops and sending message.
   */
  //@ depends_on \nothing;
  $atomic_f $cuda_op_state_t $stream_enqueue(_Bool** enqueuedFlag, cudaStream_t stream) {
    $cuda_op_state_t newOpState = ($cuda_op_state_t) $malloc($cuda_scope, sizeof(struct $cuda_op_state));
    newOpState->start = false;
    newOpState->op = $self;

    $cuda_op_state_node_t newOpStateNode = ($cuda_op_state_node_t) $malloc($cuda_scope, sizeof(struct $cuda_op_state_node));
    newOpStateNode->opState = newOpState;
    newOpStateNode->next = NULL;
  
    stream = $default_stream_if_null(stream);
    $assert(stream->alive, "Attempt to enqueue a CUDA operation onto a destroyed stream");
	
    if (stream->tail == NULL) {
      stream->head = newOpStateNode;
      stream->tail = newOpStateNode;
      newOpState->start = true;
    } else {
      stream->tail->next = newOpStateNode;
      stream->tail = newOpStateNode;
    }
    stream->numOps++;
    **enqueuedFlag = true;
    *enqueuedFlag = NULL;

    return newOpState;
  }

  //@ depends_on \nothing;
  $atomic_f void $stream_dequeue(cudaStream_t stream) {
    stream = $default_stream_if_null(stream);
    $assert(stream->head != NULL, "Attempt to dequeue an empty stream");

    if (stream->head == stream->tail) {
      stream->tail = NULL;
    }
	
    $cuda_op_state_node_t oldHead = stream->head;
    stream->head = oldHead->next;
    if (stream->head != NULL) {
      stream->head->opState->start = true;
    }
	
    stream->numOps--;
    free(oldHead->opState);
    free(oldHead);
  }

  ///////////////////////////////
  // CUDA Function Definitions //
  ///////////////////////////////

  /**
   * Only called at start of program
   */
  void $cuda_setup() {
    $cuda_stream_node_t defaultStreamNode = $create_new_stream_node();
    $cuda_default_stream = defaultStreamNode->stream;
  
    $cuda_global_context.head = defaultStreamNode;
    $cuda_global_context.numStreams = 1;
  }
  
  /**
   * Only called at end of program
   */
  void $cuda_teardown() {
    $proc destructor = $destroy_stream_node($cuda_default_stream->containingNode);
    $wait(destructor);
    $comm_destroy($cuda_device_comm);
  }

  $message $cuda_free($message request) {
    void* devPtr;
    $message_unpack(request, &devPtr, sizeof(void*));
    free($reveal(devPtr));
    //free(devPtr);

    return $message_pack($CUDA_PLACE_DEVICE, $CUDA_PLACE_HOST, $CUDA_TAG_cudaFree, NULL, 0);
  }

  void $cuda_memcpy_proc(void* dst, const void* src, size_t count, cudaMemcpyKind kind, _Bool* enqueuedFlag, cudaStream_t stream) {
    $cuda_op_state_t opState = $stream_enqueue(&enqueuedFlag, stream);
    $when(opState->start);
    
    if (kind == cudaMemcpyHostToDevice || cudaMemcpyDeviceToDevice) {
      dst = $reveal(dst);
    }
    if (kind == cudaMemcpyDeviceToHost || cudaMemcpyDeviceToDevice) {
      src = $reveal(src);
    }
    memcpy(dst, src, count);
    
    $stream_dequeue(stream);
  }

  $message $cuda_memcpy($message request, _Bool async) {
    $cuda_memcpy_data args;
    $message_unpack(request, &args, sizeof($cuda_memcpy_data));
    
    _Bool enqueuedFlag = false;
		$proc memcpyProc = $spawn $cuda_memcpy_proc(args.dst, args.src, args.count, args.kind, &enqueuedFlag, $cuda_default_stream);
    $when(enqueuedFlag);
    
		if (!async && args.kind != cudaMemcpyDeviceToDevice) {
			$wait(memcpyProc);
		}
    int tag = async ? $CUDA_TAG_cudaMemcpyAsync : $CUDA_TAG_cudaMemcpy;

    return $message_pack($CUDA_PLACE_DEVICE, $CUDA_PLACE_HOST, tag, NULL, 0);
  }

  ////////////////////////
  // Kernel Definitions //
  ////////////////////////

  // Helper function
  int $dim3_index(dim3 size, uint3 location) {
    return location.x + size.x * (location.y + size.y * location.z);
  }

  // Helper function
  int $cuda_kernel_index (dim3 gDim, dim3 bDim, uint3 bIdx, uint3 tIdx) {
    return $dim3_index(gDim, bIdx) * (bDim.x * bDim.y * bDim.z) + $dim3_index(bDim, tIdx);
  }

  void $cuda_run_and_wait_on_procs(dim3 dim, void spawningFunction(uint3)) {
    //TODO: calculate length and index, replace this function in the kernel
    $local_start();
    int length = dim.x * dim.y * dim.z;
    $proc proc_array[length];
    $range rx = 0 .. dim.x - 1;
    $range ry = 0 .. dim.y - 1;
    $range rz = 0 .. dim.z - 1;
    $domain(3) dom = ($domain(3)){rx, ry, rz};
    $for(int x,y,z : dom){
      uint3 id = { x, y, z };
      int index = $dim3_index(dim, id);
      proc_array[index] = $spawn spawningFunction(id);
    }
    $local_end();
    $waitall(proc_array,length);
  }
  
  // Generated from kernel_1 definition
  void $cuda_kernel_1(dim3 gridDim, dim3 blockDim, size_t _cuda_mem_size,
                    const float *A, const float *B, float *C, int numElements) {
    void _cuda_block(uint3 blockIdx) {
      int numThreads = (blockDim.x * blockDim.y) * blockDim.z;
      $scope _block_root = $here;
      $gbarrier _cuda_block_barrier = $gbarrier_create($here, blockDim.x * blockDim.y * blockDim.z);
      void _cuda_thread(uint3 threadIdx) {
        int _cuda_tid = $dim3_index(blockDim, threadIdx);
        int _cuda_kid = $cuda_kernel_index(gridDim, blockDim, blockIdx, threadIdx);
        $barrier _cuda_thread_barrier = $barrier_create($here, _cuda_block_barrier, _cuda_tid);
        $local_start();
        // Kernel definition start
        
        int i = blockDim.x * blockIdx.x + threadIdx.x;
  
        if (i < numElements)
        {
          C[i] = A[i] + B[i];
        }
        
        // Kernel definition end
        $local_end();
        $barrier_destroy(_cuda_thread_barrier);
      }
      $cuda_run_and_wait_on_procs(blockDim, _cuda_thread);
      $gbarrier_destroy(_cuda_block_barrier);
    }
    $cuda_run_and_wait_on_procs(gridDim, _cuda_block);
  }

  void $cuda_kernel_1_proc (_Bool* enqueuedFlag, dim3 gridDim, dim3 blockDim,
                      size_t $cudaMemSize, cudaStream_t $cudaStream,
                      const float *A, const float *B, float *C, int numElements) {
    $cuda_op_state_t opState = $stream_enqueue(&enqueuedFlag, $cudaStream);
    $when(opState->start);
    $cuda_kernel_1(gridDim, blockDim, $cudaMemSize, A, B, C, numElements);
    $stream_dequeue($cudaStream);
  }

  $message $cuda_device_launch_kernel_1($message request) {
    $cuda_kernel_1_data args;
    $message_unpack(request, &args, sizeof($cuda_kernel_1_data));
    
    _Bool enqueuedFlag = false;
    $spawn $cuda_kernel_1_proc(&enqueuedFlag, args.gridDim, args.blockDim, args.$cudaMemSize, args.$cudaStream, $reveal(args.A), $reveal(args.B), $reveal(args.C), args.numElements);
    $when(enqueuedFlag);

    return $message_pack($CUDA_PLACE_DEVICE, $CUDA_PLACE_HOST, $CUDA_TAG_LAUNCH_kernel_1, NULL, 0);
  }
  
  /////////////////
  // Device main //
  /////////////////

  $cuda_setup();

  while (true) {
    $message request = $comm_dequeue($cuda_device_comm, $CUDA_PLACE_HOST, $COMM_ANY_TAG);
    $message response;
    const int tag = $message_tag(request);
    
    switch(tag) {
    case $CUDA_TAG_SCOPE_REQUEST :
      response = $message_pack($CUDA_PLACE_DEVICE, $CUDA_PLACE_HOST, $CUDA_TAG_SCOPE_REQUEST, &$cuda_scope, sizeof($scope));
      break;
    case $CUDA_TAG_cudaFree :
      response = $cuda_free(request);
      break;
    case $CUDA_TAG_cudaMemcpy :
      response = $cuda_memcpy(request, false);
      break;
    case $CUDA_TAG_cudaMemcpyAsync :
      response = $cuda_memcpy(request, true);
      break;
    case $CUDA_TAG_LAUNCH_kernel_1 :
      response = $cuda_device_launch_kernel_1(request);
      break;
    case $CUDA_TAG_TEARDOWN :
      $cuda_teardown();
      return;
    default :
      $assert(false, "Unknown CUDA request");
    }
    
    $comm_enqueue($cuda_device_comm, response);
  }
}

///////////////
// Host file //
///////////////

$input int N;
$assume (N > 0);
$input float A[N];
$input float B[N];

void _host_main() {
  int size = N * sizeof(float);
  int numBlocks = 2;
  int numThreads = N%2 == 0? N/2 : (N+1)/2;

  float* cuda_A;
  // cudaMalloc((void **)&cuda_A, size);
  {
    $scope deviceScope = $cuda_host_request_device_scope();
    cuda_A = $hide((float*)$malloc(deviceScope, size));
    //cuda_A = (float*)$malloc(deviceScope, size);
  }
  cudaMemcpy(cuda_A, A, size, cudaMemcpyHostToDevice);

  float* cuda_B;
  // cudaMalloc((void **)&cuda_B, size);
  {
    $scope deviceScope = $cuda_host_request_device_scope();
    cuda_B = $hide((float*)$malloc(deviceScope, size));
    //cuda_B = (float*)$malloc(deviceScope, size);
  }
  cudaMemcpy(cuda_B, B, size, cudaMemcpyHostToDevice);

  float* cuda_C;
  // cudaMalloc((void **)&cuda_C, size);
  {
    $scope deviceScope = $cuda_host_request_device_scope();
    cuda_C = $hide((float*)$malloc(deviceScope, size));
    //cuda_C = (float*)$malloc(deviceScope, size);
  }

  dim3 gridDim = {numBlocks, 1, 1};
  dim3 blockDim = {numThreads, 1, 1};
  // kernel_1<<<gridDim, blockDim>>>(cuda_A, cuda_B, cuda_C, N);
  $cuda_host_launch_kernel_1(gridDim, blockDim, 0, NULL, cuda_A, cuda_B, cuda_C, N);
  
  //Checking correctness
  float* C = (float *)malloc(size);
  
  cudaMemcpy(C, cuda_C, size, cudaMemcpyDeviceToHost);
  
  for(int i = 0; i < N; i++)
  	$assert(C[i] == A[i] + B[i]);
  
  free(C);
  
  cudaFree(cuda_A); 
  cudaFree(cuda_B);
  cudaFree(cuda_C);

  // inserted by transformer
  $comm_enqueue($cuda_host_comm, $message_pack($CUDA_PLACE_HOST, $CUDA_PLACE_DEVICE, $CUDA_TAG_TEARDOWN, NULL, 0));
  $comm_destroy($cuda_host_comm);
}

int main() {
  $proc host = $spawn _host_main();
  $proc cuda = $spawn _cuda_main();
  $wait(host);
  $wait(cuda);
  $gcomm_destroy($cuda_gcomm, NULL);
}