/**
 * TODO:
 * - implement cudaMemset and cudaMemsetAsync
 * - flesh out basic structure of cuda kernel:
 *   - spawn gridDim blocks
 *   - spawn blockDim threads
 *     Alternatively, spawn blockDim warps and then spawn warp's threads
 *   - wait for blocks to finish
 * - Add in block-level barriers and a __syncthreads() nested function that uses the barrier.
 * - Add in data race checking support and implement atomicAdd for integers
 */

enum cudaError {
  cudaSuccess
};
typedef enum cudaError cudaError_t;

typedef enum cudaMemcpyKind {
	cudaMemcpyHostToHost,
	cudaMemcpyHostToDevice,
	cudaMemcpyDeviceToHost,
	cudaMemcpyDeviceToDevice,
	cudaMemcpyDefault
} cudaMemcpyKind;

typedef struct $cuda_op_state {
	_Bool start;
	$proc op;
}* $cuda_op_state_t;

typdef struct $cuda_op_state_node {
	$cuda_op_state_t opState;
	$cuda_op_state_node_t next;
}* $cuda_op_state_node_t;

typedef struct cudaStream {
	$cuda_op_state_node_t head;
	$cuda_op_state_node_t tail;
	int numOps;
	
	$cuda_stream_node_t containingNode;
	_Bool alive;
}* cudaStream_t;
cudaStream_t $cuda_default_stream;

typedef struct $cuda_stream_node {
	$cuda_stream_t stream;
	$cuda_stream_node_t prev;
	$cuda_stream_node_t next;
}* $cuda_stream_node_t;

typedef struct $cuda_context {
	$cuda_stream_node_t head;
	int numStreams;
} $cuda_context;
$cuda_context $cuda_global_context;

void $cuda_op_wait_start($cuda_op_state_t cudaOpState) {
	$when(cudaOpState->start) cudaOpState->op = $this;
}

// Helper function to get the default stream if passed NULL, and just returns stream otherwise
cudaStream_t $default_stream_if_null(cudaStream_t stream) {
	return stream == NULL ? $cuda_default_stream : stream;
}

$cuda_stream_node_t $create_new_stream_node() {
	cudaStream_t newStream = (cudaStream_t) malloc(sizeof(struct cudaStream));
	newStream->head = NULL;
	newStream->tail = NULL;
	newStream->numOps = 0;
	newStream->alive = true;
	
	$cuda_stream_node_t newHead = ($cuda_stream_node_t) malloc(sizeof(struct $cuda_stream_node));
	newHead->stream = newStream;
	newStream->containingNode = newHead;
	newHead->prev = NULL;
	newHead->next = NULL;
}

// TODO: atomic
cudaError_t cudaStreamCreate(cudaStream_t * pStream) {
	// Create new stream node in linked list
	$cuda_stream_node_t newHead = $create_new_stream_node();
	newHead->next = $cuda_global_context.head;
	$cuda_global_context.head->prev = newHead;
	
	// Update cuda context's head to be the new node we created
	$cuda_global_context.head = newHead;
	$cuda_global_context.numStreams++;
	
	return cudaSuccess
}

cudaError_t cudaStreamSynchronize(cudaStream_t stream) {
	stream = $default_stream_if_null(stream);
	$assert(stream->alive, "Attempt to synchronize with a destroyed stream");
	$when(stream->head == NULL) return cudaSuccess;
}

$proc $destroy_stream_node($cuda_stream_node_t node) {
	if (node->prev != NULL) {
		node->prev->next = node->next;
	}
	if (node->next != NULL) {
		node->next->prev = node->prev;
	}
	free(node);
	node->stream->alive = false;
	
	void $destroy_stream_when_complete(cudaStream_t stream) {
		$when(stream->head==NULL) free(stream);
	}
	return $spawn $destroy_when_complete(node->stream);
}

// TODO: atomic
cudaError_t cudaStreamDestroy(cudaStream_t stream) {
	$assert(stream != NULL && stream != $cuda_default_stream, "Attempt to destroy default stream");
	$assert(stream->alive, "Attempt to destroy an already destroyed stream);
	$destroy_stream_node(stream->containingNode);
	return cudaSuccess;
}

// TODO: atomic
$cuda_op_state_t $stream_enqueue(cudaStream_t stream) {
	stream = $default_stream_if_null(stream);
	$assert(stream->alive, "Attempt to enqueue a CUDA operation onto a destroyed stream");

	$cuda_op_state_t newOpState = ($cuda_op_state_t) malloc(sizeof(struct $cuda_op_state));
	newOpState->start = false;
	newOpState->op = NULL;

	$cuda_op_state_node_t newOpStateNode = ($cuda_op_state_node_t) malloc(sizeof($cuda_op_state_node));
	newOpStateNode->opState = newOpState;
	newOpStateNode->next = NULL;
	
	if (stream->tail == NULL) {
		stream->head = newOpStateNode;
		stream->tail = newOpStateNode;
		newOpState->start = true;
	} else {
		stream->tail->next = newOpStateNode;
		stream->tail = newOpStateNode;
	}
	stream->numOps++;
	return newOpState;
}

// TODO: atomic
void $stream_dequeue(cudaStream_t stream) {
	stream = $default_stream_if_null(stream);
	$assert(stream->head != NULL, "Attempt to dequeue an empty stream");
	
	if (stream->head == stream->tail) {
		stream->tail = NULL;
	}
	
	$cuda_op_state_node_t oldHead = stream->head;
	stream->head = oldHead->next;
	if (stream->head != NULL) {
		stream->head->op->start = true;
	}
	
	free(oldHead->opState);
	stream->numOps--;
}

void $cuda_memcpy_proc(void* dst, const void* src, size_t count,
                       $cuda_op_state_t opState, cudaStream_t stream) {
	$cuda_op_wait_start(opState);
	memcpy(dst, src, count);
	$stream_dequeue(stream);
}

cudaError_t cudaMemcpy(void* dst, const void* src, size_t count, cudaMemcpyKind kind) {
	if (kind == cudaHostToHost) {
		memcpy(dst, src, count);
	} else {
		$cuda_op_state_t opState = $stream_enqueue($cuda_default_stream);
		$proc memcpyProc = $spawn $cuda_memcpy_proc(dst, src, count, opState, $cuda_default_stream);
		if (kind != cudaMemcpyDeviceToDevice) {
			$wait(memcpyProc);
		}
	}
	
	return cudaSuccess;
}

cudaError_t cudaMemcpyAsync(void* dst, const void* src, size_t count,
                            cudaMemcpyKind kind, cudaStream_t stream) {
	if (kind == cudaMemcpyHostToHost) {
		memcpy(dst, src, count);
	} else {
		$cuda_op_state_t opState = $stream_enqueue(stream);
		$spawn $cuda_memcpy_proc(dst, src, count, opState, stream);
	}
	
	return cudaSuccess;
}

cudaError_t cudaDeviceSynchronize() {
	$proc opsToWaitOn[] = ($proc*) malloc(sizeof($proc) *
	                                               $cuda_global_context.numStreams);
	int numOps = 0;
	
	$atomic {
		for ($cuda_stream_node_t node = $cuda_global_context.head;
	         node != NULL;
	         node = node->next) {
			if (node->stream->tail != NULL) {
				opsToWaitOn[numOps] = node->stream->tail->opState->op;
				numOps++;
			}
		}
	}
  $waitall(opsToWaitOn, numOps);
	
	return cudaSuccess;
}

void $cuda_setup() {
	$cuda_stream_node_t defaultStreamNode = $create_new_stream_node();
	$cuda_default_stream = defaultStreamNode->stream;

	$cuda_global_context.head = defaultStreamNode;
	$cuda_global_context.count = 1;
}

void $cuda_teardown() {
	$proc destructor = $destroy_stream_node($cuda_default_stream->containingNode);
	$wait(destructor);
}

void _cuda_kernel_1(dim3 gridDim, dim3 blockDim, size_t _cuda_mem_size,
                    int x) {
	
}
void $proc_kernel_1(dim3 gridDim, dim3 blockDim, size_t _cudaMemSize,
              cudaStream_t _cudaStream, $cuda_op_state_t _cudaOpState,
              int x) {
	$cuda_op_wait_start(_cudaOpState);
	_cuda_kernel_1(gridDim, blockDim, _cudaMemSize, x);
	$stream_dequeue(_cudaStream);
}

void _civl_main() {
	{
		$cuda_op_state_t _newOpState = $stream_enqueue(stream);
		$spawn $proc_kernel_1(gridDim, blockDim, 0, stream, _newOpState, x);
	}
}

int main() {
	$cuda_setup();
	_civl_main();
	$cuda_teardown();
}