/* This header file contains useful helper functions for manipulating
 * the CIVL versions of various Cuda objects.
 */

#ifndef _CUDA_HELPER_
#define _CUDA_HELPER_

#include <seq.cvh>
#include <mem.cvh>
#include <concurrency.cvh>
#include <comm.cvh>

/* used to represent the size of three dimensional grids
 */
typedef struct {
  unsigned int x, y, z;
} dim3;

/* used to represent a location in a three dimensional grid
 */
typedef struct {
  unsigned int x, y, z;
} uint3;

/* a flag type used to indicate the type of memory transfer to occur
 * in a call to cudaMemcpy
 */
enum cudaMemcpyKind {
  cudaMemcpyHostToHost,
  cudaMemcpyHostToDevice,
  cudaMemcpyDeviceToHost,
  cudaMemcpyDeviceToDevice,
  cudaMemcpyDefault
};

/* the type returned by all Cuda functions
 */
enum cudaError {
  cudaSuccess,
  cudaErrorMissingConfiguration,
  cudaErrorMemoryAllocation,
  cudaErrorInitializationError,
  cudaErrorLaunchFailure,
  cudaErrorPriorLaunchFailure,
  cudaErrorLaunchTimeout,
  cudaErrorLaunchOutOfResources,
  cudaErrorInvalidDeviceFunction,
  cudaErrorInvalidConfiguration,
  cudaErrorInvalidDevice,
  cudaErrorInvalidValue,
  cudaErrorInvalidPitchValue,
  cudaErrorInvalidSymbol,
  cudaErrorMapBufferObjectFailed,
  cudaErrorUnmapBufferObjectFailed,
  cudaErrorInvalidHostPointer,
  cudaErrorInvalidDevicePointer,
  cudaErrorInvalidTexture,
  cudaErrorInvalidTextureBinding,
  cudaErrorInvalidChannelDescriptor,
  cudaErrorInvalidMemcpyDirection,
  cudaErrorAddressOfConstant,
  cudaErrorTextureFetchFailed,
  cudaErrorTextureNotBound,
  cudaErrorSynchronizationError,
  cudaErrorInvalidFilterSetting,
  cudaErrorInvalidNormSetting,
  cudaErrorMixedDeviceExecution,
  cudaErrorCudartUnloading,
  cudaErrorUnknown,
  cudaErrorNotYetImplemented,
  cudaErrorMemoryValueTooLarge,
  cudaErrorInvalidResourceHandle,
  cudaErrorNotReady,
  cudaErrorInsufficientDriver,
  cudaErrorSetOnActiveProcess,
  cudaErrorInvalidSurface,
  cudaErrorNoDevice,
  cudaErrorECCUncorrectable,
  cudaErrorSharedObjectSymbolNotFound,
  cudaErrorSharedObjectInitFailed,
  cudaErrorUnsupportedLimit,
  cudaErrorDuplicateVariableName,
  cudaErrorDuplicateTextureName,
  cudaErrorDuplicateSurfaceName,
  cudaErrorDevicesUnavailable,
  cudaErrorInvalidKernelImage,
  cudaErrorNoKernelImageForDevice,
  cudaErrorIncompatibleDriverContext,
  cudaErrorPeerAccessAlreadyEnabled,
  cudaErrorPeerAccessNotEnabled,
  cudaErrorDeviceAlreadyInUse,
  cudaErrorProfilerDisabled,
  cudaErrorProfilerNotInitialized,
  cudaErrorProfilerAlreadyStarted,
  cudaErrorProfilerAlreadyStopped,
  cudaErrorAssert,
  cudaErrorTooManyPeers,
  cudaErrorHostMemoryAlreadyRegistered,
  cudaErrorHostMemoryNotRegistered,
  cudaErrorOperatingSystem,
  cudaErrorStartupFailure,
  cudaErrorApiFailureBase
};
typedef enum cudaError cudaError_t;

/* struct representing the properties of a Cuda device
 */
typedef struct cudaDeviceProp {
  char name[256];
  size_t totalGlobalMem;
  size_t sharedMemPerBlock;
  int regsPerBlock;
  int warpSize;
  size_t memPitch;
  int maxThreadsPerBlock;
  int maxThreadsDim[3];
  int maxGridSize[3];
  int clockRate;
  size_t totalConstMem;
  int major;
  int minor;
  size_t textureAlignment;
  size_t texturePitchAlignment;
  int deviceOverlap;
  int multiProcessorCount;
  int kernelExecTimeoutEnabled;
  int integrated;
  int canMapHostMemory;
  int computeMode;
  int maxTexture1D;
  int maxTexture1DLinear;
  int maxTexture2D[2];
  int maxTexture2DLinear[3];
  int maxTexture2DGather[2];
  int maxTexture3D[3];
  int maxTextureCubemap;
  int maxTexture1DLayered[2];
  int maxTexture2DLayered[3];
  int maxTextureCubemapLayered[2];
  int maxSurface1D;
  int maxSurface2D[2];
  int maxSurface3D[3];
  int maxSurface1DLayered[2];
  int maxSurface2DLayered[3];
  int maxSurfaceCubemap;
  int maxSurfaceCubemapLayered[2];
  size_t surfaceAlignment;
  int concurrentKernels;
  int ECCEnabled;
  int pciBusID;
  int pciDeviceID;
  int pciDomainID;
  int tccDriver;
  int asyncEngineCount;
  int unifiedAddressing;
  int memoryClockRate;
  int memoryBusWidth;
  int l2CacheSize;
  int maxThreadsPerMultiProcessor;
} cudaDeviceProp;

/* flag type used to represent the status of a kernel instance
 */
typedef enum $cuda_kernel_status {
    $cuda_kernel_status_waiting,
    $cuda_kernel_status_running,
    $cuda_kernel_status_finished
} $cuda_kernel_status;

/* type used to represent an instance of a Cuda kernel
 */
typedef struct $cuda_kernel_instance $cuda_kernel_instance_t;

$cuda_kernel_status $cuda_get_status($cuda_kernel_instance_t*);

/* a type that wraps a kernel instance for insertion into a list
 */
typedef struct $cuda_kernel_instance_node $cuda_kernel_instance_node_t;

/* $cuda_kernel_instance_node_t interface
 */
$cuda_kernel_instance_t *$cuda_get_instance($cuda_kernel_instance_node_t*);

/* a type used to represent a Cuda stream
 */
typedef struct _CUstream _CUstream;
typedef _CUstream* cudaStream_t;

/* _CUstream interface
 */
$cuda_kernel_instance_node_t *$cuda_get_most_recent(cudaStream_t);

_Bool $cuda_is_usable(cudaStream_t);

void $cuda_set_usable(cudaStream_t, _Bool);

/* a type that wraps a stream for insertion into a list
 */
typedef struct $cuda_stream_node $cuda_stream_node_t;

/* $cuda_stream_node_t interface
 */
void $cuda_set_stream($cuda_stream_node_t*, cudaStream_t);

void $cuda_set_next($cuda_stream_node_t*, $cuda_stream_node_t*);

/* a type used to represent a Cuda event
 */
typedef struct _CUevent _CUevent;
typedef _CUevent* cudaEvent_t;

/* _CUevent interface
 */
$cuda_kernel_instance_t **$cuda_get_instances(cudaEvent_t);

void $cuda_set_instances(cudaEvent_t, $cuda_kernel_instance_t**, int);

int $cuda_get_num_instances(cudaEvent_t);


/* a type representing the state of a Cuda device
 */
typedef struct $cuda_context $cuda_context_t;

/* $cuda_context_t interface
 */
int $cuda_get_num_streams($cuda_context_t*);

$cuda_stream_node_t *$cuda_get_head_node($cuda_context_t*);

cudaStream_t $cuda_get_null_stream($cuda_context_t*);

void $cuda_add_new_stream($cuda_context_t*, $cuda_stream_node_t*);


/* Computes the one dimensional index of a grid cell at a given location
 * in a three dimensional grid of a given size
 */
int $cuda_index (dim3 size, uint3 location);

/* Compues the one dimensional index of a specific thread in the grid given the
 * grid dimension, block dimension, block index, and thread index
 */
int $cuda_kernel_index (dim3 gDim, dim3 bDim, uint3 bIdx, uint3 tIdx);

/* Lifts a single integer x into a three dimensional vector representing
 * a one dimensional grid of length x
 */
dim3 $cuda_to_dim3(int x);

/* Given a three dimensional vector representing a grid of size dim,
 * create and destroy a process, in parallel, for each cell in the grid. 
 * The location of the cell is passed to the spawning function.
 */
void $cuda_run_procs(dim3 dim, void spawningFunction(uint3));

// ------------------------------------------------

/* $wait on a given process is it is non-null
 */
void $cuda_try_wait($proc p);

/* The current state of the GPU
 */
$cuda_context_t $cuda_current_context;

/* malloc and initialize a new $cuda_kernel_instance_t
 */
$cuda_kernel_instance_t *$cuda_kernel_instance_create(dim3 gDim, dim3 bDim);

/* cleanup and free a given $cuda_kernel_instance_t
 */
void $cuda_kernel_instance_destroy($cuda_kernel_instance_t *i);

/* malloc and initialize a new $cuda_kernel_instance_node_t
 */
$cuda_kernel_instance_node_t *$cuda_kernel_instance_node_tCreate(void);

/* cleanup and free a given $cuda_kernel_instance_node_t
 */
void $cuda_kernel_instance_node_destroy($cuda_kernel_instance_node_t *node);

/* malloc and initialize a new stream
 */
cudaStream_t $cuda_stream_create(void);

/* block until the most recently enqueued process on the given stream
 * has terminated (meaning all kernels in that stream have completed)
 */
void $cuda_stream_wait(cudaStream_t s);

/* block until no more streams have kernels executing
 */
void $cuda_stream_wait_all(void);

/* cleanup and free a given stream
 */
void $cuda_stream_destroy(cudaStream_t s);

/* malloc and initialize a new $cuda_stream_node_t
 */
$cuda_stream_node_t *$cuda_stream_node_create(void);

/* cleanup and free a given $cuda_stream_node_t
 */
void $cuda_stream_node_destroy($cuda_stream_node_t *node);

/* destroy all stream nodes contained in the context
 */
void $cuda_stream_node_destroy_all(void);

/* malloc and initialize a new event
 */
cudaEvent_t $cuda_event_create(void);

/* block until all $cuda_kernel_instance_ts contained in this event have
 * completed
 */
void $cuda_event_wait(cudaEvent_t e);

/* cleanup and free a given event
 */
void $cuda_event_destroy(cudaEvent_t e);

/* initialize the cuda context. must be called before any cuda functions.
 */
void $cuda_init(void);

/* cleanup the cuda context. must be called after all cuda functions.
 */
void $cuda_finalize(void);

/* returns an array of pointers to the most recently enqueued kernel
 * of each stream.
 */
$cuda_kernel_instance_t **$cuda_all_most_recent_kernels(void);

/* create a kernel instance for the given function k, and enqueue it
 * onto the given stream.
 */
void $cuda_enqueue_kernel(cudaStream_t stream, void (*k)($cuda_kernel_instance_t*, cudaEvent_t), dim3 gDim,
dim3 bDim);

/* called by kernel processes. wait on the given event, then update
 * the status of the calling kernel to indicate it has finished waiting
 */
void $cuda_wait_in_queue ($cuda_kernel_instance_t *this, cudaEvent_t e);

/* called by kernel processes. update the status of the calling kernel
 * to indicate that it has completed execution
 */
void $cuda_kernel_finish($cuda_kernel_instance_t *k);

/* A barrier wrapper around barrier call that checks data races
 */
void $cuda_barrier($cuda_kernel_instance_t *k, int kernel_id, $barrier g);

/* Checks data races
 */
$atomic_f void $check_data_race($cuda_kernel_instance_t *k, int cur_tid);

/* Clears read and write memory sets of the given thread
*/
void $clear_mem_sets($cuda_kernel_instance_t *k, int cur_tid);

void $clear_all_mem_sets($cuda_kernel_instance_t *k);

/* Publishes current read a write sets to global arrays. Local sets are not cleared
*/
void $publish($cuda_kernel_instance_t *k, int cur_tid);

int _cuda__shfl_sync(unsigned mask, int var, int srcLane, int width, int numThreads, int tid, $comm comm, $gbarrier* warpBarriers);
int _cuda__shfl_up_sync(unsigned mask, int var, unsigned int delta, int width, int numThreads, int tid, $comm comm, $gbarrier* warpBarriers);
int _cuda__shfl_down_sync(unsigned mask, int var, unsigned int delta, int width, int numThreads, int tid, $comm comm, $gbarrier* warpBarriers);
int _cuda__shfl_xor_sync(unsigned mask, int var, int laneMask, int width, int numThreads, int tid, $comm comm, $gbarrier* warpBarriers);

#endif