#include "civlc.h" #include $scope _global_scope = $here; $scope _host_scope; // Cuda data types typedef struct { unsigned int x, y, z; } dim3; typedef struct { unsigned int x, y, z; } uint3; enum cudaMemcpyKind { cudaMemcpyHostToHost, cudaMemcpyHostToDevice, cudaMemcpyDeviceToHost, cudaMemcpyDeviceToDevice, cudaMemcpyDefault }; enum cudaError { cudaSuccess, cudaErrorMissingConfiguration, cudaErrorMemoryAllocation, cudaErrorInitializationError, cudaErrorLaunchFailure, cudaErrorPriorLaunchFailure, cudaErrorLaunchTimeout, cudaErrorLaunchOutOfResources, cudaErrorInvalidDeviceFunction, cudaErrorInvalidConfiguration, cudaErrorInvalidDevice, cudaErrorInvalidValue, cudaErrorInvalidPitchValue, cudaErrorInvalidSymbol, cudaErrorMapBufferObjectFailed, cudaErrorUnmapBufferObjectFailed, cudaErrorInvalidHostPointer, cudaErrorInvalidDevicePointer, cudaErrorInvalidTexture, cudaErrorInvalidTextureBinding, cudaErrorInvalidChannelDescriptor, cudaErrorInvalidMemcpyDirection, cudaErrorAddressOfConstant, cudaErrorTextureFetchFailed, cudaErrorTextureNotBound, cudaErrorSynchronizationError, cudaErrorInvalidFilterSetting, cudaErrorInvalidNormSetting, cudaErrorMixedDeviceExecution, cudaErrorCudartUnloading, cudaErrorUnknown, cudaErrorNotYetImplemented, cudaErrorMemoryValueTooLarge, cudaErrorInvalidResourceHandle, cudaErrorNotReady, cudaErrorInsufficientDriver, cudaErrorSetOnActiveProcess, cudaErrorInvalidSurface, cudaErrorNoDevice, cudaErrorECCUncorrectable, cudaErrorSharedObjectSymbolNotFound, cudaErrorSharedObjectInitFailed, cudaErrorUnsupportedLimit, cudaErrorDuplicateVariableName, cudaErrorDuplicateTextureName, cudaErrorDuplicateSurfaceName, cudaErrorDevicesUnavailable, cudaErrorInvalidKernelImage, cudaErrorNoKernelImageForDevice, cudaErrorIncompatibleDriverContext, cudaErrorPeerAccessAlreadyEnabled, cudaErrorPeerAccessNotEnabled, cudaErrorDeviceAlreadyInUse, cudaErrorProfilerDisabled, cudaErrorProfilerNotInitialized, cudaErrorProfilerAlreadyStarted, cudaErrorProfilerAlreadyStopped, cudaErrorAssert, cudaErrorTooManyPeers, cudaErrorHostMemoryAlreadyRegistered, cudaErrorHostMemoryNotRegistered, cudaErrorOperatingSystem, cudaErrorStartupFailure, cudaErrorApiFailureBase }; typedef enum cudaError cudaError_t; // civl versions of cuda functions uint3 make_uint3(unsigned int a, unsigned int b, unsigned int c) { uint3 t; t.x = a; t.y = b; t.z = c; return t; } cudaError_t cudaMemcpy ( void *dst, const void *src, size_t count, enum cudaMemcpyKind kind ) { /* switch (kind) { case cudaMemcpyHostToHost: $assert($scopeof(*src) <= _host_scope); $assert($scopeof(*dst) <= _host_scope); break; case cudaMemcpyHostToDevice: $assert($scopeof(*src) <= _host_scope); $assert($scopeof(*dst) > _host_scope); break; case cudaMemcpyDeviceToHost: $assert($scopeof(*src) > _host_scope); $assert($scopeof(*dst) <= _host_scope); break; case cudaMemcpyDeviceToDevice: $assert($scopeof(*src) > _host_scope); $assert($scopeof(*dst) > _host_scope); break; default: break; } */ memcpy(dst, src, count); return cudaSuccess; } cudaError_t cudaFree(void *devPtr) { $assert($scopeof(*devPtr) > _host_scope); $free(devPtr); return cudaSuccess; } cudaError_t cudaSetDevice(int device_id) { return cudaSuccess; } cudaError_t cudaGetDeviceCount(int *count) { // possibly this should return an value specified as $input? *count = 1; return cudaSuccess; } // helper function, only called by transformed program int _index (dim3 size, uint3 indexes) { return indexes.x + size.x * (indexes.y + size.y * indexes.z); } dim3 _toDim3(int x) { dim3 d; d.x = x; d.y = 1; d.z = 1; return d; } void _createProcs(dim3 dim, void procFn(uint3)) { $proc procs[dim.x][dim.y][dim.z]; for (int x = 0; x < dim.x; x++) { for (int y = 0; y < dim.y; y++) { for (int z = 0; z < dim.z; z++) { uint3 bid = make_uint3(x, y, z); procs[x][y][z] = $spawn procFn(bid); } } } for (int x = 0; x < dim.x; x++) { for (int y = 0; y < dim.y; y++) { for (int z = 0; z < dim.z; z++) { $wait(procs[x][y][z]); } } } } typedef struct { $proc finalProcess; } _CUstream; typedef _CUstream* cudaStream_t; struct { cudaStream_t stream; struct _cudaStreamNode_ *next; } _cudaStreamNode_; typedef struct _cudaStreamNode_ _cudaStreamNode; typedef struct { _cudaStreamNode *headNode; $proc lastProc; } _cudaContext; _cudaContext _context = { .headNode = NULL, .lastProc = $proc_null }; cudaError_t cudaStreamCreate(cudaStream_t *pStream) { *pStream = (cudaStream_t)$malloc($root, sizeof(_CUstream)); (*pStream)->finalProcess = $proc_null; _cudaStreamNode *newNode = (_cudaStreamNode*)$malloc($root, sizeof(_cudaStreamNode)); _cudaStreamNode *oldHead = _context.headNode; newNode->stream = *pStream; newNode->next = oldHead; _context.headNode = newNode; return cudaSuccess; } cudaError_t cudaStreamDestroy(cudaStream_t pStream) { _cudaStreamNode *prevNode = NULL; _cudaStreamNode *curNode = _context.headNode; while (curNode != NULL) { if (pStream == curNode->stream) { if (prevNode == NULL) { _context.headNode = NULL; } else { prevNode->next = curNode->next; $free(pStream); $free(curNode); } return cudaSuccess; } } $assert($false); return cudaErrorInvalidResourceHandle; } void _tryWait($proc p) { if (p != $proc_null) $wait(p); } void _initKernel(cudaStream_t stream, void (*k)($proc)) { $proc prevProc; if (stream == NULL) { cudaDeviceSynchronize(); prevProc = _context.lastProc; _context.lastProc = $spawn k(prevProc); } else { prevProc = stream->lastProc; stream->lastProc = $spawn k(prevProc); } }