/*********************************************************************** * FILENAME: MM.cu * Matrix Multiplication * Matrix operands have row-major order. * * C = A * B * Multiplies two square matrices (NxN * NxN). * Matrix values have type double. * * A simple CUDA program has a basic workflow: * 1) Initialize matrix operands as double-precision arrays on host (CPU). * 2) Copy operands from host memory to GPU memory. * 3) Apply matrix operaton to operands on GPU * 4) Copy result from GPU memory to host memory. * * * CUDA C Programming Guide Version 4.2 (3.2.3, p.22): * http://developer.download.nvidia.com/compute/DevZone/docs/html/C/doc/CUDA_C_Programming_Guide.pdf * * MM with linearized matrix operands: * http://www.hpcwire.com/hpcwire/2008-10-08/compilers_and_more_programming_gpus_today.html * *************************************************************************/ // online source: https://www.rcac.purdue.edu/userinfo/resources/carter/compile/MM.cu #include "civlc.h" #include "civlc-cuda.cvl" #include #include $input int N; /* size of square matrix */ $input int TILE_WIDTH; /* MM kernel using global (not shared) memory. */ void _kernel_myMM_global (dim3 gridDim, dim3 blockDim, cudaStream_t s, const double * const A, const double * const B, double *C, int width) { void _kernel (_kernelInstance *this, cudaEvent_t e) { _waitInQueue(this, e); void _block(uint3 blockIdx) { int _numThreads = blockDim.x * blockDim.y * blockDim.z; $gbarrier _block_barrier = $gbarrier_create($here, _numThreads); void _thread(uint3 threadIdx) { int _tid = _index(blockDim, threadIdx); $barrier _b = $barrier_create($here, _block_barrier, _tid); /* Get row and column from block and thread IDs */ int row = (blockDim.y*blockIdx.y) + threadIdx.y; int col = (blockDim.x*blockIdx.x) + threadIdx.x; /* Initialize result of one element which one thread computes. */ double result=0.0; /* Compute one element of the matrix product. */ for (int i = 0; i < width; ++i) result += A[row*width + i] * B[i*width + col]; /* Store the result of one matrix element in matrix C. */ C[row * width + col] = result; $barrier_destroy(_b); } _runProcs(blockDim, _thread); $gbarrier_destroy(_block_barrier); } _runProcs(gridDim, _block); _kernelFinish(this); } _enqueueKernel(s, _kernel); } /* MM kernel using shared memory. */ void _kernel_myMM_shared (dim3 gridDim, dim3 blockDim, cudaStream_t s, const double * const A, const double * const B, double* C, int width) { void _kernel (_kernelInstance *this, cudaEvent_t e) { _waitInQueue(this, e); void _block(uint3 blockIdx) { int _numThreads = blockDim.x * blockDim.y * blockDim.z; $gbarrier _block_barrier = $gbarrier_create($here, _numThreads); double A_shared[TILE_WIDTH][TILE_WIDTH]; double B_shared[TILE_WIDTH][TILE_WIDTH]; void _thread(uint3 threadIdx) { int _tid = _index(blockDim, threadIdx); $barrier _b = $barrier_create($here, _block_barrier, _tid); int bx = blockIdx.x; int by = blockIdx.y; int tx = threadIdx.x; int ty = threadIdx.y; /* Identify the row and column of the C element to work on. */ int row = by * TILE_WIDTH + ty; int col = bx * TILE_WIDTH + tx; double result = 0.0; /* Loop over the A and B tiles required to compute the C element. */ for (int phase = 0; phase < width/TILE_WIDTH; ++phase) { /* Shared effort: loading of A and B tiles into shared memory. */ A_shared[ty][tx] = A[row*width + (phase*TILE_WIDTH + tx)]; B_shared[ty][tx] = B[col + (phase*TILE_WIDTH + ty)*width]; $barrier_call(_b); for (int k = 0; k < TILE_WIDTH; ++k) { result += A_shared[ty][k] * B_shared[k][tx]; } $barrier_call(_b); } C[row*width+col] = result; $barrier_destroy(_b); } _runProcs(blockDim, _thread); $gbarrier_destroy(_block_barrier); } _runProcs(gridDim, _block); _kernelFinish(this); } _enqueueKernel(s, _kernel); } /************************************************************************/ /************************************************************************/ /************************************************************************/ int main (int argc, char** argv) { int _main ( void ) { /* Set device based on input from command line */ /* if (argc > 1) { if (cudaSetDevice(atoi(argv[1])) != cudaSuccess) { int num_devices; cudaGetDeviceCount(&num_devices); fprintf(stderr, "Error initializing device %s,\ device value must be 0-%d\n", argv[1], (num_devices-1)); return 0; } } else { fprintf(stderr, "Usage: %s gpu_device\n", argv[0]); return 0; } */ /* Declare CPU arrays. */ double A[N*N],B[N*N],C[N*N]; /* linearized CPU double arrays */ double cpuResult[N*N], gpuGlobalResult[N*N], gpuSharedResult[N*N]; int r,c; /* Declare GPU arrays. */ double *G_A,*G_B,*G_C; /* linearized GPU double arrays */ size_t size_a,size_b,size_c; /* size of linearized array in bytes */ size_a = size_b = size_c = N*N; /* Setup a clock. */ cudaEvent_t start, stop; float CPU_elapsedtime, GPU_global_elapsedtime, GPU_shared_elapsedtime; cudaEventCreate(&start); cudaEventCreate(&stop); /* 1) Initialize matrix operands as double-precision arrays on host (CPU). */ for (r=0;r= 0 && i < N*N} cpuResult[i] == gpuGlobalResult[i] && cpuResult[i] == gpuSharedResult[i]); for (int i = 0; i < N*N; i ++) { printf("i = %d\n", i); printf("cpu = %f, gpuGlobal = %f, gpuShared = %f\n", cpuResult[i], gpuGlobalResult[i], gpuSharedResult[i]); $assert(cpuResult[i] == gpuGlobalResult[i]); $assert(cpuResult[i] == gpuSharedResult[i]); } return 0; } printf("1\n"); _cudaInit(); printf("2\n"); _main(); printf("3\n"); _cudaFinalize(); printf("4\n"); return 0; }