// TODO add assertions to check equivalence of implementation results #ifdef _CIVL #include #endif /*********************************************************************** * FILENAME: MM.cu * Matrix Multiplication * Matrix operands have row-major order. * * C = A * B * Multiplies two square matrices (NxN * NxN). * Matrix values have type double. * * A simple CUDA program has a basic workflow: * 1) Initialize matrix operands as double-precision arrays on host (CPU). * 2) Copy operands from host memory to GPU memory. * 3) Apply matrix operaton to operands on GPU * 4) Copy result from GPU memory to host memory. * * * CUDA C Programming Guide Version 4.2 (3.2.3, p.22): * http://developer.download.nvidia.com/compute/DevZone/docs/html/C/doc/CUDA_C_Programming_Guide.pdf * * MM with linearized matrix operands: * http://www.hpcwire.com/hpcwire/2008-10-08/compilers_and_more_programming_gpus_today.html * *************************************************************************/ // online source: https://www.rcac.purdue.edu/userinfo/resources/carter/compile/MM.cu #include #include #include "cuda.h" #ifdef _CIVL $input int N; $input int TILE_WIDTH; #else #define N 1024 /* size of square matrix */ #define TILE_WIDTH 16 #endif #ifdef _CIVL $input double A[N*N]; $input double B[N*N]; #endif /* MM kernel using global (not shared) memory. */ __global__ void myMM_global (const double * const A, const double * const B, double *C, int width) { /* Get row and column from block and thread IDs */ int row = (blockDim.y*blockIdx.y) + threadIdx.y; int col = (blockDim.x*blockIdx.x) + threadIdx.x; /* Initialize result of one element which one thread computes. */ double result=0.0; /* Compute one element of the matrix product. */ for (int i = 0; i < width; ++i) result += A[row*width + i] * B[i*width + col]; /* Store the result of one matrix element in matrix C. */ C[row * width + col] = result; } /* MM kernel using shared memory. */ __global__ void myMM_shared (const double * const A, const double * const B, double* C, int width) { __shared__ double A_shared[TILE_WIDTH][TILE_WIDTH]; __shared__ double B_shared[TILE_WIDTH][TILE_WIDTH]; int bx = blockIdx.x; int by = blockIdx.y; int tx = threadIdx.x; int ty = threadIdx.y; /* Identify the row and column of the C element to work on. */ int row = by * TILE_WIDTH + ty; int col = bx * TILE_WIDTH + tx; double result = 0.0; /* Loop over the A and B tiles required to compute the C element. */ for (int phase = 0; phase < width/TILE_WIDTH; ++phase) { /* Shared effort: loading of A and B tiles into shared memory. */ A_shared[ty][tx] = A[row*width + (phase*TILE_WIDTH + tx)]; B_shared[ty][tx] = B[col + (phase*TILE_WIDTH + ty)*width]; __syncthreads(); for (int k = 0; k < TILE_WIDTH; ++k) result += A_shared[ty][k] * B_shared[k][tx]; __syncthreads(); } C[row*width+col] = result; } /************************************************************************/ /************************************************************************/ /************************************************************************/ int main (int argc, char** argv) { #ifdef _CIVL $assume(argc == 2); $assume(atoi(argv[1]) == 0); #endif /* Set device based on input from command line */ if (argc > 1) { if (cudaSetDevice(atoi(argv[1])) != cudaSuccess) { int num_devices; cudaGetDeviceCount(&num_devices); fprintf(stderr, "Error initializing device %s,\ device value must be 0-%d\n", argv[1], (num_devices-1)); return 0; } } else { fprintf(stderr, "Usage: %s gpu_device\n", argv[0]); return 0; } /* Declare CPU arrays. */ #ifndef _CIVL double A[N*N],B[N*N]; #endif double C[N*N]; /* linearized CPU double arrays */ int r,c; /* Declare GPU arrays. */ double *G_A,*G_B,*G_C; /* linearized GPU double arrays */ size_t size_a,size_b,size_c; /* size of linearized array in bytes */ size_a = size_b = size_c = N*N; /* Setup a clock. */ cudaEvent_t start, stop; float CPU_elapsedtime, GPU_global_elapsedtime, GPU_shared_elapsedtime; cudaEventCreate(&start); cudaEventCreate(&stop); /* 1) Initialize matrix operands as double-precision arrays on host (CPU). */ #ifndef _CIVL for (r=0;r>>(G_A,G_B,G_C,N); /* grid(16,16,1) */ /* 4) Copy result from GPGPU memory to CPU memory. */ cudaMemcpy(C,G_C,size_c*sizeof(double),cudaMemcpyDeviceToHost); /* Deallocate memory on GPGPU. */ cudaFree(G_A); cudaFree(G_B); cudaFree(G_C); cudaEventRecord(stop,0); cudaEventSynchronize(stop); cudaEventElapsedTime(&GPU_global_elapsedtime,start,stop); printf("Elapsed time in GPU (global memory): %7.1f milliseconds %5.1f\n", GPU_global_elapsedtime,CPU_elapsedtime/GPU_global_elapsedtime); //* printf("\nGLOBAL MEMORY:\n"); for (r=0;r>>(G_A,G_B,G_C,N); /* 4) Copy result from GPGPU memory to CPU memory. */ cudaMemcpy(C,G_C,size_c*sizeof(double),cudaMemcpyDeviceToHost); /* Deallocate memory on GPGPU. */ cudaFree(G_A); cudaFree(G_B); cudaFree(G_C); cudaEventRecord(stop,0); cudaEventSynchronize(stop); cudaEventElapsedTime(&GPU_shared_elapsedtime,start,stop); printf("Elapsed time in GPU (shared memory): %7.1f milliseconds %5.1f\n", GPU_shared_elapsedtime,CPU_elapsedtime/GPU_shared_elapsedtime); //* printf("\nSHARED MEMORY:\n"); for (r=0;r