/** * This file is modified from the original pathfinder_cuda.cvl * program by the following way: * First, GPU/GPU_BLOCK/GPU_THREAD no longer access global variables of * int type, but use their value (by parameter passing) instead. * Second, the process that spawns GPU_BLOCK/GPU_THREAD * doen't reuse itself to run GPU_BLOCK/GPU_THREAD. */ #include //pyramid height of 0 does nothing, of 1 is normal stencil, 2 is where //the algorithm even kicks in #define pyramid_height 2 //This is how many iterations of the loop (minus 1). If less than //the pyramid_height, then the pyramid_height does nothing. #define rows 3 //I think this is the minimum this can be to be useful //(the number of cells) #define cols 2 //BLOCK_SIZE must satisfy (BLOCK_SIZE > pyramid_height*2) #define BLOCK_SIZE 5 #define HALO 1 int borderCols = (pyramid_height)*HALO; int smallBlockCol = BLOCK_SIZE - (pyramid_height) * HALO * 2; int blockCols = cols/smallBlockCol+((cols%smallBlockCol==0)?0:1); int result[cols]; //GPU MEMORY (couldn't do it scoped because need references to memory //on host side) int gpuResult[2][cols]; int gpuWall[rows*cols - cols]; //$input int wall[rows][cols]; #define IN_RANGE(x, min, max) ((x)>=(min) && (x)<=(max)) #define CLAMP_RANGE(x, min, max) x = (x<(min)) ? min : ((x>(max)) ? max : x ) #define MIN(a, b) ((a)<=(b) ? (a) : (b)) //Not implemented yet. //void __syncthreads() { //} void GPU(int iteration, int src, int dst, int startStep, int blocks, int threads, int gBorderCols) { void GPU_BLOCK(int bx, int bIteration, int bBorderCols, int src, int dst, int startStep){ //shared memory int prev[BLOCK_SIZE]; int result[BLOCK_SIZE]; void GPU_THREAD(int tx, int tbx, int tIteration, int tBorderCols, int src, int dst, int startStep, int* prev, int* result){ int small_block_cols = BLOCK_SIZE-tIteration*HALO*2; int blkX = small_block_cols*tbx-tBorderCols; int blkXmax = blkX+BLOCK_SIZE-1; // calculate the global thread coordination int xidx = blkX+tx; // effective range within this block that falls within // the valid range of the input data // used to rule out computation outside the boundary. int validXmin = (blkX < 0) ? -blkX : 0; int validXmax = (blkXmax > cols-1) ? BLOCK_SIZE-1-(blkXmax-cols+1) : BLOCK_SIZE-1; int W = tx-1; int E = tx+1; W = (W < validXmin) ? validXmin : W; E = (E > validXmax) ? validXmax : E; int isValid = IN_RANGE(tx, validXmin, validXmax); if(IN_RANGE(xidx, 0, cols-1)){ prev[tx] = gpuResult[src][xidx]; } int computed; for (int i=0; i pyramid_height*2); calc_path(); }