| 1 | #include<civlc.cvh>
|
|---|
| 2 |
|
|---|
| 3 | //pyramid height of 0 does nothing, of 1 is normal stencil, 2 is where
|
|---|
| 4 | //the algorithm even kicks in
|
|---|
| 5 | #define pyramid_height 2
|
|---|
| 6 |
|
|---|
| 7 | //This is how many iterations of the loop (minus 1). If less than
|
|---|
| 8 | //the pyramid_height, then the pyramid_height does nothing.
|
|---|
| 9 | #define rows 3
|
|---|
| 10 |
|
|---|
| 11 | //I think this is the minimum this can be to be useful
|
|---|
| 12 | //(the number of cells)
|
|---|
| 13 | #define cols 2
|
|---|
| 14 |
|
|---|
| 15 |
|
|---|
| 16 | //BLOCK_SIZE must satisfy (BLOCK_SIZE > pyramid_height*2)
|
|---|
| 17 | #define BLOCK_SIZE 5
|
|---|
| 18 | #define HALO 1
|
|---|
| 19 |
|
|---|
| 20 | int borderCols = (pyramid_height)*HALO;
|
|---|
| 21 | int smallBlockCol = BLOCK_SIZE - (pyramid_height) * HALO * 2;
|
|---|
| 22 | int blockCols = cols/smallBlockCol+((cols%smallBlockCol==0)?0:1);
|
|---|
| 23 | $proc block_procs[blockCols];
|
|---|
| 24 | $proc thread_procs[BLOCK_SIZE*blockCols];
|
|---|
| 25 |
|
|---|
| 26 | int result[cols];
|
|---|
| 27 |
|
|---|
| 28 | //GPU MEMORY (couldn't do it scoped because need references to memory
|
|---|
| 29 | //on host side)
|
|---|
| 30 | int gpuResult[2][cols];
|
|---|
| 31 | int gpuWall[rows*cols - cols];
|
|---|
| 32 |
|
|---|
| 33 | $input int wall[rows][cols];
|
|---|
| 34 |
|
|---|
| 35 | #define IN_RANGE(x, min, max) ((x)>=(min) && (x)<=(max))
|
|---|
| 36 | #define CLAMP_RANGE(x, min, max) x = (x<(min)) ? min : ((x>(max)) ? max : x )
|
|---|
| 37 | #define MIN(a, b) ((a)<=(b) ? (a) : (b))
|
|---|
| 38 |
|
|---|
| 39 | //Not implemented yet.
|
|---|
| 40 | void __syncthreads() {
|
|---|
| 41 | }
|
|---|
| 42 |
|
|---|
| 43 | void GPU(int iteration, int *gpuSrc, int *gpuResults, int startStep, int blocks, int threads) {
|
|---|
| 44 |
|
|---|
| 45 | void GPU_BLOCK(int bx){
|
|---|
| 46 | //shared memory
|
|---|
| 47 | int prev[BLOCK_SIZE];
|
|---|
| 48 | int result[BLOCK_SIZE];
|
|---|
| 49 |
|
|---|
| 50 | void GPU_THREAD(int tx){
|
|---|
| 51 |
|
|---|
| 52 | int small_block_cols = BLOCK_SIZE-iteration*HALO*2;
|
|---|
| 53 | int blkX = small_block_cols*bx-borderCols;
|
|---|
| 54 | int blkXmax = blkX+BLOCK_SIZE-1;
|
|---|
| 55 |
|
|---|
| 56 | // calculate the global thread coordination
|
|---|
| 57 | int xidx = blkX+tx;
|
|---|
| 58 |
|
|---|
| 59 | // effective range within this block that falls within
|
|---|
| 60 | // the valid range of the input data
|
|---|
| 61 | // used to rule out computation outside the boundary.
|
|---|
| 62 | int validXmin = (blkX < 0) ? -blkX : 0;
|
|---|
| 63 | int validXmax = (blkXmax > cols-1) ? BLOCK_SIZE-1-(blkXmax-cols+1) : BLOCK_SIZE-1;
|
|---|
| 64 |
|
|---|
| 65 | int W = tx-1;
|
|---|
| 66 | int E = tx+1;
|
|---|
| 67 |
|
|---|
| 68 | W = (W < validXmin) ? validXmin : W;
|
|---|
| 69 | E = (E > validXmax) ? validXmax : E;
|
|---|
| 70 |
|
|---|
| 71 | int isValid = IN_RANGE(tx, validXmin, validXmax);
|
|---|
| 72 | if(IN_RANGE(xidx, 0, cols-1)){
|
|---|
| 73 | prev[tx] = gpuSrc[xidx];
|
|---|
| 74 | }
|
|---|
| 75 | __syncthreads();
|
|---|
| 76 | int computed;
|
|---|
| 77 |
|
|---|
| 78 | for (int i=0; i<iteration; i++){
|
|---|
| 79 | computed = 0;
|
|---|
| 80 | if( IN_RANGE(tx, i+1, BLOCK_SIZE-i-2) && isValid) {
|
|---|
| 81 | computed = 1;
|
|---|
| 82 | int left = prev[W];
|
|---|
| 83 | int up = prev[tx];
|
|---|
| 84 | int right = prev[E];
|
|---|
| 85 | int shortest = MIN(left, up);
|
|---|
| 86 | shortest = MIN(shortest, right);
|
|---|
| 87 | int index = cols*(startStep+i)+xidx;
|
|---|
| 88 | result[tx] = shortest + gpuWall[index];
|
|---|
| 89 |
|
|---|
| 90 | }
|
|---|
| 91 | __syncthreads();
|
|---|
| 92 |
|
|---|
| 93 | //Break not implemented yet...
|
|---|
| 94 | /* if(i==iteration-1) */
|
|---|
| 95 | /* break; */
|
|---|
| 96 | if(computed != 0) //Assign the computation range
|
|---|
| 97 | prev[tx]= result[tx];
|
|---|
| 98 | __syncthreads(); // [Ronny] Added sync to avoid race on prev Aug. 14 201
|
|---|
| 99 |
|
|---|
| 100 | }
|
|---|
| 101 |
|
|---|
| 102 | // update the global memory
|
|---|
| 103 | // after the last iteration, only threads coordinated within the
|
|---|
| 104 | // small block perform the calculation and switch on ``computed''
|
|---|
| 105 | if (computed != 0) {
|
|---|
| 106 | gpuResults[xidx]=result[tx];
|
|---|
| 107 | }
|
|---|
| 108 |
|
|---|
| 109 | }
|
|---|
| 110 |
|
|---|
| 111 | //Launch the threads per block
|
|---|
| 112 | for (int tp = 0; tp < BLOCK_SIZE-1; tp++) {
|
|---|
| 113 | thread_procs[bx*BLOCK_SIZE + tp] = $spawn GPU_THREAD(tp);
|
|---|
| 114 | }
|
|---|
| 115 | GPU_THREAD(BLOCK_SIZE-1);
|
|---|
| 116 |
|
|---|
| 117 | for (int tp = 0; tp < BLOCK_SIZE-1; tp++) {
|
|---|
| 118 | $wait thread_procs[bx*BLOCK_SIZE + tp];
|
|---|
| 119 | }
|
|---|
| 120 |
|
|---|
| 121 | }
|
|---|
| 122 |
|
|---|
| 123 | //Launch the blocks
|
|---|
| 124 | for (int b = 0; b < blocks-1; b++) {
|
|---|
| 125 | block_procs[b] = $spawn GPU_BLOCK(b);
|
|---|
| 126 | }
|
|---|
| 127 | GPU_BLOCK(blocks-1);
|
|---|
| 128 |
|
|---|
| 129 | for (int b = 0; b < blocks-1; b++) {
|
|---|
| 130 | $wait block_procs[b];
|
|---|
| 131 | }
|
|---|
| 132 | }
|
|---|
| 133 |
|
|---|
| 134 |
|
|---|
| 135 | void calc_path() {
|
|---|
| 136 | int src = 1, dst = 0;
|
|---|
| 137 | for (int t = 0; t < rows-1; t+=pyramid_height) {
|
|---|
| 138 | int temp = src;
|
|---|
| 139 | src = dst;
|
|---|
| 140 | dst = temp;
|
|---|
| 141 | GPU(MIN(pyramid_height, rows-t-1),gpuResult[src], gpuResult[dst],t, blockCols, BLOCK_SIZE);
|
|---|
| 142 | }
|
|---|
| 143 | }
|
|---|
| 144 |
|
|---|
| 145 | void main() {
|
|---|
| 146 | $assert(BLOCK_SIZE > pyramid_height*2);
|
|---|
| 147 | calc_path();
|
|---|
| 148 | $assert(1 == 1);
|
|---|
| 149 | }
|
|---|