| 1 | /**
|
|---|
| 2 | * This file is modified from the original pathfinder_cuda.cvl
|
|---|
| 3 | * program by the following way:
|
|---|
| 4 | * First, GPU/GPU_BLOCK/GPU_THREAD no longer access global variables of
|
|---|
| 5 | * int type, but use their value (by parameter passing) instead.
|
|---|
| 6 | * Second, the process that spawns GPU_BLOCK/GPU_THREAD
|
|---|
| 7 | * doen't reuse itself to run GPU_BLOCK/GPU_THREAD.
|
|---|
| 8 | */
|
|---|
| 9 |
|
|---|
| 10 | #include<civlc.cvh>
|
|---|
| 11 |
|
|---|
| 12 | #define pyramid_height 2
|
|---|
| 13 | #define rows 3
|
|---|
| 14 | #define cols 2
|
|---|
| 15 | //BLOCK_SIZE must satisfy (BLOCK_SIZE > pyramid_height*2)
|
|---|
| 16 | #define BLOCK_SIZE 5
|
|---|
| 17 | #define HALO 1
|
|---|
| 18 |
|
|---|
| 19 | int borderCols = (pyramid_height)*HALO;
|
|---|
| 20 | int smallBlockCol = BLOCK_SIZE - (pyramid_height) * HALO * 2;
|
|---|
| 21 | int blockCols = cols/smallBlockCol+((cols%smallBlockCol==0)?0:1);
|
|---|
| 22 |
|
|---|
| 23 | int result[cols];
|
|---|
| 24 |
|
|---|
| 25 | int gpuResult[2][cols];
|
|---|
| 26 | int gpuWall[rows*cols - cols];
|
|---|
| 27 |
|
|---|
| 28 | //$input int wall[rows][cols];
|
|---|
| 29 |
|
|---|
| 30 | #define IN_RANGE(x, min, max) ((x)>=(min) && (x)<=(max))
|
|---|
| 31 | #define CLAMP_RANGE(x, min, max) \
|
|---|
| 32 | if (x < (min)) x = (min); else if (x > (max)) x = (max);
|
|---|
| 33 | #define MIN(result, a, b) if ((a)<=(b)) result = (a); else result = (b);
|
|---|
| 34 |
|
|---|
| 35 | //Not implemented yet.
|
|---|
| 36 | //void __syncthreads() {
|
|---|
| 37 | //}
|
|---|
| 38 |
|
|---|
| 39 | void GPU(int iteration, int src, int dst, int startStep, int blocks,
|
|---|
| 40 | int threads, int gBorderCols) {
|
|---|
| 41 |
|
|---|
| 42 | void GPU_BLOCK(int bx, int bIteration, int bBorderCols,
|
|---|
| 43 | int src, int dst, int startStep){
|
|---|
| 44 | int prev[BLOCK_SIZE];
|
|---|
| 45 | int result[BLOCK_SIZE];
|
|---|
| 46 |
|
|---|
| 47 | void GPU_THREAD(int tx, int tbx, int tIteration, int tBorderCols,
|
|---|
| 48 | int src, int dst, int startStep, int* prev, int* result) {
|
|---|
| 49 | int small_block_cols = BLOCK_SIZE-tIteration*HALO*2;
|
|---|
| 50 | int blkX = small_block_cols*tbx-tBorderCols;
|
|---|
| 51 | int blkXmax = blkX+BLOCK_SIZE-1;
|
|---|
| 52 | int xidx = blkX+tx;
|
|---|
| 53 | int validXmin, validXmax;
|
|---|
| 54 | int W = tx-1;
|
|---|
| 55 | int E = tx+1;
|
|---|
| 56 |
|
|---|
| 57 | if (blkX < 0)
|
|---|
| 58 | validXmin = -blkX;
|
|---|
| 59 | else
|
|---|
| 60 | validXmin = 0;
|
|---|
| 61 | if (blkXmax > cols-1)
|
|---|
| 62 | validXmax = BLOCK_SIZE-1-(blkXmax-cols+1);
|
|---|
| 63 | else
|
|---|
| 64 | validXmax = BLOCK_SIZE-1;
|
|---|
| 65 | if (W < validXmin)
|
|---|
| 66 | W = validXmin;
|
|---|
| 67 | if (E > validXmax)
|
|---|
| 68 | E = validXmax;
|
|---|
| 69 |
|
|---|
| 70 | int isValid = IN_RANGE(tx, validXmin, validXmax);
|
|---|
| 71 |
|
|---|
| 72 | if (IN_RANGE(xidx, 0, cols-1)) {
|
|---|
| 73 | prev[tx] = gpuResult[src][xidx];
|
|---|
| 74 | }
|
|---|
| 75 |
|
|---|
| 76 | int computed;
|
|---|
| 77 |
|
|---|
| 78 | for (int i=0; i<tIteration; i++) {
|
|---|
| 79 | computed = 0;
|
|---|
| 80 | if (IN_RANGE(tx, i+1, BLOCK_SIZE-i-2) && isValid) {
|
|---|
| 81 | computed = 1;
|
|---|
| 82 | int left = prev[W];
|
|---|
| 83 | int up = prev[tx];
|
|---|
| 84 | int right = prev[E];
|
|---|
| 85 | int shortest;
|
|---|
| 86 | MIN(shortest, left, up);
|
|---|
| 87 | MIN(shortest, shortest, right);
|
|---|
| 88 | int index = cols*(startStep+i)+xidx;
|
|---|
| 89 | result[tx] = shortest + gpuWall[index];
|
|---|
| 90 | }
|
|---|
| 91 | if (i==iteration-1) break;
|
|---|
| 92 | if (computed != 0) prev[tx]= result[tx];
|
|---|
| 93 | //__syncthreads();
|
|---|
| 94 | }
|
|---|
| 95 | if (computed != 0) {
|
|---|
| 96 | gpuResult[dst][xidx]=result[tx];
|
|---|
| 97 | }
|
|---|
| 98 | }
|
|---|
| 99 |
|
|---|
| 100 | $proc thread_procs[BLOCK_SIZE];
|
|---|
| 101 |
|
|---|
| 102 | for (int tp = 0; tp < BLOCK_SIZE; tp++) {
|
|---|
| 103 | thread_procs[tp] = $spawn GPU_THREAD(tp, bx, bIteration, bBorderCols,
|
|---|
| 104 | src, dst, startStep, prev, result);
|
|---|
| 105 | }
|
|---|
| 106 | for (int tp = 0; tp < BLOCK_SIZE; tp++) {
|
|---|
| 107 | $wait thread_procs[tp];
|
|---|
| 108 | }
|
|---|
| 109 | }
|
|---|
| 110 |
|
|---|
| 111 | $proc block_procs[blocks];
|
|---|
| 112 |
|
|---|
| 113 | for (int b = 0; b < blocks; b++)
|
|---|
| 114 | block_procs[b] = $spawn GPU_BLOCK(b, iteration, gBorderCols, src,
|
|---|
| 115 | dst, startStep);
|
|---|
| 116 | for (int b = 0; b < blocks; b++) $wait block_procs[b];
|
|---|
| 117 | }
|
|---|
| 118 |
|
|---|
| 119 | void calc_path() {
|
|---|
| 120 | int src = 1, dst = 0;
|
|---|
| 121 |
|
|---|
| 122 | for (int t = 0; t < rows-1; t+=pyramid_height) {
|
|---|
| 123 | int temp = src;
|
|---|
| 124 | int min;
|
|---|
| 125 |
|
|---|
| 126 | src = dst;
|
|---|
| 127 | dst = temp;
|
|---|
| 128 | MIN(min, pyramid_height, rows-t-1);
|
|---|
| 129 | GPU( min, src, dst, t, blockCols, BLOCK_SIZE, borderCols);
|
|---|
| 130 | }
|
|---|
| 131 | }
|
|---|
| 132 |
|
|---|
| 133 | void main() {
|
|---|
| 134 | $assert(BLOCK_SIZE > pyramid_height*2);
|
|---|
| 135 | calc_path();
|
|---|
| 136 | }
|
|---|