| 1 | /* dot product of two arrays.
|
|---|
| 2 | * Command line execution:
|
|---|
| 3 | *
|
|---|
| 4 | */
|
|---|
| 5 | #include <civlc.h>
|
|---|
| 6 | #include <stdio.h>
|
|---|
| 7 |
|
|---|
| 8 | #define imin(a,b) (a<b?a:b)
|
|---|
| 9 |
|
|---|
| 10 | $input int THREADS_PER_BLOCK; // thread number per block: must be a power of 2, due to the while loop at the end of gpuThread();
|
|---|
| 11 | $input int N_BOUND;
|
|---|
| 12 | $input int N;
|
|---|
| 13 | $assume 0 < N && N <= N_BOUND;
|
|---|
| 14 | int const threadsPerBlock = THREADS_PER_BLOCK;
|
|---|
| 15 | int const blocksPerGrid =
|
|---|
| 16 | imin(32, (N+threadsPerBlock-1) / threadsPerBlock );
|
|---|
| 17 | double *a, *b, c, *partial_c;
|
|---|
| 18 |
|
|---|
| 19 | void gpu(){
|
|---|
| 20 | $proc blocks[blocksPerGrid];
|
|---|
| 21 |
|
|---|
| 22 | void gpuBlock(int blockID){
|
|---|
| 23 | int num_in_barrier =0;
|
|---|
| 24 | int barrier_size = 0;
|
|---|
| 25 | double cache[threadsPerBlock];
|
|---|
| 26 | int in_barrier[threadsPerBlock];
|
|---|
| 27 | $proc threads[threadsPerBlock];
|
|---|
| 28 |
|
|---|
| 29 | void __sync_init(int* in_barrier, int size) {
|
|---|
| 30 | barrier_size = size;
|
|---|
| 31 | for (int i=0; i<size; i++) in_barrier[i] = 0;
|
|---|
| 32 | }
|
|---|
| 33 |
|
|---|
| 34 | // model the synchronization of threads in the same block
|
|---|
| 35 | void __syncthreads(int* in_barrier, int tid) {
|
|---|
| 36 | $atomic {
|
|---|
| 37 | in_barrier[tid] = 1; // I am in the barrier
|
|---|
| 38 | num_in_barrier++; // increment number in barrier
|
|---|
| 39 | if (num_in_barrier == barrier_size) { // I am last to enter
|
|---|
| 40 | for (int i=0; i<barrier_size; i++) in_barrier[i] = 0; // release all
|
|---|
| 41 | num_in_barrier = 0; // now none are in barrier
|
|---|
| 42 | }
|
|---|
| 43 | }
|
|---|
| 44 | $when (in_barrier[tid] == 0); // wait till I am released
|
|---|
| 45 | }
|
|---|
| 46 |
|
|---|
| 47 | void gpuThread(int threadID){
|
|---|
| 48 | int tid = threadID + blockID * threadsPerBlock;
|
|---|
| 49 | int cacheIndex = threadID;
|
|---|
| 50 | double temp = 0;
|
|---|
| 51 |
|
|---|
| 52 | $atomic {
|
|---|
| 53 | while (tid < N) {
|
|---|
| 54 | temp += a[tid] * b[tid];
|
|---|
| 55 | tid += threadsPerBlock * blocksPerGrid;
|
|---|
| 56 | }
|
|---|
| 57 | // set cache values
|
|---|
| 58 | cache[cacheIndex] = temp;
|
|---|
| 59 | }
|
|---|
| 60 | // synchronize threads
|
|---|
| 61 | __syncthreads(in_barrier, threadID);
|
|---|
| 62 | int i = threadsPerBlock/2;
|
|---|
| 63 | while (i != 0) {
|
|---|
| 64 | if (cacheIndex < i)
|
|---|
| 65 | cache[cacheIndex] += cache[cacheIndex + i];
|
|---|
| 66 | // synchronize threads
|
|---|
| 67 | __syncthreads(in_barrier, threadID);
|
|---|
| 68 | i /= 2;
|
|---|
| 69 | }
|
|---|
| 70 | if (cacheIndex == 0)
|
|---|
| 71 | partial_c[blockID] = cache[0];
|
|---|
| 72 | }
|
|---|
| 73 |
|
|---|
| 74 | __sync_init(in_barrier, threadsPerBlock);
|
|---|
| 75 | for(int i = 0; i < threadsPerBlock; i++) {
|
|---|
| 76 | threads[i] = $spawn gpuThread(i);
|
|---|
| 77 | }
|
|---|
| 78 | for(int i = 0; i < threadsPerBlock; i++) {
|
|---|
| 79 | $wait(threads[i]);
|
|---|
| 80 | }
|
|---|
| 81 | }
|
|---|
| 82 |
|
|---|
| 83 | for(int i = 0; i < blocksPerGrid; i++) {
|
|---|
| 84 | blocks[i] = $spawn gpuBlock(i);
|
|---|
| 85 | }
|
|---|
| 86 | for(int i = 0; i < blocksPerGrid; i++) {
|
|---|
| 87 | $wait(blocks[i]);
|
|---|
| 88 | }
|
|---|
| 89 | }
|
|---|
| 90 |
|
|---|
| 91 | int main( void ) {
|
|---|
| 92 | $scope host = $here;
|
|---|
| 93 |
|
|---|
| 94 | // allocate memory on the cpu side
|
|---|
| 95 | a = (double *) $malloc(host, N*sizeof(double));
|
|---|
| 96 | b = (double *) $malloc(host, N*sizeof(double));
|
|---|
| 97 | partial_c = (double *) $malloc(host, blocksPerGrid*sizeof(double));
|
|---|
| 98 | // fill in the host memory with data
|
|---|
| 99 | for (int i=0; i<N; i++) {
|
|---|
| 100 | a[i] = i;
|
|---|
| 101 | b[i] = i*2;
|
|---|
| 102 | }
|
|---|
| 103 | gpu();
|
|---|
| 104 | // finish up on the CPU side
|
|---|
| 105 | c = 0;
|
|---|
| 106 | for (int i=0; i<blocksPerGrid; i++) {
|
|---|
| 107 | c += partial_c[i];
|
|---|
| 108 | }
|
|---|
| 109 | #define sum_squares(x) (x*(x+1)*(2*x+1)/6)
|
|---|
| 110 | // check result
|
|---|
| 111 | $assert(c == 2 * sum_squares( (double)(N - 1) ));
|
|---|
| 112 | }
|
|---|