| 1 | /* dot product of two arrays.
|
|---|
| 2 | * Command line execution:
|
|---|
| 3 | * civl verify dot.cvl -inputN_BOUND=8 -inputTHREADS_PER_BLOCK=4
|
|---|
| 4 | */
|
|---|
| 5 | #include <civlc.cvh>
|
|---|
| 6 | #include <concurrency.cvh>
|
|---|
| 7 | #include <stdio.h>
|
|---|
| 8 |
|
|---|
| 9 | #define imin(a,b) (a<b?a:b)
|
|---|
| 10 |
|
|---|
| 11 | $input int THREADS_PER_BLOCK = 4; // thread number per block: must be a power of 2, due to the while loop at the end of gpuThread();
|
|---|
| 12 | $input int N_BOUND = 8;
|
|---|
| 13 | $input int N;
|
|---|
| 14 | $assume(0 < N && N <= N_BOUND);
|
|---|
| 15 | int const threadsPerBlock = THREADS_PER_BLOCK;
|
|---|
| 16 | int const blocksPerGrid =
|
|---|
| 17 | imin(32, (N+threadsPerBlock-1) / threadsPerBlock );
|
|---|
| 18 | double *a, *b, c, *partial_c;
|
|---|
| 19 |
|
|---|
| 20 | void gpu(){
|
|---|
| 21 | $proc blocks[blocksPerGrid];
|
|---|
| 22 |
|
|---|
| 23 | void gpuBlock(int blockID){
|
|---|
| 24 | int num_in_barrier =0;
|
|---|
| 25 | int barrier_size = 0;
|
|---|
| 26 | double cache[threadsPerBlock];
|
|---|
| 27 | $gbarrier gbarrier = $gbarrier_create($here, threadsPerBlock);
|
|---|
| 28 | $proc threads[threadsPerBlock];
|
|---|
| 29 |
|
|---|
| 30 | void gpuThread(int threadID){
|
|---|
| 31 | int tid = threadID + blockID * threadsPerBlock;
|
|---|
| 32 | int cacheIndex = threadID;
|
|---|
| 33 | double temp = 0;
|
|---|
| 34 | $barrier barrier = $barrier_create($here, gbarrier, threadID);
|
|---|
| 35 | int i;
|
|---|
| 36 |
|
|---|
| 37 | $atomic {
|
|---|
| 38 | while (tid < N) {
|
|---|
| 39 | temp += a[tid] * b[tid];
|
|---|
| 40 | tid += threadsPerBlock * blocksPerGrid;
|
|---|
| 41 | }
|
|---|
| 42 | // set cache values
|
|---|
| 43 | cache[cacheIndex] = temp;
|
|---|
| 44 | }
|
|---|
| 45 | // synchronize threads
|
|---|
| 46 | $barrier_call(barrier);
|
|---|
| 47 | i = threadsPerBlock/2;
|
|---|
| 48 | while (i != 0) {
|
|---|
| 49 | if (cacheIndex < i)
|
|---|
| 50 | cache[cacheIndex] += cache[cacheIndex + i];
|
|---|
| 51 | // synchronize threads
|
|---|
| 52 | $barrier_call(barrier);
|
|---|
| 53 | i /= 2;
|
|---|
| 54 | }
|
|---|
| 55 | if (cacheIndex == 0)
|
|---|
| 56 | partial_c[blockID] = cache[0];
|
|---|
| 57 | $barrier_destroy(barrier);
|
|---|
| 58 | }
|
|---|
| 59 |
|
|---|
| 60 | for(int i = 0; i < threadsPerBlock; i++) {
|
|---|
| 61 | threads[i] = $spawn gpuThread(i);
|
|---|
| 62 | }
|
|---|
| 63 | for(int i = 0; i < threadsPerBlock; i++) {
|
|---|
| 64 | $wait(threads[i]);
|
|---|
| 65 | }
|
|---|
| 66 | $gbarrier_destroy(gbarrier);
|
|---|
| 67 | }
|
|---|
| 68 | // spawns gpublock's
|
|---|
| 69 | for(int i = 0; i < blocksPerGrid; i++) {
|
|---|
| 70 | blocks[i] = $spawn gpuBlock(i);
|
|---|
| 71 | }
|
|---|
| 72 | // waits for gpublock's
|
|---|
| 73 | for(int i = 0; i < blocksPerGrid; i++) {
|
|---|
| 74 | $wait(blocks[i]);
|
|---|
| 75 | }
|
|---|
| 76 | }
|
|---|
| 77 |
|
|---|
| 78 | int main( void ) {
|
|---|
| 79 | $scope host = $here;
|
|---|
| 80 |
|
|---|
| 81 | // allocate memory on the cpu side
|
|---|
| 82 | a = (double *) $malloc(host, N*sizeof(double));
|
|---|
| 83 | b = (double *) $malloc(host, N*sizeof(double));
|
|---|
| 84 | partial_c = (double *) $malloc(host, blocksPerGrid*sizeof(double));
|
|---|
| 85 | // fill in the host memory with data
|
|---|
| 86 | for (int i=0; i<N; i++) {
|
|---|
| 87 | a[i] = i;
|
|---|
| 88 | b[i] = i*2;
|
|---|
| 89 | }
|
|---|
| 90 | gpu();
|
|---|
| 91 | // finish up on the CPU side
|
|---|
| 92 | c = 0;
|
|---|
| 93 | for (int i=0; i<blocksPerGrid; i++) {
|
|---|
| 94 | c += partial_c[i];
|
|---|
| 95 | }
|
|---|
| 96 | #define sum_squares(x) (x*(x+1)*(2*x+1)/6)
|
|---|
| 97 | // check result
|
|---|
| 98 | $assert((c == 2 * sum_squares( (double)(N - 1) )));
|
|---|
| 99 | $free(a);
|
|---|
| 100 | $free(b);
|
|---|
| 101 | $free(partial_c);
|
|---|
| 102 | }
|
|---|