source: CIVL/examples/cuda/dot.cvl@ fbc5eb8

1.23 2.0 main test-branch
Last change on this file since fbc5eb8 was a82987f, checked in by Manchun Zheng <zmanchun@…>, 12 years ago

$wait is re-implemented as a system function now; examples are updated accordingly.

git-svn-id: svn://vsl.cis.udel.edu/civl/trunk@664 fb995dde-84ed-4084-dfe6-e5aef3e2452c

  • Property mode set to 100644
File size: 3.1 KB
Line 
1/* dot product of two arrays.
2 * Command line execution:
3 *
4 */
5#include <civlc.h>
6#include <stdio.h>
7
8#define imin(a,b) (a<b?a:b)
9
10$input int THREADS_PER_BLOCK; // thread number per block: must be a power of 2, due to the while loop at the end of gpuThread();
11$input int N_BOUND;
12$input int N;
13$assume 0 < N && N <= N_BOUND;
14int const threadsPerBlock = THREADS_PER_BLOCK;
15int const blocksPerGrid =
16 imin(32, (N+threadsPerBlock-1) / threadsPerBlock );
17double *a, *b, c, *partial_c;
18
19void gpu(){
20 $proc blocks[blocksPerGrid];
21
22 void gpuBlock(int blockID){
23 int num_in_barrier =0;
24 int barrier_size = 0;
25 double cache[threadsPerBlock];
26 int in_barrier[threadsPerBlock];
27 $proc threads[threadsPerBlock];
28
29 void __sync_init(int* in_barrier, int size) {
30 barrier_size = size;
31 for (int i=0; i<size; i++) in_barrier[i] = 0;
32 }
33
34 // model the synchronization of threads in the same block
35 void __syncthreads(int* in_barrier, int tid) {
36 $atomic {
37 in_barrier[tid] = 1; // I am in the barrier
38 num_in_barrier++; // increment number in barrier
39 if (num_in_barrier == barrier_size) { // I am last to enter
40 for (int i=0; i<barrier_size; i++) in_barrier[i] = 0; // release all
41 num_in_barrier = 0; // now none are in barrier
42 }
43 }
44 $when (in_barrier[tid] == 0); // wait till I am released
45 }
46
47 void gpuThread(int threadID){
48 int tid = threadID + blockID * threadsPerBlock;
49 int cacheIndex = threadID;
50 double temp = 0;
51
52 $atomic {
53 while (tid < N) {
54 temp += a[tid] * b[tid];
55 tid += threadsPerBlock * blocksPerGrid;
56 }
57 // set cache values
58 cache[cacheIndex] = temp;
59 }
60 // synchronize threads
61 __syncthreads(in_barrier, threadID);
62 int i = threadsPerBlock/2;
63 while (i != 0) {
64 if (cacheIndex < i)
65 cache[cacheIndex] += cache[cacheIndex + i];
66 // synchronize threads
67 __syncthreads(in_barrier, threadID);
68 i /= 2;
69 }
70 if (cacheIndex == 0)
71 partial_c[blockID] = cache[0];
72 }
73
74 __sync_init(in_barrier, threadsPerBlock);
75 for(int i = 0; i < threadsPerBlock; i++) {
76 threads[i] = $spawn gpuThread(i);
77 }
78 for(int i = 0; i < threadsPerBlock; i++) {
79 $wait(threads[i]);
80 }
81 }
82
83 for(int i = 0; i < blocksPerGrid; i++) {
84 blocks[i] = $spawn gpuBlock(i);
85 }
86 for(int i = 0; i < blocksPerGrid; i++) {
87 $wait(blocks[i]);
88 }
89}
90
91int main( void ) {
92 $scope host = $here;
93
94 // allocate memory on the cpu side
95 a = (double *) $malloc(host, N*sizeof(double));
96 b = (double *) $malloc(host, N*sizeof(double));
97 partial_c = (double *) $malloc(host, blocksPerGrid*sizeof(double));
98 // fill in the host memory with data
99 for (int i=0; i<N; i++) {
100 a[i] = i;
101 b[i] = i*2;
102 }
103 gpu();
104 // finish up on the CPU side
105 c = 0;
106 for (int i=0; i<blocksPerGrid; i++) {
107 c += partial_c[i];
108 }
109 #define sum_squares(x) (x*(x+1)*(2*x+1)/6)
110 // check result
111 $assert(c == 2 * sum_squares( (double)(N - 1) ));
112}
Note: See TracBrowser for help on using the repository browser.