| [33534bb] | 1 | /*
|
|---|
| 2 | * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved.
|
|---|
| 3 | *
|
|---|
| 4 | * NVIDIA CORPORATION and its licensors retain all intellectual property
|
|---|
| 5 | * and proprietary rights in and to this software, related documentation
|
|---|
| 6 | * and any modifications thereto. Any use, reproduction, disclosure or
|
|---|
| 7 | * distribution of this software and related documentation without an express
|
|---|
| 8 | * license agreement from NVIDIA CORPORATION is strictly prohibited.
|
|---|
| 9 | *
|
|---|
| 10 | */
|
|---|
| 11 |
|
|---|
| 12 |
|
|---|
| 13 | /*
|
|---|
| 14 | * Jacobi iteration example using OpenACC in C
|
|---|
| 15 | * Build with
|
|---|
| 16 | * pgcc -acc -Minfo=accel -fast c3.c
|
|---|
| 17 | */
|
|---|
| 18 |
|
|---|
| 19 | #include <stdio.h>
|
|---|
| 20 | #include <stdlib.h>
|
|---|
| 21 | #include <assert.h>
|
|---|
| 22 | #include <openacc.h>
|
|---|
| 23 | #include <math.h>
|
|---|
| 24 |
|
|---|
| 25 | #if defined(_WIN32) || defined(_WIN64)
|
|---|
| 26 | #include <sys/timeb.h>
|
|---|
| 27 | #define gettime(a) _ftime(a)
|
|---|
| 28 | #define usec(t1,t2) ((((t2).time-(t1).time)*1000+((t2).millitm-(t1).millitm))*100)
|
|---|
| 29 | typedef struct _timeb timestruct;
|
|---|
| 30 | #else
|
|---|
| 31 | #include <sys/time.h>
|
|---|
| 32 | #define gettime(a) gettimeofday(a,NULL)
|
|---|
| 33 | #define usec(t1,t2) (((t2).tv_sec-(t1).tv_sec)*1000000+((t2).tv_usec-(t1).tv_usec))
|
|---|
| 34 | typedef struct timeval timestruct;
|
|---|
| 35 | #endif
|
|---|
| 36 |
|
|---|
| 37 | void
|
|---|
| 38 | smooth( float*restrict a, float*restrict b, float w0, float w1, float w2, int n, int m, int niters )
|
|---|
| 39 | {
|
|---|
| 40 | int i, j, iter;
|
|---|
| 41 | float* tmp;
|
|---|
| 42 | for( iter = 1; iter <= niters; ++iter ){
|
|---|
| 43 | #pragma acc kernels loop copyin(b[0:n*m]) copy(a[0:n*m])
|
|---|
| 44 | for( i = 1; i < n-1; ++i )
|
|---|
| 45 | for( j = 1; j < m-1; ++j )
|
|---|
| 46 | a[i*m+j] = w0 * b[i*m+j] +
|
|---|
| 47 | w1*(b[(i-1)*m+j] + b[(i+1)*m+j] + b[i*m+j-1] + b[i*m+j+1]) +
|
|---|
| 48 | w2*(b[(i-1)*m+j-1] + b[(i-1)*m+j+1] + b[(i+1)*m+j-1] + b[(i+1)*m+j+1]);
|
|---|
| 49 | tmp = a; a = b; b = tmp;
|
|---|
| 50 | }
|
|---|
| 51 | }
|
|---|
| 52 |
|
|---|
| 53 | void
|
|---|
| 54 | smoothhost( float*restrict a, float*restrict b, float w0, float w1, float w2, int n, int m, int niters )
|
|---|
| 55 | {
|
|---|
| 56 | int i, j, iter;
|
|---|
| 57 | float* tmp;
|
|---|
| 58 | for( iter = 1; iter <= niters; ++iter ){
|
|---|
| 59 | for( i = 1; i < n-1; ++i ){
|
|---|
| 60 | for( j = 1; j < m-1; ++j ){
|
|---|
| 61 | a[i*m+j] = w0 * b[i*m+j] +
|
|---|
| 62 | w1*(b[(i-1)*m+j] + b[(i+1)*m+j] + b[i*m+j-1] + b[i*m+j+1]) +
|
|---|
| 63 | w2*(b[(i-1)*m+j-1] + b[(i-1)*m+j+1] + b[(i+1)*m+j-1] + b[(i+1)*m+j+1]);
|
|---|
| 64 | }
|
|---|
| 65 | }
|
|---|
| 66 | tmp = a; a = b; b = tmp;
|
|---|
| 67 | }
|
|---|
| 68 | }
|
|---|
| 69 |
|
|---|
| 70 | void
|
|---|
| 71 | doprt( char* s, float*restrict a, float*restrict ah, int i, int j, int n, int m )
|
|---|
| 72 | {
|
|---|
| 73 | printf( "%s[%d][%d] = %g = %g\n", s, i, j, a[i*m+j], ah[i*m+j] );
|
|---|
| 74 | }
|
|---|
| 75 |
|
|---|
| 76 | int
|
|---|
| 77 | main( int argc, char* argv[] )
|
|---|
| 78 | {
|
|---|
| 79 | float *aa, *bb, *aahost, *bbhost;
|
|---|
| 80 | int i,j;
|
|---|
| 81 | float w0, w1, w2;
|
|---|
| 82 | int n, m, aerrs, berrs, iters;
|
|---|
| 83 | float dif, rdif, tol;
|
|---|
| 84 | timestruct t1, t2, t3;
|
|---|
| 85 | long long cgpu, chost;
|
|---|
| 86 |
|
|---|
| 87 | n = 0;
|
|---|
| 88 | m = 0;
|
|---|
| 89 | iters = 0;
|
|---|
| 90 |
|
|---|
| 91 | if( argc > 1 ){
|
|---|
| 92 | n = atoi( argv[1] );
|
|---|
| 93 | if( argc > 2 ){
|
|---|
| 94 | m = atoi( argv[2] );
|
|---|
| 95 | if( argc > 3 ){
|
|---|
| 96 | iters = atoi( argv[3] );
|
|---|
| 97 | if( argc > 4 ){
|
|---|
| 98 | if( !strcmp( argv[4], "host" ) ||
|
|---|
| 99 | !strcmp( argv[4], "HOST" ) ){
|
|---|
| 100 | acc_set_device( acc_device_host );
|
|---|
| 101 | printf( "using host\n" );
|
|---|
| 102 | }else
|
|---|
| 103 | if( !strcmp( argv[4], "nvidia" ) ||
|
|---|
| 104 | !strcmp( argv[4], "NVIDIA" ) ){
|
|---|
| 105 | acc_set_device( acc_device_nvidia );
|
|---|
| 106 | acc_init( acc_device_nvidia );
|
|---|
| 107 | printf( "using nvidia\n" );
|
|---|
| 108 | }else{
|
|---|
| 109 | printf( "unknown device: %s\nUsing default\n", argv[4] );
|
|---|
| 110 | }
|
|---|
| 111 | }
|
|---|
| 112 | }
|
|---|
| 113 | }
|
|---|
| 114 | }
|
|---|
| 115 |
|
|---|
| 116 | if( n <= 0 ) n = 1000;
|
|---|
| 117 | if( m <= 0 ) m = n;
|
|---|
| 118 | if( iters <= 0 ) iters = 10;
|
|---|
| 119 |
|
|---|
| 120 | aa = (float*) malloc( sizeof(float) * n * m );
|
|---|
| 121 | aahost = (float*) malloc( sizeof(float) * n * m );
|
|---|
| 122 | bb = (float*)malloc( sizeof(float) * n * m );
|
|---|
| 123 | bbhost = (float*)malloc( sizeof(float) * n * m );
|
|---|
| 124 | for( i = 0; i < n; ++i ){
|
|---|
| 125 | for( j = 0; j < m; ++j ){
|
|---|
| 126 | aa[i*m+j] = 0;
|
|---|
| 127 | aahost[i*m+j] = 0;
|
|---|
| 128 | bb[i*m+j] = i*1000 + j;
|
|---|
| 129 | bbhost[i*m+j] = i*1000 + j;
|
|---|
| 130 | }
|
|---|
| 131 | }
|
|---|
| 132 | w0 = 0.5;
|
|---|
| 133 | w1 = 0.3;
|
|---|
| 134 | w2 = 0.2;
|
|---|
| 135 | gettime( &t1 );
|
|---|
| 136 | smooth( aa, bb, w0, w1, w2, n, m, iters );
|
|---|
| 137 | gettime( &t2 );
|
|---|
| 138 | smoothhost( aahost, bbhost, w0, w1, w2, n, m, iters );
|
|---|
| 139 | gettime( &t3 );
|
|---|
| 140 |
|
|---|
| 141 | cgpu = usec(t1,t2);
|
|---|
| 142 | chost = usec(t2,t3);
|
|---|
| 143 |
|
|---|
| 144 | printf( "matrix %d x %d, %d iterations\n", n, m, iters );
|
|---|
| 145 | printf( "%13ld microseconds optimized\n", cgpu );
|
|---|
| 146 | printf( "%13ld microseconds on host\n", chost );
|
|---|
| 147 |
|
|---|
| 148 | aerrs = berrs = 0;
|
|---|
| 149 | tol = 0.000005;
|
|---|
| 150 | for( i = 0; i < n; ++i ){
|
|---|
| 151 | for( j = 0; j < m; ++j ){
|
|---|
| 152 | rdif = dif = fabsf(aa[i*m+j] - aahost[i*m+j]);
|
|---|
| 153 | if( aahost[i*m+j] ) rdif = fabsf(dif / aahost[i*m+j]);
|
|---|
| 154 | if( rdif > tol ){
|
|---|
| 155 | ++aerrs;
|
|---|
| 156 | if( aerrs < 10 ){
|
|---|
| 157 | printf( "aa[%d][%d] = %12.7e != %12.7e, dif=%12.7e\n", i, j, (double)aa[i*m+j], (double)aahost[i*m+j], (double)dif );
|
|---|
| 158 | }
|
|---|
| 159 | }
|
|---|
| 160 | rdif = dif = fabsf(bb[i*m+j] - bbhost[i*m+j]);
|
|---|
| 161 | if( bbhost[i*m+j] ) rdif = fabsf(dif / bbhost[i*m+j]);
|
|---|
| 162 | if( rdif > tol ){
|
|---|
| 163 | ++berrs;
|
|---|
| 164 | if( berrs < 10 ){
|
|---|
| 165 | printf( "bb[%d][%d] = %12.7e != %12.7e, dif=%12.7e\n", i, j, (double)bb[i*m+j], (double)bbhost[i*m+j], (double)dif );
|
|---|
| 166 | }
|
|---|
| 167 | }
|
|---|
| 168 | }
|
|---|
| 169 | }
|
|---|
| 170 | if( aerrs == 0 && berrs == 0 ){
|
|---|
| 171 | printf( "Test PASSED\n" );
|
|---|
| 172 | return 0;
|
|---|
| 173 | }else{
|
|---|
| 174 | printf( "Test FAILED\n" );
|
|---|
| 175 | printf( "%d ERRORS found\n", aerrs + berrs );
|
|---|
| 176 | return 1;
|
|---|
| 177 | }
|
|---|
| 178 | }
|
|---|