| 1 | /*
|
|---|
| 2 | * Copyright (c) 2016, NVIDIA Corporation. All rights reserved.
|
|---|
| 3 | *
|
|---|
| 4 | * Please refer to the NVIDIA end user license agreement (EULA) associated
|
|---|
| 5 | * with this source code for terms and conditions that govern your use of
|
|---|
| 6 | * this software. Any use, reproduction, disclosure, or distribution of
|
|---|
| 7 | * this software and related documentation outside the terms of the EULA
|
|---|
| 8 | * is strictly prohibited.
|
|---|
| 9 | *
|
|---|
| 10 | */
|
|---|
| 11 |
|
|---|
| 12 | #include <stdlib.h>
|
|---|
| 13 | #include <math.h>
|
|---|
| 14 | #include <stdio.h>
|
|---|
| 15 | #include <string.h>
|
|---|
| 16 |
|
|---|
| 17 | #ifdef _WIN32
|
|---|
| 18 | #define WIN32_LEAN_AND_MEAN
|
|---|
| 19 | #include <windows.h>
|
|---|
| 20 | #else
|
|---|
| 21 | #include <sys/time.h>
|
|---|
| 22 | #endif
|
|---|
| 23 |
|
|---|
| 24 | #ifndef FP32
|
|---|
| 25 | #define FP64
|
|---|
| 26 | #endif
|
|---|
| 27 |
|
|---|
| 28 | #ifdef FP64
|
|---|
| 29 | typedef double real;
|
|---|
| 30 | int flopsPerInteraction = 30;
|
|---|
| 31 | const real SOFTENING_SQUARED = 0.01;
|
|---|
| 32 | #define RSQRT(x) 1.0 / sqrt((x))
|
|---|
| 33 | #else
|
|---|
| 34 | typedef float real;
|
|---|
| 35 | int flopsPerInteraction = 20;
|
|---|
| 36 | const real SOFTENING_SQUARED = 0.01f;
|
|---|
| 37 | #define RSQRT(x) 1.0f / sqrtf((x))
|
|---|
| 38 | #endif
|
|---|
| 39 |
|
|---|
| 40 | typedef struct { real x, y, z; } real3;
|
|---|
| 41 | typedef struct { real x, y, z, w; } real4;
|
|---|
| 42 |
|
|---|
| 43 | real3 bodyBodyInteraction(real iPosx, real iPosy, real iPosz,
|
|---|
| 44 | real jPosx, real jPosy, real jPosz, real jMass)
|
|---|
| 45 | {
|
|---|
| 46 | real rx, ry, rz;
|
|---|
| 47 |
|
|---|
| 48 | // r_01 [3 FLOPS]
|
|---|
| 49 | rx = jPosx - iPosx;
|
|---|
| 50 | ry = jPosy - iPosy;
|
|---|
| 51 | rz = jPosz - iPosz;
|
|---|
| 52 |
|
|---|
| 53 | // d^2 + e^2 [6 FLOPS]
|
|---|
| 54 | real distSqr = rx*rx+ry*ry+rz*rz;;
|
|---|
| 55 | distSqr += SOFTENING_SQUARED;
|
|---|
| 56 |
|
|---|
| 57 | // invDistCube =1/distSqr^(3/2) [4 FLOPS (2 mul, 1 sqrt, 1 inv)]
|
|---|
| 58 | real invDist = RSQRT(distSqr);
|
|---|
| 59 | real invDistCube = invDist * invDist * invDist;
|
|---|
| 60 |
|
|---|
| 61 | // s = m_j * invDistCube [1 FLOP]
|
|---|
| 62 | real s = jMass * invDistCube;
|
|---|
| 63 |
|
|---|
| 64 | // (m_1 * r_01) / (d^2 + e^2)^(3/2) [6 FLOPS]
|
|---|
| 65 | real3 f;
|
|---|
| 66 | f.x = rx * s;
|
|---|
| 67 | f.y = ry * s;
|
|---|
| 68 | f.z = rz * s;
|
|---|
| 69 |
|
|---|
| 70 | return f;
|
|---|
| 71 | }
|
|---|
| 72 |
|
|---|
| 73 | void seqintegrate(real4 * restrict out,
|
|---|
| 74 | real4 * restrict in,
|
|---|
| 75 | real3 * restrict vel,
|
|---|
| 76 | real3 * restrict force,
|
|---|
| 77 | real dt,
|
|---|
| 78 | int n)
|
|---|
| 79 | {
|
|---|
| 80 |
|
|---|
| 81 | for (int i = 0; i < n; i++)
|
|---|
| 82 | {
|
|---|
| 83 | real fx, fy, fz;
|
|---|
| 84 | fx = fy = fz = 0;
|
|---|
| 85 |
|
|---|
| 86 | real iPosx = in[i].x;
|
|---|
| 87 | real iPosy = in[i].y;
|
|---|
| 88 | real iPosz = in[i].z;
|
|---|
| 89 |
|
|---|
| 90 | for (int j = 0; j < n; j++)
|
|---|
| 91 | {
|
|---|
| 92 | real3 ff = bodyBodyInteraction(iPosx, iPosy, iPosz,
|
|---|
| 93 | in[j].x, in[j].y, in[j].z, in[j].w);
|
|---|
| 94 | fx += ff.x;
|
|---|
| 95 | fy += ff.y;
|
|---|
| 96 | fz += ff.z;
|
|---|
| 97 | }
|
|---|
| 98 |
|
|---|
| 99 | force[i].x = fx;
|
|---|
| 100 | force[i].y = fy;
|
|---|
| 101 | force[i].z = fz;
|
|---|
| 102 | }
|
|---|
| 103 |
|
|---|
| 104 | for (int i = 0; i < n; i++)
|
|---|
| 105 | {
|
|---|
| 106 | real fx = force[i].x;
|
|---|
| 107 | real fy = force[i].y;
|
|---|
| 108 | real fz = force[i].z;
|
|---|
| 109 |
|
|---|
| 110 | real px = in[i].x;
|
|---|
| 111 | real py = in[i].y;
|
|---|
| 112 | real pz = in[i].z;
|
|---|
| 113 | real invMass = in[i].w;
|
|---|
| 114 |
|
|---|
| 115 | real vx = vel[i].x;
|
|---|
| 116 | real vy = vel[i].y;
|
|---|
| 117 | real vz = vel[i].z;
|
|---|
| 118 |
|
|---|
| 119 | // acceleration = force / mass;
|
|---|
| 120 | // new velocity = old velocity + acceleration * deltaTime
|
|---|
| 121 | vx += (fx * invMass) * dt;
|
|---|
| 122 | vy += (fy * invMass) * dt;
|
|---|
| 123 | vz += (fz * invMass) * dt;
|
|---|
| 124 |
|
|---|
| 125 | // new position = old position + velocity * deltaTime
|
|---|
| 126 | px += vx * dt;
|
|---|
| 127 | py += vy * dt;
|
|---|
| 128 | pz += vz * dt;
|
|---|
| 129 |
|
|---|
| 130 | out[i].x = px;
|
|---|
| 131 | out[i].y = py;
|
|---|
| 132 | out[i].z = pz;
|
|---|
| 133 | out[i].w = invMass;
|
|---|
| 134 |
|
|---|
| 135 | vel[i].x = vx;
|
|---|
| 136 | vel[i].y = vy;
|
|---|
| 137 | vel[i].z = vz;
|
|---|
| 138 | }
|
|---|
| 139 | }
|
|---|
| 140 |
|
|---|
| 141 | void integrate(real4 * restrict out,
|
|---|
| 142 | real4 * restrict in,
|
|---|
| 143 | real3 * restrict vel,
|
|---|
| 144 | real3 * restrict force,
|
|---|
| 145 | real dt,
|
|---|
| 146 | int n)
|
|---|
| 147 | {
|
|---|
| 148 |
|
|---|
| 149 | #pragma acc data pcopyin(in[0:n]) pcopyout(out[0:n]) pcopy(force[0:n], vel[0:n])
|
|---|
| 150 | {
|
|---|
| 151 | #pragma acc parallel loop
|
|---|
| 152 | for (int i = 0; i < n; i++)
|
|---|
| 153 | {
|
|---|
| 154 | real fx, fy, fz;
|
|---|
| 155 | fx = fy = fz = 0;
|
|---|
| 156 |
|
|---|
| 157 | real iPosx = in[i].x;
|
|---|
| 158 | real iPosy = in[i].y;
|
|---|
| 159 | real iPosz = in[i].z;
|
|---|
| 160 |
|
|---|
| 161 | for (int j = 0; j < n; j++)
|
|---|
| 162 | {
|
|---|
| 163 | real3 ff = bodyBodyInteraction(iPosx, iPosy, iPosz,
|
|---|
| 164 | in[j].x, in[j].y, in[j].z, in[j].w);
|
|---|
| 165 | fx += ff.x;
|
|---|
| 166 | fy += ff.y;
|
|---|
| 167 | fz += ff.z;
|
|---|
| 168 | }
|
|---|
| 169 |
|
|---|
| 170 | force[i].x = fx;
|
|---|
| 171 | force[i].y = fy;
|
|---|
| 172 | force[i].z = fz;
|
|---|
| 173 | }
|
|---|
| 174 |
|
|---|
| 175 | #pragma acc parallel loop
|
|---|
| 176 | for (int i = 0; i < n; i++)
|
|---|
| 177 | {
|
|---|
| 178 | real fx = force[i].x;
|
|---|
| 179 | real fy = force[i].y;
|
|---|
| 180 | real fz = force[i].z;
|
|---|
| 181 |
|
|---|
| 182 | real px = in[i].x;
|
|---|
| 183 | real py = in[i].y;
|
|---|
| 184 | real pz = in[i].z;
|
|---|
| 185 | real invMass = in[i].w;
|
|---|
| 186 |
|
|---|
| 187 | real vx = vel[i].x;
|
|---|
| 188 | real vy = vel[i].y;
|
|---|
| 189 | real vz = vel[i].z;
|
|---|
| 190 |
|
|---|
| 191 | // acceleration = force / mass;
|
|---|
| 192 | // new velocity = old velocity + acceleration * deltaTime
|
|---|
| 193 | vx += (fx * invMass) * dt;
|
|---|
| 194 | vy += (fy * invMass) * dt;
|
|---|
| 195 | vz += (fz * invMass) * dt;
|
|---|
| 196 |
|
|---|
| 197 | // new position = old position + velocity * deltaTime
|
|---|
| 198 | px += vx * dt;
|
|---|
| 199 | py += vy * dt;
|
|---|
| 200 | pz += vz * dt;
|
|---|
| 201 |
|
|---|
| 202 | out[i].x = px;
|
|---|
| 203 | out[i].y = py;
|
|---|
| 204 | out[i].z = pz;
|
|---|
| 205 | out[i].w = invMass;
|
|---|
| 206 |
|
|---|
| 207 | vel[i].x = vx;
|
|---|
| 208 | vel[i].y = vy;
|
|---|
| 209 | vel[i].z = vz;
|
|---|
| 210 | }
|
|---|
| 211 | }
|
|---|
| 212 | }
|
|---|
| 213 |
|
|---|
| 214 | real
|
|---|
| 215 | dot(real v0[3], real v1[3])
|
|---|
| 216 | {
|
|---|
| 217 | return v0[0]*v1[0]+v0[1]*v1[1]+v0[2]*v1[2];
|
|---|
| 218 | }
|
|---|
| 219 |
|
|---|
| 220 | real
|
|---|
| 221 | normalize(real vector[3])
|
|---|
| 222 | {
|
|---|
| 223 | float dist = sqrt(dot(vector, vector));
|
|---|
| 224 | if (dist > 1e-6)
|
|---|
| 225 | {
|
|---|
| 226 | vector[0] /= dist;
|
|---|
| 227 | vector[1] /= dist;
|
|---|
| 228 | vector[2] /= dist;
|
|---|
| 229 | }
|
|---|
| 230 | return dist;
|
|---|
| 231 | }
|
|---|
| 232 |
|
|---|
| 233 | void cross(real out[3], real v0[3], real v1[3])
|
|---|
| 234 | {
|
|---|
| 235 | out[0] = v0[1]*v1[2]-v0[2]*v1[1];
|
|---|
| 236 | out[1] = v0[2]*v1[0]-v0[0]*v1[2];
|
|---|
| 237 | out[2] = v0[0]*v1[1]-v0[1]*v1[0];
|
|---|
| 238 | }
|
|---|
| 239 |
|
|---|
| 240 | void randomizeBodies(real4* pos,
|
|---|
| 241 | real3* vel,
|
|---|
| 242 | float clusterScale,
|
|---|
| 243 | float velocityScale,
|
|---|
| 244 | int n)
|
|---|
| 245 | {
|
|---|
| 246 | srand(42);
|
|---|
| 247 | float scale = clusterScale;
|
|---|
| 248 | float vscale = scale * velocityScale;
|
|---|
| 249 | float inner = 2.5f * scale;
|
|---|
| 250 | float outer = 4.0f * scale;
|
|---|
| 251 |
|
|---|
| 252 | int p = 0, v=0;
|
|---|
| 253 | int i = 0;
|
|---|
| 254 | while (i < n)
|
|---|
| 255 | {
|
|---|
| 256 | real x, y, z;
|
|---|
| 257 | x = rand() / (float) RAND_MAX * 2 - 1;
|
|---|
| 258 | y = rand() / (float) RAND_MAX * 2 - 1;
|
|---|
| 259 | z = rand() / (float) RAND_MAX * 2 - 1;
|
|---|
| 260 |
|
|---|
| 261 | real point[3] = {x, y, z};
|
|---|
| 262 | real len = normalize(point);
|
|---|
| 263 | if (len > 1)
|
|---|
| 264 | continue;
|
|---|
| 265 |
|
|---|
| 266 | pos[i].x = point[0] * (inner + (outer - inner) * rand() / (real) RAND_MAX);
|
|---|
| 267 | pos[i].y = point[1] * (inner + (outer - inner) * rand() / (real) RAND_MAX);
|
|---|
| 268 | pos[i].z = point[2] * (inner + (outer - inner) * rand() / (real) RAND_MAX);
|
|---|
| 269 | pos[i].w = 1.0f;
|
|---|
| 270 |
|
|---|
| 271 | x = 0.0f;
|
|---|
| 272 | y = 0.0f;
|
|---|
| 273 | z = 1.0f;
|
|---|
| 274 | real axis[3] = {x, y, z};
|
|---|
| 275 | normalize(axis);
|
|---|
| 276 |
|
|---|
| 277 | if (1 - dot(point, axis) < 1e-6)
|
|---|
| 278 | {
|
|---|
| 279 | axis[0] = point[1];
|
|---|
| 280 | axis[1] = point[0];
|
|---|
| 281 | normalize(axis);
|
|---|
| 282 | }
|
|---|
| 283 | //if (point.y < 0) axis = scalevec(axis, -1);
|
|---|
| 284 | real vv[3] = {(real)pos[i].x, (real)pos[i].y, (real)pos[i].z};
|
|---|
| 285 | cross(vv, vv, axis);
|
|---|
| 286 | vel[i].x = vv[0] * vscale;
|
|---|
| 287 | vel[i].y = vv[1] * vscale;
|
|---|
| 288 | vel[i].z = vv[2] * vscale;
|
|---|
| 289 |
|
|---|
| 290 | i++;
|
|---|
| 291 | }
|
|---|
| 292 | }
|
|---|
| 293 |
|
|---|
| 294 | #ifdef _WIN32
|
|---|
| 295 | double PCFreq = 0.0;
|
|---|
| 296 | __int64 timerStart = 0;
|
|---|
| 297 | #else
|
|---|
| 298 | struct timeval timerStart;
|
|---|
| 299 | #endif
|
|---|
| 300 |
|
|---|
| 301 | void StartTimer()
|
|---|
| 302 | {
|
|---|
| 303 | #ifdef _WIN32
|
|---|
| 304 | LARGE_INTEGER li;
|
|---|
| 305 | if(!QueryPerformanceFrequency(&li))
|
|---|
| 306 | printf("QueryPerformanceFrequency failed!\n");
|
|---|
| 307 |
|
|---|
| 308 | PCFreq = (double)li.QuadPart/1000.0;
|
|---|
| 309 |
|
|---|
| 310 | QueryPerformanceCounter(&li);
|
|---|
| 311 | timerStart = li.QuadPart;
|
|---|
| 312 | #else
|
|---|
| 313 | gettimeofday(&timerStart, NULL);
|
|---|
| 314 | #endif
|
|---|
| 315 | }
|
|---|
| 316 |
|
|---|
| 317 | // time elapsed in ms
|
|---|
| 318 | double GetTimer()
|
|---|
| 319 | {
|
|---|
| 320 | #ifdef _WIN32
|
|---|
| 321 | LARGE_INTEGER li;
|
|---|
| 322 | QueryPerformanceCounter(&li);
|
|---|
| 323 | return (double)(li.QuadPart-timerStart)/PCFreq;
|
|---|
| 324 | #else
|
|---|
| 325 | struct timeval timerStop, timerElapsed;
|
|---|
| 326 | gettimeofday(&timerStop, NULL);
|
|---|
| 327 | timersub(&timerStop, &timerStart, &timerElapsed);
|
|---|
| 328 | return timerElapsed.tv_sec*1000.0+timerElapsed.tv_usec/1000.0;
|
|---|
| 329 | #endif
|
|---|
| 330 | }
|
|---|
| 331 |
|
|---|
| 332 | // run one iteration and compare to non-accelerated CPU version
|
|---|
| 333 | void checkCorrectness(real4 *restrict pin,
|
|---|
| 334 | real4 *restrict pout,
|
|---|
| 335 | real3 *restrict v,
|
|---|
| 336 | real dt,
|
|---|
| 337 | int n)
|
|---|
| 338 | {
|
|---|
| 339 | real4 *pin_ref = (real4*)malloc(n * sizeof(real4));
|
|---|
| 340 | real4 *pout_ref = (real4*)malloc(n * sizeof(real4));
|
|---|
| 341 | real3 *v_ref = (real3*)malloc(n * sizeof(real3));
|
|---|
| 342 | real3 *f = (real3*)malloc(n * sizeof(real3));
|
|---|
| 343 | real3 *f_ref = (real3*)malloc(n * sizeof(real3));
|
|---|
| 344 |
|
|---|
| 345 | randomizeBodies(pin_ref, v_ref, 1.54f, 8.0f, n);
|
|---|
| 346 | memcpy( pin, pin_ref, sizeof(real4)*n);
|
|---|
| 347 | memcpy( v, v_ref, sizeof(real3)*n);
|
|---|
| 348 | seqintegrate(pout_ref, pin_ref, v_ref, f_ref, dt, n);
|
|---|
| 349 | integrate(pout, pin, v, f, dt, n);
|
|---|
| 350 |
|
|---|
| 351 | #ifdef FP64
|
|---|
| 352 | checkdtol( pout, pout_ref, 4*n );
|
|---|
| 353 | #else
|
|---|
| 354 | checkftol5( pout, pout_ref, 4*n );
|
|---|
| 355 | #endif
|
|---|
| 356 |
|
|---|
| 357 | free(pin_ref);
|
|---|
| 358 | free(pout_ref);
|
|---|
| 359 | free(v_ref);
|
|---|
| 360 | }
|
|---|
| 361 |
|
|---|
| 362 | double computePerfStats(float milliseconds, int iterations, int n)
|
|---|
| 363 | {
|
|---|
| 364 | // double precision uses intrinsic operation followed by refinement,
|
|---|
| 365 | // resulting in higher operation count per interaction.
|
|---|
| 366 | // (Note Astrophysicists use 38 flops per interaction no matter what,
|
|---|
| 367 | // based on "historical precedent", but they are using FLOP/s as a
|
|---|
| 368 | // measure of "science throughput". We are using it as a measure of
|
|---|
| 369 | // hardware throughput. They should really use interactions/s...
|
|---|
| 370 | // const int flopsPerInteraction = fp64 ? 30 : 20;
|
|---|
| 371 | double interactionsPerSecond = (double)n * (double)n;
|
|---|
| 372 | interactionsPerSecond *= 1e-9 * iterations * 1000 / milliseconds;
|
|---|
| 373 | return interactionsPerSecond * (double)flopsPerInteraction;
|
|---|
| 374 | }
|
|---|
| 375 |
|
|---|
| 376 | int main(int argc, char** argv)
|
|---|
| 377 | {
|
|---|
| 378 | int n = 1024;
|
|---|
| 379 | int iterations = 20;
|
|---|
| 380 | real dt = 0.01667;
|
|---|
| 381 |
|
|---|
| 382 | if (argc >= 2) n = atoi(argv[1]);
|
|---|
| 383 | if (argc >= 3) iterations = atoi(argv[2]);
|
|---|
| 384 |
|
|---|
| 385 | real4 *pin = (real4*)malloc(n * sizeof(real4));
|
|---|
| 386 | real4 *pout = (real4*)malloc(n * sizeof(real4));
|
|---|
| 387 | real3 *v = (real3*)malloc(n * sizeof(real3));
|
|---|
| 388 | real3 *f = (real3*)malloc(n * sizeof(real3));
|
|---|
| 389 |
|
|---|
| 390 | randomizeBodies(pin, v, 1.54f, 8.0f, n);
|
|---|
| 391 |
|
|---|
| 392 | integrate(pout, pin, v, f, dt, n);
|
|---|
| 393 | checkCorrectness(pin, pout, v, dt, n);
|
|---|
| 394 |
|
|---|
| 395 | StartTimer();
|
|---|
| 396 | for (int i = 0; i < iterations; i++)
|
|---|
| 397 | {
|
|---|
| 398 | integrate(pout, pin, v, f, dt, n);
|
|---|
| 399 | real4 *t = pout;
|
|---|
| 400 | pout = pin;
|
|---|
| 401 | pin = t;
|
|---|
| 402 | }
|
|---|
| 403 | double ms = GetTimer();
|
|---|
| 404 |
|
|---|
| 405 | StartTimer();
|
|---|
| 406 | for (int i = 0; i < iterations; i++)
|
|---|
| 407 | {
|
|---|
| 408 | seqintegrate(pout, pin, v, f, dt, n);
|
|---|
| 409 | real4 *t = pout;
|
|---|
| 410 | pout = pin;
|
|---|
| 411 | pin = t;
|
|---|
| 412 | }
|
|---|
| 413 | double msUnaccelerated = GetTimer();
|
|---|
| 414 |
|
|---|
| 415 | double gf = computePerfStats(ms, iterations, n);
|
|---|
| 416 | double gfUnaccelerated = computePerfStats(msUnaccelerated, iterations, n);
|
|---|
| 417 |
|
|---|
| 418 | printf("n=%d bodies for %d iterations\n", n, iterations);
|
|---|
| 419 | #ifdef _OPENACC
|
|---|
| 420 | printf("OpenACC: %f ms: %f GFLOP/s\n", ms, gf);
|
|---|
| 421 | #else
|
|---|
| 422 | printf("OpenMP: %f ms: %f GFLOP/s\n", ms, gf);
|
|---|
| 423 | #endif
|
|---|
| 424 | printf("Sequential: %f ms: %f GFLOP/s\n", msUnaccelerated, gfUnaccelerated);
|
|---|
| 425 |
|
|---|
| 426 | free(pin);
|
|---|
| 427 | free(pout);
|
|---|
| 428 | free(v);
|
|---|
| 429 |
|
|---|
| 430 | return 0;
|
|---|
| 431 | }
|
|---|