source: CIVL/examples/openacc/nbody/nbodyacc.c

main
Last change on this file was ea777aa, checked in by Alex Wilton <awilton@…>, 3 years ago

Moved examples, include, build_default.properties, common.xml, and README out from dev.civl.com into the root of the repo.

git-svn-id: svn://vsl.cis.udel.edu/civl/trunk@5704 fb995dde-84ed-4084-dfe6-e5aef3e2452c

  • Property mode set to 100644
File size: 11.0 KB
Line 
1/*
2 * Copyright (c) 2016, NVIDIA Corporation. All rights reserved.
3 *
4 * Please refer to the NVIDIA end user license agreement (EULA) associated
5 * with this source code for terms and conditions that govern your use of
6 * this software. Any use, reproduction, disclosure, or distribution of
7 * this software and related documentation outside the terms of the EULA
8 * is strictly prohibited.
9 *
10 */
11
12#include <stdlib.h>
13#include <math.h>
14#include <stdio.h>
15#include <string.h>
16
17#ifdef _WIN32
18#define WIN32_LEAN_AND_MEAN
19#include <windows.h>
20#else
21#include <sys/time.h>
22#endif
23
24#ifndef FP32
25#define FP64
26#endif
27
28#ifdef FP64
29typedef double real;
30int flopsPerInteraction = 30;
31const real SOFTENING_SQUARED = 0.01;
32#define RSQRT(x) 1.0 / sqrt((x))
33#else
34typedef float real;
35int flopsPerInteraction = 20;
36const real SOFTENING_SQUARED = 0.01f;
37#define RSQRT(x) 1.0f / sqrtf((x))
38#endif
39
40typedef struct { real x, y, z; } real3;
41typedef struct { real x, y, z, w; } real4;
42
43real3 bodyBodyInteraction(real iPosx, real iPosy, real iPosz,
44 real jPosx, real jPosy, real jPosz, real jMass)
45{
46 real rx, ry, rz;
47
48 // r_01 [3 FLOPS]
49 rx = jPosx - iPosx;
50 ry = jPosy - iPosy;
51 rz = jPosz - iPosz;
52
53 // d^2 + e^2 [6 FLOPS]
54 real distSqr = rx*rx+ry*ry+rz*rz;;
55 distSqr += SOFTENING_SQUARED;
56
57 // invDistCube =1/distSqr^(3/2) [4 FLOPS (2 mul, 1 sqrt, 1 inv)]
58 real invDist = RSQRT(distSqr);
59 real invDistCube = invDist * invDist * invDist;
60
61 // s = m_j * invDistCube [1 FLOP]
62 real s = jMass * invDistCube;
63
64 // (m_1 * r_01) / (d^2 + e^2)^(3/2) [6 FLOPS]
65 real3 f;
66 f.x = rx * s;
67 f.y = ry * s;
68 f.z = rz * s;
69
70 return f;
71}
72
73void seqintegrate(real4 * restrict out,
74 real4 * restrict in,
75 real3 * restrict vel,
76 real3 * restrict force,
77 real dt,
78 int n)
79{
80
81 for (int i = 0; i < n; i++)
82 {
83 real fx, fy, fz;
84 fx = fy = fz = 0;
85
86 real iPosx = in[i].x;
87 real iPosy = in[i].y;
88 real iPosz = in[i].z;
89
90 for (int j = 0; j < n; j++)
91 {
92 real3 ff = bodyBodyInteraction(iPosx, iPosy, iPosz,
93 in[j].x, in[j].y, in[j].z, in[j].w);
94 fx += ff.x;
95 fy += ff.y;
96 fz += ff.z;
97 }
98
99 force[i].x = fx;
100 force[i].y = fy;
101 force[i].z = fz;
102 }
103
104 for (int i = 0; i < n; i++)
105 {
106 real fx = force[i].x;
107 real fy = force[i].y;
108 real fz = force[i].z;
109
110 real px = in[i].x;
111 real py = in[i].y;
112 real pz = in[i].z;
113 real invMass = in[i].w;
114
115 real vx = vel[i].x;
116 real vy = vel[i].y;
117 real vz = vel[i].z;
118
119 // acceleration = force / mass;
120 // new velocity = old velocity + acceleration * deltaTime
121 vx += (fx * invMass) * dt;
122 vy += (fy * invMass) * dt;
123 vz += (fz * invMass) * dt;
124
125 // new position = old position + velocity * deltaTime
126 px += vx * dt;
127 py += vy * dt;
128 pz += vz * dt;
129
130 out[i].x = px;
131 out[i].y = py;
132 out[i].z = pz;
133 out[i].w = invMass;
134
135 vel[i].x = vx;
136 vel[i].y = vy;
137 vel[i].z = vz;
138 }
139}
140
141void integrate(real4 * restrict out,
142 real4 * restrict in,
143 real3 * restrict vel,
144 real3 * restrict force,
145 real dt,
146 int n)
147{
148
149#pragma acc data pcopyin(in[0:n]) pcopyout(out[0:n]) pcopy(force[0:n], vel[0:n])
150 {
151#pragma acc parallel loop
152 for (int i = 0; i < n; i++)
153 {
154 real fx, fy, fz;
155 fx = fy = fz = 0;
156
157 real iPosx = in[i].x;
158 real iPosy = in[i].y;
159 real iPosz = in[i].z;
160
161 for (int j = 0; j < n; j++)
162 {
163 real3 ff = bodyBodyInteraction(iPosx, iPosy, iPosz,
164 in[j].x, in[j].y, in[j].z, in[j].w);
165 fx += ff.x;
166 fy += ff.y;
167 fz += ff.z;
168 }
169
170 force[i].x = fx;
171 force[i].y = fy;
172 force[i].z = fz;
173 }
174
175#pragma acc parallel loop
176 for (int i = 0; i < n; i++)
177 {
178 real fx = force[i].x;
179 real fy = force[i].y;
180 real fz = force[i].z;
181
182 real px = in[i].x;
183 real py = in[i].y;
184 real pz = in[i].z;
185 real invMass = in[i].w;
186
187 real vx = vel[i].x;
188 real vy = vel[i].y;
189 real vz = vel[i].z;
190
191 // acceleration = force / mass;
192 // new velocity = old velocity + acceleration * deltaTime
193 vx += (fx * invMass) * dt;
194 vy += (fy * invMass) * dt;
195 vz += (fz * invMass) * dt;
196
197 // new position = old position + velocity * deltaTime
198 px += vx * dt;
199 py += vy * dt;
200 pz += vz * dt;
201
202 out[i].x = px;
203 out[i].y = py;
204 out[i].z = pz;
205 out[i].w = invMass;
206
207 vel[i].x = vx;
208 vel[i].y = vy;
209 vel[i].z = vz;
210 }
211 }
212}
213
214real
215dot(real v0[3], real v1[3])
216{
217 return v0[0]*v1[0]+v0[1]*v1[1]+v0[2]*v1[2];
218}
219
220real
221normalize(real vector[3])
222{
223 float dist = sqrt(dot(vector, vector));
224 if (dist > 1e-6)
225 {
226 vector[0] /= dist;
227 vector[1] /= dist;
228 vector[2] /= dist;
229 }
230 return dist;
231}
232
233void cross(real out[3], real v0[3], real v1[3])
234{
235 out[0] = v0[1]*v1[2]-v0[2]*v1[1];
236 out[1] = v0[2]*v1[0]-v0[0]*v1[2];
237 out[2] = v0[0]*v1[1]-v0[1]*v1[0];
238}
239
240void randomizeBodies(real4* pos,
241 real3* vel,
242 float clusterScale,
243 float velocityScale,
244 int n)
245{
246 srand(42);
247 float scale = clusterScale;
248 float vscale = scale * velocityScale;
249 float inner = 2.5f * scale;
250 float outer = 4.0f * scale;
251
252 int p = 0, v=0;
253 int i = 0;
254 while (i < n)
255 {
256 real x, y, z;
257 x = rand() / (float) RAND_MAX * 2 - 1;
258 y = rand() / (float) RAND_MAX * 2 - 1;
259 z = rand() / (float) RAND_MAX * 2 - 1;
260
261 real point[3] = {x, y, z};
262 real len = normalize(point);
263 if (len > 1)
264 continue;
265
266 pos[i].x = point[0] * (inner + (outer - inner) * rand() / (real) RAND_MAX);
267 pos[i].y = point[1] * (inner + (outer - inner) * rand() / (real) RAND_MAX);
268 pos[i].z = point[2] * (inner + (outer - inner) * rand() / (real) RAND_MAX);
269 pos[i].w = 1.0f;
270
271 x = 0.0f;
272 y = 0.0f;
273 z = 1.0f;
274 real axis[3] = {x, y, z};
275 normalize(axis);
276
277 if (1 - dot(point, axis) < 1e-6)
278 {
279 axis[0] = point[1];
280 axis[1] = point[0];
281 normalize(axis);
282 }
283 //if (point.y < 0) axis = scalevec(axis, -1);
284 real vv[3] = {(real)pos[i].x, (real)pos[i].y, (real)pos[i].z};
285 cross(vv, vv, axis);
286 vel[i].x = vv[0] * vscale;
287 vel[i].y = vv[1] * vscale;
288 vel[i].z = vv[2] * vscale;
289
290 i++;
291 }
292}
293
294#ifdef _WIN32
295double PCFreq = 0.0;
296__int64 timerStart = 0;
297#else
298struct timeval timerStart;
299#endif
300
301void StartTimer()
302{
303#ifdef _WIN32
304 LARGE_INTEGER li;
305 if(!QueryPerformanceFrequency(&li))
306 printf("QueryPerformanceFrequency failed!\n");
307
308 PCFreq = (double)li.QuadPart/1000.0;
309
310 QueryPerformanceCounter(&li);
311 timerStart = li.QuadPart;
312#else
313 gettimeofday(&timerStart, NULL);
314#endif
315}
316
317// time elapsed in ms
318double GetTimer()
319{
320#ifdef _WIN32
321 LARGE_INTEGER li;
322 QueryPerformanceCounter(&li);
323 return (double)(li.QuadPart-timerStart)/PCFreq;
324#else
325 struct timeval timerStop, timerElapsed;
326 gettimeofday(&timerStop, NULL);
327 timersub(&timerStop, &timerStart, &timerElapsed);
328 return timerElapsed.tv_sec*1000.0+timerElapsed.tv_usec/1000.0;
329#endif
330}
331
332// run one iteration and compare to non-accelerated CPU version
333void checkCorrectness(real4 *restrict pin,
334 real4 *restrict pout,
335 real3 *restrict v,
336 real dt,
337 int n)
338{
339 real4 *pin_ref = (real4*)malloc(n * sizeof(real4));
340 real4 *pout_ref = (real4*)malloc(n * sizeof(real4));
341 real3 *v_ref = (real3*)malloc(n * sizeof(real3));
342 real3 *f = (real3*)malloc(n * sizeof(real3));
343 real3 *f_ref = (real3*)malloc(n * sizeof(real3));
344
345 randomizeBodies(pin_ref, v_ref, 1.54f, 8.0f, n);
346 memcpy( pin, pin_ref, sizeof(real4)*n);
347 memcpy( v, v_ref, sizeof(real3)*n);
348 seqintegrate(pout_ref, pin_ref, v_ref, f_ref, dt, n);
349 integrate(pout, pin, v, f, dt, n);
350
351#ifdef FP64
352 checkdtol( pout, pout_ref, 4*n );
353#else
354 checkftol5( pout, pout_ref, 4*n );
355#endif
356
357 free(pin_ref);
358 free(pout_ref);
359 free(v_ref);
360}
361
362double computePerfStats(float milliseconds, int iterations, int n)
363{
364 // double precision uses intrinsic operation followed by refinement,
365 // resulting in higher operation count per interaction.
366 // (Note Astrophysicists use 38 flops per interaction no matter what,
367 // based on "historical precedent", but they are using FLOP/s as a
368 // measure of "science throughput". We are using it as a measure of
369 // hardware throughput. They should really use interactions/s...
370 // const int flopsPerInteraction = fp64 ? 30 : 20;
371 double interactionsPerSecond = (double)n * (double)n;
372 interactionsPerSecond *= 1e-9 * iterations * 1000 / milliseconds;
373 return interactionsPerSecond * (double)flopsPerInteraction;
374}
375
376int main(int argc, char** argv)
377{
378 int n = 1024;
379 int iterations = 20;
380 real dt = 0.01667;
381
382 if (argc >= 2) n = atoi(argv[1]);
383 if (argc >= 3) iterations = atoi(argv[2]);
384
385 real4 *pin = (real4*)malloc(n * sizeof(real4));
386 real4 *pout = (real4*)malloc(n * sizeof(real4));
387 real3 *v = (real3*)malloc(n * sizeof(real3));
388 real3 *f = (real3*)malloc(n * sizeof(real3));
389
390 randomizeBodies(pin, v, 1.54f, 8.0f, n);
391
392 integrate(pout, pin, v, f, dt, n);
393 checkCorrectness(pin, pout, v, dt, n);
394
395 StartTimer();
396 for (int i = 0; i < iterations; i++)
397 {
398 integrate(pout, pin, v, f, dt, n);
399 real4 *t = pout;
400 pout = pin;
401 pin = t;
402 }
403 double ms = GetTimer();
404
405 StartTimer();
406 for (int i = 0; i < iterations; i++)
407 {
408 seqintegrate(pout, pin, v, f, dt, n);
409 real4 *t = pout;
410 pout = pin;
411 pin = t;
412 }
413 double msUnaccelerated = GetTimer();
414
415 double gf = computePerfStats(ms, iterations, n);
416 double gfUnaccelerated = computePerfStats(msUnaccelerated, iterations, n);
417
418 printf("n=%d bodies for %d iterations\n", n, iterations);
419#ifdef _OPENACC
420 printf("OpenACC: %f ms: %f GFLOP/s\n", ms, gf);
421#else
422 printf("OpenMP: %f ms: %f GFLOP/s\n", ms, gf);
423#endif
424 printf("Sequential: %f ms: %f GFLOP/s\n", msUnaccelerated, gfUnaccelerated);
425
426 free(pin);
427 free(pout);
428 free(v);
429
430 return 0;
431}
Note: See TracBrowser for help on using the repository browser.