source: CIVL/examples/omp/c_fft6.c@ 1aaefd4

main test-branch
Last change on this file since 1aaefd4 was ea777aa, checked in by Alex Wilton <awilton@…>, 3 years ago

Moved examples, include, build_default.properties, common.xml, and README out from dev.civl.com into the root of the repo.

git-svn-id: svn://vsl.cis.udel.edu/civl/trunk@5704 fb995dde-84ed-4084-dfe6-e5aef3e2452c

  • Property mode set to 100644
File size: 17.1 KB
RevLine 
[feadd65]1/* ***********************************************************************
2 This program is part of the
3 OpenMP Source Code Repository
4
5 http://www.pcg.ull.es/ompscr/
6 e-mail: ompscr@etsii.ull.es
7
8 This program is free software; you can redistribute it and/or modify
9 it under the terms of the GNU General Public License as published by
10 the Free Software Foundation; either version 2 of the License, or
11 (at your option) any later version.
12
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
17
18 You should have received a copy of the GNU General Public License
19 (LICENSE file) along with this program; if not, write to
20 the Free Software Foundation, Inc., 59 Temple Place, Suite 330,
21 Boston, MA 02111-1307 USA
22
23 FILE: c_fft6.c
24 VERSION: 1.0
25 DATE: May 2004
26 AUTHOR: F. de Sande
27 COMMENTS TO: sande@csi.ull.es
28 DESCRIPTION: Bailey's 6-step 1D Fast Fourier Transform
29 Computes the 1D FFT of an input signal (complex array)
30 COMMENTS: The code performs a number (ITERS) of iterations of the
31 Bailey's 6-step FFT algorithm (following the ideas in the
32 CMU Task parallel suite).
33 1.- Generates an input signal vector (dgen) with size
34 n=n1xn2 stored in row major order
35 In this code the size of the input signal
36 is NN=NxN (n=NN, n1=n2=N)
37 2.- Transpose (tpose) A to have it stored in column
38
39 major order
40 3.- Perform independent FFTs on the rows (cffts)
41 4.- Scale each element of the resulting array by a
42 factor of w[n]**(p*q)
43 5.- Transpose (tpose) to prepair it for the next step
44 6.- Perform independent FFTs on the rows (cffts)
45 7.- Transpose the resulting matrix
46 The code requires nested Parallelism.
47 REFERENCES: David H. Bailey
48 FFTs in External or Hierarchical Memory,
49 The Journal of Supercomputing, vol. 4, no. 1, pg. 23-35. Mar 1990,
50 vol. 4, no. 7, pg. 321. Jul 1961
51 http://www-2.cs.cmu.edu/~fx/tpsuite.html
52 http://en.wikipedia.org/wiki/Cooley-Tukey_FFT_algorithm
53 BASIC PRAGMAS: parallel for
54 USAGE: ./c_fft6.par 1024 10
55 INPUT: The size of the input signal.
56 The number of iterations to perform
57 OUTPUT: The code tests the correctness of the output signal
58 FILE FORMATS: -
59 RESTRICTIONS: The size of the input signal MUST BE a power of 2
60 REVISION HISTORY:
61**************************************************************************/
62//#include "OmpSCR.h"
63#include <omp.h>
64#include <stdlib.h>
65#include <math.h>
66#include <stdio.h>
67
68#define PI (3.141592653589793f)
69#define NUM_ARGS 2
70#define NUM_TIMERS 4
71typedef double real_type;
72
73#define NINIT 2
74#define ITERSINIT 2
75
76/* Input values (CONSTANTS) */
77unsigned NN; /* input vector is NN complex values */
78unsigned LOGNN; /* log base 2 of NN */
79unsigned N; /* square root of NN */
80unsigned LOGN; /* log base 2 of N */
81int ITERS; /* number of input vectors */
82
83typedef struct { real_type re, im; } complex;
84/* -----------------------------------------------------------------------
85 PROTOTYPES
86 * ----------------------------------------------------------------------- */
87int prperf(double tpose_time, double scale_time, double ffts_time, int nn, int lognn, int iters);
88int gen_bit_reverse_table(int *brt, int n);
89int gen_w_table(complex *w, int n, int logn, int ndv2);
90int gen_v_table(complex *v, int n);
91int dgen(complex *xin, int n);
92int tpose(complex *a, complex *b, int n);
93int tpose_seq(complex *a, complex *b, const int n);
94int cffts(complex *a, int *brt, complex *w, int n, int logn, int ndv2);
95int cffts_seq(complex *a, int *brt, complex *w, const int n, int logn, int ndv2);
96int fft(complex *a, int *brt, complex *w, int n, int logn, int ndv2);
97int scale(complex *a, complex *v, int n);
98int scale_seq(complex *a, complex *v, const int n);
99int chkmat(complex *a, int n);
100complex complex_pow(complex z, int n);
101int log2_int(int n);
102int print_mat(complex *a, int n);
103int print_vec(complex *a, int n);
104/* -----------------------------------------------------------------
105 IMPLEMENTATION
106 * ----------------------------------------------------------------- */
107/* -----------------------------------------------------------------
108 gen_bit_reverse_table - initialize bit reverse table
109 Postcondition: br(i) = bit-reverse(i-1) + 1
110 ----------------------------------------------------------------- */
111int gen_bit_reverse_table(int *brt, int n) {
112 register int i, j, k, nv2;
113
114 nv2 = n / 2;
115 j = 1;
116 brt[0] = j;
117 for (i = 1; i < n; ++i) {
118 k = nv2;
119 while (k < j) {
120 j -= k;
121 k /= 2;
122 }
123 j += k;
124 brt[i] = j;
125 }
126 return 0;
127}
128/* -----------------------------------------------------------------------
129 gen_w_table: generate powers of w.
130 postcondition: w(i) = w**(i-1)
131 * ----------------------------------------------------------------------- */
132int gen_w_table(complex *w, int n, int logn, int ndv2) {
133 register int i;
134 complex ww, pt;
135
136 ww.re = cos(PI / ndv2);
137 ww.im = -sin(PI / ndv2);
138 w[0].re = 1.f;
139 w[0].im = 0.f;
140 pt.re = 1.f;
141 pt.im = 0.f;
142 for (i = 1; i < ndv2; ++i) {
143 w[i].re = pt.re * ww.re - pt.im * ww.im;
144 w[i].im = pt.re * ww.im + pt.im * ww.re;
145 pt = w[i];
146 }
147 return 0;
148}
149/* -----------------------------------------------------------------------
150 gen_v_table - gen 2d twiddle factors
151 * ----------------------------------------------------------------------- */
152int gen_v_table(complex *v, int n) {
153 register int j, k;
154 complex wn;
155
156 wn.re = cos(PI * 2.f / (n * n));
157 wn.im = -sin(PI * 2.f / (n * n));
158 for (j = 0; j < n; ++j) {
159 for (k = 0; k < n; ++k) {
160 v[j * n + k] = complex_pow(wn, j * k);
161 }
162 }
163 return 0;
164}
165/* -----------------------------------------------------------------------
166 z^n = modulo^n (cos(n * alfa) + i sin(n * alfa)
167 * ----------------------------------------------------------------------- */
168complex complex_pow(complex z, int n) {
169 complex temp;
170 real_type pot, nangulo, modulo, angulo;
171
172 modulo = sqrt(z.re * z.re + z.im * z.im);
173 angulo = atan(z.im / z.re);
174 pot = pow(modulo, n);
175 nangulo = n * angulo;
176
177 temp.re = pot * cos(nangulo);
178 temp.im = pot * sin(nangulo);
179 return(temp);
180}
181/* -----------------------------------------------------------------------
182 A: Para una señal de entrada de tamaño N y de forma
183 (1,0), (1,0), ..., (1,0)
184 la salida debe ser de la forma
185 (N,0), (0,0), ..., (0,0)
186
187 B: Para una señal de entrada de tamaño N y de forma
188 (0,0), (0,0), ..., (N*N, 0), ..., (0,0)
189 la salida debe ser de la forma (N*N, 0), (0,0), ..., (0,0)
190
191 Propiedad útil: C: FFT(s1 + s2) = FFT(s1) + FFT(s2)
192
193 * ----------------------------------------------------------------------- */
194int dgen(complex *xin, int n) {
195 register int i;
196 int nn = n * n;
197 /* Señal de forma B */
198 for (i = 0; i < nn; ++i) {
199 xin[i].re = 0.f;
200 xin[i].im = 0.f;
201 }
202 xin[nn / 2].re = (real_type)nn;
203
204 /* Señal de forma A */
205 /*
206 for (i = 0; i < nn; ++i) {
207 xin[i].re = 1.0;
208 xin[i].im = 0.f;
209 }
210 */
211 return 0;
212}
213/* ----------------------------------------------------------------- */
214int tpose(complex *a, complex *b, const int n) {
215 register int i, j;
216
217#pragma omp parallel for private(i, j)
218 for (i = 0; i < n; ++i) {
219 for (j = i; j < n; ++j) {
220 b[i * n + j] = a[j * n + i];
221 b[j * n + i] = a[i * n + j];
222 }
223 }
224 return 0;
225}
226
227/* ----------------------------------------------------------------- */
228int tpose_seq(complex *a, complex *b, const int n) {
229 register int i, j;
230
231 for (i = 0; i < n; ++i) {
232 for (j = i; j < n; ++j) {
233 b[i * n + j] = a[j * n + i];
234 b[j * n + i] = a[i * n + j];
235 }
236 }
237 return 0;
238}
239/* ----------------------------------------------------------------- */
240int cffts(complex *a, int *brt, complex *w, const int n, int logn, int ndv2) {
241 register int i;
242
243#pragma omp parallel for private(i)
244 for (i = 0; i < n; ++i)
245 fft(&a[i * n], brt, w, n, logn, ndv2);
246 /* fft(a + i * n, brt, w, n, logn, ndv2); */
247 return 0;
248}
249/* ----------------------------------------------------------------- */
250int cffts_seq(complex *a, int *brt, complex *w, const int n, int logn, int ndv2) {
251 register int i;
252
253 for (i = 0; i < n; ++i)
254 fft(&a[i * n], brt, w, n, logn, ndv2);
255 return 0;
256}
257/* -----------------------------------------------------------------------
258 Fast Fourier Transform
259 1D in-place complex-complex decimation-in-time Cooley-Tukey
260 * ----------------------------------------------------------------------- */
261int fft(complex *a, int *brt, complex *w, int n, int logn, int ndv2) {
262 register int i, j;
263 int powerOfW, stage, first, spowerOfW;
264 int ijDiff;
265 int stride;
266 complex ii, jj, temp, pw;
267
268 /* bit reverse step */
269 for (i = 0; i < n; ++i) {
270 j = brt[i];
271 if (i < (j-1)) {
272 temp = a[j - 1];
273 a[j - 1] = a[i];
274 a[i] = temp;
275 }
276 }
277
278 /* butterfly computations */
279 ijDiff = 1;
280 stride = 2;
281 spowerOfW = ndv2;
282 for (stage = 0; stage < logn; ++stage) {
283 /* Invariant: stride = 2 ** stage
284 Invariant: ijDiff = 2 ** (stage - 1) */
285 first = 0;
286 for (powerOfW = 0; powerOfW < ndv2; powerOfW += spowerOfW) {
287 pw = w[powerOfW];
288 /* Invariant: pwr + sqrt(-1)*pwi = W**(powerOfW - 1) */
289 for (i = first; i < n; i += stride) {
290 j = i + ijDiff;
291 jj = a[j];
292 ii = a[i];
293 temp.re = jj.re * pw.re - jj.im * pw.im;
294 temp.im = jj.re * pw.im + jj.im * pw.re;
295 a[j].re = ii.re - temp.re;
296 a[j].im = ii.im - temp.im;
297 a[i].re = ii.re + temp.re;
298 a[i].im = ii.im + temp.im;
299 }
300 ++first;
301 }
302 ijDiff = stride;
303 stride <<= 1;
304 spowerOfW /= 2;
305 }
306 return 0;
307} /* fft */
308/* ----------------------------------------------------------------- */
309int scale(complex *a, complex *v, const int n) {
310 register int i, j, index;
311 complex aa, vv;
312
313#pragma omp parallel for private(i, j, index, aa, vv)
314 for (i = 0; i < n; ++i) {
315 for (j = 0; j < n; ++j) {
316 index = i * n + j;
317 aa = a[index];
318 vv = v[index];
319 a[index].re = aa.re * vv.re - aa.im * vv.im;
320 a[index].im = aa.re * vv.im + aa.im * vv.re;
321 }
322 }
323 return 0;
324}
325/* ----------------------------------------------------------------- */
326int scale_seq(complex *a, complex *v, const int n) {
327 register int i, j, index;
328 complex aa, vv;
329
330 for (i = 0; i < n; ++i) {
331 for (j = 0; j < n; ++j) {
332 index = i * n + j;
333 aa = a[index];
334 vv = v[index];
335 a[index].re = aa.re * vv.re - aa.im * vv.im;
336 a[index].im = aa.re * vv.im + aa.im * vv.re;
337 }
338 }
339 return 0;
340}
341/* -----------------------------------------------------------------------
342 chkmat - check the output matrix for correctness
343 * ----------------------------------------------------------------------- */
344int chkmat(complex *a, int n) {
345 int sign, i, j, nn, errors;
346 real_type EPSILON = 1e-4f;
347
348 errors = 0;
349 nn = n * n;
350 for (i = 0; i < n; ++i) {
351 sign = 1;
352 for (j = 0; j < n; ++j) {
353 if (a[i * n + j].re > nn * sign + EPSILON)
354 ++errors;
355 if (a[i * n + j].re < nn * sign - EPSILON)
356 ++errors;
357 sign = -sign;
358 }
359 }
360 if (errors > 0) {
361 printf("Errors = %d\n", errors);
362 exit(-1);
363 }
364 return 0;
365}
366/* ----------------------------------------------------------------- */
367int prperf(double tpose_time, double scale_time, double ffts_time, int nn, int lognn, int iters) {
368 double secs;
369 double fpercent, spercent, tpercent;
370 double flops, mflops;
371
372 tpose_time /= iters;
373 scale_time /= iters;
374 ffts_time /= iters;
375 secs = tpose_time + scale_time + ffts_time;
376 tpercent = tpose_time / secs * 100;
377 spercent = scale_time / secs * 100;
378 fpercent = ffts_time / secs * 100;
379 flops = (real_type) (nn * 5 * lognn);
380 mflops = flops / 1e6f / secs;
381 printf("***********************\n");
382 printf("1D FFT %d points\n" ,nn);
383 printf("***********************\n");
384 printf("Time per input vector:\n");
385 printf("tpose : %lf %lf percent\n", tpose_time, tpercent);
386 printf("scale : %lf %lf percent\n", scale_time, spercent);
387 printf("ffts : %lf %lf percent\n", ffts_time, fpercent);
388 printf("total(s) : %lf\n", secs);
389 printf("mflop/s : %lf\n", mflops);
390 return 0;
391} /* prperf */
392/* -----------------------------------------------------------------------
393 Base 2 logarithm
394 * ----------------------------------------------------------------------- */
395int log2_int(int n) {
396 register int i, aux;
397
398 aux = 1;
399 for (i = 0; i <= 128; i++) {
400 if (aux > n)
401 return (i - 1);
402 aux <<= 1;
403 }
404 return -1;
405}
406/* ----------------------------------------------------------------- */
407int print_mat(complex *a, int n) {
408 register int i, j;
409
410 for (i = 0; i < n; ++i) {
411 for (j = 0; j < n; ++j) {
412 printf(" (%.0f, %.0f)", a[i * n + j].re, a[i * n + j].im);
413 }
414 printf("\n");
415 }
416 printf("\n");
417 return 0;
418}
419/* ----------------------------------------------------------------- */
420int print_vec(complex *a, int n) {
421 register int i;
422
423 for (i = 0; i < n*n; ++i)
424 printf(" (%.0f, %.0f)", a[i].re, a[i].im);
425 printf("\n\n");
426 return 0;
427}
428/* ----------------------------------------------------------------- */
429int main(int argc, char *argv[]) {
430 complex **xin; /* input vector */
431 complex **aux; /* intermediate array (same size) */
432 /* precomputed vectors */
433 int *brt; /* bit reverse table for 1d fft's */
434 complex *w; /* twiddles for 1d fft's */
435 complex *v; /* twiddles for 2d fft */
436 double total_time; /* timing */
437 int k;
438 int NUMTHREADS, MAX_THREADS;
439#define ID omp_get_thread_num()
440 float MEMORY; /* number of bytes required */
441// char *PARAM_NAMES[NUM_ARGS] = {"Size (a power of 2).", "No. of iterations"};
442// char *TIMERS_NAMES[NUM_TIMERS] = {"Total_time", "Tpose time", "Scale time", "Column FFTs time" };
443 // char *DEFAULT_VALUES[NUM_ARGS] = {"64", "10"};
444
445
[ee9e9c3]446 NUMTHREADS = 1; //omp_get_num_threads();
[feadd65]447 //OSCR_init (NUMTHREADS, "Bailey's '6-step' Fast Fourier Transform", "Use 'fft6' <size (in K)> <iters>", NUM_ARGS,
448 // PARAM_NAMES, DEFAULT_VALUES , NUM_TIMERS, NUM_TIMERS, TIMERS_NAMES,
449 // argc, argv);
450
451
452 /* Command line arguments processing */
453 N = NINIT; // OSCR_getarg_int(1);
454 ITERS = ITERSINIT; // OSCR_getarg_int(2);
455 /*
456 if(argc == 3) {
457 N = atoi(argv[1]);
458 ITERS = atoi(argv[2]);
459 }
460 else {
461 printf("Usage: %s N ITERS\n", argv[0]);
462 printf("N is the size of the input signal. Should be a power of two.\n");
463 N = 64;
464 ITERS = 10;
465 }
466 */
467
[ee9e9c3]468 MAX_THREADS = 1; //omp_get_num_threads();
[feadd65]469 NN = N * N;
470 LOGN = log2_int(N);
471 LOGNN = log2_int(NN);
472 MEMORY = N * sizeof(int) +
473 2 * (NN) * sizeof(complex) * MAX_THREADS + (NN) * sizeof(complex) +
474 (N / 2) * sizeof(complex);
475
476 /* Memory allocation */
477 xin = calloc(MAX_THREADS, sizeof(*xin));
478 aux = calloc(MAX_THREADS, sizeof(*aux));
479 brt = (int *)calloc(N, sizeof(int));
480 v = (complex *)calloc(N * N, sizeof(complex)); /* twiddles for 2d fft */
481 w = (complex *)calloc((N / 2), sizeof(complex)); /* twiddles for 1d fft's */
482
483 printf("Input values:\n");
484 printf("=============\n");
485 printf("N : %u\n", N);
486 printf("ITERS : %u\n", ITERS);
487 printf("NN : %u\n", NN);
488 printf("LOGN : %u\n", LOGN);
489 printf("LOGNN : %u\n", LOGNN);
490 printf("Memory : %.2f Kbytes\n", MEMORY / 1024);
491
492 /* precompute the input array and fft constants */
493 gen_bit_reverse_table(brt, N);
494 gen_w_table(w, N, LOGN, N / 2);
495 gen_v_table(v, N);
496
497 for (k = 0; k < MAX_THREADS; k++) {
498 xin[k] = (complex *)calloc(N * N, sizeof(complex));
499 aux[k] = (complex *)calloc(N * N, sizeof(complex));
500 }
501#define TOTAL 0
502
503//OSCR_timer_start(TOTAL);
504#pragma omp parallel for private(k)
505 for(k = 0; k < ITERS; k++) {
506 dgen(xin[ID], N);
507
508 tpose(xin[ID], aux[ID], N);
509
510 cffts(aux[ID], brt, w, N, LOGN, N / 2);
511
512 scale(aux[ID], v, N);
513
514 tpose(aux[ID], xin[ID], N);
515
516 cffts(xin[ID], brt, w, N, LOGN, N / 2);
517
518 tpose(xin[ID], aux[ID], N);
519
520 chkmat(aux[ID], N);
521 }
522 //OSCR_timer_stop(TOTAL);
523 total_time = 1; //OSCR_timer_read(TOTAL);
524
525 /* Display results and time */
526 //OSCR_report(1, TIMERS_NAMES);
527
528 printf("============================");
529 printf("\n# THREADS : %d\n", NUMTHREADS);
530 printf("N : %d\n", N);
531 printf("ITERS : %d\n", ITERS);
532 printf("total TIME: %.6lf secs.\n", total_time);
533
534#undef ID
535#undef MAX_THREADS
536 return 0;
537}
538/* ----------------------------------------------------------------- */
539
540/*
541 * vim:ts=2:sw=2:set nonu:
542 */
Note: See TracBrowser for help on using the repository browser.