| 1 | /* ***********************************************************************
|
|---|
| 2 | This program is part of the
|
|---|
| 3 | OpenMP Source Code Repository
|
|---|
| 4 |
|
|---|
| 5 | http://www.pcg.ull.es/ompscr/
|
|---|
| 6 | e-mail: ompscr@etsii.ull.es
|
|---|
| 7 |
|
|---|
| 8 | This program is free software; you can redistribute it and/or modify
|
|---|
| 9 | it under the terms of the GNU General Public License as published by
|
|---|
| 10 | the Free Software Foundation; either version 2 of the License, or
|
|---|
| 11 | (at your option) any later version.
|
|---|
| 12 |
|
|---|
| 13 | This program is distributed in the hope that it will be useful,
|
|---|
| 14 | but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|---|
| 15 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|---|
| 16 | GNU General Public License for more details.
|
|---|
| 17 |
|
|---|
| 18 | You should have received a copy of the GNU General Public License
|
|---|
| 19 | (LICENSE file) along with this program; if not, write to
|
|---|
| 20 | the Free Software Foundation, Inc., 59 Temple Place, Suite 330,
|
|---|
| 21 | Boston, MA 02111-1307 USA
|
|---|
| 22 |
|
|---|
| 23 | FILE: c_fft6.c
|
|---|
| 24 | VERSION: 1.0
|
|---|
| 25 | DATE: May 2004
|
|---|
| 26 | AUTHOR: F. de Sande
|
|---|
| 27 | COMMENTS TO: sande@csi.ull.es
|
|---|
| 28 | DESCRIPTION: Bailey's 6-step 1D Fast Fourier Transform
|
|---|
| 29 | Computes the 1D FFT of an input signal (complex array)
|
|---|
| 30 | COMMENTS: The code performs a number (ITERS) of iterations of the
|
|---|
| 31 | Bailey's 6-step FFT algorithm (following the ideas in the
|
|---|
| 32 | CMU Task parallel suite).
|
|---|
| 33 | 1.- Generates an input signal vector (dgen) with size
|
|---|
| 34 | n=n1xn2 stored in row major order
|
|---|
| 35 | In this code the size of the input signal
|
|---|
| 36 | is NN=NxN (n=NN, n1=n2=N)
|
|---|
| 37 | 2.- Transpose (tpose) A to have it stored in column
|
|---|
| 38 |
|
|---|
| 39 | major order
|
|---|
| 40 | 3.- Perform independent FFTs on the rows (cffts)
|
|---|
| 41 | 4.- Scale each element of the resulting array by a
|
|---|
| 42 | factor of w[n]**(p*q)
|
|---|
| 43 | 5.- Transpose (tpose) to prepair it for the next step
|
|---|
| 44 | 6.- Perform independent FFTs on the rows (cffts)
|
|---|
| 45 | 7.- Transpose the resulting matrix
|
|---|
| 46 | The code requires nested Parallelism.
|
|---|
| 47 | REFERENCES: David H. Bailey
|
|---|
| 48 | FFTs in External or Hierarchical Memory,
|
|---|
| 49 | The Journal of Supercomputing, vol. 4, no. 1, pg. 23-35. Mar 1990,
|
|---|
| 50 | vol. 4, no. 7, pg. 321. Jul 1961
|
|---|
| 51 | http://www-2.cs.cmu.edu/~fx/tpsuite.html
|
|---|
| 52 | http://en.wikipedia.org/wiki/Cooley-Tukey_FFT_algorithm
|
|---|
| 53 | BASIC PRAGMAS: parallel for
|
|---|
| 54 | USAGE: ./c_fft6.par 1024 10
|
|---|
| 55 | INPUT: The size of the input signal.
|
|---|
| 56 | The number of iterations to perform
|
|---|
| 57 | OUTPUT: The code tests the correctness of the output signal
|
|---|
| 58 | FILE FORMATS: -
|
|---|
| 59 | RESTRICTIONS: The size of the input signal MUST BE a power of 2
|
|---|
| 60 | REVISION HISTORY:
|
|---|
| 61 | **************************************************************************/
|
|---|
| 62 | //#include "OmpSCR.h"
|
|---|
| 63 | #include <omp.h>
|
|---|
| 64 | #include <stdlib.h>
|
|---|
| 65 | #include <math.h>
|
|---|
| 66 | #include <stdio.h>
|
|---|
| 67 |
|
|---|
| 68 | #define PI (3.141592653589793f)
|
|---|
| 69 | #define NUM_ARGS 2
|
|---|
| 70 | #define NUM_TIMERS 4
|
|---|
| 71 | typedef double real_type;
|
|---|
| 72 |
|
|---|
| 73 | #define NINIT 2
|
|---|
| 74 | #define ITERSINIT 2
|
|---|
| 75 |
|
|---|
| 76 | /* Input values (CONSTANTS) */
|
|---|
| 77 | unsigned NN; /* input vector is NN complex values */
|
|---|
| 78 | unsigned LOGNN; /* log base 2 of NN */
|
|---|
| 79 | unsigned N; /* square root of NN */
|
|---|
| 80 | unsigned LOGN; /* log base 2 of N */
|
|---|
| 81 | int ITERS; /* number of input vectors */
|
|---|
| 82 |
|
|---|
| 83 | typedef struct { real_type re, im; } complex;
|
|---|
| 84 | /* -----------------------------------------------------------------------
|
|---|
| 85 | PROTOTYPES
|
|---|
| 86 | * ----------------------------------------------------------------------- */
|
|---|
| 87 | int prperf(double tpose_time, double scale_time, double ffts_time, int nn, int lognn, int iters);
|
|---|
| 88 | int gen_bit_reverse_table(int *brt, int n);
|
|---|
| 89 | int gen_w_table(complex *w, int n, int logn, int ndv2);
|
|---|
| 90 | int gen_v_table(complex *v, int n);
|
|---|
| 91 | int dgen(complex *xin, int n);
|
|---|
| 92 | int tpose(complex *a, complex *b, int n);
|
|---|
| 93 | int tpose_seq(complex *a, complex *b, const int n);
|
|---|
| 94 | int cffts(complex *a, int *brt, complex *w, int n, int logn, int ndv2);
|
|---|
| 95 | int cffts_seq(complex *a, int *brt, complex *w, const int n, int logn, int ndv2);
|
|---|
| 96 | int fft(complex *a, int *brt, complex *w, int n, int logn, int ndv2);
|
|---|
| 97 | int scale(complex *a, complex *v, int n);
|
|---|
| 98 | int scale_seq(complex *a, complex *v, const int n);
|
|---|
| 99 | int chkmat(complex *a, int n);
|
|---|
| 100 | complex complex_pow(complex z, int n);
|
|---|
| 101 | int log2_int(int n);
|
|---|
| 102 | int print_mat(complex *a, int n);
|
|---|
| 103 | int print_vec(complex *a, int n);
|
|---|
| 104 | /* -----------------------------------------------------------------
|
|---|
| 105 | IMPLEMENTATION
|
|---|
| 106 | * ----------------------------------------------------------------- */
|
|---|
| 107 | /* -----------------------------------------------------------------
|
|---|
| 108 | gen_bit_reverse_table - initialize bit reverse table
|
|---|
| 109 | Postcondition: br(i) = bit-reverse(i-1) + 1
|
|---|
| 110 | ----------------------------------------------------------------- */
|
|---|
| 111 | int gen_bit_reverse_table(int *brt, int n) {
|
|---|
| 112 | register int i, j, k, nv2;
|
|---|
| 113 |
|
|---|
| 114 | nv2 = n / 2;
|
|---|
| 115 | j = 1;
|
|---|
| 116 | brt[0] = j;
|
|---|
| 117 | for (i = 1; i < n; ++i) {
|
|---|
| 118 | k = nv2;
|
|---|
| 119 | while (k < j) {
|
|---|
| 120 | j -= k;
|
|---|
| 121 | k /= 2;
|
|---|
| 122 | }
|
|---|
| 123 | j += k;
|
|---|
| 124 | brt[i] = j;
|
|---|
| 125 | }
|
|---|
| 126 | return 0;
|
|---|
| 127 | }
|
|---|
| 128 | /* -----------------------------------------------------------------------
|
|---|
| 129 | gen_w_table: generate powers of w.
|
|---|
| 130 | postcondition: w(i) = w**(i-1)
|
|---|
| 131 | * ----------------------------------------------------------------------- */
|
|---|
| 132 | int gen_w_table(complex *w, int n, int logn, int ndv2) {
|
|---|
| 133 | register int i;
|
|---|
| 134 | complex ww, pt;
|
|---|
| 135 |
|
|---|
| 136 | ww.re = cos(PI / ndv2);
|
|---|
| 137 | ww.im = -sin(PI / ndv2);
|
|---|
| 138 | w[0].re = 1.f;
|
|---|
| 139 | w[0].im = 0.f;
|
|---|
| 140 | pt.re = 1.f;
|
|---|
| 141 | pt.im = 0.f;
|
|---|
| 142 | for (i = 1; i < ndv2; ++i) {
|
|---|
| 143 | w[i].re = pt.re * ww.re - pt.im * ww.im;
|
|---|
| 144 | w[i].im = pt.re * ww.im + pt.im * ww.re;
|
|---|
| 145 | pt = w[i];
|
|---|
| 146 | }
|
|---|
| 147 | return 0;
|
|---|
| 148 | }
|
|---|
| 149 | /* -----------------------------------------------------------------------
|
|---|
| 150 | gen_v_table - gen 2d twiddle factors
|
|---|
| 151 | * ----------------------------------------------------------------------- */
|
|---|
| 152 | int gen_v_table(complex *v, int n) {
|
|---|
| 153 | register int j, k;
|
|---|
| 154 | complex wn;
|
|---|
| 155 |
|
|---|
| 156 | wn.re = cos(PI * 2.f / (n * n));
|
|---|
| 157 | wn.im = -sin(PI * 2.f / (n * n));
|
|---|
| 158 | for (j = 0; j < n; ++j) {
|
|---|
| 159 | for (k = 0; k < n; ++k) {
|
|---|
| 160 | v[j * n + k] = complex_pow(wn, j * k);
|
|---|
| 161 | }
|
|---|
| 162 | }
|
|---|
| 163 | return 0;
|
|---|
| 164 | }
|
|---|
| 165 | /* -----------------------------------------------------------------------
|
|---|
| 166 | z^n = modulo^n (cos(n * alfa) + i sin(n * alfa)
|
|---|
| 167 | * ----------------------------------------------------------------------- */
|
|---|
| 168 | complex complex_pow(complex z, int n) {
|
|---|
| 169 | complex temp;
|
|---|
| 170 | real_type pot, nangulo, modulo, angulo;
|
|---|
| 171 |
|
|---|
| 172 | modulo = sqrt(z.re * z.re + z.im * z.im);
|
|---|
| 173 | angulo = atan(z.im / z.re);
|
|---|
| 174 | pot = pow(modulo, n);
|
|---|
| 175 | nangulo = n * angulo;
|
|---|
| 176 |
|
|---|
| 177 | temp.re = pot * cos(nangulo);
|
|---|
| 178 | temp.im = pot * sin(nangulo);
|
|---|
| 179 | return(temp);
|
|---|
| 180 | }
|
|---|
| 181 | /* -----------------------------------------------------------------------
|
|---|
| 182 | A: Para una señal de entrada de tamaño N y de forma
|
|---|
| 183 | (1,0), (1,0), ..., (1,0)
|
|---|
| 184 | la salida debe ser de la forma
|
|---|
| 185 | (N,0), (0,0), ..., (0,0)
|
|---|
| 186 |
|
|---|
| 187 | B: Para una señal de entrada de tamaño N y de forma
|
|---|
| 188 | (0,0), (0,0), ..., (N*N, 0), ..., (0,0)
|
|---|
| 189 | la salida debe ser de la forma (N*N, 0), (0,0), ..., (0,0)
|
|---|
| 190 |
|
|---|
| 191 | Propiedad útil: C: FFT(s1 + s2) = FFT(s1) + FFT(s2)
|
|---|
| 192 |
|
|---|
| 193 | * ----------------------------------------------------------------------- */
|
|---|
| 194 | int dgen(complex *xin, int n) {
|
|---|
| 195 | register int i;
|
|---|
| 196 | int nn = n * n;
|
|---|
| 197 | /* Señal de forma B */
|
|---|
| 198 | for (i = 0; i < nn; ++i) {
|
|---|
| 199 | xin[i].re = 0.f;
|
|---|
| 200 | xin[i].im = 0.f;
|
|---|
| 201 | }
|
|---|
| 202 | xin[nn / 2].re = (real_type)nn;
|
|---|
| 203 |
|
|---|
| 204 | /* Señal de forma A */
|
|---|
| 205 | /*
|
|---|
| 206 | for (i = 0; i < nn; ++i) {
|
|---|
| 207 | xin[i].re = 1.0;
|
|---|
| 208 | xin[i].im = 0.f;
|
|---|
| 209 | }
|
|---|
| 210 | */
|
|---|
| 211 | return 0;
|
|---|
| 212 | }
|
|---|
| 213 | /* ----------------------------------------------------------------- */
|
|---|
| 214 | int tpose(complex *a, complex *b, const int n) {
|
|---|
| 215 | register int i, j;
|
|---|
| 216 |
|
|---|
| 217 | #pragma omp parallel for private(i, j)
|
|---|
| 218 | for (i = 0; i < n; ++i) {
|
|---|
| 219 | for (j = i; j < n; ++j) {
|
|---|
| 220 | b[i * n + j] = a[j * n + i];
|
|---|
| 221 | b[j * n + i] = a[i * n + j];
|
|---|
| 222 | }
|
|---|
| 223 | }
|
|---|
| 224 | return 0;
|
|---|
| 225 | }
|
|---|
| 226 |
|
|---|
| 227 | /* ----------------------------------------------------------------- */
|
|---|
| 228 | int tpose_seq(complex *a, complex *b, const int n) {
|
|---|
| 229 | register int i, j;
|
|---|
| 230 |
|
|---|
| 231 | for (i = 0; i < n; ++i) {
|
|---|
| 232 | for (j = i; j < n; ++j) {
|
|---|
| 233 | b[i * n + j] = a[j * n + i];
|
|---|
| 234 | b[j * n + i] = a[i * n + j];
|
|---|
| 235 | }
|
|---|
| 236 | }
|
|---|
| 237 | return 0;
|
|---|
| 238 | }
|
|---|
| 239 | /* ----------------------------------------------------------------- */
|
|---|
| 240 | int cffts(complex *a, int *brt, complex *w, const int n, int logn, int ndv2) {
|
|---|
| 241 | register int i;
|
|---|
| 242 |
|
|---|
| 243 | #pragma omp parallel for private(i)
|
|---|
| 244 | for (i = 0; i < n; ++i)
|
|---|
| 245 | fft(&a[i * n], brt, w, n, logn, ndv2);
|
|---|
| 246 | /* fft(a + i * n, brt, w, n, logn, ndv2); */
|
|---|
| 247 | return 0;
|
|---|
| 248 | }
|
|---|
| 249 | /* ----------------------------------------------------------------- */
|
|---|
| 250 | int cffts_seq(complex *a, int *brt, complex *w, const int n, int logn, int ndv2) {
|
|---|
| 251 | register int i;
|
|---|
| 252 |
|
|---|
| 253 | for (i = 0; i < n; ++i)
|
|---|
| 254 | fft(&a[i * n], brt, w, n, logn, ndv2);
|
|---|
| 255 | return 0;
|
|---|
| 256 | }
|
|---|
| 257 | /* -----------------------------------------------------------------------
|
|---|
| 258 | Fast Fourier Transform
|
|---|
| 259 | 1D in-place complex-complex decimation-in-time Cooley-Tukey
|
|---|
| 260 | * ----------------------------------------------------------------------- */
|
|---|
| 261 | int fft(complex *a, int *brt, complex *w, int n, int logn, int ndv2) {
|
|---|
| 262 | register int i, j;
|
|---|
| 263 | int powerOfW, stage, first, spowerOfW;
|
|---|
| 264 | int ijDiff;
|
|---|
| 265 | int stride;
|
|---|
| 266 | complex ii, jj, temp, pw;
|
|---|
| 267 |
|
|---|
| 268 | /* bit reverse step */
|
|---|
| 269 | for (i = 0; i < n; ++i) {
|
|---|
| 270 | j = brt[i];
|
|---|
| 271 | if (i < (j-1)) {
|
|---|
| 272 | temp = a[j - 1];
|
|---|
| 273 | a[j - 1] = a[i];
|
|---|
| 274 | a[i] = temp;
|
|---|
| 275 | }
|
|---|
| 276 | }
|
|---|
| 277 |
|
|---|
| 278 | /* butterfly computations */
|
|---|
| 279 | ijDiff = 1;
|
|---|
| 280 | stride = 2;
|
|---|
| 281 | spowerOfW = ndv2;
|
|---|
| 282 | for (stage = 0; stage < logn; ++stage) {
|
|---|
| 283 | /* Invariant: stride = 2 ** stage
|
|---|
| 284 | Invariant: ijDiff = 2 ** (stage - 1) */
|
|---|
| 285 | first = 0;
|
|---|
| 286 | for (powerOfW = 0; powerOfW < ndv2; powerOfW += spowerOfW) {
|
|---|
| 287 | pw = w[powerOfW];
|
|---|
| 288 | /* Invariant: pwr + sqrt(-1)*pwi = W**(powerOfW - 1) */
|
|---|
| 289 | for (i = first; i < n; i += stride) {
|
|---|
| 290 | j = i + ijDiff;
|
|---|
| 291 | jj = a[j];
|
|---|
| 292 | ii = a[i];
|
|---|
| 293 | temp.re = jj.re * pw.re - jj.im * pw.im;
|
|---|
| 294 | temp.im = jj.re * pw.im + jj.im * pw.re;
|
|---|
| 295 | a[j].re = ii.re - temp.re;
|
|---|
| 296 | a[j].im = ii.im - temp.im;
|
|---|
| 297 | a[i].re = ii.re + temp.re;
|
|---|
| 298 | a[i].im = ii.im + temp.im;
|
|---|
| 299 | }
|
|---|
| 300 | ++first;
|
|---|
| 301 | }
|
|---|
| 302 | ijDiff = stride;
|
|---|
| 303 | stride <<= 1;
|
|---|
| 304 | spowerOfW /= 2;
|
|---|
| 305 | }
|
|---|
| 306 | return 0;
|
|---|
| 307 | } /* fft */
|
|---|
| 308 | /* ----------------------------------------------------------------- */
|
|---|
| 309 | int scale(complex *a, complex *v, const int n) {
|
|---|
| 310 | register int i, j, index;
|
|---|
| 311 | complex aa, vv;
|
|---|
| 312 |
|
|---|
| 313 | #pragma omp parallel for private(i, j, index, aa, vv)
|
|---|
| 314 | for (i = 0; i < n; ++i) {
|
|---|
| 315 | for (j = 0; j < n; ++j) {
|
|---|
| 316 | index = i * n + j;
|
|---|
| 317 | aa = a[index];
|
|---|
| 318 | vv = v[index];
|
|---|
| 319 | a[index].re = aa.re * vv.re - aa.im * vv.im;
|
|---|
| 320 | a[index].im = aa.re * vv.im + aa.im * vv.re;
|
|---|
| 321 | }
|
|---|
| 322 | }
|
|---|
| 323 | return 0;
|
|---|
| 324 | }
|
|---|
| 325 | /* ----------------------------------------------------------------- */
|
|---|
| 326 | int scale_seq(complex *a, complex *v, const int n) {
|
|---|
| 327 | register int i, j, index;
|
|---|
| 328 | complex aa, vv;
|
|---|
| 329 |
|
|---|
| 330 | for (i = 0; i < n; ++i) {
|
|---|
| 331 | for (j = 0; j < n; ++j) {
|
|---|
| 332 | index = i * n + j;
|
|---|
| 333 | aa = a[index];
|
|---|
| 334 | vv = v[index];
|
|---|
| 335 | a[index].re = aa.re * vv.re - aa.im * vv.im;
|
|---|
| 336 | a[index].im = aa.re * vv.im + aa.im * vv.re;
|
|---|
| 337 | }
|
|---|
| 338 | }
|
|---|
| 339 | return 0;
|
|---|
| 340 | }
|
|---|
| 341 | /* -----------------------------------------------------------------------
|
|---|
| 342 | chkmat - check the output matrix for correctness
|
|---|
| 343 | * ----------------------------------------------------------------------- */
|
|---|
| 344 | int chkmat(complex *a, int n) {
|
|---|
| 345 | int sign, i, j, nn, errors;
|
|---|
| 346 | real_type EPSILON = 1e-4f;
|
|---|
| 347 |
|
|---|
| 348 | errors = 0;
|
|---|
| 349 | nn = n * n;
|
|---|
| 350 | for (i = 0; i < n; ++i) {
|
|---|
| 351 | sign = 1;
|
|---|
| 352 | for (j = 0; j < n; ++j) {
|
|---|
| 353 | if (a[i * n + j].re > nn * sign + EPSILON)
|
|---|
| 354 | ++errors;
|
|---|
| 355 | if (a[i * n + j].re < nn * sign - EPSILON)
|
|---|
| 356 | ++errors;
|
|---|
| 357 | sign = -sign;
|
|---|
| 358 | }
|
|---|
| 359 | }
|
|---|
| 360 | if (errors > 0) {
|
|---|
| 361 | printf("Errors = %d\n", errors);
|
|---|
| 362 | exit(-1);
|
|---|
| 363 | }
|
|---|
| 364 | return 0;
|
|---|
| 365 | }
|
|---|
| 366 | /* ----------------------------------------------------------------- */
|
|---|
| 367 | int prperf(double tpose_time, double scale_time, double ffts_time, int nn, int lognn, int iters) {
|
|---|
| 368 | double secs;
|
|---|
| 369 | double fpercent, spercent, tpercent;
|
|---|
| 370 | double flops, mflops;
|
|---|
| 371 |
|
|---|
| 372 | tpose_time /= iters;
|
|---|
| 373 | scale_time /= iters;
|
|---|
| 374 | ffts_time /= iters;
|
|---|
| 375 | secs = tpose_time + scale_time + ffts_time;
|
|---|
| 376 | tpercent = tpose_time / secs * 100;
|
|---|
| 377 | spercent = scale_time / secs * 100;
|
|---|
| 378 | fpercent = ffts_time / secs * 100;
|
|---|
| 379 | flops = (real_type) (nn * 5 * lognn);
|
|---|
| 380 | mflops = flops / 1e6f / secs;
|
|---|
| 381 | printf("***********************\n");
|
|---|
| 382 | printf("1D FFT %d points\n" ,nn);
|
|---|
| 383 | printf("***********************\n");
|
|---|
| 384 | printf("Time per input vector:\n");
|
|---|
| 385 | printf("tpose : %lf %lf percent\n", tpose_time, tpercent);
|
|---|
| 386 | printf("scale : %lf %lf percent\n", scale_time, spercent);
|
|---|
| 387 | printf("ffts : %lf %lf percent\n", ffts_time, fpercent);
|
|---|
| 388 | printf("total(s) : %lf\n", secs);
|
|---|
| 389 | printf("mflop/s : %lf\n", mflops);
|
|---|
| 390 | return 0;
|
|---|
| 391 | } /* prperf */
|
|---|
| 392 | /* -----------------------------------------------------------------------
|
|---|
| 393 | Base 2 logarithm
|
|---|
| 394 | * ----------------------------------------------------------------------- */
|
|---|
| 395 | int log2_int(int n) {
|
|---|
| 396 | register int i, aux;
|
|---|
| 397 |
|
|---|
| 398 | aux = 1;
|
|---|
| 399 | for (i = 0; i <= 128; i++) {
|
|---|
| 400 | if (aux > n)
|
|---|
| 401 | return (i - 1);
|
|---|
| 402 | aux <<= 1;
|
|---|
| 403 | }
|
|---|
| 404 | return -1;
|
|---|
| 405 | }
|
|---|
| 406 | /* ----------------------------------------------------------------- */
|
|---|
| 407 | int print_mat(complex *a, int n) {
|
|---|
| 408 | register int i, j;
|
|---|
| 409 |
|
|---|
| 410 | for (i = 0; i < n; ++i) {
|
|---|
| 411 | for (j = 0; j < n; ++j) {
|
|---|
| 412 | printf(" (%.0f, %.0f)", a[i * n + j].re, a[i * n + j].im);
|
|---|
| 413 | }
|
|---|
| 414 | printf("\n");
|
|---|
| 415 | }
|
|---|
| 416 | printf("\n");
|
|---|
| 417 | return 0;
|
|---|
| 418 | }
|
|---|
| 419 | /* ----------------------------------------------------------------- */
|
|---|
| 420 | int print_vec(complex *a, int n) {
|
|---|
| 421 | register int i;
|
|---|
| 422 |
|
|---|
| 423 | for (i = 0; i < n*n; ++i)
|
|---|
| 424 | printf(" (%.0f, %.0f)", a[i].re, a[i].im);
|
|---|
| 425 | printf("\n\n");
|
|---|
| 426 | return 0;
|
|---|
| 427 | }
|
|---|
| 428 | /* ----------------------------------------------------------------- */
|
|---|
| 429 | int main(int argc, char *argv[]) {
|
|---|
| 430 | complex **xin; /* input vector */
|
|---|
| 431 | complex **aux; /* intermediate array (same size) */
|
|---|
| 432 | /* precomputed vectors */
|
|---|
| 433 | int *brt; /* bit reverse table for 1d fft's */
|
|---|
| 434 | complex *w; /* twiddles for 1d fft's */
|
|---|
| 435 | complex *v; /* twiddles for 2d fft */
|
|---|
| 436 | double total_time; /* timing */
|
|---|
| 437 | int k;
|
|---|
| 438 | int NUMTHREADS, MAX_THREADS;
|
|---|
| 439 | #define ID omp_get_thread_num()
|
|---|
| 440 | float MEMORY; /* number of bytes required */
|
|---|
| 441 | // char *PARAM_NAMES[NUM_ARGS] = {"Size (a power of 2).", "No. of iterations"};
|
|---|
| 442 | // char *TIMERS_NAMES[NUM_TIMERS] = {"Total_time", "Tpose time", "Scale time", "Column FFTs time" };
|
|---|
| 443 | // char *DEFAULT_VALUES[NUM_ARGS] = {"64", "10"};
|
|---|
| 444 |
|
|---|
| 445 |
|
|---|
| 446 | NUMTHREADS = 1; //omp_get_num_threads();
|
|---|
| 447 | //OSCR_init (NUMTHREADS, "Bailey's '6-step' Fast Fourier Transform", "Use 'fft6' <size (in K)> <iters>", NUM_ARGS,
|
|---|
| 448 | // PARAM_NAMES, DEFAULT_VALUES , NUM_TIMERS, NUM_TIMERS, TIMERS_NAMES,
|
|---|
| 449 | // argc, argv);
|
|---|
| 450 |
|
|---|
| 451 |
|
|---|
| 452 | /* Command line arguments processing */
|
|---|
| 453 | N = NINIT; // OSCR_getarg_int(1);
|
|---|
| 454 | ITERS = ITERSINIT; // OSCR_getarg_int(2);
|
|---|
| 455 | /*
|
|---|
| 456 | if(argc == 3) {
|
|---|
| 457 | N = atoi(argv[1]);
|
|---|
| 458 | ITERS = atoi(argv[2]);
|
|---|
| 459 | }
|
|---|
| 460 | else {
|
|---|
| 461 | printf("Usage: %s N ITERS\n", argv[0]);
|
|---|
| 462 | printf("N is the size of the input signal. Should be a power of two.\n");
|
|---|
| 463 | N = 64;
|
|---|
| 464 | ITERS = 10;
|
|---|
| 465 | }
|
|---|
| 466 | */
|
|---|
| 467 |
|
|---|
| 468 | MAX_THREADS = 1; //omp_get_num_threads();
|
|---|
| 469 | NN = N * N;
|
|---|
| 470 | LOGN = log2_int(N);
|
|---|
| 471 | LOGNN = log2_int(NN);
|
|---|
| 472 | MEMORY = N * sizeof(int) +
|
|---|
| 473 | 2 * (NN) * sizeof(complex) * MAX_THREADS + (NN) * sizeof(complex) +
|
|---|
| 474 | (N / 2) * sizeof(complex);
|
|---|
| 475 |
|
|---|
| 476 | /* Memory allocation */
|
|---|
| 477 | xin = calloc(MAX_THREADS, sizeof(*xin));
|
|---|
| 478 | aux = calloc(MAX_THREADS, sizeof(*aux));
|
|---|
| 479 | brt = (int *)calloc(N, sizeof(int));
|
|---|
| 480 | v = (complex *)calloc(N * N, sizeof(complex)); /* twiddles for 2d fft */
|
|---|
| 481 | w = (complex *)calloc((N / 2), sizeof(complex)); /* twiddles for 1d fft's */
|
|---|
| 482 |
|
|---|
| 483 | printf("Input values:\n");
|
|---|
| 484 | printf("=============\n");
|
|---|
| 485 | printf("N : %u\n", N);
|
|---|
| 486 | printf("ITERS : %u\n", ITERS);
|
|---|
| 487 | printf("NN : %u\n", NN);
|
|---|
| 488 | printf("LOGN : %u\n", LOGN);
|
|---|
| 489 | printf("LOGNN : %u\n", LOGNN);
|
|---|
| 490 | printf("Memory : %.2f Kbytes\n", MEMORY / 1024);
|
|---|
| 491 |
|
|---|
| 492 | /* precompute the input array and fft constants */
|
|---|
| 493 | gen_bit_reverse_table(brt, N);
|
|---|
| 494 | gen_w_table(w, N, LOGN, N / 2);
|
|---|
| 495 | gen_v_table(v, N);
|
|---|
| 496 |
|
|---|
| 497 | for (k = 0; k < MAX_THREADS; k++) {
|
|---|
| 498 | xin[k] = (complex *)calloc(N * N, sizeof(complex));
|
|---|
| 499 | aux[k] = (complex *)calloc(N * N, sizeof(complex));
|
|---|
| 500 | }
|
|---|
| 501 | #define TOTAL 0
|
|---|
| 502 |
|
|---|
| 503 | //OSCR_timer_start(TOTAL);
|
|---|
| 504 | #pragma omp parallel for private(k)
|
|---|
| 505 | for(k = 0; k < ITERS; k++) {
|
|---|
| 506 | dgen(xin[ID], N);
|
|---|
| 507 |
|
|---|
| 508 | tpose(xin[ID], aux[ID], N);
|
|---|
| 509 |
|
|---|
| 510 | cffts(aux[ID], brt, w, N, LOGN, N / 2);
|
|---|
| 511 |
|
|---|
| 512 | scale(aux[ID], v, N);
|
|---|
| 513 |
|
|---|
| 514 | tpose(aux[ID], xin[ID], N);
|
|---|
| 515 |
|
|---|
| 516 | cffts(xin[ID], brt, w, N, LOGN, N / 2);
|
|---|
| 517 |
|
|---|
| 518 | tpose(xin[ID], aux[ID], N);
|
|---|
| 519 |
|
|---|
| 520 | chkmat(aux[ID], N);
|
|---|
| 521 | }
|
|---|
| 522 | //OSCR_timer_stop(TOTAL);
|
|---|
| 523 | total_time = 1; //OSCR_timer_read(TOTAL);
|
|---|
| 524 |
|
|---|
| 525 | /* Display results and time */
|
|---|
| 526 | //OSCR_report(1, TIMERS_NAMES);
|
|---|
| 527 |
|
|---|
| 528 | printf("============================");
|
|---|
| 529 | printf("\n# THREADS : %d\n", NUMTHREADS);
|
|---|
| 530 | printf("N : %d\n", N);
|
|---|
| 531 | printf("ITERS : %d\n", ITERS);
|
|---|
| 532 | printf("total TIME: %.6lf secs.\n", total_time);
|
|---|
| 533 |
|
|---|
| 534 | #undef ID
|
|---|
| 535 | #undef MAX_THREADS
|
|---|
| 536 | return 0;
|
|---|
| 537 | }
|
|---|
| 538 | /* ----------------------------------------------------------------- */
|
|---|
| 539 |
|
|---|
| 540 | /*
|
|---|
| 541 | * vim:ts=2:sw=2:set nonu:
|
|---|
| 542 | */
|
|---|