Context Navigation

c_fft6.c@ 1aaefd4

main test-branch

Last change on this file since 1aaefd4 was ea777aa, checked in by Alex Wilton <awilton@…>, 3 years ago

Moved examples, include, build_default.properties, common.xml, and README out from dev.civl.com into the root of the repo.

git-svn-id: svn://vsl.cis.udel.edu/civl/trunk@5704 fb995dde-84ed-4084-dfe6-e5aef3e2452c

Property mode set to 100644

File size: 17.1 KB

Line
1	/* ***********************************************************************
2	This program is part of the
3	OpenMP Source Code Repository
4
5	http://www.pcg.ull.es/ompscr/
6	e-mail: ompscr@etsii.ull.es
7
8	This program is free software; you can redistribute it and/or modify
9	it under the terms of the GNU General Public License as published by
10	the Free Software Foundation; either version 2 of the License, or
11	(at your option) any later version.
12
13	This program is distributed in the hope that it will be useful,
14	but WITHOUT ANY WARRANTY; without even the implied warranty of
15	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16	GNU General Public License for more details.
17
18	You should have received a copy of the GNU General Public License
19	(LICENSE file) along with this program; if not, write to
20	the Free Software Foundation, Inc., 59 Temple Place, Suite 330,
21	Boston, MA 02111-1307 USA
22
23	FILE: c_fft6.c
24	VERSION: 1.0
25	DATE: May 2004
26	AUTHOR: F. de Sande
27	COMMENTS TO: sande@csi.ull.es
28	DESCRIPTION: Bailey's 6-step 1D Fast Fourier Transform
29	Computes the 1D FFT of an input signal (complex array)
30	COMMENTS: The code performs a number (ITERS) of iterations of the
31	Bailey's 6-step FFT algorithm (following the ideas in the
32	CMU Task parallel suite).
33	1.- Generates an input signal vector (dgen) with size
34	n=n1xn2 stored in row major order
35	In this code the size of the input signal
36	is NN=NxN (n=NN, n1=n2=N)
37	2.- Transpose (tpose) A to have it stored in column
38
39	major order
40	3.- Perform independent FFTs on the rows (cffts)
41	4.- Scale each element of the resulting array by a
42	factor of w[n]*(pq)
43	5.- Transpose (tpose) to prepair it for the next step
44	6.- Perform independent FFTs on the rows (cffts)
45	7.- Transpose the resulting matrix
46	The code requires nested Parallelism.
47	REFERENCES: David H. Bailey
48	FFTs in External or Hierarchical Memory,
49	The Journal of Supercomputing, vol. 4, no. 1, pg. 23-35. Mar 1990,
50	vol. 4, no. 7, pg. 321. Jul 1961
51	http://www-2.cs.cmu.edu/~fx/tpsuite.html
52	http://en.wikipedia.org/wiki/Cooley-Tukey_FFT_algorithm
53	BASIC PRAGMAS: parallel for
54	USAGE: ./c_fft6.par 1024 10
55	INPUT: The size of the input signal.
56	The number of iterations to perform
57	OUTPUT: The code tests the correctness of the output signal
58	FILE FORMATS: -
59	RESTRICTIONS: The size of the input signal MUST BE a power of 2
60	REVISION HISTORY:
61	**************************************************************************/
62	//#include "OmpSCR.h"
63	#include <omp.h>
64	#include <stdlib.h>
65	#include <math.h>
66	#include <stdio.h>
67
68	#define PI (3.141592653589793f)
69	#define NUM_ARGS 2
70	#define NUM_TIMERS 4
71	typedef double real_type;
72
73	#define NINIT 2
74	#define ITERSINIT 2
75
76	/* Input values (CONSTANTS) */
77	unsigned NN; /* input vector is NN complex values */
78	unsigned LOGNN; /* log base 2 of NN */
79	unsigned N; /* square root of NN */
80	unsigned LOGN; /* log base 2 of N */
81	int ITERS; /* number of input vectors */
82
83	typedef struct { real_type re, im; } complex;
84	/* -----------------------------------------------------------------------
85	PROTOTYPES
86	* ----------------------------------------------------------------------- */
87	int prperf(double tpose_time, double scale_time, double ffts_time, int nn, int lognn, int iters);
88	int gen_bit_reverse_table(int *brt, int n);
89	int gen_w_table(complex *w, int n, int logn, int ndv2);
90	int gen_v_table(complex *v, int n);
91	int dgen(complex *xin, int n);
92	int tpose(complex a, complex b, int n);
93	int tpose_seq(complex a, complex b, const int n);
94	int cffts(complex a, int brt, complex *w, int n, int logn, int ndv2);
95	int cffts_seq(complex a, int brt, complex *w, const int n, int logn, int ndv2);
96	int fft(complex a, int brt, complex *w, int n, int logn, int ndv2);
97	int scale(complex a, complex v, int n);
98	int scale_seq(complex a, complex v, const int n);
99	int chkmat(complex *a, int n);
100	complex complex_pow(complex z, int n);
101	int log2_int(int n);
102	int print_mat(complex *a, int n);
103	int print_vec(complex *a, int n);
104	/* -----------------------------------------------------------------
105	IMPLEMENTATION
106	* ----------------------------------------------------------------- */
107	/* -----------------------------------------------------------------
108	gen_bit_reverse_table - initialize bit reverse table
109	Postcondition: br(i) = bit-reverse(i-1) + 1
110	----------------------------------------------------------------- */
111	int gen_bit_reverse_table(int *brt, int n) {
112	register int i, j, k, nv2;
113
114	nv2 = n / 2;
115	j = 1;
116	brt[0] = j;
117	for (i = 1; i < n; ++i) {
118	k = nv2;
119	while (k < j) {
120	j -= k;
121	k /= 2;
122	}
123	j += k;
124	brt[i] = j;
125	}
126	return 0;
127	}
128	/* -----------------------------------------------------------------------
129	gen_w_table: generate powers of w.
130	postcondition: w(i) = w**(i-1)
131	* ----------------------------------------------------------------------- */
132	int gen_w_table(complex *w, int n, int logn, int ndv2) {
133	register int i;
134	complex ww, pt;
135
136	ww.re = cos(PI / ndv2);
137	ww.im = -sin(PI / ndv2);
138	w[0].re = 1.f;
139	w[0].im = 0.f;
140	pt.re = 1.f;
141	pt.im = 0.f;
142	for (i = 1; i < ndv2; ++i) {
143	w[i].re = pt.re * ww.re - pt.im * ww.im;
144	w[i].im = pt.re * ww.im + pt.im * ww.re;
145	pt = w[i];
146	}
147	return 0;
148	}
149	/* -----------------------------------------------------------------------
150	gen_v_table - gen 2d twiddle factors
151	* ----------------------------------------------------------------------- */
152	int gen_v_table(complex *v, int n) {
153	register int j, k;
154	complex wn;
155
156	wn.re = cos(PI * 2.f / (n * n));
157	wn.im = -sin(PI * 2.f / (n * n));
158	for (j = 0; j < n; ++j) {
159	for (k = 0; k < n; ++k) {
160	v[j * n + k] = complex_pow(wn, j * k);
161	}
162	}
163	return 0;
164	}
165	/* -----------------------------------------------------------------------
166	z^n = modulo^n (cos(n * alfa) + i sin(n * alfa)
167	* ----------------------------------------------------------------------- */
168	complex complex_pow(complex z, int n) {
169	complex temp;
170	real_type pot, nangulo, modulo, angulo;
171
172	modulo = sqrt(z.re * z.re + z.im * z.im);
173	angulo = atan(z.im / z.re);
174	pot = pow(modulo, n);
175	nangulo = n * angulo;
176
177	temp.re = pot * cos(nangulo);
178	temp.im = pot * sin(nangulo);
179	return(temp);
180	}
181	/* -----------------------------------------------------------------------
182	A: Para una señal de entrada de tamaño N y de forma
183	(1,0), (1,0), ..., (1,0)
184	la salida debe ser de la forma
185	(N,0), (0,0), ..., (0,0)
186
187	B: Para una señal de entrada de tamaño N y de forma
188	(0,0), (0,0), ..., (N*N, 0), ..., (0,0)
189	la salida debe ser de la forma (N*N, 0), (0,0), ..., (0,0)
190
191	Propiedad útil: C: FFT(s1 + s2) = FFT(s1) + FFT(s2)
192
193	* ----------------------------------------------------------------------- */
194	int dgen(complex *xin, int n) {
195	register int i;
196	int nn = n * n;
197	/* Señal de forma B */
198	for (i = 0; i < nn; ++i) {
199	xin[i].re = 0.f;
200	xin[i].im = 0.f;
201	}
202	xin[nn / 2].re = (real_type)nn;
203
204	/* Señal de forma A */
205	/*
206	for (i = 0; i < nn; ++i) {
207	xin[i].re = 1.0;
208	xin[i].im = 0.f;
209	}
210	*/
211	return 0;
212	}
213	/* ----------------------------------------------------------------- */
214	int tpose(complex a, complex b, const int n) {
215	register int i, j;
216
217	#pragma omp parallel for private(i, j)
218	for (i = 0; i < n; ++i) {
219	for (j = i; j < n; ++j) {
220	b[i * n + j] = a[j * n + i];
221	b[j * n + i] = a[i * n + j];
222	}
223	}
224	return 0;
225	}
226
227	/* ----------------------------------------------------------------- */
228	int tpose_seq(complex a, complex b, const int n) {
229	register int i, j;
230
231	for (i = 0; i < n; ++i) {
232	for (j = i; j < n; ++j) {
233	b[i * n + j] = a[j * n + i];
234	b[j * n + i] = a[i * n + j];
235	}
236	}
237	return 0;
238	}
239	/* ----------------------------------------------------------------- */
240	int cffts(complex a, int brt, complex *w, const int n, int logn, int ndv2) {
241	register int i;
242
243	#pragma omp parallel for private(i)
244	for (i = 0; i < n; ++i)
245	fft(&a[i * n], brt, w, n, logn, ndv2);
246	/* fft(a + i * n, brt, w, n, logn, ndv2); */
247	return 0;
248	}
249	/* ----------------------------------------------------------------- */
250	int cffts_seq(complex a, int brt, complex *w, const int n, int logn, int ndv2) {
251	register int i;
252
253	for (i = 0; i < n; ++i)
254	fft(&a[i * n], brt, w, n, logn, ndv2);
255	return 0;
256	}
257	/* -----------------------------------------------------------------------
258	Fast Fourier Transform
259	1D in-place complex-complex decimation-in-time Cooley-Tukey
260	* ----------------------------------------------------------------------- */
261	int fft(complex a, int brt, complex *w, int n, int logn, int ndv2) {
262	register int i, j;
263	int powerOfW, stage, first, spowerOfW;
264	int ijDiff;
265	int stride;
266	complex ii, jj, temp, pw;
267
268	/* bit reverse step */
269	for (i = 0; i < n; ++i) {
270	j = brt[i];
271	if (i < (j-1)) {
272	temp = a[j - 1];
273	a[j - 1] = a[i];
274	a[i] = temp;
275	}
276	}
277
278	/* butterfly computations */
279	ijDiff = 1;
280	stride = 2;
281	spowerOfW = ndv2;
282	for (stage = 0; stage < logn; ++stage) {
283	/* Invariant: stride = 2 ** stage
284	Invariant: ijDiff = 2 ** (stage - 1) */
285	first = 0;
286	for (powerOfW = 0; powerOfW < ndv2; powerOfW += spowerOfW) {
287	pw = w[powerOfW];
288	/* Invariant: pwr + sqrt(-1)pwi = W(powerOfW - 1) /
289	for (i = first; i < n; i += stride) {
290	j = i + ijDiff;
291	jj = a[j];
292	ii = a[i];
293	temp.re = jj.re * pw.re - jj.im * pw.im;
294	temp.im = jj.re * pw.im + jj.im * pw.re;
295	a[j].re = ii.re - temp.re;
296	a[j].im = ii.im - temp.im;
297	a[i].re = ii.re + temp.re;
298	a[i].im = ii.im + temp.im;
299	}
300	++first;
301	}
302	ijDiff = stride;
303	stride <<= 1;
304	spowerOfW /= 2;
305	}
306	return 0;
307	} /* fft */
308	/* ----------------------------------------------------------------- */
309	int scale(complex a, complex v, const int n) {
310	register int i, j, index;
311	complex aa, vv;
312
313	#pragma omp parallel for private(i, j, index, aa, vv)
314	for (i = 0; i < n; ++i) {
315	for (j = 0; j < n; ++j) {
316	index = i * n + j;
317	aa = a[index];
318	vv = v[index];
319	a[index].re = aa.re * vv.re - aa.im * vv.im;
320	a[index].im = aa.re * vv.im + aa.im * vv.re;
321	}
322	}
323	return 0;
324	}
325	/* ----------------------------------------------------------------- */
326	int scale_seq(complex a, complex v, const int n) {
327	register int i, j, index;
328	complex aa, vv;
329
330	for (i = 0; i < n; ++i) {
331	for (j = 0; j < n; ++j) {
332	index = i * n + j;
333	aa = a[index];
334	vv = v[index];
335	a[index].re = aa.re * vv.re - aa.im * vv.im;
336	a[index].im = aa.re * vv.im + aa.im * vv.re;
337	}
338	}
339	return 0;
340	}
341	/* -----------------------------------------------------------------------
342	chkmat - check the output matrix for correctness
343	* ----------------------------------------------------------------------- */
344	int chkmat(complex *a, int n) {
345	int sign, i, j, nn, errors;
346	real_type EPSILON = 1e-4f;
347
348	errors = 0;
349	nn = n * n;
350	for (i = 0; i < n; ++i) {
351	sign = 1;
352	for (j = 0; j < n; ++j) {
353	if (a[i * n + j].re > nn * sign + EPSILON)
354	++errors;
355	if (a[i * n + j].re < nn * sign - EPSILON)
356	++errors;
357	sign = -sign;
358	}
359	}
360	if (errors > 0) {
361	printf("Errors = %d\n", errors);
362	exit(-1);
363	}
364	return 0;
365	}
366	/* ----------------------------------------------------------------- */
367	int prperf(double tpose_time, double scale_time, double ffts_time, int nn, int lognn, int iters) {
368	double secs;
369	double fpercent, spercent, tpercent;
370	double flops, mflops;
371
372	tpose_time /= iters;
373	scale_time /= iters;
374	ffts_time /= iters;
375	secs = tpose_time + scale_time + ffts_time;
376	tpercent = tpose_time / secs * 100;
377	spercent = scale_time / secs * 100;
378	fpercent = ffts_time / secs * 100;
379	flops = (real_type) (nn * 5 * lognn);
380	mflops = flops / 1e6f / secs;
381	printf("***********************\n");
382	printf("1D FFT %d points\n" ,nn);
383	printf("***********************\n");
384	printf("Time per input vector:\n");
385	printf("tpose : %lf %lf percent\n", tpose_time, tpercent);
386	printf("scale : %lf %lf percent\n", scale_time, spercent);
387	printf("ffts : %lf %lf percent\n", ffts_time, fpercent);
388	printf("total(s) : %lf\n", secs);
389	printf("mflop/s : %lf\n", mflops);
390	return 0;
391	} /* prperf */
392	/* -----------------------------------------------------------------------
393	Base 2 logarithm
394	* ----------------------------------------------------------------------- */
395	int log2_int(int n) {
396	register int i, aux;
397
398	aux = 1;
399	for (i = 0; i <= 128; i++) {
400	if (aux > n)
401	return (i - 1);
402	aux <<= 1;
403	}
404	return -1;
405	}
406	/* ----------------------------------------------------------------- */
407	int print_mat(complex *a, int n) {
408	register int i, j;
409
410	for (i = 0; i < n; ++i) {
411	for (j = 0; j < n; ++j) {
412	printf(" (%.0f, %.0f)", a[i * n + j].re, a[i * n + j].im);
413	}
414	printf("\n");
415	}
416	printf("\n");
417	return 0;
418	}
419	/* ----------------------------------------------------------------- */
420	int print_vec(complex *a, int n) {
421	register int i;
422
423	for (i = 0; i < n*n; ++i)
424	printf(" (%.0f, %.0f)", a[i].re, a[i].im);
425	printf("\n\n");
426	return 0;
427	}
428	/* ----------------------------------------------------------------- */
429	int main(int argc, char *argv[]) {
430	complex *xin; / input vector */
431	complex *aux; / intermediate array (same size) */
432	/* precomputed vectors */
433	int brt; / bit reverse table for 1d fft's */
434	complex w; / twiddles for 1d fft's */
435	complex v; / twiddles for 2d fft */
436	double total_time; /* timing */
437	int k;
438	int NUMTHREADS, MAX_THREADS;
439	#define ID omp_get_thread_num()
440	float MEMORY; /* number of bytes required */
441	// char *PARAM_NAMES[NUM_ARGS] = {"Size (a power of 2).", "No. of iterations"};
442	// char *TIMERS_NAMES[NUM_TIMERS] = {"Total_time", "Tpose time", "Scale time", "Column FFTs time" };
443	// char *DEFAULT_VALUES[NUM_ARGS] = {"64", "10"};
444
445
446	NUMTHREADS = 1; //omp_get_num_threads();
447	//OSCR_init (NUMTHREADS, "Bailey's '6-step' Fast Fourier Transform", "Use 'fft6' <size (in K)> <iters>", NUM_ARGS,
448	// PARAM_NAMES, DEFAULT_VALUES , NUM_TIMERS, NUM_TIMERS, TIMERS_NAMES,
449	// argc, argv);
450
451
452	/* Command line arguments processing */
453	N = NINIT; // OSCR_getarg_int(1);
454	ITERS = ITERSINIT; // OSCR_getarg_int(2);
455	/*
456	if(argc == 3) {
457	N = atoi(argv[1]);
458	ITERS = atoi(argv[2]);
459	}
460	else {
461	printf("Usage: %s N ITERS\n", argv[0]);
462	printf("N is the size of the input signal. Should be a power of two.\n");
463	N = 64;
464	ITERS = 10;
465	}
466	*/
467
468	MAX_THREADS = 1; //omp_get_num_threads();
469	NN = N * N;
470	LOGN = log2_int(N);
471	LOGNN = log2_int(NN);
472	MEMORY = N * sizeof(int) +
473	2 * (NN) * sizeof(complex) * MAX_THREADS + (NN) * sizeof(complex) +
474	(N / 2) * sizeof(complex);
475
476	/* Memory allocation */
477	xin = calloc(MAX_THREADS, sizeof(*xin));
478	aux = calloc(MAX_THREADS, sizeof(*aux));
479	brt = (int *)calloc(N, sizeof(int));
480	v = (complex )calloc(N N, sizeof(complex)); /* twiddles for 2d fft */
481	w = (complex )calloc((N / 2), sizeof(complex)); / twiddles for 1d fft's */
482
483	printf("Input values:\n");
484	printf("=============\n");
485	printf("N : %u\n", N);
486	printf("ITERS : %u\n", ITERS);
487	printf("NN : %u\n", NN);
488	printf("LOGN : %u\n", LOGN);
489	printf("LOGNN : %u\n", LOGNN);
490	printf("Memory : %.2f Kbytes\n", MEMORY / 1024);
491
492	/* precompute the input array and fft constants */
493	gen_bit_reverse_table(brt, N);
494	gen_w_table(w, N, LOGN, N / 2);
495	gen_v_table(v, N);
496
497	for (k = 0; k < MAX_THREADS; k++) {
498	xin[k] = (complex )calloc(N N, sizeof(complex));
499	aux[k] = (complex )calloc(N N, sizeof(complex));
500	}
501	#define TOTAL 0
502
503	//OSCR_timer_start(TOTAL);
504	#pragma omp parallel for private(k)
505	for(k = 0; k < ITERS; k++) {
506	dgen(xin[ID], N);
507
508	tpose(xin[ID], aux[ID], N);
509
510	cffts(aux[ID], brt, w, N, LOGN, N / 2);
511
512	scale(aux[ID], v, N);
513
514	tpose(aux[ID], xin[ID], N);
515
516	cffts(xin[ID], brt, w, N, LOGN, N / 2);
517
518	tpose(xin[ID], aux[ID], N);
519
520	chkmat(aux[ID], N);
521	}
522	//OSCR_timer_stop(TOTAL);
523	total_time = 1; //OSCR_timer_read(TOTAL);
524
525	/* Display results and time */
526	//OSCR_report(1, TIMERS_NAMES);
527
528	printf("============================");
529	printf("\n# THREADS : %d\n", NUMTHREADS);
530	printf("N : %d\n", N);
531	printf("ITERS : %d\n", ITERS);
532	printf("total TIME: %.6lf secs.\n", total_time);
533
534	#undef ID
535	#undef MAX_THREADS
536	return 0;
537	}
538	/* ----------------------------------------------------------------- */
539
540	/*
541	* vim:ts=2:sw=2:set nonu:
542	*/

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: CIVL/examples/omp/c_fft6.c@ 1aaefd4

Download in other formats: