Context Navigation

omp_SH_to_spat.gen.c

main

Last change on this file was ea777aa, checked in by Alex Wilton <awilton@…>, 3 years ago

Moved examples, include, build_default.properties, common.xml, and README out from dev.civl.com into the root of the repo.

git-svn-id: svn://vsl.cis.udel.edu/civl/trunk@5704 fb995dde-84ed-4084-dfe6-e5aef3e2452c

Property mode set to 100644

File size: 17.8 KB

Line
1	/*
2	* Copyright (c) 2010-2015 Centre National de la Recherche Scientifique.
3	* written by Nathanael Schaeffer (CNRS, ISTerre, Grenoble, France).
4	*
5	* nathanael.schaeffer@ujf-grenoble.fr
6	*
7	* This software is governed by the CeCILL license under French law and
8	* abiding by the rules of distribution of free software. You can use,
9	* modify and/or redistribute the software under the terms of the CeCILL
10	* license as circulated by CEA, CNRS and INRIA at the following URL
11	* "http://www.cecill.info".
12	*
13	* The fact that you are presently reading this means that you have had
14	* knowledge of the CeCILL license and that you accept its terms.
15	*
16	*/
17
18	# This file is meta-code for SHT.c (spherical harmonic transform).
19	# it is intended for "make" to generate C code for similar SHT functions,
20	# from one generic function + tags.
21	# > See Makefile and SHT.c
22	# Basically, there are tags at the beginning of lines that are information
23	# to keep or remove the line depending on the function to build.
24	# tags :
25	# Q : line for scalar transform
26	# V : line for vector transform (both spheroidal and toroidal)
27	# S : line for vector transfrom, spheroidal component
28	# T : line for vector transform, toroidal component.
29
30	static
31	3 void GEN3(_sy3,NWAY,SUFFIX)(shtns_cfg shtns, cplx Qlm, cplx Slm, cplx Tlm, v2d BrF, v2d BtF, v2d BpF, const long int llim, const int imlim) {
32	QX void GEN3(_sy1,NWAY,SUFFIX)(shtns_cfg shtns, cplx Qlm, v2d BrF, const long int llim, const int imlim) {
33	#ifndef SHT_GRAD
34	VX void GEN3(_sy2,NWAY,SUFFIX)(shtns_cfg shtns, cplx Slm, cplx Tlm, v2d BtF, v2d BpF, const long int llim, const int imlim) {
35	#else
36	S void GEN3(_sy1s,NWAY,SUFFIX)(shtns_cfg shtns, cplx Slm, v2d BtF, v2d *BpF, const long int llim, const int imlim) {
37	T void GEN3(_sy1t,NWAY,SUFFIX)(shtns_cfg shtns, cplx Tlm, v2d BtF, v2d *BpF, const long int llim, const int imlim) {
38	#endif
39
40	#ifndef SHT_AXISYM
41	Q #define BR0(i) ((double )BrF)[2(i)]
42	V #define BT0(i) ((double )BtF)[2(i)]
43	V #define BP0(i) ((double )BpF)[2(i)]
44	Q #define qr(l) vall(creal(Ql[l]))
45	Q #define qi(l) vall(cimag(Ql[l]))
46	S #define sr(l) vall(creal(Sl[l]))
47	S #define si(l) vall(cimag(Sl[l]))
48	T #define tr(l) vall(creal(Tl[l]))
49	T #define ti(l) vall(cimag(Tl[l]))
50	V double m_1;
51	unsigned im;
52	#else
53	Q #define BR0(i) ((double *)BrF)[i]
54	S #define BT0(i) ((double *)BtF)[i]
55	T #define BP0(i) ((double *)BpF)[i]
56	#endif
57	unsigned m0, mstep;
58	long int nk,k,l,m;
59	double alm, al;
60	double ct, st;
61	Q double Ql0[llim+1];
62	S double Sl0[llim];
63	T double Tl0[llim];
64
65	ct = shtns->ct; st = shtns->st;
66	nk = NLAT_2;
67	#if _GCC_VEC_
68	nk = ((unsigned)(nk+VSIZE2-1)) / VSIZE2;
69	#endif
70
71	#ifndef _OPENMP
72	m0 = 0; mstep = 1;
73	#else
74	m0 = omp_get_thread_num();
75	mstep = omp_get_num_threads();
76	if (m0 == 0)
77	#endif
78	{ // im=0;
79	#ifdef SHT_GRAD
80	#ifndef SHT_AXISYM
81	#ifdef _GCC_VEC_
82	S k=0; do { BpF[k]=vdup(0.0); } while(++k<NLAT_2);
83	T k=0; do { BtF[k]=vdup(0.0); } while(++k<NLAT_2);
84	#else
85	S k=0; do { BpF[k]=vdup(0.0); } while(++k<NLAT);
86	T k=0; do { BtF[k]=vdup(0.0); } while(++k<NLAT);
87	#endif
88	#else
89	S if (BpF != NULL) { int k=0; do { BpF[k]=vdup(0.0); } while(++k<NLAT_2); }
90	T if (BtF != NULL) { int k=0; do { BtF[k]=vdup(0.0); } while(++k<NLAT_2); }
91	#endif
92	#endif
93	l=1;
94	alm = shtns->alm;
95	Q Ql0[0] = (double) Qlm[0]; // l=0
96	do { // for m=0, compress the complex Q,S,T to double
97	Q Ql0[l] = (double) Qlm[l]; // Ql[l+1] = (double) Qlm[l+1];
98	S Sl0[l-1] = (double) Slm[l]; // Sl[l] = (double) Slm[l+1];
99	T Tl0[l-1] = (double) Tlm[l]; // Tl[l] = (double) Tlm[l+1];
100	++l;
101	} while(l<=llim);
102	k=0;
103	do {
104	l=0; al = alm;
105	rnd cost[NWAY], y0[NWAY], y1[NWAY];
106	V rnd sint[NWAY], dy0[NWAY], dy1[NWAY];
107	Q rnd re[NWAY], ro[NWAY];
108	S rnd te[NWAY], to[NWAY];
109	T rnd pe[NWAY], po[NWAY];
110	for (int j=0; j<NWAY; ++j) {
111	cost[j] = vread(ct, j+k);
112	V sint[j] = -vread(st, j+k);
113	y0[j] = vall(al[0]);
114	V dy0[j] = vall(0.0);
115	Q re[j] = y0[j] * vall(Ql0[0]);
116	S to[j] = dy0[j];
117	T po[j] = dy0[j];
118	}
119	for (int j=0; j<NWAY; ++j) {
120	y1[j] = vall(al[0]al[1]) cost[j];
121	V dy1[j] = vall(al[0]al[1]) sint[j];
122	}
123	for (int j=0; j<NWAY; ++j) {
124	Q ro[j] = y1[j] * vall(Ql0[1]);
125	S te[j] = dy1[j] * vall(Sl0[0]);
126	T pe[j] = -dy1[j] * vall(Tl0[0]);
127	}
128	al+=2; l+=2;
129	while(l<llim) {
130	for (int j=0; j<NWAY; ++j) {
131	V dy0[j] = vall(al[1])(cost[j]dy1[j] + y1[j]sint[j]) + vall(al[0])dy0[j];
132	y0[j] = vall(al[1])(cost[j]y1[j]) + vall(al[0])*y0[j];
133	}
134	for (int j=0; j<NWAY; ++j) {
135	Q re[j] += y0[j] * vall(Ql0[l]);
136	S to[j] += dy0[j] * vall(Sl0[l-1]);
137	T po[j] -= dy0[j] * vall(Tl0[l-1]);
138	}
139	for (int j=0; j<NWAY; ++j) {
140	V dy1[j] = vall(al[3])(cost[j]dy0[j] + y0[j]sint[j]) + vall(al[2])dy1[j];
141	y1[j] = vall(al[3])(cost[j]y0[j]) + vall(al[2])*y1[j];
142	}
143	for (int j=0; j<NWAY; ++j) {
144	Q ro[j] += y1[j] * vall(Ql0[l+1]);
145	S te[j] += dy1[j] * vall(Sl0[l]);
146	T pe[j] -= dy1[j] * vall(Tl0[l]);
147	}
148	al+=4; l+=2;
149	}
150	if (l==llim) {
151	for (int j=0; j<NWAY; ++j) {
152	V dy0[j] = vall(al[1])(cost[j]dy1[j] + y1[j]sint[j]) + vall(al[0])dy0[j];
153	y0[j] = vall(al[1])cost[j]y1[j] + vall(al[0])*y0[j];
154	}
155	for (int j=0; j<NWAY; ++j) {
156	Q re[j] += y0[j] * vall(Ql0[l]);
157	S to[j] += dy0[j] * vall(Sl0[l-1]);
158	T po[j] -= dy0[j] * vall(Tl0[l-1]);
159	}
160	}
161	#if _GCC_VEC_
162	for (int j=0; j<NWAY; ++j) {
163	Q S2D_STORE(BrF, j+k, re[j], ro[j])
164	S S2D_STORE(BtF, j+k, te[j], to[j])
165	T S2D_STORE(BpF, j+k, pe[j], po[j])
166	}
167	#else
168	for (int j=0; j<NWAY; ++j) {
169	Q BR0(k+j) = (re[j]+ro[j]);
170	Q BR0(NLAT-k-1-j) = (re[j]-ro[j]);
171	S BT0(k+j) = (te[j]+to[j]);
172	S BT0(NLAT-k-1-j) = (te[j]-to[j]);
173	T BP0(k+j) = (pe[j]+po[j]);
174	T BP0(NLAT-k-1-j) = (pe[j]-po[j]);
175	}
176	#endif
177	k+=NWAY;
178	} while (k < nk);
179	m0=mstep;
180	}
181
182	#ifndef SHT_AXISYM
183	#if _GCC_VEC_
184	Q BrF += m0*NLAT_2;
185	V BtF += m0NLAT_2; BpF += m0NLAT_2;
186	#else
187	Q BrF += m0*NLAT;
188	V BtF += m0NLAT; BpF += m0NLAT;
189	#endif
190	for (im=m0; im<imlim; im+=mstep) {
191	m = im*MRES;
192	//l = LiM(shtns, 0,im);
193	l = (im(2(LMAX+1)-(m+MRES)))>>1;
194	V m_1 = 1.0/m;
195	//alm = shtns->alm[im];
196	alm = shtns->alm + im(2LMAX -m+MRES);
197	Q cplx* Ql = &Qlm[l]; // virtual pointer for l=0 and im
198	S cplx* Sl = &Slm[l]; // virtual pointer for l=0 and im
199	T cplx* Tl = &Tlm[l];
200	k=0; l=shtns->tm[im];
201	#if _GCC_VEC_
202	l>>=1; // stay on a 16 byte boundary
203	while (k<l) { // polar optimization
204	Q BrF[k] = vdup(0.0); BrF[(NPHI-2im)NLAT_2 + k] = vdup(0.0);
205	Q BrF[NLAT_2-l+k] = vdup(0.0); BrF[(NPHI+1-2im)NLAT_2 -l+k] = vdup(0.0);
206	V BtF[k] = vdup(0.0); BtF[(NPHI-2im)NLAT_2 + k] = vdup(0.0);
207	V BtF[NLAT_2-l+k] = vdup(0.0); BtF[(NPHI+1-2im)NLAT_2 -l+k] = vdup(0.0);
208	V BpF[k] = vdup(0.0); BpF[(NPHI-2im)NLAT_2 + k] = vdup(0.0);
209	V BpF[NLAT_2-l+k] = vdup(0.0); BpF[(NPHI+1-2im)NLAT_2 -l+k] = vdup(0.0);
210	++k;
211	}
212	k = ((unsigned) k) / (VSIZE2/2);
213	#else
214	while (k<l) { // polar optimization
215	Q BrF[k] = 0.0; BrF[NLAT-l+k] = 0.0;
216	V BtF[k] = 0.0; BtF[NLAT-l+k] = 0.0;
217	V BpF[k] = 0.0; BpF[NLAT-l+k] = 0.0;
218	++k;
219	}
220	#endif
221	do {
222	al = alm;
223	rnd cost[NWAY], y0[NWAY], y1[NWAY];
224	V rnd st2[NWAY], dy0[NWAY], dy1[NWAY];
225	Q rnd rer[NWAY], rei[NWAY], ror[NWAY], roi[NWAY];
226	V rnd ter[NWAY], tei[NWAY], tor[NWAY], toi[NWAY];
227	V rnd per[NWAY], pei[NWAY], por[NWAY], poi[NWAY];
228	for (int j=0; j<NWAY; ++j) {
229	cost[j] = vread(st, k+j);
230	y0[j] = vall(1.0);
231	V st2[j] = cost[j]cost[j]vall(-m_1);
232	V y0[j] = vall(m); // for the vector transform, compute ylm*m/sint
233	}
234	Q l=m;
235	V l=m-1;
236	long int ny = 0;
237	if ((int)llim <= SHT_L_RESCALE_FLY) {
238	do { // sin(theta)^m
239	if (l&1) for (int j=0; j<NWAY; ++j) y0[j] *= cost[j];
240	for (int j=0; j<NWAY; ++j) cost[j] *= cost[j];
241	} while(l >>= 1);
242	} else {
243	long int nsint = 0;
244	do { // sin(theta)^m (use rescaling to avoid underflow)
245	if (l&1) {
246	for (int j=0; j<NWAY; ++j) y0[j] *= cost[j];
247	ny += nsint;
248	if (vlo(y0[0]) < (SHT_ACCURACY+1.0/SHT_SCALE_FACTOR)) {
249	ny--;
250	for (int j=0; j<NWAY; ++j) y0[j] *= vall(SHT_SCALE_FACTOR);
251	}
252	}
253	for (int j=0; j<NWAY; ++j) cost[j] *= cost[j];
254	nsint += nsint;
255	if (vlo(cost[0]) < 1.0/SHT_SCALE_FACTOR) {
256	nsint--;
257	for (int j=0; j<NWAY; ++j) cost[j] *= vall(SHT_SCALE_FACTOR);
258	}
259	} while(l >>= 1);
260	}
261	for (int j=0; j<NWAY; ++j) {
262	y0[j] *= vall(al[0]);
263	cost[j] = vread(ct, j+k);
264	V dy0[j] = cost[j]*y0[j];
265	Q ror[j] = vall(0.0); roi[j] = vall(0.0);
266	Q rer[j] = vall(0.0); rei[j] = vall(0.0);
267	}
268	for (int j=0; j<NWAY; ++j) {
269	y1[j] = (vall(al[1])y0[j]) cost[j]; // y1[j] = vall(al[1])cost[j]y0[j];
270	V por[j] = vall(0.0); tei[j] = vall(0.0);
271	V tor[j] = vall(0.0); pei[j] = vall(0.0);
272	V dy1[j] = (vall(al[1])y0[j]) (cost[j]cost[j] + st2[j]); // dy1[j] = vall(al[1])(cost[j]dy0[j] - y0[j]st2[j]);
273	V poi[j] = vall(0.0); ter[j] = vall(0.0);
274	V toi[j] = vall(0.0); per[j] = vall(0.0);
275	}
276	l=m; al+=2;
277	while ((ny<0) && (l<llim)) { // ylm treated as zero and ignored if ny < 0
278	for (int j=0; j<NWAY; ++j) {
279	y0[j] = vall(al[1])(cost[j]y1[j]) + vall(al[0])*y0[j];
280	V dy0[j] = vall(al[1])(cost[j]dy1[j] + y1[j]st2[j]) + vall(al[0])dy0[j];
281	}
282	for (int j=0; j<NWAY; ++j) {
283	y1[j] = vall(al[3])(cost[j]y0[j]) + vall(al[2])*y1[j];
284	V dy1[j] = vall(al[3])(cost[j]dy0[j] + y0[j]st2[j]) + vall(al[2])dy1[j];
285	}
286	l+=2; al+=4;
287	if (fabs(vlo(y0[NWAY-1])) > SHT_ACCURACY*SHT_SCALE_FACTOR + 1.0) { // rescale when value is significant
288	++ny;
289	for (int j=0; j<NWAY; ++j) {
290	y0[j] = vall(1.0/SHT_SCALE_FACTOR); y1[j] = vall(1.0/SHT_SCALE_FACTOR);
291	V dy0[j] = vall(1.0/SHT_SCALE_FACTOR); dy1[j] = vall(1.0/SHT_SCALE_FACTOR);
292	}
293	}
294	}
295	if (ny == 0) {
296	while (l<llim) { // compute even and odd parts
297	Q for (int j=0; j<NWAY; ++j) { rer[j] += y0[j] * qr(l); rei[j] += y0[j] * qi(l); }
298	Q for (int j=0; j<NWAY; ++j) { ror[j] += y1[j] * qr(l+1); roi[j] += y1[j] * qi(l+1); }
299	#ifdef SHT_GRAD
300	S for (int j=0; j<NWAY; ++j) { tor[j] += dy0[j] * sr(l); pei[j] += y0[j] * sr(l); }
301	S for (int j=0; j<NWAY; ++j) { toi[j] += dy0[j] * si(l); per[j] -= y0[j] * si(l); }
302	T for (int j=0; j<NWAY; ++j) { por[j] -= dy0[j] * tr(l); tei[j] += y0[j] * tr(l); }
303	T for (int j=0; j<NWAY; ++j) { poi[j] -= dy0[j] * ti(l); ter[j] -= y0[j] * ti(l); }
304	S for (int j=0; j<NWAY; ++j) { ter[j] += dy1[j] * sr(l+1); poi[j] += y1[j] * sr(l+1); }
305	S for (int j=0; j<NWAY; ++j) { tei[j] += dy1[j] * si(l+1); por[j] -= y1[j] * si(l+1); }
306	T for (int j=0; j<NWAY; ++j) { per[j] -= dy1[j] * tr(l+1); toi[j] += y1[j] * tr(l+1); }
307	T for (int j=0; j<NWAY; ++j) { pei[j] -= dy1[j] * ti(l+1); tor[j] -= y1[j] * ti(l+1); }
308	#else
309	V for (int j=0; j<NWAY; ++j) {
310	V tor[j] += dy0[j] * sr(l) - y1[j] * ti(l+1);
311	V pei[j] += y0[j] * sr(l) - dy1[j] * ti(l+1);
312	V }
313	V for (int j=0; j<NWAY; ++j) {
314	V poi[j] -= dy0[j] * ti(l) - y1[j] * sr(l+1);
315	V ter[j] -= y0[j] * ti(l) - dy1[j] * sr(l+1);
316	V }
317	V for (int j=0; j<NWAY; ++j) {
318	V toi[j] += dy0[j] * si(l) + y1[j] * tr(l+1);
319	V per[j] -= y0[j] * si(l) + dy1[j] * tr(l+1);
320	V }
321	V for (int j=0; j<NWAY; ++j) {
322	V por[j] -= dy0[j] * tr(l) + y1[j] * si(l+1);
323	V tei[j] += y0[j] * tr(l) + dy1[j] * si(l+1);
324	V }
325	#endif
326	for (int j=0; j<NWAY; ++j) {
327	V dy0[j] = vall(al[1])(cost[j]dy1[j] + y1[j]st2[j]) + vall(al[0])dy0[j];
328	y0[j] = vall(al[1])(cost[j]y1[j]) + vall(al[0])*y0[j];
329	}
330	for (int j=0; j<NWAY; ++j) {
331	V dy1[j] = vall(al[3])(cost[j]dy0[j] + y0[j]st2[j]) + vall(al[2])dy1[j];
332	y1[j] = vall(al[3])(cost[j]y0[j]) + vall(al[2])*y1[j];
333	}
334	l+=2; al+=4;
335	}
336	if (l==llim) {
337	Q for (int j=0; j<NWAY; ++j) { rer[j] += y0[j] * qr(l); rei[j] += y0[j] * qi(l); }
338	S for (int j=0; j<NWAY; ++j) { tor[j] += dy0[j] * sr(l); pei[j] += y0[j] * sr(l); }
339	S for (int j=0; j<NWAY; ++j) { toi[j] += dy0[j] * si(l); per[j] -= y0[j] * si(l); }
340	T for (int j=0; j<NWAY; ++j) { por[j] -= dy0[j] * tr(l); tei[j] += y0[j] * tr(l); }
341	T for (int j=0; j<NWAY; ++j) { poi[j] -= dy0[j] * ti(l); ter[j] -= y0[j] * ti(l); }
342	}
343	3 for (int j=0; j<NWAY; ++j) cost[j] = vread(st, k+j) * vall(m_1);
344	3 for (int j=0; j<NWAY; ++j) { rer[j] = cost[j]; ror[j] = cost[j]; rei[j] = cost[j]; roi[j] = cost[j]; }
345	}
346	#if _GCC_VEC_
347	for (int j=0; j<NWAY; ++j) {
348	Q S2D_CSTORE(BrF, k+j, rer[j], ror[j], rei[j], roi[j])
349	V S2D_CSTORE(BtF, k+j, ter[j], tor[j], tei[j], toi[j])
350	V S2D_CSTORE(BpF, k+j, per[j], por[j], pei[j], poi[j])
351	}
352	#else
353	for (int j=0; j<NWAY; ++j) {
354	Q BrF[k+j] = (rer[j]+ror[j]) + I*(rei[j]+roi[j]);
355	Q BrF[NLAT-k-1-j] = (rer[j]-ror[j]) + I*(rei[j]-roi[j]);
356	V BtF[k+j] = (ter[j]+tor[j]) + I*(tei[j]+toi[j]);
357	V BtF[NLAT-1-k-j] = (ter[j]-tor[j]) + I*(tei[j]-toi[j]);
358	V BpF[k+j] = (per[j]+por[j]) + I*(pei[j]+poi[j]);
359	V BpF[NLAT-1-k-j] = (per[j]-por[j]) + I*(pei[j]-poi[j]);
360	}
361	#endif
362	k+=NWAY;
363	} while (k < nk);
364	#if _GCC_VEC_
365	Q BrF += mstep*NLAT_2;
366	V BtF += mstepNLAT_2; BpF += mstepNLAT_2;
367	#else
368	Q BrF += mstep*NLAT;
369	V BtF += mstepNLAT; BpF += mstepNLAT;
370	#endif
371	}
372
373	#if _GCC_VEC_
374	while(im <= NPHI-imlim) { // padding for high m's
375	k=0;
376	do {
377	Q BrF[k] = vdup(0.0);
378	V BtF[k] = vdup(0.0); BpF[k] = vdup(0.0);
379	} while (++k < NLAT_2);
380	Q BrF += mstep*NLAT_2;
381	V BtF += mstepNLAT_2; BpF += mstepNLAT_2;
382	im+=mstep;
383	}
384	#else
385	while(im <= NPHI/2) { // padding for high m's
386	k=0;
387	do {
388	Q BrF[k] = 0.0;
389	V BtF[k] = 0.0; BpF[k] = 0.0;
390	} while (++k < NLAT);
391	Q BrF += mstep*NLAT;
392	V BtF += mstepNLAT; BpF += mstepNLAT;
393	im+=mstep;
394	}
395	#endif
396	#endif
397	}
398
399	Q #undef BR0
400	V #undef BT0
401	V #undef BP0
402	Q #undef qr
403	Q #undef qi
404	S #undef sr
405	S #undef si
406	T #undef tr
407	T #undef ti
408
409	3 static void GEN3(SHqst_to_spat_omp,NWAY,SUFFIX)(shtns_cfg shtns, cplx Qlm, cplx Slm, cplx Tlm, double Vr, double Vt, double Vp, long int llim) {
410	QX static void GEN3(SH_to_spat_omp,NWAY,SUFFIX)(shtns_cfg shtns, cplx Qlm, double Vr, long int llim) {
411	#ifndef SHT_GRAD
412	VX static void GEN3(SHsphtor_to_spat_omp,NWAY,SUFFIX)(shtns_cfg shtns, cplx Slm, cplx Tlm, double Vt, double Vp, long int llim) {
413	#else
414	S static void GEN3(SHsph_to_spat_omp,NWAY,SUFFIX)(shtns_cfg shtns, cplx Slm, double Vt, double *Vp, long int llim) {
415	T static void GEN3(SHtor_to_spat_omp,NWAY,SUFFIX)(shtns_cfg shtns, cplx Tlm, double Vt, double *Vp, long int llim) {
416	#endif
417
418	int k;
419	unsigned imlim = 0;
420	Q v2d* BrF = (v2d*) Vr;
421	V v2d* BtF = (v2d) Vt; v2d BpF = (v2d*) Vp;
422
423	#ifndef SHT_AXISYM
424	imlim = MTR;
425	#ifdef SHT_VAR_LTR
426	if (imlim*MRES > (unsigned) llim) imlim = ((unsigned) llim)/MRES; // 32bit mul and div should be faster
427	#endif
428	#ifdef _GCC_VEC_
429	if (shtns->fftc_mode > 0) { // alloc memory for the FFT
430	unsigned long nv = shtns->nspat;
431	QX BrF = (v2d) VMALLOC( nv sizeof(double) );
432	VX BtF = (v2d) VMALLOC( 2nv * sizeof(double) );
433	VX BpF = BtF + nv/2;
434	3 BrF = (v2d) VMALLOC( 3nv * sizeof(double) );
435	3 BtF = BrF + nv/2; BpF = BrF + nv;
436	}
437	#else
438	if (shtns->ncplx_fft > 0) { // alloc memory for the FFT
439	QX BrF = VMALLOC( shtns->ncplx_fft * sizeof(cplx) );
440	VX BtF = VMALLOC( 2* shtns->ncplx_fft * sizeof(cplx) );
441	VX BpF = BtF + shtns->ncplx_fft;
442	3 BrF = VMALLOC( 3* shtns->ncplx_fft * sizeof(cplx) );
443	3 BtF = BrF + shtns->ncplx_fft; BpF = BtF + shtns->ncplx_fft;
444	}
445	#endif
446	#endif
447	imlim += 1;
448
449	#pragma omp parallel num_threads(shtns->nthreads)
450	{
451	3 GEN3(_sy3,NWAY,SUFFIX)(shtns, Qlm, Slm, Tlm, BrF, BtF, BpF, llim, imlim);
452	QX GEN3(_sy1,NWAY,SUFFIX)(shtns, Qlm, BrF, llim, imlim);
453	#ifndef SHT_GRAD
454	VX GEN3(_sy2,NWAY,SUFFIX)(shtns, Slm, Tlm, BtF, BpF, llim, imlim);
455	#else
456	S GEN3(_sy1s,NWAY,SUFFIX)(shtns, Slm, BtF, BpF, llim, imlim);
457	T GEN3(_sy1t,NWAY,SUFFIX)(shtns, Tlm, BtF, BpF, llim, imlim);
458	#endif
459
460	#ifndef SHT_AXISYM
461	V #ifndef HAVE_LIBFFTW3_OMP
462	V #pragma omp barrier
463	V #if _GCC_VEC_
464	V if (shtns->fftc_mode == 0) {
465	3 #pragma omp single nowait
466	3 fftw_execute_dft(shtns->ifftc, (cplx ) BrF, (cplx ) Vr);
467	V #pragma omp single nowait
468	V fftw_execute_dft(shtns->ifftc, (cplx ) BtF, (cplx ) Vt);
469	V #pragma omp single nowait
470	V fftw_execute_dft(shtns->ifftc, (cplx ) BpF, (cplx ) Vp);
471	V } else if (shtns->fftc_mode > 0) { // split dft
472	3 #pragma omp single nowait
473	3 fftw_execute_split_dft(shtns->ifftc,((double)BrF)+1, ((double)BrF), Vr+NPHI, Vr);
474	V #pragma omp single nowait
475	V fftw_execute_split_dft(shtns->ifftc,((double)BtF)+1, ((double)BtF), Vt+NPHI, Vt);
476	V #pragma omp single nowait
477	V fftw_execute_split_dft(shtns->ifftc,((double)BpF)+1, ((double)BpF), Vp+NPHI, Vp);
478	V }
479	V #else
480	3 #pragma omp single nowait
481	3 fftw_execute_dft_c2r(shtns->ifft, (cplx *) BrF, Vr);
482	V #pragma omp single nowait
483	V fftw_execute_dft_c2r(shtns->ifft, (cplx *) BtF, Vt);
484	V #pragma omp single nowait
485	V fftw_execute_dft_c2r(shtns->ifft, (cplx *) BpF, Vp);
486	V #endif
487	V #endif
488	#endif
489
490	}
491
492	#ifndef SHT_AXISYM
493	// NPHI > 1 as SHT_AXISYM is not defined.
494	#if _GCC_VEC_
495	if (shtns->fftc_mode >= 0) {
496	if (shtns->fftc_mode == 0) {
497	V #ifdef HAVE_LIBFFTW3_OMP
498	Q fftw_execute_dft(shtns->ifftc, (cplx ) BrF, (cplx ) Vr);
499	V fftw_execute_dft(shtns->ifftc, (cplx ) BtF, (cplx ) Vt);
500	V fftw_execute_dft(shtns->ifftc, (cplx ) BpF, (cplx ) Vp);
501	V #endif
502	} else { // split dft
503	V #ifdef HAVE_LIBFFTW3_OMP
504	Q fftw_execute_split_dft(shtns->ifftc,((double)BrF)+1, ((double)BrF), Vr+NPHI, Vr);
505	V fftw_execute_split_dft(shtns->ifftc,((double)BtF)+1, ((double)BtF), Vt+NPHI, Vt);
506	V fftw_execute_split_dft(shtns->ifftc,((double)BpF)+1, ((double)BpF), Vp+NPHI, Vp);
507	V #endif
508	Q VFREE(BrF);
509	VX VFREE(BtF); // this frees also BpF.
510	}
511	}
512	#else
513	if (shtns->ncplx_fft >= 0) {
514	V #ifdef HAVE_LIBFFTW3_OMP
515	Q fftw_execute_dft_c2r(shtns->ifft, (cplx *) BrF, Vr);
516	V fftw_execute_dft_c2r(shtns->ifft, (cplx *) BtF, Vt);
517	V fftw_execute_dft_c2r(shtns->ifft, (cplx *) BpF, Vp);
518	V #endif
519	if (shtns->ncplx_fft > 0) { // free memory
520	Q VFREE(BrF);
521	VX VFREE(BtF); // this frees also BpF.
522	}
523	}
524	#endif
525	#endif
526
527	}

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: CIVL/examples/omp/shtns/SHT/omp_SH_to_spat.gen.c

Download in other formats: