source: CIVL/examples/omp/shtns/SHT/omp_SH_to_spat.gen.c

main
Last change on this file was ea777aa, checked in by Alex Wilton <awilton@…>, 3 years ago

Moved examples, include, build_default.properties, common.xml, and README out from dev.civl.com into the root of the repo.

git-svn-id: svn://vsl.cis.udel.edu/civl/trunk@5704 fb995dde-84ed-4084-dfe6-e5aef3e2452c

  • Property mode set to 100644
File size: 17.8 KB
Line 
1/*
2 * Copyright (c) 2010-2015 Centre National de la Recherche Scientifique.
3 * written by Nathanael Schaeffer (CNRS, ISTerre, Grenoble, France).
4 *
5 * nathanael.schaeffer@ujf-grenoble.fr
6 *
7 * This software is governed by the CeCILL license under French law and
8 * abiding by the rules of distribution of free software. You can use,
9 * modify and/or redistribute the software under the terms of the CeCILL
10 * license as circulated by CEA, CNRS and INRIA at the following URL
11 * "http://www.cecill.info".
12 *
13 * The fact that you are presently reading this means that you have had
14 * knowledge of the CeCILL license and that you accept its terms.
15 *
16 */
17
18# This file is meta-code for SHT.c (spherical harmonic transform).
19# it is intended for "make" to generate C code for similar SHT functions,
20# from one generic function + tags.
21# > See Makefile and SHT.c
22# Basically, there are tags at the beginning of lines that are information
23# to keep or remove the line depending on the function to build.
24# tags :
25# Q : line for scalar transform
26# V : line for vector transform (both spheroidal and toroidal)
27# S : line for vector transfrom, spheroidal component
28# T : line for vector transform, toroidal component.
29
30 static
313 void GEN3(_sy3,NWAY,SUFFIX)(shtns_cfg shtns, cplx *Qlm, cplx *Slm, cplx *Tlm, v2d *BrF, v2d *BtF, v2d *BpF, const long int llim, const int imlim) {
32QX void GEN3(_sy1,NWAY,SUFFIX)(shtns_cfg shtns, cplx *Qlm, v2d *BrF, const long int llim, const int imlim) {
33 #ifndef SHT_GRAD
34VX void GEN3(_sy2,NWAY,SUFFIX)(shtns_cfg shtns, cplx *Slm, cplx *Tlm, v2d *BtF, v2d *BpF, const long int llim, const int imlim) {
35 #else
36S void GEN3(_sy1s,NWAY,SUFFIX)(shtns_cfg shtns, cplx *Slm, v2d *BtF, v2d *BpF, const long int llim, const int imlim) {
37T void GEN3(_sy1t,NWAY,SUFFIX)(shtns_cfg shtns, cplx *Tlm, v2d *BtF, v2d *BpF, const long int llim, const int imlim) {
38 #endif
39
40 #ifndef SHT_AXISYM
41Q #define BR0(i) ((double *)BrF)[2*(i)]
42V #define BT0(i) ((double *)BtF)[2*(i)]
43V #define BP0(i) ((double *)BpF)[2*(i)]
44Q #define qr(l) vall(creal(Ql[l]))
45Q #define qi(l) vall(cimag(Ql[l]))
46S #define sr(l) vall(creal(Sl[l]))
47S #define si(l) vall(cimag(Sl[l]))
48T #define tr(l) vall(creal(Tl[l]))
49T #define ti(l) vall(cimag(Tl[l]))
50V double m_1;
51 unsigned im;
52 #else
53Q #define BR0(i) ((double *)BrF)[i]
54S #define BT0(i) ((double *)BtF)[i]
55T #define BP0(i) ((double *)BpF)[i]
56 #endif
57 unsigned m0, mstep;
58 long int nk,k,l,m;
59 double *alm, *al;
60 double *ct, *st;
61Q double Ql0[llim+1];
62S double Sl0[llim];
63T double Tl0[llim];
64
65 ct = shtns->ct; st = shtns->st;
66 nk = NLAT_2;
67 #if _GCC_VEC_
68 nk = ((unsigned)(nk+VSIZE2-1)) / VSIZE2;
69 #endif
70
71 #ifndef _OPENMP
72 m0 = 0; mstep = 1;
73 #else
74 m0 = omp_get_thread_num();
75 mstep = omp_get_num_threads();
76 if (m0 == 0)
77 #endif
78 { // im=0;
79 #ifdef SHT_GRAD
80 #ifndef SHT_AXISYM
81 #ifdef _GCC_VEC_
82S k=0; do { BpF[k]=vdup(0.0); } while(++k<NLAT_2);
83T k=0; do { BtF[k]=vdup(0.0); } while(++k<NLAT_2);
84 #else
85S k=0; do { BpF[k]=vdup(0.0); } while(++k<NLAT);
86T k=0; do { BtF[k]=vdup(0.0); } while(++k<NLAT);
87 #endif
88 #else
89S if (BpF != NULL) { int k=0; do { BpF[k]=vdup(0.0); } while(++k<NLAT_2); }
90T if (BtF != NULL) { int k=0; do { BtF[k]=vdup(0.0); } while(++k<NLAT_2); }
91 #endif
92 #endif
93 l=1;
94 alm = shtns->alm;
95Q Ql0[0] = (double) Qlm[0]; // l=0
96 do { // for m=0, compress the complex Q,S,T to double
97Q Ql0[l] = (double) Qlm[l]; // Ql[l+1] = (double) Qlm[l+1];
98S Sl0[l-1] = (double) Slm[l]; // Sl[l] = (double) Slm[l+1];
99T Tl0[l-1] = (double) Tlm[l]; // Tl[l] = (double) Tlm[l+1];
100 ++l;
101 } while(l<=llim);
102 k=0;
103 do {
104 l=0; al = alm;
105 rnd cost[NWAY], y0[NWAY], y1[NWAY];
106V rnd sint[NWAY], dy0[NWAY], dy1[NWAY];
107Q rnd re[NWAY], ro[NWAY];
108S rnd te[NWAY], to[NWAY];
109T rnd pe[NWAY], po[NWAY];
110 for (int j=0; j<NWAY; ++j) {
111 cost[j] = vread(ct, j+k);
112V sint[j] = -vread(st, j+k);
113 y0[j] = vall(al[0]);
114V dy0[j] = vall(0.0);
115Q re[j] = y0[j] * vall(Ql0[0]);
116S to[j] = dy0[j];
117T po[j] = dy0[j];
118 }
119 for (int j=0; j<NWAY; ++j) {
120 y1[j] = vall(al[0]*al[1]) * cost[j];
121V dy1[j] = vall(al[0]*al[1]) * sint[j];
122 }
123 for (int j=0; j<NWAY; ++j) {
124Q ro[j] = y1[j] * vall(Ql0[1]);
125S te[j] = dy1[j] * vall(Sl0[0]);
126T pe[j] = -dy1[j] * vall(Tl0[0]);
127 }
128 al+=2; l+=2;
129 while(l<llim) {
130 for (int j=0; j<NWAY; ++j) {
131V dy0[j] = vall(al[1])*(cost[j]*dy1[j] + y1[j]*sint[j]) + vall(al[0])*dy0[j];
132 y0[j] = vall(al[1])*(cost[j]*y1[j]) + vall(al[0])*y0[j];
133 }
134 for (int j=0; j<NWAY; ++j) {
135Q re[j] += y0[j] * vall(Ql0[l]);
136S to[j] += dy0[j] * vall(Sl0[l-1]);
137T po[j] -= dy0[j] * vall(Tl0[l-1]);
138 }
139 for (int j=0; j<NWAY; ++j) {
140V dy1[j] = vall(al[3])*(cost[j]*dy0[j] + y0[j]*sint[j]) + vall(al[2])*dy1[j];
141 y1[j] = vall(al[3])*(cost[j]*y0[j]) + vall(al[2])*y1[j];
142 }
143 for (int j=0; j<NWAY; ++j) {
144Q ro[j] += y1[j] * vall(Ql0[l+1]);
145S te[j] += dy1[j] * vall(Sl0[l]);
146T pe[j] -= dy1[j] * vall(Tl0[l]);
147 }
148 al+=4; l+=2;
149 }
150 if (l==llim) {
151 for (int j=0; j<NWAY; ++j) {
152V dy0[j] = vall(al[1])*(cost[j]*dy1[j] + y1[j]*sint[j]) + vall(al[0])*dy0[j];
153 y0[j] = vall(al[1])*cost[j]*y1[j] + vall(al[0])*y0[j];
154 }
155 for (int j=0; j<NWAY; ++j) {
156Q re[j] += y0[j] * vall(Ql0[l]);
157S to[j] += dy0[j] * vall(Sl0[l-1]);
158T po[j] -= dy0[j] * vall(Tl0[l-1]);
159 }
160 }
161 #if _GCC_VEC_
162 for (int j=0; j<NWAY; ++j) {
163Q S2D_STORE(BrF, j+k, re[j], ro[j])
164S S2D_STORE(BtF, j+k, te[j], to[j])
165T S2D_STORE(BpF, j+k, pe[j], po[j])
166 }
167 #else
168 for (int j=0; j<NWAY; ++j) {
169Q BR0(k+j) = (re[j]+ro[j]);
170Q BR0(NLAT-k-1-j) = (re[j]-ro[j]);
171S BT0(k+j) = (te[j]+to[j]);
172S BT0(NLAT-k-1-j) = (te[j]-to[j]);
173T BP0(k+j) = (pe[j]+po[j]);
174T BP0(NLAT-k-1-j) = (pe[j]-po[j]);
175 }
176 #endif
177 k+=NWAY;
178 } while (k < nk);
179 m0=mstep;
180 }
181
182 #ifndef SHT_AXISYM
183 #if _GCC_VEC_
184Q BrF += m0*NLAT_2;
185V BtF += m0*NLAT_2; BpF += m0*NLAT_2;
186 #else
187Q BrF += m0*NLAT;
188V BtF += m0*NLAT; BpF += m0*NLAT;
189 #endif
190 for (im=m0; im<imlim; im+=mstep) {
191 m = im*MRES;
192 //l = LiM(shtns, 0,im);
193 l = (im*(2*(LMAX+1)-(m+MRES)))>>1;
194V m_1 = 1.0/m;
195 //alm = shtns->alm[im];
196 alm = shtns->alm + im*(2*LMAX -m+MRES);
197Q cplx* Ql = &Qlm[l]; // virtual pointer for l=0 and im
198S cplx* Sl = &Slm[l]; // virtual pointer for l=0 and im
199T cplx* Tl = &Tlm[l];
200 k=0; l=shtns->tm[im];
201 #if _GCC_VEC_
202 l>>=1; // stay on a 16 byte boundary
203 while (k<l) { // polar optimization
204Q BrF[k] = vdup(0.0); BrF[(NPHI-2*im)*NLAT_2 + k] = vdup(0.0);
205Q BrF[NLAT_2-l+k] = vdup(0.0); BrF[(NPHI+1-2*im)*NLAT_2 -l+k] = vdup(0.0);
206V BtF[k] = vdup(0.0); BtF[(NPHI-2*im)*NLAT_2 + k] = vdup(0.0);
207V BtF[NLAT_2-l+k] = vdup(0.0); BtF[(NPHI+1-2*im)*NLAT_2 -l+k] = vdup(0.0);
208V BpF[k] = vdup(0.0); BpF[(NPHI-2*im)*NLAT_2 + k] = vdup(0.0);
209V BpF[NLAT_2-l+k] = vdup(0.0); BpF[(NPHI+1-2*im)*NLAT_2 -l+k] = vdup(0.0);
210 ++k;
211 }
212 k = ((unsigned) k) / (VSIZE2/2);
213 #else
214 while (k<l) { // polar optimization
215Q BrF[k] = 0.0; BrF[NLAT-l+k] = 0.0;
216V BtF[k] = 0.0; BtF[NLAT-l+k] = 0.0;
217V BpF[k] = 0.0; BpF[NLAT-l+k] = 0.0;
218 ++k;
219 }
220 #endif
221 do {
222 al = alm;
223 rnd cost[NWAY], y0[NWAY], y1[NWAY];
224V rnd st2[NWAY], dy0[NWAY], dy1[NWAY];
225Q rnd rer[NWAY], rei[NWAY], ror[NWAY], roi[NWAY];
226V rnd ter[NWAY], tei[NWAY], tor[NWAY], toi[NWAY];
227V rnd per[NWAY], pei[NWAY], por[NWAY], poi[NWAY];
228 for (int j=0; j<NWAY; ++j) {
229 cost[j] = vread(st, k+j);
230 y0[j] = vall(1.0);
231V st2[j] = cost[j]*cost[j]*vall(-m_1);
232V y0[j] = vall(m); // for the vector transform, compute ylm*m/sint
233 }
234Q l=m;
235V l=m-1;
236 long int ny = 0;
237 if ((int)llim <= SHT_L_RESCALE_FLY) {
238 do { // sin(theta)^m
239 if (l&1) for (int j=0; j<NWAY; ++j) y0[j] *= cost[j];
240 for (int j=0; j<NWAY; ++j) cost[j] *= cost[j];
241 } while(l >>= 1);
242 } else {
243 long int nsint = 0;
244 do { // sin(theta)^m (use rescaling to avoid underflow)
245 if (l&1) {
246 for (int j=0; j<NWAY; ++j) y0[j] *= cost[j];
247 ny += nsint;
248 if (vlo(y0[0]) < (SHT_ACCURACY+1.0/SHT_SCALE_FACTOR)) {
249 ny--;
250 for (int j=0; j<NWAY; ++j) y0[j] *= vall(SHT_SCALE_FACTOR);
251 }
252 }
253 for (int j=0; j<NWAY; ++j) cost[j] *= cost[j];
254 nsint += nsint;
255 if (vlo(cost[0]) < 1.0/SHT_SCALE_FACTOR) {
256 nsint--;
257 for (int j=0; j<NWAY; ++j) cost[j] *= vall(SHT_SCALE_FACTOR);
258 }
259 } while(l >>= 1);
260 }
261 for (int j=0; j<NWAY; ++j) {
262 y0[j] *= vall(al[0]);
263 cost[j] = vread(ct, j+k);
264V dy0[j] = cost[j]*y0[j];
265Q ror[j] = vall(0.0); roi[j] = vall(0.0);
266Q rer[j] = vall(0.0); rei[j] = vall(0.0);
267 }
268 for (int j=0; j<NWAY; ++j) {
269 y1[j] = (vall(al[1])*y0[j]) *cost[j]; // y1[j] = vall(al[1])*cost[j]*y0[j];
270V por[j] = vall(0.0); tei[j] = vall(0.0);
271V tor[j] = vall(0.0); pei[j] = vall(0.0);
272V dy1[j] = (vall(al[1])*y0[j]) *(cost[j]*cost[j] + st2[j]); // dy1[j] = vall(al[1])*(cost[j]*dy0[j] - y0[j]*st2[j]);
273V poi[j] = vall(0.0); ter[j] = vall(0.0);
274V toi[j] = vall(0.0); per[j] = vall(0.0);
275 }
276 l=m; al+=2;
277 while ((ny<0) && (l<llim)) { // ylm treated as zero and ignored if ny < 0
278 for (int j=0; j<NWAY; ++j) {
279 y0[j] = vall(al[1])*(cost[j]*y1[j]) + vall(al[0])*y0[j];
280V dy0[j] = vall(al[1])*(cost[j]*dy1[j] + y1[j]*st2[j]) + vall(al[0])*dy0[j];
281 }
282 for (int j=0; j<NWAY; ++j) {
283 y1[j] = vall(al[3])*(cost[j]*y0[j]) + vall(al[2])*y1[j];
284V dy1[j] = vall(al[3])*(cost[j]*dy0[j] + y0[j]*st2[j]) + vall(al[2])*dy1[j];
285 }
286 l+=2; al+=4;
287 if (fabs(vlo(y0[NWAY-1])) > SHT_ACCURACY*SHT_SCALE_FACTOR + 1.0) { // rescale when value is significant
288 ++ny;
289 for (int j=0; j<NWAY; ++j) {
290 y0[j] *= vall(1.0/SHT_SCALE_FACTOR); y1[j] *= vall(1.0/SHT_SCALE_FACTOR);
291V dy0[j] *= vall(1.0/SHT_SCALE_FACTOR); dy1[j] *= vall(1.0/SHT_SCALE_FACTOR);
292 }
293 }
294 }
295 if (ny == 0) {
296 while (l<llim) { // compute even and odd parts
297Q for (int j=0; j<NWAY; ++j) { rer[j] += y0[j] * qr(l); rei[j] += y0[j] * qi(l); }
298Q for (int j=0; j<NWAY; ++j) { ror[j] += y1[j] * qr(l+1); roi[j] += y1[j] * qi(l+1); }
299 #ifdef SHT_GRAD
300S for (int j=0; j<NWAY; ++j) { tor[j] += dy0[j] * sr(l); pei[j] += y0[j] * sr(l); }
301S for (int j=0; j<NWAY; ++j) { toi[j] += dy0[j] * si(l); per[j] -= y0[j] * si(l); }
302T for (int j=0; j<NWAY; ++j) { por[j] -= dy0[j] * tr(l); tei[j] += y0[j] * tr(l); }
303T for (int j=0; j<NWAY; ++j) { poi[j] -= dy0[j] * ti(l); ter[j] -= y0[j] * ti(l); }
304S for (int j=0; j<NWAY; ++j) { ter[j] += dy1[j] * sr(l+1); poi[j] += y1[j] * sr(l+1); }
305S for (int j=0; j<NWAY; ++j) { tei[j] += dy1[j] * si(l+1); por[j] -= y1[j] * si(l+1); }
306T for (int j=0; j<NWAY; ++j) { per[j] -= dy1[j] * tr(l+1); toi[j] += y1[j] * tr(l+1); }
307T for (int j=0; j<NWAY; ++j) { pei[j] -= dy1[j] * ti(l+1); tor[j] -= y1[j] * ti(l+1); }
308 #else
309V for (int j=0; j<NWAY; ++j) {
310V tor[j] += dy0[j] * sr(l) - y1[j] * ti(l+1);
311V pei[j] += y0[j] * sr(l) - dy1[j] * ti(l+1);
312V }
313V for (int j=0; j<NWAY; ++j) {
314V poi[j] -= dy0[j] * ti(l) - y1[j] * sr(l+1);
315V ter[j] -= y0[j] * ti(l) - dy1[j] * sr(l+1);
316V }
317V for (int j=0; j<NWAY; ++j) {
318V toi[j] += dy0[j] * si(l) + y1[j] * tr(l+1);
319V per[j] -= y0[j] * si(l) + dy1[j] * tr(l+1);
320V }
321V for (int j=0; j<NWAY; ++j) {
322V por[j] -= dy0[j] * tr(l) + y1[j] * si(l+1);
323V tei[j] += y0[j] * tr(l) + dy1[j] * si(l+1);
324V }
325 #endif
326 for (int j=0; j<NWAY; ++j) {
327V dy0[j] = vall(al[1])*(cost[j]*dy1[j] + y1[j]*st2[j]) + vall(al[0])*dy0[j];
328 y0[j] = vall(al[1])*(cost[j]*y1[j]) + vall(al[0])*y0[j];
329 }
330 for (int j=0; j<NWAY; ++j) {
331V dy1[j] = vall(al[3])*(cost[j]*dy0[j] + y0[j]*st2[j]) + vall(al[2])*dy1[j];
332 y1[j] = vall(al[3])*(cost[j]*y0[j]) + vall(al[2])*y1[j];
333 }
334 l+=2; al+=4;
335 }
336 if (l==llim) {
337Q for (int j=0; j<NWAY; ++j) { rer[j] += y0[j] * qr(l); rei[j] += y0[j] * qi(l); }
338S for (int j=0; j<NWAY; ++j) { tor[j] += dy0[j] * sr(l); pei[j] += y0[j] * sr(l); }
339S for (int j=0; j<NWAY; ++j) { toi[j] += dy0[j] * si(l); per[j] -= y0[j] * si(l); }
340T for (int j=0; j<NWAY; ++j) { por[j] -= dy0[j] * tr(l); tei[j] += y0[j] * tr(l); }
341T for (int j=0; j<NWAY; ++j) { poi[j] -= dy0[j] * ti(l); ter[j] -= y0[j] * ti(l); }
342 }
3433 for (int j=0; j<NWAY; ++j) cost[j] = vread(st, k+j) * vall(m_1);
3443 for (int j=0; j<NWAY; ++j) { rer[j] *= cost[j]; ror[j] *= cost[j]; rei[j] *= cost[j]; roi[j] *= cost[j]; }
345 }
346 #if _GCC_VEC_
347 for (int j=0; j<NWAY; ++j) {
348Q S2D_CSTORE(BrF, k+j, rer[j], ror[j], rei[j], roi[j])
349V S2D_CSTORE(BtF, k+j, ter[j], tor[j], tei[j], toi[j])
350V S2D_CSTORE(BpF, k+j, per[j], por[j], pei[j], poi[j])
351 }
352 #else
353 for (int j=0; j<NWAY; ++j) {
354Q BrF[k+j] = (rer[j]+ror[j]) + I*(rei[j]+roi[j]);
355Q BrF[NLAT-k-1-j] = (rer[j]-ror[j]) + I*(rei[j]-roi[j]);
356V BtF[k+j] = (ter[j]+tor[j]) + I*(tei[j]+toi[j]);
357V BtF[NLAT-1-k-j] = (ter[j]-tor[j]) + I*(tei[j]-toi[j]);
358V BpF[k+j] = (per[j]+por[j]) + I*(pei[j]+poi[j]);
359V BpF[NLAT-1-k-j] = (per[j]-por[j]) + I*(pei[j]-poi[j]);
360 }
361 #endif
362 k+=NWAY;
363 } while (k < nk);
364 #if _GCC_VEC_
365Q BrF += mstep*NLAT_2;
366V BtF += mstep*NLAT_2; BpF += mstep*NLAT_2;
367 #else
368Q BrF += mstep*NLAT;
369V BtF += mstep*NLAT; BpF += mstep*NLAT;
370 #endif
371 }
372
373 #if _GCC_VEC_
374 while(im <= NPHI-imlim) { // padding for high m's
375 k=0;
376 do {
377Q BrF[k] = vdup(0.0);
378V BtF[k] = vdup(0.0); BpF[k] = vdup(0.0);
379 } while (++k < NLAT_2);
380Q BrF += mstep*NLAT_2;
381V BtF += mstep*NLAT_2; BpF += mstep*NLAT_2;
382 im+=mstep;
383 }
384 #else
385 while(im <= NPHI/2) { // padding for high m's
386 k=0;
387 do {
388Q BrF[k] = 0.0;
389V BtF[k] = 0.0; BpF[k] = 0.0;
390 } while (++k < NLAT);
391Q BrF += mstep*NLAT;
392V BtF += mstep*NLAT; BpF += mstep*NLAT;
393 im+=mstep;
394 }
395 #endif
396 #endif
397}
398
399Q #undef BR0
400V #undef BT0
401V #undef BP0
402Q #undef qr
403Q #undef qi
404S #undef sr
405S #undef si
406T #undef tr
407T #undef ti
408
4093 static void GEN3(SHqst_to_spat_omp,NWAY,SUFFIX)(shtns_cfg shtns, cplx *Qlm, cplx *Slm, cplx *Tlm, double *Vr, double *Vt, double *Vp, long int llim) {
410QX static void GEN3(SH_to_spat_omp,NWAY,SUFFIX)(shtns_cfg shtns, cplx *Qlm, double *Vr, long int llim) {
411 #ifndef SHT_GRAD
412VX static void GEN3(SHsphtor_to_spat_omp,NWAY,SUFFIX)(shtns_cfg shtns, cplx *Slm, cplx *Tlm, double *Vt, double *Vp, long int llim) {
413 #else
414S static void GEN3(SHsph_to_spat_omp,NWAY,SUFFIX)(shtns_cfg shtns, cplx *Slm, double *Vt, double *Vp, long int llim) {
415T static void GEN3(SHtor_to_spat_omp,NWAY,SUFFIX)(shtns_cfg shtns, cplx *Tlm, double *Vt, double *Vp, long int llim) {
416 #endif
417
418 int k;
419 unsigned imlim = 0;
420Q v2d* BrF = (v2d*) Vr;
421V v2d* BtF = (v2d*) Vt; v2d* BpF = (v2d*) Vp;
422
423 #ifndef SHT_AXISYM
424 imlim = MTR;
425 #ifdef SHT_VAR_LTR
426 if (imlim*MRES > (unsigned) llim) imlim = ((unsigned) llim)/MRES; // 32bit mul and div should be faster
427 #endif
428 #ifdef _GCC_VEC_
429 if (shtns->fftc_mode > 0) { // alloc memory for the FFT
430 unsigned long nv = shtns->nspat;
431QX BrF = (v2d*) VMALLOC( nv * sizeof(double) );
432VX BtF = (v2d*) VMALLOC( 2*nv * sizeof(double) );
433VX BpF = BtF + nv/2;
4343 BrF = (v2d*) VMALLOC( 3*nv * sizeof(double) );
4353 BtF = BrF + nv/2; BpF = BrF + nv;
436 }
437 #else
438 if (shtns->ncplx_fft > 0) { // alloc memory for the FFT
439QX BrF = VMALLOC( shtns->ncplx_fft * sizeof(cplx) );
440VX BtF = VMALLOC( 2* shtns->ncplx_fft * sizeof(cplx) );
441VX BpF = BtF + shtns->ncplx_fft;
4423 BrF = VMALLOC( 3* shtns->ncplx_fft * sizeof(cplx) );
4433 BtF = BrF + shtns->ncplx_fft; BpF = BtF + shtns->ncplx_fft;
444 }
445 #endif
446 #endif
447 imlim += 1;
448
449 #pragma omp parallel num_threads(shtns->nthreads)
450 {
4513 GEN3(_sy3,NWAY,SUFFIX)(shtns, Qlm, Slm, Tlm, BrF, BtF, BpF, llim, imlim);
452QX GEN3(_sy1,NWAY,SUFFIX)(shtns, Qlm, BrF, llim, imlim);
453 #ifndef SHT_GRAD
454VX GEN3(_sy2,NWAY,SUFFIX)(shtns, Slm, Tlm, BtF, BpF, llim, imlim);
455 #else
456S GEN3(_sy1s,NWAY,SUFFIX)(shtns, Slm, BtF, BpF, llim, imlim);
457T GEN3(_sy1t,NWAY,SUFFIX)(shtns, Tlm, BtF, BpF, llim, imlim);
458 #endif
459
460 #ifndef SHT_AXISYM
461V #ifndef HAVE_LIBFFTW3_OMP
462V #pragma omp barrier
463V #if _GCC_VEC_
464V if (shtns->fftc_mode == 0) {
4653 #pragma omp single nowait
4663 fftw_execute_dft(shtns->ifftc, (cplx *) BrF, (cplx *) Vr);
467V #pragma omp single nowait
468V fftw_execute_dft(shtns->ifftc, (cplx *) BtF, (cplx *) Vt);
469V #pragma omp single nowait
470V fftw_execute_dft(shtns->ifftc, (cplx *) BpF, (cplx *) Vp);
471V } else if (shtns->fftc_mode > 0) { // split dft
4723 #pragma omp single nowait
4733 fftw_execute_split_dft(shtns->ifftc,((double*)BrF)+1, ((double*)BrF), Vr+NPHI, Vr);
474V #pragma omp single nowait
475V fftw_execute_split_dft(shtns->ifftc,((double*)BtF)+1, ((double*)BtF), Vt+NPHI, Vt);
476V #pragma omp single nowait
477V fftw_execute_split_dft(shtns->ifftc,((double*)BpF)+1, ((double*)BpF), Vp+NPHI, Vp);
478V }
479V #else
4803 #pragma omp single nowait
4813 fftw_execute_dft_c2r(shtns->ifft, (cplx *) BrF, Vr);
482V #pragma omp single nowait
483V fftw_execute_dft_c2r(shtns->ifft, (cplx *) BtF, Vt);
484V #pragma omp single nowait
485V fftw_execute_dft_c2r(shtns->ifft, (cplx *) BpF, Vp);
486V #endif
487V #endif
488 #endif
489
490 }
491
492 #ifndef SHT_AXISYM
493 // NPHI > 1 as SHT_AXISYM is not defined.
494 #if _GCC_VEC_
495 if (shtns->fftc_mode >= 0) {
496 if (shtns->fftc_mode == 0) {
497V #ifdef HAVE_LIBFFTW3_OMP
498Q fftw_execute_dft(shtns->ifftc, (cplx *) BrF, (cplx *) Vr);
499V fftw_execute_dft(shtns->ifftc, (cplx *) BtF, (cplx *) Vt);
500V fftw_execute_dft(shtns->ifftc, (cplx *) BpF, (cplx *) Vp);
501V #endif
502 } else { // split dft
503V #ifdef HAVE_LIBFFTW3_OMP
504Q fftw_execute_split_dft(shtns->ifftc,((double*)BrF)+1, ((double*)BrF), Vr+NPHI, Vr);
505V fftw_execute_split_dft(shtns->ifftc,((double*)BtF)+1, ((double*)BtF), Vt+NPHI, Vt);
506V fftw_execute_split_dft(shtns->ifftc,((double*)BpF)+1, ((double*)BpF), Vp+NPHI, Vp);
507V #endif
508Q VFREE(BrF);
509VX VFREE(BtF); // this frees also BpF.
510 }
511 }
512 #else
513 if (shtns->ncplx_fft >= 0) {
514V #ifdef HAVE_LIBFFTW3_OMP
515Q fftw_execute_dft_c2r(shtns->ifft, (cplx *) BrF, Vr);
516V fftw_execute_dft_c2r(shtns->ifft, (cplx *) BtF, Vt);
517V fftw_execute_dft_c2r(shtns->ifft, (cplx *) BpF, Vp);
518V #endif
519 if (shtns->ncplx_fft > 0) { // free memory
520Q VFREE(BrF);
521VX VFREE(BtF); // this frees also BpF.
522 }
523 }
524 #endif
525 #endif
526
527 }
Note: See TracBrowser for help on using the repository browser.