source: CIVL/examples/omp/shtns/SHT/fly_SH_to_spat.gen.c

main
Last change on this file was ea777aa, checked in by Alex Wilton <awilton@…>, 3 years ago

Moved examples, include, build_default.properties, common.xml, and README out from dev.civl.com into the root of the repo.

git-svn-id: svn://vsl.cis.udel.edu/civl/trunk@5704 fb995dde-84ed-4084-dfe6-e5aef3e2452c

  • Property mode set to 100644
File size: 25.8 KB
Line 
1/*
2 * Copyright (c) 2010-2015 Centre National de la Recherche Scientifique.
3 * written by Nathanael Schaeffer (CNRS, ISTerre, Grenoble, France).
4 *
5 * nathanael.schaeffer@ujf-grenoble.fr
6 *
7 * This software is governed by the CeCILL license under French law and
8 * abiding by the rules of distribution of free software. You can use,
9 * modify and/or redistribute the software under the terms of the CeCILL
10 * license as circulated by CEA, CNRS and INRIA at the following URL
11 * "http://www.cecill.info".
12 *
13 * The fact that you are presently reading this means that you have had
14 * knowledge of the CeCILL license and that you accept its terms.
15 *
16 */
17
18# This file is meta-code for SHT.c (spherical harmonic transform).
19# it is intended for "make" to generate C code for similar SHT functions,
20# from one generic function + tags.
21# > See Makefile and SHT.c
22# Basically, there are tags at the beginning of lines that are information
23# to keep or remove the line depending on the function to build.
24# tags :
25# Q : line for scalar transform
26# V : line for vector transform (both spheroidal and toroidal)
27# S : line for vector transfrom, spheroidal component
28# T : line for vector transform, toroidal component.
29
303 static void GEN3(SHqst_to_spat_fly,NWAY,SUFFIX)(shtns_cfg shtns, cplx *Qlm, cplx *Slm, cplx *Tlm, double *Vr, double *Vt, double *Vp, long int llim) {
31QX static void GEN3(SH_to_spat_fly,NWAY,SUFFIX)(shtns_cfg shtns, cplx *Qlm, double *Vr, long int llim) {
32 #ifndef SHT_GRAD
33VX static void GEN3(SHsphtor_to_spat_fly,NWAY,SUFFIX)(shtns_cfg shtns, cplx *Slm, cplx *Tlm, double *Vt, double *Vp, long int llim) {
34 #else
35S static void GEN3(SHsph_to_spat_fly,NWAY,SUFFIX)(shtns_cfg shtns, cplx *Slm, double *Vt, double *Vp, long int llim) {
36T static void GEN3(SHtor_to_spat_fly,NWAY,SUFFIX)(shtns_cfg shtns, cplx *Tlm, double *Vt, double *Vp, long int llim) {
37 #endif
38
39Q v2d *BrF;
40 #ifndef SHT_AXISYM
41V v2d *BtF, *BpF;
42Q #define BR0(i) ((double *)BrF)[2*(i)]
43V #define BT0(i) ((double *)BtF)[2*(i)]
44V #define BP0(i) ((double *)BpF)[2*(i)]
45Q #define qr(l) vall(creal(Ql[l]))
46Q #define qi(l) vall(cimag(Ql[l]))
47S #define sr(l) vall(creal(Sl[l]))
48S #define si(l) vall(cimag(Sl[l]))
49T #define tr(l) vall(creal(Tl[l]))
50T #define ti(l) vall(cimag(Tl[l]))
51V double m_1;
52 unsigned im, imlim;
53 #else
54S v2d *BtF;
55T v2d *BpF;
56Q #define BR0(i) ((double *)BrF)[i]
57S #define BT0(i) ((double *)BtF)[i]
58T #define BP0(i) ((double *)BpF)[i]
59 #endif
60 long int nk, k,l,m;
61 double *alm, *al;
62 s2d *ct, *st;
63Q double Ql0[llim+1];
64S double Sl0[llim];
65T double Tl0[llim];
66
67 #ifndef SHT_AXISYM
68Q BrF = (v2d*) Vr;
69V BtF = (v2d*) Vt; BpF = (v2d*) Vp;
70 #ifdef _GCC_VEC_
71 if (shtns->fftc_mode > 0) { // alloc memory for the FFT
72 unsigned long nv = shtns->nspat;
73QX BrF = (v2d*) VMALLOC( nv * sizeof(double) );
74VX BtF = (v2d*) VMALLOC( 2*nv * sizeof(double) );
75VX BpF = BtF + nv/2;
763 BrF = (v2d*) VMALLOC( 3*nv * sizeof(double) );
773 BtF = BrF + nv/2; BpF = BrF + nv;
78 }
79 #ifdef SHT_GRAD
80S k=0; do { BpF[k]=vdup(0.0); } while(++k<NLAT_2);
81T k=0; do { BtF[k]=vdup(0.0); } while(++k<NLAT_2);
82 #endif
83 #else
84 if (shtns->ncplx_fft > 0) { // alloc memory for the FFT
85QX BrF = VMALLOC( shtns->ncplx_fft * sizeof(cplx) );
86VX BtF = VMALLOC( 2* shtns->ncplx_fft * sizeof(cplx) );
87VX BpF = BtF + shtns->ncplx_fft;
883 BrF = VMALLOC( 3* shtns->ncplx_fft * sizeof(cplx) );
893 BtF = BrF + shtns->ncplx_fft; BpF = BtF + shtns->ncplx_fft;
90 }
91 #ifdef SHT_GRAD
92S k=0; do { BpF[k]=vdup(0.0); } while(++k<NLAT);
93T k=0; do { BtF[k]=vdup(0.0); } while(++k<NLAT);
94 #endif
95 #endif
96 imlim = MTR;
97 #ifdef SHT_VAR_LTR
98 if (imlim*MRES > (unsigned) llim) imlim = ((unsigned) llim)/MRES; // 32bit mul and div should be faster
99 #endif
100 #else
101 #ifdef SHT_GRAD
102S if (Vp != NULL) { k=0; do { ((v2d*)Vp)[k]=vdup(0.0); } while(++k<NLAT_2); }
103T if (Vt != NULL) { k=0; do { ((v2d*)Vt)[k]=vdup(0.0); } while(++k<NLAT_2); }
104 #endif
105Q BrF = (v2d*) Vr;
106S BtF = (v2d*) Vt;
107T BpF = (v2d*) Vp;
108 #endif
109
110 ct = (s2d*) shtns->ct; st = (s2d*) shtns->st;
111 // im=0;
112 l=1;
113 alm = shtns->alm;
114Q Ql0[0] = (double) Qlm[0]; // l=0
115 do { // for m=0, compress the complex Q,S,T to double
116Q Ql0[l] = (double) Qlm[l]; // Ql[l+1] = (double) Qlm[l+1];
117S Sl0[l-1] = (double) Slm[l]; // Sl[l] = (double) Slm[l+1];
118T Tl0[l-1] = (double) Tlm[l]; // Tl[l] = (double) Tlm[l+1];
119 ++l;
120 } while(l<=llim);
121 k=0; nk = NLAT_2;
122 #if _GCC_VEC_
123 nk = ((unsigned)(nk+VSIZE2-1)) / VSIZE2;
124 #endif
125 do {
126 l=0; al = alm;
127 rnd cost[NWAY], y0[NWAY], y1[NWAY];
128V rnd sint[NWAY], dy0[NWAY], dy1[NWAY];
129Q rnd re[NWAY], ro[NWAY];
130S rnd te[NWAY], to[NWAY];
131T rnd pe[NWAY], po[NWAY];
132 for (int j=0; j<NWAY; ++j) {
133 cost[j] = vread(ct, j+k);
134V sint[j] = -vread(st, j+k);
135 y0[j] = vall(al[0]);
136V dy0[j] = vall(0.0);
137Q re[j] = y0[j] * vall(Ql0[0]);
138S to[j] = dy0[j];
139T po[j] = dy0[j];
140 }
141 for (int j=0; j<NWAY; ++j) {
142 y1[j] = vall(al[0]*al[1]) * cost[j];
143V dy1[j] = vall(al[0]*al[1]) * sint[j];
144 }
145 for (int j=0; j<NWAY; ++j) {
146Q ro[j] = y1[j] * vall(Ql0[1]);
147S te[j] = dy1[j] * vall(Sl0[0]);
148T pe[j] = -dy1[j] * vall(Tl0[0]);
149 }
150 al+=2; l+=2;
151 while(l<llim) {
152 for (int j=0; j<NWAY; ++j) {
153V dy0[j] = vall(al[1])*(cost[j]*dy1[j] + y1[j]*sint[j]) + vall(al[0])*dy0[j];
154 y0[j] = vall(al[1])*(cost[j]*y1[j]) + vall(al[0])*y0[j];
155 }
156 for (int j=0; j<NWAY; ++j) {
157Q re[j] += y0[j] * vall(Ql0[l]);
158S to[j] += dy0[j] * vall(Sl0[l-1]);
159T po[j] -= dy0[j] * vall(Tl0[l-1]);
160 }
161 for (int j=0; j<NWAY; ++j) {
162V dy1[j] = vall(al[3])*(cost[j]*dy0[j] + y0[j]*sint[j]) + vall(al[2])*dy1[j];
163 y1[j] = vall(al[3])*(cost[j]*y0[j]) + vall(al[2])*y1[j];
164 }
165 for (int j=0; j<NWAY; ++j) {
166Q ro[j] += y1[j] * vall(Ql0[l+1]);
167S te[j] += dy1[j] * vall(Sl0[l]);
168T pe[j] -= dy1[j] * vall(Tl0[l]);
169 }
170 al+=4; l+=2;
171 }
172 if (l==llim) {
173 for (int j=0; j<NWAY; ++j) {
174V dy0[j] = vall(al[1])*(cost[j]*dy1[j] + y1[j]*sint[j]) + vall(al[0])*dy0[j];
175 y0[j] = vall(al[1])*cost[j]*y1[j] + vall(al[0])*y0[j];
176 }
177 for (int j=0; j<NWAY; ++j) {
178Q re[j] += y0[j] * vall(Ql0[l]);
179S to[j] += dy0[j] * vall(Sl0[l-1]);
180T po[j] -= dy0[j] * vall(Tl0[l-1]);
181 }
182 }
183 #if _GCC_VEC_
184 for (int j=0; j<NWAY; ++j) {
185Q S2D_STORE(BrF, j+k, re[j], ro[j])
186S S2D_STORE(BtF, j+k, te[j], to[j])
187T S2D_STORE(BpF, j+k, pe[j], po[j])
188 }
189 #else
190 for (int j=0; j<NWAY; ++j) {
191Q BR0(k+j) = (re[j]+ro[j]);
192Q BR0(NLAT-k-1-j) = (re[j]-ro[j]);
193S BT0(k+j) = (te[j]+to[j]);
194S BT0(NLAT-k-1-j) = (te[j]-to[j]);
195T BP0(k+j) = (pe[j]+po[j]);
196T BP0(NLAT-k-1-j) = (pe[j]-po[j]);
197 }
198 #endif
199 k+=NWAY;
200 } while (k < nk);
201
202 #ifndef SHT_AXISYM
203 #if _GCC_VEC_
204Q BrF += NLAT_2;
205V BtF += NLAT_2; BpF += NLAT_2;
206 #else
207Q BrF += NLAT;
208V BtF += NLAT; BpF += NLAT;
209 #endif
210 for(im=1; im<=imlim; ++im) {
211 m = im*MRES;
212 //l = LiM(shtns, 0,im);
213 l = (im*(2*(LMAX+1)-(m+MRES)))>>1;
214V m_1 = 1.0/m;
215 //alm = shtns->alm[im];
216 //alm = shtns->alm[0] + im*(2*LMAX - (im-1)*MRES); // for m > 0
217 alm += 2*(LMAX-m+MRES);
218Q cplx* Ql = &Qlm[l]; // virtual pointer for l=0 and im
219S cplx* Sl = &Slm[l]; // virtual pointer for l=0 and im
220T cplx* Tl = &Tlm[l];
221 k=0; l=shtns->tm[im];
222 #if _GCC_VEC_
223 l>>=1; // stay on a 16 byte boundary
224 while (k<l) { // polar optimization
225Q BrF[k] = vdup(0.0); BrF[(NPHI-2*im)*NLAT_2 + k] = vdup(0.0);
226Q BrF[NLAT_2-l+k] = vdup(0.0); BrF[(NPHI+1-2*im)*NLAT_2 -l+k] = vdup(0.0);
227V BtF[k] = vdup(0.0); BtF[(NPHI-2*im)*NLAT_2 + k] = vdup(0.0);
228V BtF[NLAT_2-l+k] = vdup(0.0); BtF[(NPHI+1-2*im)*NLAT_2 -l+k] = vdup(0.0);
229V BpF[k] = vdup(0.0); BpF[(NPHI-2*im)*NLAT_2 + k] = vdup(0.0);
230V BpF[NLAT_2-l+k] = vdup(0.0); BpF[(NPHI+1-2*im)*NLAT_2 -l+k] = vdup(0.0);
231 ++k;
232 }
233 k = ((unsigned) k) / (VSIZE2/2);
234 #else
235 while (k<l) { // polar optimization
236Q BrF[k] = 0.0; BrF[NLAT-l+k] = 0.0;
237V BtF[k] = 0.0; BtF[NLAT-l+k] = 0.0;
238V BpF[k] = 0.0; BpF[NLAT-l+k] = 0.0;
239 ++k;
240 }
241 #endif
242 do {
243 al = alm;
244 rnd cost[NWAY], y0[NWAY], y1[NWAY];
245V rnd st2[NWAY], dy0[NWAY], dy1[NWAY];
246Q rnd rer[NWAY], rei[NWAY], ror[NWAY], roi[NWAY];
247V rnd ter[NWAY], tei[NWAY], tor[NWAY], toi[NWAY];
248V rnd per[NWAY], pei[NWAY], por[NWAY], poi[NWAY];
249 for (int j=0; j<NWAY; ++j) {
250 cost[j] = vread(st, k+j);
251QX y0[j] = vall(1.0);
252V st2[j] = cost[j]*cost[j]*vall(-m_1);
253V y0[j] = vall(m); // for the vector transform, compute ylm*m/sint
254 }
255Q l=m;
256V l=m-1;
257 long int ny = 0;
258 if ((int)llim <= SHT_L_RESCALE_FLY) {
259 do { // sin(theta)^m
260 if (l&1) for (int j=0; j<NWAY; ++j) y0[j] *= cost[j];
261 for (int j=0; j<NWAY; ++j) cost[j] *= cost[j];
262 } while(l >>= 1);
263 } else {
264 long int nsint = 0;
265 do { // sin(theta)^m (use rescaling to avoid underflow)
266 if (l&1) {
267 for (int j=0; j<NWAY; ++j) y0[j] *= cost[j];
268 ny += nsint;
269 if (vlo(y0[0]) < (SHT_ACCURACY+1.0/SHT_SCALE_FACTOR)) {
270 ny--;
271 for (int j=0; j<NWAY; ++j) y0[j] *= vall(SHT_SCALE_FACTOR);
272 }
273 }
274 for (int j=0; j<NWAY; ++j) cost[j] *= cost[j];
275 nsint += nsint;
276 if (vlo(cost[0]) < 1.0/SHT_SCALE_FACTOR) {
277 nsint--;
278 for (int j=0; j<NWAY; ++j) cost[j] *= vall(SHT_SCALE_FACTOR);
279 }
280 } while(l >>= 1);
281 }
282 for (int j=0; j<NWAY; ++j) {
283 y0[j] *= vall(al[0]);
284 cost[j] = vread(ct, j+k);
285V dy0[j] = cost[j]*y0[j];
286Q ror[j] = vall(0.0); roi[j] = vall(0.0);
287Q rer[j] = vall(0.0); rei[j] = vall(0.0);
288 }
289 for (int j=0; j<NWAY; ++j) {
290 y1[j] = (vall(al[1])*y0[j]) *cost[j]; // y1[j] = vall(al[1])*cost[j]*y0[j];
291V por[j] = vall(0.0); tei[j] = vall(0.0);
292V tor[j] = vall(0.0); pei[j] = vall(0.0);
293V dy1[j] = (vall(al[1])*y0[j]) *(cost[j]*cost[j] + st2[j]); // dy1[j] = vall(al[1])*(cost[j]*dy0[j] - y0[j]*st2[j]);
294V poi[j] = vall(0.0); ter[j] = vall(0.0);
295V toi[j] = vall(0.0); per[j] = vall(0.0);
296 }
297 l=m; al+=2;
298 while ((ny<0) && (l<llim)) { // ylm treated as zero and ignored if ny < 0
299 for (int j=0; j<NWAY; ++j) {
300 y0[j] = vall(al[1])*(cost[j]*y1[j]) + vall(al[0])*y0[j];
301V dy0[j] = vall(al[1])*(cost[j]*dy1[j] + y1[j]*st2[j]) + vall(al[0])*dy0[j];
302 }
303 for (int j=0; j<NWAY; ++j) {
304 y1[j] = vall(al[3])*(cost[j]*y0[j]) + vall(al[2])*y1[j];
305V dy1[j] = vall(al[3])*(cost[j]*dy0[j] + y0[j]*st2[j]) + vall(al[2])*dy1[j];
306 }
307 l+=2; al+=4;
308 if (fabs(vlo(y0[NWAY-1])) > SHT_ACCURACY*SHT_SCALE_FACTOR + 1.0) { // rescale when value is significant
309 ++ny;
310 for (int j=0; j<NWAY; ++j) {
311 y0[j] *= vall(1.0/SHT_SCALE_FACTOR); y1[j] *= vall(1.0/SHT_SCALE_FACTOR);
312V dy0[j] *= vall(1.0/SHT_SCALE_FACTOR); dy1[j] *= vall(1.0/SHT_SCALE_FACTOR);
313 }
314 }
315 }
316 if (ny == 0) {
317 while (l<llim) { // compute even and odd parts
318Q for (int j=0; j<NWAY; ++j) { rer[j] += y0[j] * qr(l); rei[j] += y0[j] * qi(l); }
319Q for (int j=0; j<NWAY; ++j) { ror[j] += y1[j] * qr(l+1); roi[j] += y1[j] * qi(l+1); }
320 #ifdef SHT_GRAD
321S for (int j=0; j<NWAY; ++j) { tor[j] += dy0[j] * sr(l); pei[j] += y0[j] * sr(l); }
322S for (int j=0; j<NWAY; ++j) { toi[j] += dy0[j] * si(l); per[j] -= y0[j] * si(l); }
323T for (int j=0; j<NWAY; ++j) { por[j] -= dy0[j] * tr(l); tei[j] += y0[j] * tr(l); }
324T for (int j=0; j<NWAY; ++j) { poi[j] -= dy0[j] * ti(l); ter[j] -= y0[j] * ti(l); }
325S for (int j=0; j<NWAY; ++j) { ter[j] += dy1[j] * sr(l+1); poi[j] += y1[j] * sr(l+1); }
326S for (int j=0; j<NWAY; ++j) { tei[j] += dy1[j] * si(l+1); por[j] -= y1[j] * si(l+1); }
327T for (int j=0; j<NWAY; ++j) { per[j] -= dy1[j] * tr(l+1); toi[j] += y1[j] * tr(l+1); }
328T for (int j=0; j<NWAY; ++j) { pei[j] -= dy1[j] * ti(l+1); tor[j] -= y1[j] * ti(l+1); }
329 #else
330V for (int j=0; j<NWAY; ++j) {
331V tor[j] += dy0[j] * sr(l) - y1[j] * ti(l+1);
332V pei[j] += y0[j] * sr(l) - dy1[j] * ti(l+1);
333V }
334V for (int j=0; j<NWAY; ++j) {
335V poi[j] -= dy0[j] * ti(l) - y1[j] * sr(l+1);
336V ter[j] -= y0[j] * ti(l) - dy1[j] * sr(l+1);
337V }
338V for (int j=0; j<NWAY; ++j) {
339V toi[j] += dy0[j] * si(l) + y1[j] * tr(l+1);
340V per[j] -= y0[j] * si(l) + dy1[j] * tr(l+1);
341V }
342V for (int j=0; j<NWAY; ++j) {
343V por[j] -= dy0[j] * tr(l) + y1[j] * si(l+1);
344V tei[j] += y0[j] * tr(l) + dy1[j] * si(l+1);
345V }
346 #endif
347 for (int j=0; j<NWAY; ++j) {
348V dy0[j] = vall(al[1])*(cost[j]*dy1[j] + y1[j]*st2[j]) + vall(al[0])*dy0[j];
349 y0[j] = vall(al[1])*(cost[j]*y1[j]) + vall(al[0])*y0[j];
350 }
351 for (int j=0; j<NWAY; ++j) {
352V dy1[j] = vall(al[3])*(cost[j]*dy0[j] + y0[j]*st2[j]) + vall(al[2])*dy1[j];
353 y1[j] = vall(al[3])*(cost[j]*y0[j]) + vall(al[2])*y1[j];
354 }
355 l+=2; al+=4;
356 }
357 if (l==llim) {
358Q for (int j=0; j<NWAY; ++j) { rer[j] += y0[j] * qr(l); rei[j] += y0[j] * qi(l); }
359S for (int j=0; j<NWAY; ++j) { tor[j] += dy0[j] * sr(l); pei[j] += y0[j] * sr(l); }
360S for (int j=0; j<NWAY; ++j) { toi[j] += dy0[j] * si(l); per[j] -= y0[j] * si(l); }
361T for (int j=0; j<NWAY; ++j) { por[j] -= dy0[j] * tr(l); tei[j] += y0[j] * tr(l); }
362T for (int j=0; j<NWAY; ++j) { poi[j] -= dy0[j] * ti(l); ter[j] -= y0[j] * ti(l); }
363 }
3643 for (int j=0; j<NWAY; ++j) cost[j] = vread(st, k+j) * vall(m_1);
3653 for (int j=0; j<NWAY; ++j) { rer[j] *= cost[j]; ror[j] *= cost[j]; rei[j] *= cost[j]; roi[j] *= cost[j]; }
366 }
367 #if _GCC_VEC_
368 for (int j=0; j<NWAY; ++j) {
369Q S2D_CSTORE(BrF, k+j, rer[j], ror[j], rei[j], roi[j])
370V S2D_CSTORE(BtF, k+j, ter[j], tor[j], tei[j], toi[j])
371V S2D_CSTORE(BpF, k+j, per[j], por[j], pei[j], poi[j])
372 }
373 #else
374 for (int j=0; j<NWAY; ++j) {
375Q BrF[k+j] = (rer[j]+ror[j]) + I*(rei[j]+roi[j]);
376Q BrF[NLAT-k-1-j] = (rer[j]-ror[j]) + I*(rei[j]-roi[j]);
377V BtF[k+j] = (ter[j]+tor[j]) + I*(tei[j]+toi[j]);
378V BtF[NLAT-1-k-j] = (ter[j]-tor[j]) + I*(tei[j]-toi[j]);
379V BpF[k+j] = (per[j]+por[j]) + I*(pei[j]+poi[j]);
380V BpF[NLAT-1-k-j] = (per[j]-por[j]) + I*(pei[j]-poi[j]);
381 }
382 #endif
383 k+=NWAY;
384 } while (k < nk);
385 #if _GCC_VEC_
386Q BrF += NLAT_2;
387V BtF += NLAT_2; BpF += NLAT_2;
388 #else
389Q BrF += NLAT;
390V BtF += NLAT; BpF += NLAT;
391 #endif
392 }
393
394 #if _GCC_VEC_
395 for (k=0; k < NLAT_2*(NPHI-1-2*imlim); ++k) { // padding for high m's
396Q BrF[k] = vdup(0.0);
397V BtF[k] = vdup(0.0); BpF[k] = vdup(0.0);
398 }
399Q BrF -= NLAT_2*(imlim+1); // restore original pointer
400V BtF -= NLAT_2*(imlim+1); BpF -= NLAT_2*(imlim+1);
401 #else
402 for (k=0; k < NLAT*((NPHI>>1) -imlim); ++k) { // padding for high m's
403Q BrF[k] = 0.0;
404V BtF[k] = 0.0; BpF[k] = 0.0;
405 }
406Q BrF -= NLAT*(imlim+1); // restore original pointer
407V BtF -= NLAT*(imlim+1); BpF -= NLAT*(imlim+1); // restore original pointer
408 #endif
409 // NPHI > 1 as SHT_AXISYM is not defined.
410 #if _GCC_VEC_
411 if (shtns->fftc_mode >= 0) {
412 if (shtns->fftc_mode == 0) {
413Q fftw_execute_dft(shtns->ifftc, (cplx *) BrF, (cplx *) Vr);
414V fftw_execute_dft(shtns->ifftc, (cplx *) BtF, (cplx *) Vt);
415V fftw_execute_dft(shtns->ifftc, (cplx *) BpF, (cplx *) Vp);
416 } else { // split dft
417Q fftw_execute_split_dft(shtns->ifftc,((double*)BrF)+1, ((double*)BrF), Vr+NPHI, Vr);
418V fftw_execute_split_dft(shtns->ifftc,((double*)BtF)+1, ((double*)BtF), Vt+NPHI, Vt);
419V fftw_execute_split_dft(shtns->ifftc,((double*)BpF)+1, ((double*)BpF), Vp+NPHI, Vp);
420Q VFREE(BrF);
421VX VFREE(BtF); // this frees also BpF.
422 }
423 }
424 #else
425 if (shtns->ncplx_fft >= 0) {
426Q fftw_execute_dft_c2r(shtns->ifft, (cplx *) BrF, Vr);
427V fftw_execute_dft_c2r(shtns->ifft, (cplx *) BtF, Vt);
428V fftw_execute_dft_c2r(shtns->ifft, (cplx *) BpF, Vp);
429 if (shtns->ncplx_fft > 0) { // free memory
430Q VFREE(BrF);
431VX VFREE(BtF); // this frees also BpF.
432 }
433 }
434 #endif
435 #endif
436
437Q #undef BR0
438V #undef BT0
439V #undef BP0
440Q #undef qr
441Q #undef qi
442S #undef sr
443S #undef si
444T #undef tr
445T #undef ti
446 }
447
448 #ifndef SHT_AXISYM
449
4503 static void GEN3(SHqst_m_to_spat_fly,NWAY,SUFFIX)(shtns_cfg shtns, int im, cplx *Qlm, cplx *Slm, cplx *Tlm, cplx *Vr, cplx *Vt, cplx *Vp, long int llim) {
451QX static void GEN3(SH_m_to_spat_fly,NWAY,SUFFIX)(shtns_cfg shtns, int im, cplx *Qlm, cplx *Vr, long int llim) {
452 #ifndef SHT_GRAD
453VX static void GEN3(SHsphtor_m_to_spat_fly,NWAY,SUFFIX)(shtns_cfg shtns, int im, cplx *Slm, cplx *Tlm, cplx *Vt, cplx *Vp, long int llim) {
454 #else
455S static void GEN3(SHsph_m_to_spat_fly,NWAY,SUFFIX)(shtns_cfg shtns, int im, cplx *Slm, cplx *Vt, cplx *Vp, long int llim) {
456T static void GEN3(SHtor_m_to_spat_fly,NWAY,SUFFIX)(shtns_cfg shtns, int im, cplx *Tlm, cplx *Vt, cplx *Vp, long int llim) {
457 #endif
458
459Q v2d *BrF;
460V v2d *BtF, *BpF;
461Q #define qr(l) vall(creal(Qlm[l]))
462Q #define qi(l) vall(cimag(Qlm[l]))
463S #define sr(l) vall(creal(Slm[l]))
464S #define si(l) vall(cimag(Slm[l]))
465T #define tr(l) vall(creal(Tlm[l]))
466T #define ti(l) vall(cimag(Tlm[l]))
467V double m_1;
468 long int nk, k,l,m;
469 double *alm, *al;
470 s2d *ct, *st;
471
472Q BrF = (v2d*) Vr;
473V BtF = (v2d*) Vt; BpF = (v2d*) Vp;
474
475 nk = NLAT_2;
476 #if _GCC_VEC_
477 nk = ((unsigned)(nk+VSIZE2-1)) / VSIZE2;
478 #endif
479 ct = (s2d*) shtns->ct; st = (s2d*) shtns->st;
480
481 if (im == 0) {
482Q double Ql0[llim+1];
483S double Sl0[llim];
484T double Tl0[llim];
485
486 #ifdef SHT_GRAD
487S k=0; do { BpF[k]=vdup(0.0); } while(++k<NLAT);
488T k=0; do { BtF[k]=vdup(0.0); } while(++k<NLAT);
489 #endif
490
491 l=1;
492 alm = shtns->alm;
493Q Ql0[0] = (double) Qlm[0]; // l=0
494 do { // for m=0, compress the complex Q,S,T to double
495Q Ql0[l] = (double) Qlm[l]; // Ql[l+1] = (double) Qlm[l+1];
496S Sl0[l-1] = (double) Slm[l]; // Sl[l] = (double) Slm[l+1];
497T Tl0[l-1] = (double) Tlm[l]; // Tl[l] = (double) Tlm[l+1];
498 ++l;
499 } while(l<=llim);
500 k=0;
501 do {
502 l=0; al = alm;
503 rnd cost[NWAY], y0[NWAY], y1[NWAY];
504V rnd sint[NWAY], dy0[NWAY], dy1[NWAY];
505Q rnd re[NWAY], ro[NWAY];
506S rnd te[NWAY], to[NWAY];
507T rnd pe[NWAY], po[NWAY];
508 for (int j=0; j<NWAY; ++j) {
509 cost[j] = vread(ct, j+k);
510V sint[j] = -vread(st, j+k);
511 y0[j] = vall(al[0]);
512V dy0[j] = vall(0.0);
513Q re[j] = y0[j] * vall(Ql0[0]);
514S to[j] = dy0[j];
515T po[j] = dy0[j];
516 }
517 for (int j=0; j<NWAY; ++j) {
518 y1[j] = vall(al[0]*al[1]) * cost[j];
519V dy1[j] = vall(al[0]*al[1]) * sint[j];
520 }
521 for (int j=0; j<NWAY; ++j) {
522Q ro[j] = y1[j] * vall(Ql0[1]);
523S te[j] = dy1[j] * vall(Sl0[0]);
524T pe[j] = -dy1[j] * vall(Tl0[0]);
525 }
526 al+=2; l+=2;
527 while(l<llim) {
528 for (int j=0; j<NWAY; ++j) {
529V dy0[j] = vall(al[1])*(cost[j]*dy1[j] + y1[j]*sint[j]) + vall(al[0])*dy0[j];
530 y0[j] = vall(al[1])*(cost[j]*y1[j]) + vall(al[0])*y0[j];
531 }
532 for (int j=0; j<NWAY; ++j) {
533Q re[j] += y0[j] * vall(Ql0[l]);
534S to[j] += dy0[j] * vall(Sl0[l-1]);
535T po[j] -= dy0[j] * vall(Tl0[l-1]);
536 }
537 for (int j=0; j<NWAY; ++j) {
538V dy1[j] = vall(al[3])*(cost[j]*dy0[j] + y0[j]*sint[j]) + vall(al[2])*dy1[j];
539 y1[j] = vall(al[3])*(cost[j]*y0[j]) + vall(al[2])*y1[j];
540 }
541 for (int j=0; j<NWAY; ++j) {
542Q ro[j] += y1[j] * vall(Ql0[l+1]);
543S te[j] += dy1[j] * vall(Sl0[l]);
544T pe[j] -= dy1[j] * vall(Tl0[l]);
545 }
546 al+=4; l+=2;
547 }
548 if (l==llim) {
549 for (int j=0; j<NWAY; ++j) {
550V dy0[j] = vall(al[1])*(cost[j]*dy1[j] + y1[j]*sint[j]) + vall(al[0])*dy0[j];
551 y0[j] = vall(al[1])*cost[j]*y1[j] + vall(al[0])*y0[j];
552 }
553 for (int j=0; j<NWAY; ++j) {
554Q re[j] += y0[j] * vall(Ql0[l]);
555S to[j] += dy0[j] * vall(Sl0[l-1]);
556T po[j] -= dy0[j] * vall(Tl0[l-1]);
557 }
558 }
559 #if _GCC_VEC_
560 for (int j=0; j<NWAY; ++j) {
561Q S2D_CSTORE2(BrF, k+j, re[j], ro[j], vall(0), vall(0))
562S S2D_CSTORE2(BtF, k+j, te[j], to[j], vall(0), vall(0))
563T S2D_CSTORE2(BpF, k+j, pe[j], po[j], vall(0), vall(0))
564 }
565 #else
566 for (int j=0; j<NWAY; ++j) {
567Q BrF[k+j] = (re[j]+ro[j]);
568Q BrF[NLAT-k-1-j] = (re[j]-ro[j]);
569S BtF[k+j] = (te[j]+to[j]);
570S BtF[NLAT-1-k-j] = (te[j]-to[j]);
571T BpF[k+j] = (pe[j]+po[j]);
572T BpF[NLAT-1-k-j] = (pe[j]-po[j]);
573 }
574 #endif
575 k+=NWAY;
576 } while (k < nk);
577
578 } else { // im > 0
579
580 m = im*MRES;
581V m_1 = 1.0/m;
582 alm = shtns->alm + im*(2*LMAX -m+MRES);
583Q Qlm -= m; // virtual pointer for l=0
584S Slm -= m; // virtual pointer for l=0
585T Tlm -= m;
586 k=0; l=shtns->tm[im];
587 while (k < l) { // polar optimization
588Q BrF[k] = vdup(0.0); BrF[NLAT-l+k] = vdup(0.0);
589V BtF[k] = vdup(0.0); BtF[NLAT-l+k] = vdup(0.0);
590V BpF[k] = vdup(0.0); BpF[NLAT-l+k] = vdup(0.0);
591 ++k;
592 }
593 k = ((unsigned) l) / VSIZE2;
594 do {
595 al = alm;
596 rnd cost[NWAY], y0[NWAY], y1[NWAY];
597V rnd st2[NWAY], dy0[NWAY], dy1[NWAY];
598Q rnd rer[NWAY], rei[NWAY], ror[NWAY], roi[NWAY];
599V rnd ter[NWAY], tei[NWAY], tor[NWAY], toi[NWAY];
600V rnd per[NWAY], pei[NWAY], por[NWAY], poi[NWAY];
601 for (int j=0; j<NWAY; ++j) {
602 cost[j] = vread(st, k+j);
603QX y0[j] = vall(1.0);
604V st2[j] = cost[j]*cost[j]*vall(-m_1);
605V y0[j] = vall(m); // for the vector transform, compute ylm*m/sint
606 }
607Q l=m;
608V l=m-1;
609 long int ny = 0;
610 if ((int)llim <= SHT_L_RESCALE_FLY) {
611 do { // sin(theta)^m
612 if (l&1) for (int j=0; j<NWAY; ++j) y0[j] *= cost[j];
613 for (int j=0; j<NWAY; ++j) cost[j] *= cost[j];
614 } while(l >>= 1);
615 } else {
616 long int nsint = 0;
617 do { // sin(theta)^m (use rescaling to avoid underflow)
618 if (l&1) {
619 for (int j=0; j<NWAY; ++j) y0[j] *= cost[j];
620 ny += nsint;
621 if (vlo(y0[0]) < (SHT_ACCURACY+1.0/SHT_SCALE_FACTOR)) {
622 ny--;
623 for (int j=0; j<NWAY; ++j) y0[j] *= vall(SHT_SCALE_FACTOR);
624 }
625 }
626 for (int j=0; j<NWAY; ++j) cost[j] *= cost[j];
627 nsint += nsint;
628 if (vlo(cost[0]) < 1.0/SHT_SCALE_FACTOR) {
629 nsint--;
630 for (int j=0; j<NWAY; ++j) cost[j] *= vall(SHT_SCALE_FACTOR);
631 }
632 } while(l >>= 1);
633 }
634 for (int j=0; j<NWAY; ++j) {
635 y0[j] *= vall(al[0]);
636 cost[j] = vread(ct, j+k);
637V dy0[j] = cost[j]*y0[j];
638Q ror[j] = vall(0.0); roi[j] = vall(0.0);
639Q rer[j] = vall(0.0); rei[j] = vall(0.0);
640 }
641 for (int j=0; j<NWAY; ++j) {
642 y1[j] = (vall(al[1])*y0[j]) *cost[j]; // y1[j] = vall(al[1])*cost[j]*y0[j];
643V por[j] = vall(0.0); tei[j] = vall(0.0);
644V tor[j] = vall(0.0); pei[j] = vall(0.0);
645V dy1[j] = (vall(al[1])*y0[j]) *(cost[j]*cost[j] + st2[j]); // dy1[j] = vall(al[1])*(cost[j]*dy0[j] - y0[j]*st2[j]);
646V poi[j] = vall(0.0); ter[j] = vall(0.0);
647V toi[j] = vall(0.0); per[j] = vall(0.0);
648 }
649 l=m; al+=2;
650 while ((ny<0) && (l<llim)) { // ylm treated as zero and ignored if ny < 0
651 for (int j=0; j<NWAY; ++j) {
652 y0[j] = vall(al[1])*(cost[j]*y1[j]) + vall(al[0])*y0[j];
653V dy0[j] = vall(al[1])*(cost[j]*dy1[j] + y1[j]*st2[j]) + vall(al[0])*dy0[j];
654 }
655 for (int j=0; j<NWAY; ++j) {
656 y1[j] = vall(al[3])*(cost[j]*y0[j]) + vall(al[2])*y1[j];
657V dy1[j] = vall(al[3])*(cost[j]*dy0[j] + y0[j]*st2[j]) + vall(al[2])*dy1[j];
658 }
659 l+=2; al+=4;
660 if (fabs(vlo(y0[NWAY-1])) > SHT_ACCURACY*SHT_SCALE_FACTOR + 1.0) { // rescale when value is significant
661 ++ny;
662 for (int j=0; j<NWAY; ++j) {
663 y0[j] *= vall(1.0/SHT_SCALE_FACTOR); y1[j] *= vall(1.0/SHT_SCALE_FACTOR);
664V dy0[j] *= vall(1.0/SHT_SCALE_FACTOR); dy1[j] *= vall(1.0/SHT_SCALE_FACTOR);
665 }
666 }
667 }
668 if (ny == 0) {
669 while (l<llim) { // compute even and odd parts
670Q for (int j=0; j<NWAY; ++j) { rer[j] += y0[j] * qr(l); rei[j] += y0[j] * qi(l); }
671Q for (int j=0; j<NWAY; ++j) { ror[j] += y1[j] * qr(l+1); roi[j] += y1[j] * qi(l+1); }
672 #ifdef SHT_GRAD
673S for (int j=0; j<NWAY; ++j) { tor[j] += dy0[j] * sr(l); pei[j] += y0[j] * sr(l); }
674S for (int j=0; j<NWAY; ++j) { toi[j] += dy0[j] * si(l); per[j] -= y0[j] * si(l); }
675T for (int j=0; j<NWAY; ++j) { por[j] -= dy0[j] * tr(l); tei[j] += y0[j] * tr(l); }
676T for (int j=0; j<NWAY; ++j) { poi[j] -= dy0[j] * ti(l); ter[j] -= y0[j] * ti(l); }
677S for (int j=0; j<NWAY; ++j) { ter[j] += dy1[j] * sr(l+1); poi[j] += y1[j] * sr(l+1); }
678S for (int j=0; j<NWAY; ++j) { tei[j] += dy1[j] * si(l+1); por[j] -= y1[j] * si(l+1); }
679T for (int j=0; j<NWAY; ++j) { per[j] -= dy1[j] * tr(l+1); toi[j] += y1[j] * tr(l+1); }
680T for (int j=0; j<NWAY; ++j) { pei[j] -= dy1[j] * ti(l+1); tor[j] -= y1[j] * ti(l+1); }
681 #else
682V for (int j=0; j<NWAY; ++j) {
683V tor[j] += dy0[j] * sr(l) - y1[j] * ti(l+1);
684V pei[j] += y0[j] * sr(l) - dy1[j] * ti(l+1);
685V }
686V for (int j=0; j<NWAY; ++j) {
687V poi[j] -= dy0[j] * ti(l) - y1[j] * sr(l+1);
688V ter[j] -= y0[j] * ti(l) - dy1[j] * sr(l+1);
689V }
690V for (int j=0; j<NWAY; ++j) {
691V toi[j] += dy0[j] * si(l) + y1[j] * tr(l+1);
692V per[j] -= y0[j] * si(l) + dy1[j] * tr(l+1);
693V }
694V for (int j=0; j<NWAY; ++j) {
695V por[j] -= dy0[j] * tr(l) + y1[j] * si(l+1);
696V tei[j] += y0[j] * tr(l) + dy1[j] * si(l+1);
697V }
698 #endif
699 for (int j=0; j<NWAY; ++j) {
700V dy0[j] = vall(al[1])*(cost[j]*dy1[j] + y1[j]*st2[j]) + vall(al[0])*dy0[j];
701 y0[j] = vall(al[1])*(cost[j]*y1[j]) + vall(al[0])*y0[j];
702 }
703 for (int j=0; j<NWAY; ++j) {
704V dy1[j] = vall(al[3])*(cost[j]*dy0[j] + y0[j]*st2[j]) + vall(al[2])*dy1[j];
705 y1[j] = vall(al[3])*(cost[j]*y0[j]) + vall(al[2])*y1[j];
706 }
707 l+=2; al+=4;
708 }
709 if (l==llim) {
710Q for (int j=0; j<NWAY; ++j) { rer[j] += y0[j] * qr(l); rei[j] += y0[j] * qi(l); }
711S for (int j=0; j<NWAY; ++j) { tor[j] += dy0[j] * sr(l); pei[j] += y0[j] * sr(l); }
712S for (int j=0; j<NWAY; ++j) { toi[j] += dy0[j] * si(l); per[j] -= y0[j] * si(l); }
713T for (int j=0; j<NWAY; ++j) { por[j] -= dy0[j] * tr(l); tei[j] += y0[j] * tr(l); }
714T for (int j=0; j<NWAY; ++j) { poi[j] -= dy0[j] * ti(l); ter[j] -= y0[j] * ti(l); }
715 }
7163 for (int j=0; j<NWAY; ++j) cost[j] = vread(st, k+j) * vall(m_1);
7173 for (int j=0; j<NWAY; ++j) { rer[j] *= cost[j]; ror[j] *= cost[j]; rei[j] *= cost[j]; roi[j] *= cost[j]; }
718 }
719 #if _GCC_VEC_
720 for (int j=0; j<NWAY; ++j) {
721Q S2D_CSTORE2(BrF, k+j, rer[j], ror[j], rei[j], roi[j])
722V S2D_CSTORE2(BtF, k+j, ter[j], tor[j], tei[j], toi[j])
723V S2D_CSTORE2(BpF, k+j, per[j], por[j], pei[j], poi[j])
724 }
725 #else
726 for (int j=0; j<NWAY; ++j) {
727Q BrF[k+j] = (rer[j]+ror[j]) + I*(rei[j]+roi[j]);
728Q BrF[NLAT-k-1-j] = (rer[j]-ror[j]) + I*(rei[j]-roi[j]);
729V BtF[k+j] = (ter[j]+tor[j]) + I*(tei[j]+toi[j]);
730V BtF[NLAT-1-k-j] = (ter[j]-tor[j]) + I*(tei[j]-toi[j]);
731V BpF[k+j] = (per[j]+por[j]) + I*(pei[j]+poi[j]);
732V BpF[NLAT-1-k-j] = (per[j]-por[j]) + I*(pei[j]-poi[j]);
733 }
734 #endif
735 k+=NWAY;
736 } while (k < nk);
737 }
738
739Q #undef qr
740Q #undef qi
741S #undef sr
742S #undef si
743T #undef tr
744T #undef ti
745 }
746
747 #endif
Note: See TracBrowser for help on using the repository browser.