source: CIVL/examples/omp/shtns/SHT/hyb_SH_to_spat.gen.c

main
Last change on this file was ea777aa, checked in by Alex Wilton <awilton@…>, 3 years ago

Moved examples, include, build_default.properties, common.xml, and README out from dev.civl.com into the root of the repo.

git-svn-id: svn://vsl.cis.udel.edu/civl/trunk@5704 fb995dde-84ed-4084-dfe6-e5aef3e2452c

  • Property mode set to 100644
File size: 18.5 KB
Line 
1/*
2 * Copyright (c) 2010-2015 Centre National de la Recherche Scientifique.
3 * written by Nathanael Schaeffer (CNRS, ISTerre, Grenoble, France).
4 *
5 * nathanael.schaeffer@ujf-grenoble.fr
6 *
7 * This software is governed by the CeCILL license under French law and
8 * abiding by the rules of distribution of free software. You can use,
9 * modify and/or redistribute the software under the terms of the CeCILL
10 * license as circulated by CEA, CNRS and INRIA at the following URL
11 * "http://www.cecill.info".
12 *
13 * The fact that you are presently reading this means that you have had
14 * knowledge of the CeCILL license and that you accept its terms.
15 *
16 */
17
18# This file is meta-code for SHT.c (spherical harmonic transform).
19# it is intended for "make" to generate C code for similar SHT functions,
20# from one generic function + tags.
21# > See Makefile and SHT.c
22# Basically, there are tags at the beginning of lines that are information
23# to keep or remove the line depending on the function to build.
24# tags :
25# Q : line for scalar transform
26# V : line for vector transform (both spheroidal and toroidal)
27# S : line for vector transfrom, spheroidal component
28# T : line for vector transform, toroidal component.
29
30/////////////////////////////////////////////////////
31 #ifdef SHT_AXISYM
32/// Only the \b axisymmetric (m=0) component is transformed, and the resulting spatial fields have size NLAT.
33 #endif
34
35/// Truncation and spatial discretization are defined by \ref shtns_create and \ref shtns_set_grid_*
36/// \param[in] shtns = a configuration created by \ref shtns_create with a grid set by shtns_set_grid_*
37Q/// \param[in] Qlm = spherical harmonics coefficients :
38Q/// complex double arrays of size shtns->nlm [unmodified].
39S/// \param[in] Slm = spherical harmonics coefficients of \b Spheroidal scalar :
40S/// complex double array of size shtns->nlm [unmodified].
41T/// \param[in] Tlm = spherical harmonics coefficients of \b Toroidal scalar :
42T/// complex double array of size shtns->nlm [unmodified].
43Q/// \param[out] Vr = spatial scalar field : double array.
44 #ifndef SHT_AXISYM
45V/// \param[out] Vt, Vp = (theta,phi)-components of spatial vector : double arrays.
46 #else
47S/// \param[out] Vt = theta-component of spatial vector : double array.
48T/// \param[out] Vp = phi-component of spatial vector : double array.
49 #endif
50 #ifdef SHT_VAR_LTR
51/// \param[in] llim = specify maximum degree of spherical harmonic. llim must be at most LMAX, and all spherical harmonic degree higher than llim are ignored.
52 #else
53/// \param[in] llim MUST be shtns->lmax.
54 #endif
55
56// MTR_DCT : <0 => SHT_NO_DCT must be defined !!! mem only transform
57// 0 => dct for m=0 only
58// m => dct up to m, (!!! MTR_DCT <= MTR !!!)
59
603 static void GEN3(SHqst_to_spat_,ID_NME,SUFFIX)(shtns_cfg shtns, cplx *Qlm, cplx *Slm, cplx *Tlm, double *Vr, double *Vt, double *Vp, long int llim) {
61QX static void GEN3(SH_to_spat_,ID_NME,SUFFIX)(shtns_cfg shtns, cplx *Qlm, double *Vr, long int llim) {
62 #ifndef SHT_GRAD
63VX static void GEN3(SHsphtor_to_spat_,ID_NME,SUFFIX)(shtns_cfg shtns, cplx *Slm, cplx *Tlm, double *Vt, double *Vp, long int llim) {
64 #else
65S static void GEN3(SHsph_to_spat_,ID_NME,SUFFIX)(shtns_cfg shtns, cplx *Slm, double *Vt, double *Vp, long int llim) {
66T static void GEN3(SHtor_to_spat_,ID_NME,SUFFIX)(shtns_cfg shtns, cplx *Tlm, double *Vt, double *Vp, long int llim) {
67 #endif
68
69Q v2d *BrF;
70 #ifndef SHT_AXISYM
71// with m>0, 3 components not possible with DCT acceleration
723 #define SHT_NO_DCT
73V v2d *BtF, *BpF;
74V struct DtDp *dyl;
75Q cplx re,ro;
76V cplx te,to, pe,po;
77V cplx dte,dto, dpe,dpo;
78Q #define BR0(i) ((double *)BrF)[2*(i)]
79V #define BT0(i) ((double *)BtF)[2*(i)]
80V #define BP0(i) ((double *)BpF)[2*(i)]
81 unsigned im, imlim;
82 int imlim_dct;
83 #else
84S v2d *BtF;
85T v2d *BpF;
86Q double re,ro;
87S double te,to;
88T double pe,po;
89Q #define BR0(i) ((double *)BrF)[i]
90S #define BT0(i) ((double *)BtF)[i]
91T #define BP0(i) ((double *)BpF)[i]
92 #endif
93 long int k,m,l;
94 #ifdef _GCC_VEC_
95QX s2d Ql0[(llim+5)>>1]; // we need some zero-padding (icc 14 bug: needs 1 more for SHT_AXISYM only!)
963 s2d Ql0[(llim+4)>>1]; // we need some zero-padding (icc 14 bug: needs 1 more for SHT_AXISYM only!).
97S s2d Sl0[(llim+3)>>1]; // we need some zero-padding (icc 14 bug: needs 1 more for SHT_AXISYM only!).
98T s2d Tl0[(llim+3)>>1]; // we need some zero-padding (icc 14 bug: needs 1 more for SHT_AXISYM only!).
99 #endif
100
101 #ifndef SHT_AXISYM
102Q BrF = (v2d *) Vr;
103V BtF = (v2d *) Vt; BpF = (v2d *) Vp;
104 if (shtns->ncplx_fft > 0) { // alloc memory for the FFT
105QX BrF = VMALLOC( shtns->ncplx_fft * sizeof(cplx) );
106VX BtF = VMALLOC( 2* shtns->ncplx_fft * sizeof(cplx) );
107VX BpF = BtF + shtns->ncplx_fft;
1083 BrF = VMALLOC( 3* shtns->ncplx_fft * sizeof(cplx) );
1093 BtF = BrF + shtns->ncplx_fft; BpF = BtF + shtns->ncplx_fft;
110 }
111 imlim = MTR;
112 #ifdef SHT_VAR_LTR
113 if (imlim*MRES > (unsigned) llim) imlim = ((unsigned) llim)/MRES; // 32bit mul and div should be faster
114 #endif
115 #ifndef SHT_NO_DCT
116 imlim_dct = MTR_DCT;
117 #ifdef SHT_VAR_LTR
118 if (imlim_dct > imlim) imlim_dct = imlim;
119 #endif
120 #endif
121 #else
122 #ifdef SHT_GRAD
123S if (Vp != NULL) { k=0; do { ((s2d*)Vp)[k]=vdup(0.0); } while(++k<NLAT/2); Vp[NLAT-1]=0.0; }
124T if (Vt != NULL) { k=0; do { ((s2d*)Vt)[k]=vdup(0.0); } while(++k<NLAT/2); Vt[NLAT-1]=0.0; }
125 #endif
126Q BrF = (v2d*) Vr;
127S BtF = (v2d*) Vt;
128T BpF = (v2d*) Vp;
129 #endif
130
131 // im=0;
132 #ifdef _GCC_VEC_
133 { // store the m=0 coefficients in an efficient & vectorizable way.
134Q double* Ql = (double*) &Ql0;
135S double* Sl = (double*) &Sl0;
136T double* Tl = (double*) &Tl0;
137Q Ql[0] = (double) Qlm[0]; // l=0
138 l=1;
139 do { // for m=0, compress the complex Q,S,T to double
140S Sl[l-1] = (double) Slm[l];
141T Tl[l-1] = (double) Tlm[l];
142Q Ql[l] = (double) Qlm[l];
143 } while(++l<=llim); // l=llim+1
144S Sl[l-1] = 0.0;
145T Tl[l-1] = 0.0;
1463 Sl[l] = 0.0; Tl[l] = 0.0; Ql[l] = 0.0; Ql[l+1] = 0.0;
147QX Ql[l] = 0.0; Ql[l+1] = 0.0; Ql[l+2] = 0.0;
148 }
149 #endif
150 #ifndef SHT_NO_DCT
151 { // m=0 dct
152Q s2d* yl = (s2d*) shtns->ykm_dct[0];
153V s2d* dyl0 = (s2d*) shtns->dykm_dct[0]; // only theta derivative (d/dphi = 0 for m=0)
154 #ifndef _GCC_VEC_
155Q double* Ql = (double*) Qlm;
156S double* Sl = (double*) Slm;
157T double* Tl = (double*) Tlm;
158Q re = 0.0; ro = 0.0;
159QE re = yl[0] * Ql[0];
160QO ro = yl[1] * Ql[2];
161Q yl+=2;
162 k=0; l = 2;
163 do {
164S te = 0.0; to = 0.0;
165T pe = 0.0; po = 0.0;
166SO te = dyl0[0] * Sl[2*l-2]; // l-1
167TE pe = -dyl0[0] * Tl[2*l-2]; // l-1
168V ++dyl0;
169 while(l<llim) {
170QE re += yl[0] * Ql[2*l];
171QO ro += yl[1] * Ql[2*l+2];
172SE to += dyl0[0] * Sl[2*l];
173TO po -= dyl0[0] * Tl[2*l];
174SO te += dyl0[1] * Sl[2*l+2];
175TE pe -= dyl0[1] * Tl[2*l+2];
176 l+=2;
177Q yl+=2;
178V dyl0+=2;
179 }
180 if (l==llim) {
181QE re += yl[0] * Ql[2*l];
182SE to += dyl0[0] * Sl[2*l];
183TO po -= dyl0[0] * Tl[2*l];
184Q yl+=2;
185 }
186V ++dyl0;
187Q BR0(k) = re; BR0(k+1) = ro;
188VX #ifndef SHT_AXISYM
189VX BT0(k) = 0.0; BT0(k+1) = 0.0; // required for tor or sph only transform
190VX BP0(k) = 0.0; BP0(k+1) = 0.0;
191VX #endif
192S BT0(k) = te; BT0(k+1) = to;
193T BP0(k) = pe; BP0(k+1) = po;
194Q re = 0.0; ro = 0.0;
195 #ifdef SHT_VAR_LTR
196Q yl += ((LMAX>>1) - (llim>>1))*2;
197V dyl0 += (((LMAX+1)>>1) - ((llim+1)>>1))*2;
198 #endif
199 k+=2;
200 l = k;
201 } while (k<=llim+1);
202 #else
203Q s2d r = yl[0] * Ql0[0]; // first Q [l=0 and 1]
204 l=0; k=0;
205 do {
206 l >>= 1; // l = l/2;
207S s2d t = vdup(0.0);
208T s2d p = vdup(0.0);
209QX s2d r1 = vdup(0.0);
210 do {
211Q r += yl[l+1] * Ql0[l+1]; // { re, ro }
212S t += dyl0[l] * Sl0[l]; // { te, to }
213T p -= dyl0[l] * Tl0[l]; // { pe, po }
214V ++l;
215QX r1 += yl[l+2] * Ql0[l+2]; // { re, ro }
216QX l+=2;
217QX } while(2*l < llim-1); // 2*(l+1) <= llim
218V } while(2*l < llim);
219QX r += r1;
220Q yl += (LMAX -k)>>1;
221V dyl0 += (LMAX+1 -k)>>1;
222 #ifndef SHT_AXISYM
223Q BR0(k) = vlo_to_dbl(r); BR0(k+1) = vhi_to_dbl(r);
224VX BT0(k) = 0.0; BT0(k+1) = 0.0;
225S BT0(k) = vlo_to_dbl(t); BT0(k+1) = vhi_to_dbl(t);
226VX BP0(k) = 0.0; BP0(k+1) = 0.0;
227T BP0(k) = vlo_to_dbl(p); BP0(k+1) = vhi_to_dbl(p);
228 #else
229Q *((s2d*)(((double*)BrF)+k)) = r;
230S *((s2d*)(((double*)BtF)+k)) = t;
231T *((s2d*)(((double*)BpF)+k)) = p;
232 #endif
233 l = k+1;
234 k+=2;
235Q r = vdup(0.0);
236QX } while (l<llim); // (k<=llim);
237V } while (l<=llim); // (k<=llim+1);
238 #endif
239 while (k<NLAT) { // dct padding (NLAT is even)
240Q BR0(k) = 0.0; BR0(k+1) = 0.0;
241 #ifndef SHT_AXISYM
242V BT0(k) = 0.0; BT0(k+1) = 0.0; // required for tor or sph only transform
243V BP0(k) = 0.0; BP0(k+1) = 0.0;
244 #else
245S BT0(k) = 0.0; BT0(k+1) = 0.0;
246T BP0(k) = 0.0; BP0(k+1) = 0.0;
247 #endif
248 k+=2;
249 }
250 #ifdef SHT_AXISYM
251Q fftw_execute_r2r(shtns->idct,Vr, Vr); // iDCT m=0
252S fftw_execute_r2r(shtns->idct,Vt, Vt); // iDCT m=0
253T fftw_execute_r2r(shtns->idct,Vp, Vp); // iDCT m=0
254V double* st_1 = shtns->st_1;
255V k=0; do {
256V #ifdef _GCC_VEC_
257V v2d sin_1 = ((v2d *)st_1)[k];
258S ((v2d *)Vt)[k] *= sin_1;
259T ((v2d *)Vp)[k] *= sin_1;
260V #else
261V double sin_1 = st_1[2*k]; double sin_2 = st_1[2*k+1];
262S Vt[2*k] *= sin_1; Vt[2*k+1] *= sin_2;
263T Vp[2*k] *= sin_1; Vp[2*k+1] *= sin_2;
264V #endif
265V } while (++k<NLAT_2);
266 #endif
267 }
268 #else // ifndef SHT_NO_DCT
269 { // m=0 no dct
270 #ifndef _GCC_VEC_
271Q double* Ql = (double*) Qlm;
272S double* Sl = (double*) Slm;
273T double* Tl = (double*) Tlm;
274 #endif
275 k=0;
276Q s2d* yl = (s2d*) shtns->ylm[0];
277V s2d* dyl0 = (s2d*) shtns->dylm[0]; // only theta derivative (d/dphi = 0 for m=0)
278 do { // ops : NLAT_2 * [ (lmax-m+1) + 4] : almost twice as fast.
279 #ifndef _GCC_VEC_
280 l=1;
281Q re = yl[0] * Ql[0]; // re += ylm[im][k*(LMAX-m+1) + (l-m)] * Qlm[LiM(l,im)];
282Q ++yl;
283Q ro = 0.0;
284S te = 0.0; to = 0.0;
285T pe = 0.0; po = 0.0;
286 while (l<llim) { // compute even and odd parts
287Q ro += yl[0] * Ql[2*l]; // re += ylm[im][k*(LMAX-m+1) + (l-m)] * Qlm[LiM(l,im)];
288QB re += yl[1] * Ql[2*l+2]; // ro += ylm[im][k*(LMAX-m+1) + (l+1-m)] * Qlm[LiM(l+1,im)];
289T pe -= dyl0[0] * Tl[2*l]; // m=0 : everything is real.
290SB te += dyl0[0] * Sl[2*l];
291TB po -= dyl0[1] * Tl[2*l+2];
292S to += dyl0[1] * Sl[2*l+2];
293 l+=2;
294Q yl+=2;
295V dyl0+=2;
296 }
297 if (l==llim) {
298Q ro += yl[0] * Ql[2*l];
299TB pe -= dyl0[0] * Tl[2*l];
300S te += dyl0[0] * Sl[2*l];
301V dyl0+=2;
302 }
303Q ++yl;
304 #ifdef SHT_VAR_LTR
305Q yl += ((LMAX>>1) - (llim>>1))*2;
306V dyl0 += (((LMAX+1)>>1) - ((llim+1)>>1))*2;
307 #endif
308Q BR0(k) = re + ro;
309V #ifndef SHT_AXISYM
310V BT0(k) = 0.0; BP0(k) = 0.0; // required for partial tor or sph transform
311V #endif
312S BT0(k) = te + to; // Bt = dS/dt
313T BP0(k) = pe + po; // Bp = - dT/dt
314QB BR0(NLAT-1-k) = re - ro;
315VB #ifndef SHT_AXISYM
316VB BT0(NLAT-1-k) = 0.0; BP0(NLAT-1-k) = 0.0; // required for partial tor or sph transform
317VB #endif
318SB BT0(NLAT-1-k) = te - to;
319TB BP0(NLAT-1-k) = pe - po;
320 #else
321 l=0;
322S s2d t = vdup(0.0);
323T s2d p = vdup(0.0);
324Q s2d r = vdup(0.0);
325QX s2d r1 = vdup(0.0);
326 do {
327S t += dyl0[l] * Sl0[l]; // { te, to }
328T p -= dyl0[l] * Tl0[l]; // { pe, po }
329Q r += yl[l] * Ql0[l]; // { re, ro }
330 ++l;
331QX r1 += yl[l] * Ql0[l]; // reduce dependency
332QX ++l;
333Q } while (2*l <= llim);
334VX } while (2*l < llim);
335QX r += r1;
336Q yl += (LMAX+2)>>1;
337V dyl0 += (LMAX+1)>>1;
338 #if __SSE3__
339 /* alternate code, which may be faster (slightly) on SSE3. */
340Q r = addi(r,r); // { re-ro , re+ro }
341S t = addi(t,t); // { te-to , te+to }
342T p = addi(p,p); // { pe-po , pe+po }
343Q BR0(k) = vhi_to_dbl(r);
344V #ifndef SHT_AXISYM
345V BT0(k) = 0.0; BP0(k) = 0.0; // required for partial tor or sph transform
346V #endif
347S BT0(k) = vhi_to_dbl(t); // Bt = dS/dt
348T BP0(k) = vhi_to_dbl(p); // Bp = - dT/dt
349QB BR0(NLAT-1-k) = vlo_to_dbl(r);
350VB #ifndef SHT_AXISYM
351VB BT0(NLAT-1-k) = 0.0; BP0(NLAT-1-k) = 0.0; // required for partial tor or sph transform
352VB #endif
353SB BT0(NLAT-1-k) = vlo_to_dbl(t);
354TB BP0(NLAT-1-k) = vlo_to_dbl(p);
355 #else
356Q BR0(k) = vhi_to_dbl(r) + vlo_to_dbl(r);
357V #ifndef SHT_AXISYM
358V BT0(k) = 0.0; BP0(k) = 0.0; // required for partial tor or sph transform
359V #endif
360S BT0(k) = vhi_to_dbl(t) + vlo_to_dbl(t); // Bt = dS/dt
361T BP0(k) = vhi_to_dbl(p) + vlo_to_dbl(p); // Bp = - dT/dt
362QB BR0(NLAT-1-k) = vlo_to_dbl(r) - vhi_to_dbl(r);
363VB #ifndef SHT_AXISYM
364VB BT0(NLAT-1-k) = 0.0; BP0(NLAT-1-k) = 0.0; // required for partial tor or sph transform
365VB #endif
366SB BT0(NLAT-1-k) = vlo_to_dbl(t) - vhi_to_dbl(t);
367TB BP0(NLAT-1-k) = vlo_to_dbl(p) - vhi_to_dbl(p);
368 #endif
369 #endif
370 } while (++k < NLAT_2);
371 }
372 #endif // ifndef SHT_NO_DCT
373
374 #ifndef SHT_AXISYM
375 im=1;
376Q BrF += NLAT;
377V BtF += NLAT; BpF += NLAT;
378 #ifndef SHT_NO_DCT
379 while(im<=imlim_dct) { // dct for im <= MTR_DCT
380 m=im*MRES;
381 l = LiM(shtns, 0,im);
382Q v2d* Ql = (v2d*) &Qlm[l]; // virtual pointer for l=0 and im
383S v2d* Sl = (v2d*) &Slm[l];
384T v2d* Tl = (v2d*) &Tlm[l];
385Q double* yl = shtns->ykm_dct[im];
386V dyl = shtns->dykm_dct[im];
387 k=0; l=m;
388 do {
389Q v2d re = vdup(0.0); v2d ro = vdup(0.0);
390V v2d te = vdup(0.0); v2d to = vdup(0.0); v2d pe = vdup(0.0); v2d po = vdup(0.0);
391V v2d dte = vdup(0.0); v2d dto = vdup(0.0); v2d dpe = vdup(0.0); v2d dpo = vdup(0.0);
392 while(l<llim) {
393QE re += vdup(yl[0]) * Ql[l];
394QO ro += vdup(yl[1]) * Ql[l+1];
395TO dte += vdup(dyl[0].p) * Tl[l];
396TO po -= vdup(dyl[0].t) * Tl[l];
397SE dpe += vdup(dyl[0].p) * Sl[l];
398SE to += vdup(dyl[0].t) * Sl[l];
399SO dpo += vdup(dyl[1].p) * Sl[l+1];
400SO te += vdup(dyl[1].t) * Sl[l+1];
401TE dto += vdup(dyl[1].p) * Tl[l+1];
402TE pe -= vdup(dyl[1].t) * Tl[l+1];
403 l+=2;
404Q yl+=2;
405V dyl+=2;
406 }
407 if (l==llim) {
408QE re += vdup(yl[0]) * Ql[l];
409TO dte += vdup(dyl[0].p) * Tl[l];
410TO po -= vdup(dyl[0].t) * Tl[l];
411SE dpe += vdup(dyl[0].p) * Sl[l];
412SE to += vdup(dyl[0].t) * Sl[l];
413Q ++yl;
414V ++dyl;
415 }
416Q BrF[k] = re; BrF[k+1] = ro;
417V BtF[k] = addi(te, dte); BtF[k+1] = addi(to, dto);
418V BpF[k] = addi(pe, dpe); BpF[k+1] = addi(po, dpo);
419V l = (k < m) ? m : k+(m&1);
420 k+=2;
421 #ifdef SHT_VAR_LTR
422Q yl+= (LMAX-llim);
423V dyl+= (LMAX-llim);
424 #endif
425Q l = (k < m) ? m : k-(m&1);
426V } while (k<=llim+1-(m&1));
427Q } while (k<=llim);
428QE if (l==llim) {
429QE BrF[k] = vdup(yl[0]) * Ql[l];
430QE BrF[k+1] = vdup(0.0);
431QE k+=2;
432QE }
433 while (k<NLAT) { // NLAT even
434Q BrF[k] = vdup(0.0); BrF[k+1] = vdup(0.0);
435V BtF[k] = vdup(0.0); BpF[k] = vdup(0.0);
436V BtF[k+1] = vdup(0.0); BpF[k+1] = vdup(0.0);
437 k+=2;
438 }
439 ++im;
440Q BrF += NLAT;
441V BtF += NLAT; BpF += NLAT;
442 }
443 #endif
444
445 while(im<=imlim) { // regular for MTR_DCT < im <= MTR
446 m = im*MRES;
4473 double m_1 = 1.0/m;
448 l = LiM(shtns, 0,im);
449Q v2d* Ql = (v2d*) &Qlm[l]; // virtual pointer for l=0 and im
450S v2d* Sl = (v2d*) &Slm[l]; // virtual pointer for l=0 and im
451T v2d* Tl = (v2d*) &Tlm[l];
452 k=0; l=shtns->tm[im];
453 while (k<l) { // polar optimization
454Q BrF[k] = vdup(0.0);
455QB BrF[NLAT-l + k] = vdup(0.0); // south pole zeroes <=> BrF[im*NLAT + NLAT-(k+1)] = 0.0;
456V BtF[k] = vdup(0.0); BpF[k] = vdup(0.0);
457VB BtF[NLAT-l + k] = vdup(0.0); BpF[NLAT-l + k] = vdup(0.0); // south pole zeroes
458 ++k;
459 }
460Q double* yl = shtns->ylm[im];
461V dyl = shtns->dylm[im];
462 do { // ops : NLAT_2 * [ (lmax-m+1)*2 + 4] : almost twice as fast.
463 l=m;
464Q v2d re = vdup(0.0); v2d ro = vdup(0.0);
465V v2d dte = vdup(0.0); v2d dto = vdup(0.0); v2d pe = vdup(0.0); v2d po = vdup(0.0);
466V v2d dpe = vdup(0.0); v2d dpo = vdup(0.0); v2d te = vdup(0.0); v2d to = vdup(0.0);
467 while (l<llim) { // compute even and odd parts
468QX re += vdup(yl[0]) * Ql[l]; // re += ylm[im][k*(LMAX-m+1) + (l-m)] * Qlm[LiM(l,im)];
469QX ro += vdup(yl[1]) * Ql[l+1]; // ro += ylm[im][k*(LMAX-m+1) + (l+1-m)] * Qlm[LiM(l+1,im)];
470TB dpo -= vdup(dyl[0].t) * Tl[l];
471S dto += vdup(dyl[0].t) * Sl[l];
472TB te += vdup(dyl[0].p) * Tl[l];
473S pe += vdup(dyl[0].p) * Sl[l];
4743 re += vdup(dyl[0].p) * Ql[l];
475T dpe -= vdup(dyl[1].t) * Tl[l+1];
476SB dte += vdup(dyl[1].t) * Sl[l+1];
477T to += vdup(dyl[1].p) * Tl[l+1];
478SB po += vdup(dyl[1].p) * Sl[l+1];
4793 ro += vdup(dyl[1].p) * Ql[l+1];
480 l+=2;
481Q yl+=2;
482V dyl+=2;
483 }
484 if (l==llim) {
485QX re += vdup(yl[0]) * Ql[l]; // re += ylm[im][k*(LMAX-m+1) + (l-m)] * Qlm[LiM(l,im)];
486TO dpo -= vdup(dyl[0].t) * Tl[l];
487SE dto += vdup(dyl[0].t) * Sl[l];
488TO te += vdup(dyl[0].p) * Tl[l];
489SE pe += vdup(dyl[0].p) * Sl[l];
4903 re += vdup(dyl[0].p) * Ql[l];
491Q ++yl;
492V ++dyl;
493 }
4943 s2d qv = vdup(shtns->st[k]*m_1);
495V BtF[k] = addi(dte+dto, te+to); // Bt = dS/dt + I.m/sint *T
496VB BtF[NLAT-1-k] = addi(dte-dto, te-to);
4973 re *= qv; ro *= qv;
498V BpF[k] = addi(dpe+dpo, pe+po); // Bp = I.m/sint * S - dT/dt
499VB BpF[NLAT-1-k] = addi(dpe-dpo, pe-po);
500Q BrF[k] = re + ro;
501QB BrF[NLAT-1-k] = re - ro;
502 #ifdef SHT_VAR_LTR
503Q yl += (LMAX-llim);
504V dyl += (LMAX-llim);
505 #endif
506 } while (++k < NLAT_2);
507 ++im;
508Q BrF += NLAT;
509V BtF += NLAT; BpF += NLAT;
510 }
511 for (k=0; k < NLAT*((NPHI>>1) -imlim); ++k) { // padding for high m's
512Q BrF[k] = vdup(0.0);
513V BtF[k] = vdup(0.0); BpF[k] = vdup(0.0);
514 }
515Q BrF -= NLAT*(imlim+1); // restore original pointer
516V BtF -= NLAT*(imlim+1); BpF -= NLAT*(imlim+1); // restore original pointer
517
518 // NPHI > 1 as SHT_AXISYM is not defined.
519 #ifndef SHT_NO_DCT
520Q fftw_execute_r2r(shtns->idct,(double *) BrF, (double *) BrF); // iDCT
521V fftw_execute_r2r(shtns->idct,(double *) BtF, (double *) BtF); // iDCT
522V fftw_execute_r2r(shtns->idct,(double *) BpF, (double *) BpF); // iDCT
523V double* st_1 = shtns->st_1;
524V k=0; do { // m=0
525V double sin_1 = st_1[k]; double sin_2 = st_1[k+1];
526V ((double *)BtF)[2*k] *= sin_1; ((double *)BpF)[2*k] *= sin_1;
527V ((double *)BtF)[2*k+2] *= sin_2; ((double *)BpF)[2*k+2] *= sin_2;
528V k+=2;
529V } while(k<NLAT);
530Q if (MRES & 1) { // odd m's must be divided by sin(theta)
531Q double* st_1 = shtns->st_1;
532Q for (im=1; im<=MTR_DCT; im+=2) { // odd m's
533Q k=0; do {
534Q ((v2d *)BrF)[im*NLAT + k] *= vdup(st_1[k]); ((v2d *)BrF)[im*NLAT + k+1] *= vdup(st_1[k+1]);
535Q k+=2;
536Q } while(k<NLAT);
537Q }
538Q }
539V l = (MRES & 1) + 1; // im-stride (l=1 if MRES even, l=2 if MRES odd)
540V for (im=l; im<=MTR_DCT; im+=l) { //even m's must be divided by sin(theta)
541V k=0; do {
542V s2d sin_1 = vdup(st_1[k]); s2d sin_2 = vdup(st_1[k+1]);
543V ((v2d *)BtF)[im*NLAT + k] *= sin_1; ((v2d *)BpF)[im*NLAT + k] *= sin_1;
544V ((v2d *)BtF)[im*NLAT + k+1] *= sin_2; ((v2d *)BpF)[im*NLAT + k+1] *= sin_2;
545V k+=2;
546V } while(k<NLAT);
547V }
548 #endif
549 if (shtns->ncplx_fft >= 0) {
550Q fftw_execute_dft_c2r(shtns->ifft, (cplx *) BrF, Vr);
551V fftw_execute_dft_c2r(shtns->ifft, (cplx *) BtF, Vt);
552V fftw_execute_dft_c2r(shtns->ifft, (cplx *) BpF, Vp);
553 if (shtns->ncplx_fft > 0) { // free memory
554Q VFREE(BrF);
555VX VFREE(BtF); // this frees also BpF.
556 }
557 }
558 #endif
559
560Q #undef BR0
561V #undef BT0
562V #undef BP0
563 }
Note: See TracBrowser for help on using the repository browser.