source: CIVL/examples/omp/shtns/SHT/fly_spat_to_SH.gen.c

main
Last change on this file was ea777aa, checked in by Alex Wilton <awilton@…>, 3 years ago

Moved examples, include, build_default.properties, common.xml, and README out from dev.civl.com into the root of the repo.

git-svn-id: svn://vsl.cis.udel.edu/civl/trunk@5704 fb995dde-84ed-4084-dfe6-e5aef3e2452c

  • Property mode set to 100644
File size: 28.7 KB
Line 
1/*
2 * Copyright (c) 2010-2015 Centre National de la Recherche Scientifique.
3 * written by Nathanael Schaeffer (CNRS, ISTerre, Grenoble, France).
4 *
5 * nathanael.schaeffer@ujf-grenoble.fr
6 *
7 * This software is governed by the CeCILL license under French law and
8 * abiding by the rules of distribution of free software. You can use,
9 * modify and/or redistribute the software under the terms of the CeCILL
10 * license as circulated by CEA, CNRS and INRIA at the following URL
11 * "http://www.cecill.info".
12 *
13 * The fact that you are presently reading this means that you have had
14 * knowledge of the CeCILL license and that you accept its terms.
15 *
16 */
17
18# This file is meta-code for SHT.c (spherical harmonic transform).
19# it is intended for "make" to generate C code for 3 similar SHT functions,
20# (namely spat_to_SH [Q tag]), spat_to_SHsphtor [V tag], spat_to_SH3 [both Q&V tags])
21# from one generic function + tags.
22# Basically, there are tags at the beginning of lines (Q,V) that are information
23# to keep or remove the line depending on the function to build. (Q for scalar, V for vector, # for comment)
24#
25//////////////////////////////////////////////////
26
27QX static void GEN3(spat_to_SH_fly,NWAY,SUFFIX)(shtns_cfg shtns, double *Vr, cplx *Qlm, long int llim) {
28VX static void GEN3(spat_to_SHsphtor_fly,NWAY,SUFFIX)(shtns_cfg shtns, double *Vt, double *Vp, cplx *Slm, cplx *Tlm, long int llim) {
293 static void GEN3(spat_to_SHqst_fly,NWAY,SUFFIX)(shtns_cfg shtns, double *Vr, double *Vt, double *Vp, cplx *Qlm, cplx *Slm, cplx *Tlm, long int llim) {
30
31Q double *BrF; // contains the Fourier transformed data
32V double *BtF, *BpF; // contains the Fourier transformed data
33 double *alm, *al;
34 double *wg, *ct, *st;
35V double *l_2;
36 long int nk, k, l,m;
37 #ifndef SHT_AXISYM
38V double m_1;
39 unsigned imlim, im;
40 int k_inc, m_inc;
41 #else
42 const int k_inc = 1;
43 #endif
44 #if _GCC_VEC_
45Q rnd qq[2*llim];
46V rnd ss[2*llim];
47V rnd tt[2*llim];
48 #else
49Q double qq[llim];
50V double ss[llim];
51V double tt[llim];
52 #endif
53
54Q double rer[NLAT_2 + NWAY*VSIZE2] SSE;
55Q double ror[NLAT_2 + NWAY*VSIZE2] SSE;
56V double ter[NLAT_2 + NWAY*VSIZE2] SSE;
57V double tor[NLAT_2 + NWAY*VSIZE2] SSE;
58V double per[NLAT_2 + NWAY*VSIZE2] SSE;
59V double por[NLAT_2 + NWAY*VSIZE2] SSE;
60 #ifndef SHT_AXISYM
61Q double rei[NLAT_2 + NWAY*VSIZE2] SSE;
62Q double roi[NLAT_2 + NWAY*VSIZE2] SSE;
63V double tei[NLAT_2 + NWAY*VSIZE2] SSE;
64V double toi[NLAT_2 + NWAY*VSIZE2] SSE;
65V double pei[NLAT_2 + NWAY*VSIZE2] SSE;
66V double poi[NLAT_2 + NWAY*VSIZE2] SSE;
67 #endif
68
69Q BrF = Vr;
70V BtF = Vt; BpF = Vp;
71 #ifndef SHT_AXISYM
72 if (shtns->fftc_mode >= 0) {
73 if (shtns->fftc_mode == 0) { // in-place
74Q fftw_execute_dft(shtns->fftc,(cplx*)BrF, (cplx*)BrF);
75V fftw_execute_dft(shtns->fftc,(cplx*)BtF, (cplx*)BtF);
76V fftw_execute_dft(shtns->fftc,(cplx*)BpF, (cplx*)BpF);
77 } else { // alloc memory for the transpose FFT
78 unsigned long nv = shtns->nspat;
79QX BrF = (double*) VMALLOC( nv * sizeof(double) );
80VX BtF = (double*) VMALLOC( 2*nv * sizeof(double) );
81VX BpF = BtF + nv;
823 BrF = (double*) VMALLOC( 3*nv * sizeof(double) );
833 BtF = BrF + nv; BpF = BtF + nv;
84Q fftw_execute_split_dft(shtns->fftc, Vr+NPHI, Vr, ((double*)BrF)+1, ((double*)BrF));
85V fftw_execute_split_dft(shtns->fftc, Vt+NPHI, Vt, ((double*)BtF)+1, ((double*)BtF));
86V fftw_execute_split_dft(shtns->fftc, Vp+NPHI, Vp, ((double*)BpF)+1, ((double*)BpF));
87 }
88 }
89 imlim = MTR;
90 #ifdef SHT_VAR_LTR
91 if (imlim*MRES > (unsigned) llim) imlim = ((unsigned) llim)/MRES; // 32bit mul and div should be faster
92 #endif
93
94 // ACCESS PATTERN
95 k_inc = shtns->k_stride_a;
96 m_inc = shtns->m_stride_a;
97 #endif
98
99 nk = NLAT_2; // copy NLAT_2 to a local variable for faster access (inner loop limit)
100 wg = shtns->wg; ct = shtns->ct; st = shtns->st;
101 #if _GCC_VEC_
102 nk = ((unsigned) nk+(VSIZE2-1))/VSIZE2;
103 #endif
104V l_2 = shtns->l_2;
105 alm = shtns->blm;
106V k=0; do { // compute symmetric and antisymmetric parts. (do not weight here, it is cheaper to weight y0)
107V double an = BtF[k*k_inc]; double bn = BtF[k*k_inc +1];
108V double bs = BtF[(NLAT-2-k)*k_inc]; double as = BtF[(NLAT-2-k)*k_inc +1];
109V ter[k] = an+as; tor[k] = an-as;
110V ter[k+1] = bn+bs; tor[k+1] = bn-bs;
111V k+=2;
112V } while(k < nk*VSIZE2);
113V k=0; do { // compute symmetric and antisymmetric parts. (do not weight here, it is cheaper to weight y0)
114V double an = BpF[k*k_inc]; double bn = BpF[k*k_inc +1];
115V double bs = BpF[(NLAT-2-k)*k_inc]; double as = BpF[(NLAT-2-k)*k_inc +1];
116V per[k] = an+as; por[k] = an-as;
117V per[k+1] = bn+bs; por[k+1] = bn-bs;
118V k+=2;
119V } while(k < nk*VSIZE2);
120Q double r0a = 0.0; double r0b = 0.0;
121Q k=0; do { // compute symmetric and antisymmetric parts. (do not weight here, it is cheaper to weight y0)
122Q double an = BrF[k*k_inc]; double bn = BrF[k*k_inc +1];
123Q double bs = BrF[(NLAT-2-k)*k_inc]; double as = BrF[(NLAT-2-k)*k_inc +1];
124Q rer[k] = an+as; ror[k] = an-as;
125Q rer[k+1] = bn+bs; ror[k+1] = bn-bs;
126Q r0a += (an+as)*wg[k]; r0b += (bn+bs)*wg[k+1];
127Q k+=2;
128Q } while(k < nk*VSIZE2);
129 for (k=nk*VSIZE2; k<(nk-1+NWAY)*VSIZE2; ++k) {
130Q rer[k] = 0.0; ror[k] = 0.0;
131V ter[k] = 0.0; tor[k] = 0.0;
132V per[k] = 0.0; por[k] = 0.0;
133 }
134Q Qlm[0] = (r0a+r0b) * alm[0]; // l=0 is done.
135V Slm[0] = 0.0; Tlm[0] = 0.0; // l=0 is zero for the vector transform.
136 k = 0;
137 for (l=0;l<llim;++l) {
138Q qq[l] = vall(0.0);
139V ss[l] = vall(0.0); tt[l] = vall(0.0);
140 }
141 do {
142 al = alm;
143 rnd cost[NWAY], y0[NWAY], y1[NWAY];
144V rnd sint[NWAY], dy0[NWAY], dy1[NWAY];
145Q rnd rerk[NWAY], rork[NWAY]; // help the compiler to cache into registers.
146V rnd terk[NWAY], tork[NWAY], perk[NWAY], pork[NWAY];
147 for (int j=0; j<NWAY; ++j) {
148 cost[j] = vread(ct, k+j);
149 y0[j] = vall(al[0]) * vread(wg, k+j); // weight of Gauss quadrature appears here
150V dy0[j] = vall(0.0);
151V sint[j] = -vread(st, k+j);
152 y1[j] = (vall(al[1])*y0[j]) * cost[j];
153V dy1[j] = (vall(al[1])*y0[j]) * sint[j];
154Q rerk[j] = vread(rer, k+j); rork[j] = vread(ror, k+j); // cache into registers.
155V terk[j] = vread(ter, k+j); tork[j] = vread(tor, k+j);
156V perk[j] = vread(per, k+j); pork[j] = vread(por, k+j);
157 }
158 al+=2; l=1;
159 while(l<llim) {
160 for (int j=0; j<NWAY; ++j) {
161V dy0[j] = vall(al[1])*(cost[j]*dy1[j] + y1[j]*sint[j]) + vall(al[0])*dy0[j];
162 y0[j] = vall(al[1])*(cost[j]*y1[j]) + vall(al[0])*y0[j];
163 }
164 for (int j=0; j<NWAY; ++j) {
165Q qq[l-1] += y1[j] * rork[j];
166V ss[l-1] += dy1[j] * terk[j];
167V tt[l-1] -= dy1[j] * perk[j];
168 }
169 for (int j=0; j<NWAY; ++j) {
170V dy1[j] = vall(al[3])*(cost[j]*dy0[j] + y0[j]*sint[j]) + vall(al[2])*dy1[j];
171 y1[j] = vall(al[3])*(cost[j]*y0[j]) + vall(al[2])*y1[j];
172 }
173 for (int j=0; j<NWAY; ++j) {
174Q qq[l] += y0[j] * rerk[j];
175V ss[l] += dy0[j] * tork[j];
176V tt[l] -= dy0[j] * pork[j];
177 }
178 al+=4; l+=2;
179 }
180 if (l==llim) {
181 for (int j=0; j<NWAY; ++j) {
182Q qq[l-1] += y1[j] * rork[j];
183V ss[l-1] += dy1[j] * terk[j];
184V tt[l-1] -= dy1[j] * perk[j];
185 }
186 }
187 k+=NWAY;
188 } while (k < nk); // limit: k=nk-1 => k=nk-1+NWAY is never read.
189 for (l=1; l<=llim; ++l) {
190 #if _GCC_VEC_
191Q ((v2d*)Qlm)[l] = v2d_reduce(qq[l-1], vall(0));
192V ((v2d*)Slm)[l] = v2d_reduce(ss[l-1], vall(0)) * vdup(l_2[l]);
193V ((v2d*)Tlm)[l] = v2d_reduce(tt[l-1], vall(0)) * vdup(l_2[l]);
194 #else
195Q Qlm[l] = qq[l-1];
196V Slm[l] = ss[l-1]*l_2[l]; Tlm[l] = tt[l-1]*l_2[l];
197 #endif
198 }
199 #ifdef SHT_VAR_LTR
200 for (l=llim+1; l<= LMAX; ++l) {
201Q Qlm[l] = 0.0;
202V Slm[l] = 0.0; Tlm[l] = 0.0;
203 }
204 #endif
205
206 #ifndef SHT_AXISYM
207 for (k=nk*VSIZE2; k<(nk-1+NWAY)*VSIZE2; ++k) { // never written, so this is now done for all m's (real parts already zero)
208Q rei[k] = 0.0; roi[k] = 0.0;
209V tei[k] = 0.0; toi[k] = 0.0;
210V pei[k] = 0.0; poi[k] = 0.0;
211 }
212 for (im=1;im<=imlim;++im) {
213 m = im*MRES;
214 l = shtns->tm[im] / VSIZE2;
215 //alm = shtns->blm[im];
216 alm += 2*(LMAX-m+MRES);
217Q k = ((l*VSIZE2)>>1)*2; // k must be even here.
218Q do { // compute symmetric and antisymmetric parts, and reorganize data.
219Q double an, bn, ani, bni, bs, as, bsi, asi, t;
2203 double sina = st[k]; double sinb = st[k+1];
221Q ani = BrF[im*m_inc + k*k_inc]; bni = BrF[im*m_inc + k*k_inc +1]; // north
222Q an = BrF[(NPHI-im)*m_inc + k*k_inc]; bn = BrF[(NPHI-im)*m_inc + k*k_inc +1];
223Q t = ani-an; an += ani; ani = bn-bni; bn += bni; bni = t;
2243 an *= sina; ani*= sina; bn *= sinb; bni *= sinb;
225Q bsi = BrF[im*m_inc + (NLAT-2 -k)*k_inc]; asi = BrF[im*m_inc + (NLAT-2-k)*k_inc + 1]; // south
226Q bs = BrF[(NPHI-im)*m_inc +(NLAT-2-k)*k_inc]; as = BrF[(NPHI-im)*m_inc +(NLAT-2-k)*k_inc +1];
227Q t = bsi-bs; bs += bsi; bsi = as-asi; as += asi; asi = t;
2283 as *= sina; asi*= sina; bs *= sinb; bsi *= sinb;
229Q rer[k] = an+as; rei[k] = ani+asi; rer[k+1] = bn+bs; rei[k+1] = bni+bsi;
230Q ror[k] = an-as; roi[k] = ani-asi; ror[k+1] = bn-bs; roi[k+1] = bni-bsi;
231Q k+=2;
232Q } while (k<nk*VSIZE2);
233V k = ((l*VSIZE2)>>1)*2; // k must be even here.
234V do { // compute symmetric and antisymmetric parts, and reorganize data.
235V double an, bn, ani, bni, bs, as, bsi, asi, t;
236V ani = BtF[im*m_inc + k*k_inc]; bni = BtF[im*m_inc + k*k_inc +1]; // north
237V an = BtF[(NPHI-im)*m_inc + k*k_inc]; bn = BtF[(NPHI-im)*m_inc + k*k_inc +1];
238V t = ani-an; an += ani; ani = bn-bni; bn += bni; bni = t;
239V bsi = BtF[im*m_inc + (NLAT-2 -k)*k_inc]; asi = BtF[im*m_inc + (NLAT-2-k)*k_inc + 1]; // south
240V bs = BtF[(NPHI-im)*m_inc +(NLAT-2-k)*k_inc]; as = BtF[(NPHI-im)*m_inc +(NLAT-2-k)*k_inc +1];
241V t = bsi-bs; bs += bsi; bsi = as-asi; as += asi; asi = t;
242V ter[k] = an+as; tei[k] = ani+asi; ter[k+1] = bn+bs; tei[k+1] = bni+bsi;
243V tor[k] = an-as; toi[k] = ani-asi; tor[k+1] = bn-bs; toi[k+1] = bni-bsi;
244V k+=2;
245V } while (k<nk*VSIZE2);
246V k = ((l*VSIZE2)>>1)*2; // k must be even here.
247V do { // compute symmetric and antisymmetric parts, and reorganize data.
248V double an, bn, ani, bni, bs, as, bsi, asi, t;
249V ani = BpF[im*m_inc + k*k_inc]; bni = BpF[im*m_inc + k*k_inc +1]; // north
250V an = BpF[(NPHI-im)*m_inc + k*k_inc]; bn = BpF[(NPHI-im)*m_inc + k*k_inc +1];
251V t = ani-an; an += ani; ani = bn-bni; bn += bni; bni = t;
252V bsi = BpF[im*m_inc + (NLAT-2 -k)*k_inc]; asi = BpF[im*m_inc + (NLAT-2-k)*k_inc + 1]; // south
253V bs = BpF[(NPHI-im)*m_inc +(NLAT-2-k)*k_inc]; as = BpF[(NPHI-im)*m_inc +(NLAT-2-k)*k_inc +1];
254V t = bsi-bs; bs += bsi; bsi = as-asi; as += asi; asi = t;
255V per[k] = an+as; pei[k] = ani+asi; per[k+1] = bn+bs; pei[k+1] = bni+bsi;
256V por[k] = an-as; poi[k] = ani-asi; por[k+1] = bn-bs; poi[k+1] = bni-bsi;
257V k+=2;
258V } while (k<nk*VSIZE2);
259V m_1 = 1.0/m;
260 k=l;
261 #if _GCC_VEC_
262Q rnd* q = qq;
263V rnd* s = ss; rnd* t = tt;
264 #else
265 l = LiM(shtns, m, im);
266Q double* q = (double *) &Qlm[l];
267V double* s = (double *) &Slm[l];
268V double* t = (double *) &Tlm[l];
269 #endif
270 for (l=llim-m; l>=0; l--) {
271Q q[0] = vall(0.0); q[1] = vall(0.0); q+=2;
272V s[0] = vall(0.0); s[1] = vall(0.0); s+=2;
273V t[0] = vall(0.0); t[1] = vall(0.0); t+=2;
274 }
275 do {
276 #if _GCC_VEC_
277Q rnd* q = qq;
278V rnd* s = ss; rnd* t = tt;
279 #else
280 l = LiM(shtns, m, im);
281Q double* q = (double *) &Qlm[l];
282V double* s = (double *) &Slm[l];
283V double* t = (double *) &Tlm[l];
284 #endif
285 al = alm;
286 rnd cost[NWAY], y0[NWAY], y1[NWAY];
287V rnd st2[NWAY], dy0[NWAY], dy1[NWAY];
288Q rnd rerk[NWAY], reik[NWAY], rork[NWAY], roik[NWAY]; // help the compiler to cache into registers.
289V rnd terk[NWAY], teik[NWAY], tork[NWAY], toik[NWAY];
290V rnd perk[NWAY], peik[NWAY], pork[NWAY], poik[NWAY];
291 for (int j=0; j<NWAY; ++j) {
292 cost[j] = vread(st, k+j);
293 y0[j] = vall(0.5);
294V st2[j] = cost[j]*cost[j]*vall(-m_1);
295V y0[j] *= vall(m); // for the vector transform, compute ylm*m/sint
296 }
297Q l=m;
298V l=m-1;
299 long int ny = 0; // exponent to extend double precision range.
300 if ((int)llim <= SHT_L_RESCALE_FLY) {
301 do { // sin(theta)^m
302 if (l&1) for (int j=0; j<NWAY; ++j) y0[j] *= cost[j];
303 for (int j=0; j<NWAY; ++j) cost[j] *= cost[j];
304 } while(l >>= 1);
305 } else {
306 long int nsint = 0;
307 do { // sin(theta)^m (use rescaling to avoid underflow)
308 if (l&1) {
309 for (int j=0; j<NWAY; ++j) y0[j] *= cost[j];
310 ny += nsint;
311 if (vlo(y0[0]) < (SHT_ACCURACY+1.0/SHT_SCALE_FACTOR)) {
312 ny--;
313 for (int j=0; j<NWAY; ++j) y0[j] *= vall(SHT_SCALE_FACTOR);
314 }
315 }
316 for (int j=0; j<NWAY; ++j) cost[j] *= cost[j];
317 nsint += nsint;
318 if (vlo(cost[0]) < 1.0/SHT_SCALE_FACTOR) {
319 nsint--;
320 for (int j=0; j<NWAY; ++j) cost[j] *= vall(SHT_SCALE_FACTOR);
321 }
322 } while(l >>= 1);
323 }
324 for (int j=0; j<NWAY; ++j) {
325 y0[j] *= vall(al[0]);
326 cost[j] = vread(ct, k+j);
327V dy0[j] = cost[j]*y0[j];
328 y1[j] = (vall(al[1])*y0[j]) *cost[j];
329V dy1[j] = (vall(al[1])*y0[j]) *(cost[j]*cost[j] + st2[j]);
330 }
331 l=m; al+=2;
332 while ((ny<0) && (l<llim)) { // ylm treated as zero and ignored if ny < 0
333 for (int j=0; j<NWAY; ++j) {
334V dy0[j] = vall(al[1])*(cost[j]*dy1[j] + y1[j]*st2[j]) + vall(al[0])*dy0[j];
335 y0[j] = vall(al[1])*(cost[j]*y1[j]) + vall(al[0])*y0[j];
336 }
337 for (int j=0; j<NWAY; ++j) {
338V dy1[j] = vall(al[3])*(cost[j]*dy0[j] + y0[j]*st2[j]) + vall(al[2])*dy1[j];
339 y1[j] = vall(al[3])*(cost[j]*y0[j]) + vall(al[2])*y1[j];
340 }
341 l+=2; al+=4;
342 if (fabs(vlo(y0[NWAY-1])) > SHT_ACCURACY*SHT_SCALE_FACTOR + 1.0) { // rescale when value is significant
343 ++ny;
344 for (int j=0; j<NWAY; ++j) {
345 y0[j] *= vall(1.0/SHT_SCALE_FACTOR); y1[j] *= vall(1.0/SHT_SCALE_FACTOR);
346V dy0[j] *= vall(1.0/SHT_SCALE_FACTOR); dy1[j] *= vall(1.0/SHT_SCALE_FACTOR);
347 }
348 }
349 }
350 if (ny == 0) {
351Q q+=2*(l-m);
352V s+=2*(l-m); t+=2*(l-m);
353 for (int j=0; j<NWAY; ++j) { // prefetch
354 y0[j] *= vread(wg, k+j); y1[j] *= vread(wg, k+j); // weight appears here (must be after the previous accuracy loop).
355V dy0[j] *= vread(wg, k+j); dy1[j] *= vread(wg, k+j);
356Q rerk[j] = vread( rer, k+j); reik[j] = vread( rei, k+j); rork[j] = vread( ror, k+j); roik[j] = vread( roi, k+j);
357V terk[j] = vread( ter, k+j); teik[j] = vread( tei, k+j); tork[j] = vread( tor, k+j); toik[j] = vread( toi, k+j);
358V perk[j] = vread( per, k+j); peik[j] = vread( pei, k+j); pork[j] = vread( por, k+j); poik[j] = vread( poi, k+j);
359 }
360 while (l<llim) { // compute even and odd parts
361Q for (int j=0; j<NWAY; ++j) q[0] += y0[j] * rerk[j]; // real even
362Q for (int j=0; j<NWAY; ++j) q[1] += y0[j] * reik[j]; // imag even
363V for (int j=0; j<NWAY; ++j) s[0] += dy0[j] * tork[j] + y0[j] * peik[j];
364V for (int j=0; j<NWAY; ++j) s[1] += dy0[j] * toik[j] - y0[j] * perk[j];
365V for (int j=0; j<NWAY; ++j) t[0] -= dy0[j] * pork[j] - y0[j] * teik[j];
366V for (int j=0; j<NWAY; ++j) t[1] -= dy0[j] * poik[j] + y0[j] * terk[j];
367 for (int j=0; j<NWAY; ++j) {
368V dy0[j] = vall(al[1])*(cost[j]*dy1[j] + y1[j]*st2[j]) + vall(al[0])*dy0[j];
369 y0[j] = vall(al[1])*(cost[j]*y1[j]) + vall(al[0])*y0[j];
370 }
371Q for (int j=0; j<NWAY; ++j) q[2] += y1[j] * rork[j]; // real odd
372Q for (int j=0; j<NWAY; ++j) q[3] += y1[j] * roik[j]; // imag odd
373V for (int j=0; j<NWAY; ++j) s[2] += dy1[j] * terk[j] + y1[j] * poik[j];
374V for (int j=0; j<NWAY; ++j) s[3] += dy1[j] * teik[j] - y1[j] * pork[j];
375V for (int j=0; j<NWAY; ++j) t[2] -= dy1[j] * perk[j] - y1[j] * toik[j];
376V for (int j=0; j<NWAY; ++j) t[3] -= dy1[j] * peik[j] + y1[j] * tork[j];
377Q q+=4;
378V s+=4; t+=4;
379 for (int j=0; j<NWAY; ++j) {
380V dy1[j] = vall(al[3])*(cost[j]*dy0[j] + y0[j]*st2[j]) + vall(al[2])*dy1[j];
381 y1[j] = vall(al[3])*(cost[j]*y0[j]) + vall(al[2])*y1[j];
382 }
383 l+=2; al+=4;
384 }
385 if (l==llim) {
386Q for (int j=0; j<NWAY; ++j) q[0] += y0[j] * rerk[j]; // real even
387Q for (int j=0; j<NWAY; ++j) q[1] += y0[j] * reik[j]; // imag even
388V for (int j=0; j<NWAY; ++j) s[0] += dy0[j] * tork[j] + y0[j] * peik[j];
389V for (int j=0; j<NWAY; ++j) s[1] += dy0[j] * toik[j] - y0[j] * perk[j];
390V for (int j=0; j<NWAY; ++j) t[0] -= dy0[j] * pork[j] - y0[j] * teik[j];
391V for (int j=0; j<NWAY; ++j) t[1] -= dy0[j] * poik[j] + y0[j] * terk[j];
392 }
393 }
394 k+=NWAY;
395 } while (k < nk); // limit: k=nk-1 => k=nk-1+NWAY is never read.
396 l = LiM(shtns, m, im);
397Q v2d *Ql = (v2d*) &Qlm[l];
398V v2d *Sl = (v2d*) &Slm[l];
399V v2d *Tl = (v2d*) &Tlm[l];
400 #if _GCC_VEC_
401 for (l=0; l<=llim-m; ++l) {
402QX Ql[l] = v2d_reduce(qq[2*l], qq[2*l+1]);
4033 Ql[l] = v2d_reduce(qq[2*l], qq[2*l+1]) * vdup(m_1);
404V Sl[l] = v2d_reduce(ss[2*l], ss[2*l+1]) * vdup(l_2[l+m]);
405V Tl[l] = v2d_reduce(tt[2*l], tt[2*l+1]) * vdup(l_2[l+m]);
406 }
407 #else
408V for (l=0; l<=llim-m; ++l) {
4093 Ql[l] *= m_1;
410V Sl[l] *= l_2[l+m];
411V Tl[l] *= l_2[l+m];
412V }
413 #endif
414 #ifdef SHT_VAR_LTR
415 for (l=llim+1-m; l<=LMAX-m; ++l) {
416Q Ql[l] = vdup(0.0);
417V Sl[l] = vdup(0.0); Tl[l] = vdup(0.0);
418 }
419 #endif
420 }
421 #ifdef SHT_VAR_LTR
422 if (imlim < MMAX) {
423 im = imlim+1;
424 l = LiM(shtns, im*MRES, im);
425 do {
426Q ((v2d*)Qlm)[l] = vdup(0.0);
427V ((v2d*)Slm)[l] = vdup(0.0); ((v2d*)Tlm)[l] = vdup(0.0);
428 } while(++l < shtns->nlm);
429 }
430 #endif
431
432 if (shtns->fftc_mode > 0) { // free memory
433Q VFREE(BrF);
434VX VFREE(BtF); // this frees also BpF.
435 }
436 #endif
437
438 }
439
440
441 #ifndef SHT_AXISYM
442
443QX static void GEN3(spat_to_SH_m_fly,NWAY,SUFFIX)(shtns_cfg shtns, int im, cplx *Vr, cplx *Qlm, long int llim) {
444VX static void GEN3(spat_to_SHsphtor_m_fly,NWAY,SUFFIX)(shtns_cfg shtns, int im, cplx *Vt, cplx *Vp, cplx *Slm, cplx *Tlm, long int llim) {
4453 static void GEN3(spat_to_SHqst_m_fly,NWAY,SUFFIX)(shtns_cfg shtns, int im, cplx *Vr, cplx *Vt, cplx *Vp, cplx *Qlm, cplx *Slm, cplx *Tlm, long int llim) {
446
447 double *alm, *al;
448 double *wg, *ct, *st;
449V double *l_2;
450 long int nk, k, l,m;
451 double alm0_rescale;
452V double m_1;
453 #if _GCC_VEC_
454Q rnd qq[2*llim];
455V rnd ss[2*llim];
456V rnd tt[2*llim];
457 #else
458Q double qq[llim];
459V double ss[llim];
460V double tt[llim];
461 #endif
462
463Q double rer[NLAT_2 + NWAY*VSIZE2] SSE;
464Q double ror[NLAT_2 + NWAY*VSIZE2] SSE;
465V double ter[NLAT_2 + NWAY*VSIZE2] SSE;
466V double tor[NLAT_2 + NWAY*VSIZE2] SSE;
467V double per[NLAT_2 + NWAY*VSIZE2] SSE;
468V double por[NLAT_2 + NWAY*VSIZE2] SSE;
469Q double rei[NLAT_2 + NWAY*VSIZE2] SSE;
470Q double roi[NLAT_2 + NWAY*VSIZE2] SSE;
471V double tei[NLAT_2 + NWAY*VSIZE2] SSE;
472V double toi[NLAT_2 + NWAY*VSIZE2] SSE;
473V double pei[NLAT_2 + NWAY*VSIZE2] SSE;
474V double poi[NLAT_2 + NWAY*VSIZE2] SSE;
475
476 nk = NLAT_2; // copy NLAT_2 to a local variable for faster access (inner loop limit)
477 #if _GCC_VEC_
478 nk = ((unsigned) nk+(VSIZE2-1))/VSIZE2;
479 #endif
480 wg = shtns->wg; ct = shtns->ct; st = shtns->st;
481V l_2 = shtns->l_2;
482
483 for (k=nk*VSIZE2; k<(nk-1+NWAY)*VSIZE2; ++k) {
484Q rer[k] = 0.0; ror[k] = 0.0;
485V ter[k] = 0.0; tor[k] = 0.0;
486V per[k] = 0.0; por[k] = 0.0;
487 }
488
489 if (im == 0) { // im=0
490 alm = shtns->blm;
491V k=0; do { // compute symmetric and antisymmetric parts. (do not weight here, it is cheaper to weight y0)
492V double n = creal(Vt[k]); double s = creal(Vt[NLAT-1-k]);
493V ter[k] = n+s; tor[k] = n-s;
494V } while(++k < nk*VSIZE2);
495V k=0; do { // compute symmetric and antisymmetric parts. (do not weight here, it is cheaper to weight y0)
496V double n = creal(Vp[k]); double s = creal(Vp[NLAT-1-k]);
497V per[k] = n+s; por[k] = n-s;
498V } while(++k < nk*VSIZE2);
499Q double r0 = 0.0;
500Q k=0; do { // compute symmetric and antisymmetric parts. (do not weight here, it is cheaper to weight y0)
501Q double n = creal(Vr[k]); double s = creal(Vr[NLAT-1-k]);
502Q rer[k] = n+s; ror[k] = n-s;
503Q r0 += (n+s)*wg[k];
504Q } while(++k < nk*VSIZE2);
505 alm0_rescale = alm[0] * shtns->nphi; // alm[0] takes into account the fftw normalization, *nphi cancels it
506V Slm[0] = 0.0; Tlm[0] = 0.0; // l=0 is zero for the vector transform.
507Q Qlm[0] = r0 * alm0_rescale; // l=0 is done.
508 k = 0;
509 for (l=0;l<llim;++l) {
510Q qq[l] = vall(0.0);
511V ss[l] = vall(0.0); tt[l] = vall(0.0);
512 }
513 do {
514 al = alm;
515 rnd cost[NWAY], y0[NWAY], y1[NWAY];
516V rnd sint[NWAY], dy0[NWAY], dy1[NWAY];
517Q rnd rerk[NWAY], rork[NWAY]; // help the compiler to cache into registers.
518V rnd terk[NWAY], tork[NWAY], perk[NWAY], pork[NWAY];
519 for (int j=0; j<NWAY; ++j) {
520 cost[j] = vread(ct, k+j);
521 y0[j] = vall(alm0_rescale) * vread(wg, k+j); // weight of Gauss quadrature appears here
522V dy0[j] = vall(0.0);
523V sint[j] = -vread(st, k+j);
524 y1[j] = (vall(al[1])*y0[j]) * cost[j];
525V dy1[j] = (vall(al[1])*y0[j]) * sint[j];
526Q rerk[j] = vread(rer, k+j); rork[j] = vread(ror, k+j); // cache into registers.
527V terk[j] = vread(ter, k+j); tork[j] = vread(tor, k+j);
528V perk[j] = vread(per, k+j); pork[j] = vread(por, k+j);
529 }
530 al+=2; l=1;
531 while(l<llim) {
532 for (int j=0; j<NWAY; ++j) {
533V dy0[j] = vall(al[1])*(cost[j]*dy1[j] + y1[j]*sint[j]) + vall(al[0])*dy0[j];
534 y0[j] = vall(al[1])*(cost[j]*y1[j]) + vall(al[0])*y0[j];
535 }
536 for (int j=0; j<NWAY; ++j) {
537Q qq[l-1] += y1[j] * rork[j];
538V ss[l-1] += dy1[j] * terk[j];
539V tt[l-1] -= dy1[j] * perk[j];
540 }
541 for (int j=0; j<NWAY; ++j) {
542V dy1[j] = vall(al[3])*(cost[j]*dy0[j] + y0[j]*sint[j]) + vall(al[2])*dy1[j];
543 y1[j] = vall(al[3])*(cost[j]*y0[j]) + vall(al[2])*y1[j];
544 }
545 for (int j=0; j<NWAY; ++j) {
546Q qq[l] += y0[j] * rerk[j];
547V ss[l] += dy0[j] * tork[j];
548V tt[l] -= dy0[j] * pork[j];
549 }
550 al+=4; l+=2;
551 }
552 if (l==llim) {
553 for (int j=0; j<NWAY; ++j) {
554Q qq[l-1] += y1[j] * rork[j];
555V ss[l-1] += dy1[j] * terk[j];
556V tt[l-1] -= dy1[j] * perk[j];
557 }
558 }
559 k+=NWAY;
560 } while (k < nk);
561 for (l=1; l<=llim; ++l) {
562 #if _GCC_VEC_
563Q ((v2d*)Qlm)[l] = v2d_reduce(qq[l-1], vall(0));
564V ((v2d*)Slm)[l] = v2d_reduce(ss[l-1], vall(0)) * vdup(l_2[l]);
565V ((v2d*)Tlm)[l] = v2d_reduce(tt[l-1], vall(0)) * vdup(l_2[l]);
566 #else
567Q Qlm[l] = qq[l-1];
568V Slm[l] = ss[l-1]*l_2[l]; Tlm[l] = tt[l-1]*l_2[l];
569 #endif
570 }
571 #ifdef SHT_VAR_LTR
572 for (l=llim+1; l<= LMAX; ++l) {
573Q ((v2d*)Qlm)[l] = vdup(0.0);
574V ((v2d*)Slm)[l] = vdup(0.0); ((v2d*)Tlm)[l] = vdup(0.0);
575 }
576 #endif
577
578 } else { // im > 0
579
580 for (k=nk*VSIZE2; k<(nk-1+NWAY)*VSIZE2; ++k) {
581Q rei[k] = 0.0; roi[k] = 0.0;
582V tei[k] = 0.0; toi[k] = 0.0;
583V pei[k] = 0.0; poi[k] = 0.0;
584 }
585
586 m = im*MRES;
587 l = shtns->tm[im] / VSIZE2;
588 alm = shtns->blm + im*(2*LMAX -m+MRES);
589Q k = ((l*VSIZE2)>>1)*2; // k must be even here.
590Q do { // compute symmetric and antisymmetric parts.
5913 double sink = st[k];
592Q cplx n = Vr[k]; cplx s = Vr[NLAT-1-k];
5933 n *= sink; s *= sink;
594Q rer[k] = creal(n+s); rei[k] = cimag(n+s);
595Q ror[k] = creal(n-s); roi[k] = cimag(n-s);
596Q } while (++k<nk*VSIZE2);
597V k = ((l*VSIZE2)>>1)*2; // k must be even here.
598V do { // compute symmetric and antisymmetric parts.
599V cplx n = Vt[k]; cplx s = Vt[NLAT-1-k];
600V ter[k] = creal(n+s); tei[k] = cimag(n+s);
601V tor[k] = creal(n-s); toi[k] = cimag(n-s);
602V } while (++k<nk*VSIZE2);
603V k = ((l*VSIZE2)>>1)*2; // k must be even here.
604V do { // compute symmetric and antisymmetric parts.
605V cplx n = Vp[k]; cplx s = Vp[NLAT-1-k];
606V per[k] = creal(n+s); pei[k] = cimag(n+s);
607V por[k] = creal(n-s); poi[k] = cimag(n-s);
608V } while (++k<nk*VSIZE2);
609
610V m_1 = 1.0/m;
611 k=l;
612 #if _GCC_VEC_
613Q rnd* q = qq;
614V rnd* s = ss; rnd* t = tt;
615 #else
616Q double* q = (double *) Qlm;
617V double* s = (double *) Slm;
618V double* t = (double *) Tlm;
619 #endif
620 for (l=llim-m; l>=0; l--) {
621Q q[0] = vall(0.0); q[1] = vall(0.0); q+=2;
622V s[0] = vall(0.0); s[1] = vall(0.0); s+=2;
623V t[0] = vall(0.0); t[1] = vall(0.0); t+=2;
624 }
625 alm0_rescale = alm[0] * (shtns->nphi*2);
626 do {
627 #if _GCC_VEC_
628Q rnd* q = qq;
629V rnd* s = ss; rnd* t = tt;
630 #else
631Q double* q = (double *) Qlm;
632V double* s = (double *) Slm;
633V double* t = (double *) Tlm;
634 #endif
635 al = alm;
636 rnd cost[NWAY], y0[NWAY], y1[NWAY];
637V rnd st2[NWAY], dy0[NWAY], dy1[NWAY];
638Q rnd rerk[NWAY], reik[NWAY], rork[NWAY], roik[NWAY]; // help the compiler to cache into registers.
639V rnd terk[NWAY], teik[NWAY], tork[NWAY], toik[NWAY];
640V rnd perk[NWAY], peik[NWAY], pork[NWAY], poik[NWAY];
641 for (int j=0; j<NWAY; ++j) {
642 cost[j] = vread(st, k+j);
643 y0[j] = vall(0.5);
644V st2[j] = cost[j]*cost[j]*vall(-m_1);
645V y0[j] *= vall(m); // for the vector transform, compute ylm*m/sint
646 }
647Q l=m;
648V l=m-1;
649 long int ny = 0; // exponent to extend double precision range.
650 if ((int)llim <= SHT_L_RESCALE_FLY) {
651 do { // sin(theta)^m
652 if (l&1) for (int j=0; j<NWAY; ++j) y0[j] *= cost[j];
653 for (int j=0; j<NWAY; ++j) cost[j] *= cost[j];
654 } while(l >>= 1);
655 } else {
656 long int nsint = 0;
657 do { // sin(theta)^m (use rescaling to avoid underflow)
658 if (l&1) {
659 for (int j=0; j<NWAY; ++j) y0[j] *= cost[j];
660 ny += nsint;
661 if (vlo(y0[0]) < (SHT_ACCURACY+1.0/SHT_SCALE_FACTOR)) {
662 ny--;
663 for (int j=0; j<NWAY; ++j) y0[j] *= vall(SHT_SCALE_FACTOR);
664 }
665 }
666 for (int j=0; j<NWAY; ++j) cost[j] *= cost[j];
667 nsint += nsint;
668 if (vlo(cost[0]) < 1.0/SHT_SCALE_FACTOR) {
669 nsint--;
670 for (int j=0; j<NWAY; ++j) cost[j] *= vall(SHT_SCALE_FACTOR);
671 }
672 } while(l >>= 1);
673 }
674 for (int j=0; j<NWAY; ++j) {
675 y0[j] *= vall(alm0_rescale);
676 cost[j] = vread(ct, k+j);
677V dy0[j] = cost[j]*y0[j];
678 y1[j] = (vall(al[1])*y0[j]) *cost[j];
679V dy1[j] = (vall(al[1])*y0[j]) *(cost[j]*cost[j] + st2[j]);
680 }
681 l=m; al+=2;
682 while ((ny<0) && (l<llim)) { // ylm treated as zero and ignored if ny < 0
683 for (int j=0; j<NWAY; ++j) {
684V dy0[j] = vall(al[1])*(cost[j]*dy1[j] + y1[j]*st2[j]) + vall(al[0])*dy0[j];
685 y0[j] = vall(al[1])*(cost[j]*y1[j]) + vall(al[0])*y0[j];
686 }
687 for (int j=0; j<NWAY; ++j) {
688V dy1[j] = vall(al[3])*(cost[j]*dy0[j] + y0[j]*st2[j]) + vall(al[2])*dy1[j];
689 y1[j] = vall(al[3])*(cost[j]*y0[j]) + vall(al[2])*y1[j];
690 }
691 l+=2; al+=4;
692 if (fabs(vlo(y0[NWAY-1])) > SHT_ACCURACY*SHT_SCALE_FACTOR + 1.0) { // rescale when value is significant
693 ++ny;
694 for (int j=0; j<NWAY; ++j) {
695 y0[j] *= vall(1.0/SHT_SCALE_FACTOR); y1[j] *= vall(1.0/SHT_SCALE_FACTOR);
696V dy0[j] *= vall(1.0/SHT_SCALE_FACTOR); dy1[j] *= vall(1.0/SHT_SCALE_FACTOR);
697 }
698 }
699 }
700 if (ny == 0) {
701Q q+=2*(l-m);
702V s+=2*(l-m); t+=2*(l-m);
703 for (int j=0; j<NWAY; ++j) { // prefetch
704 y0[j] *= vread(wg, k+j); y1[j] *= vread(wg, k+j); // weight appears here (must be after the previous accuracy loop).
705V dy0[j] *= vread(wg, k+j); dy1[j] *= vread(wg, k+j);
706Q rerk[j] = vread( rer, k+j); reik[j] = vread( rei, k+j); rork[j] = vread( ror, k+j); roik[j] = vread( roi, k+j);
707V terk[j] = vread( ter, k+j); teik[j] = vread( tei, k+j); tork[j] = vread( tor, k+j); toik[j] = vread( toi, k+j);
708V perk[j] = vread( per, k+j); peik[j] = vread( pei, k+j); pork[j] = vread( por, k+j); poik[j] = vread( poi, k+j);
709 }
710 while (l<llim) { // compute even and odd parts
711Q for (int j=0; j<NWAY; ++j) q[0] += y0[j] * rerk[j]; // real even
712Q for (int j=0; j<NWAY; ++j) q[1] += y0[j] * reik[j]; // imag even
713V for (int j=0; j<NWAY; ++j) s[0] += dy0[j] * tork[j] + y0[j] * peik[j];
714V for (int j=0; j<NWAY; ++j) s[1] += dy0[j] * toik[j] - y0[j] * perk[j];
715V for (int j=0; j<NWAY; ++j) t[0] -= dy0[j] * pork[j] - y0[j] * teik[j];
716V for (int j=0; j<NWAY; ++j) t[1] -= dy0[j] * poik[j] + y0[j] * terk[j];
717 for (int j=0; j<NWAY; ++j) {
718V dy0[j] = vall(al[1])*(cost[j]*dy1[j] + y1[j]*st2[j]) + vall(al[0])*dy0[j];
719 y0[j] = vall(al[1])*(cost[j]*y1[j]) + vall(al[0])*y0[j];
720 }
721Q for (int j=0; j<NWAY; ++j) q[2] += y1[j] * rork[j]; // real odd
722Q for (int j=0; j<NWAY; ++j) q[3] += y1[j] * roik[j]; // imag odd
723V for (int j=0; j<NWAY; ++j) s[2] += dy1[j] * terk[j] + y1[j] * poik[j];
724V for (int j=0; j<NWAY; ++j) s[3] += dy1[j] * teik[j] - y1[j] * pork[j];
725V for (int j=0; j<NWAY; ++j) t[2] -= dy1[j] * perk[j] - y1[j] * toik[j];
726V for (int j=0; j<NWAY; ++j) t[3] -= dy1[j] * peik[j] + y1[j] * tork[j];
727Q q+=4;
728V s+=4; t+=4;
729 for (int j=0; j<NWAY; ++j) {
730V dy1[j] = vall(al[3])*(cost[j]*dy0[j] + y0[j]*st2[j]) + vall(al[2])*dy1[j];
731 y1[j] = vall(al[3])*(cost[j]*y0[j]) + vall(al[2])*y1[j];
732 }
733 l+=2; al+=4;
734 }
735 if (l==llim) {
736Q for (int j=0; j<NWAY; ++j) q[0] += y0[j] * rerk[j]; // real even
737Q for (int j=0; j<NWAY; ++j) q[1] += y0[j] * reik[j]; // imag even
738V for (int j=0; j<NWAY; ++j) s[0] += dy0[j] * tork[j] + y0[j] * peik[j];
739V for (int j=0; j<NWAY; ++j) s[1] += dy0[j] * toik[j] - y0[j] * perk[j];
740V for (int j=0; j<NWAY; ++j) t[0] -= dy0[j] * pork[j] - y0[j] * teik[j];
741V for (int j=0; j<NWAY; ++j) t[1] -= dy0[j] * poik[j] + y0[j] * terk[j];
742 }
743 }
744 k+=NWAY;
745 } while (k < nk);
746 #if _GCC_VEC_
747 for (l=0; l<=llim-m; ++l) {
748QX ((v2d*)Qlm)[l] = v2d_reduce(qq[2*l], qq[2*l+1]);
7493 ((v2d*)Qlm)[l] = v2d_reduce(qq[2*l], qq[2*l+1]) * vdup(m_1);
750V ((v2d*)Slm)[l] = v2d_reduce(ss[2*l], ss[2*l+1]) * vdup(l_2[l+m]);
751V ((v2d*)Tlm)[l] = v2d_reduce(tt[2*l], tt[2*l+1]) * vdup(l_2[l+m]);
752 }
753 #else
754V for (l=0; l<=llim-m; ++l) {
7553 Qlm[l] *= m_1;
756V Slm[l] *= l_2[l+m];
757V Tlm[l] *= l_2[l+m];
758V }
759 #endif
760 #ifdef SHT_VAR_LTR
761 for (l=llim+1-m; l<=LMAX-m; ++l) {
762Q ((v2d*)Qlm)[l] = vdup(0.0);
763V ((v2d*)Slm)[l] = vdup(0.0); ((v2d*)Tlm)[l] = vdup(0.0);
764 }
765 #endif
766 }
767
768 }
769
770 #endif
Note: See TracBrowser for help on using the repository browser.