source: CIVL/examples/omp/shtns/SHT/mic_spat_to_SH.gen.c

main
Last change on this file was ea777aa, checked in by Alex Wilton <awilton@…>, 3 years ago

Moved examples, include, build_default.properties, common.xml, and README out from dev.civl.com into the root of the repo.

git-svn-id: svn://vsl.cis.udel.edu/civl/trunk@5704 fb995dde-84ed-4084-dfe6-e5aef3e2452c

  • Property mode set to 100644
File size: 18.1 KB
RevLine 
[2131108]1/*
2 * Copyright (c) 2010-2015 Centre National de la Recherche Scientifique.
3 * written by Nathanael Schaeffer (CNRS, ISTerre, Grenoble, France).
4 *
5 * nathanael.schaeffer@ujf-grenoble.fr
6 *
7 * This software is governed by the CeCILL license under French law and
8 * abiding by the rules of distribution of free software. You can use,
9 * modify and/or redistribute the software under the terms of the CeCILL
10 * license as circulated by CEA, CNRS and INRIA at the following URL
11 * "http://www.cecill.info".
12 *
13 * The fact that you are presently reading this means that you have had
14 * knowledge of the CeCILL license and that you accept its terms.
15 *
16 */
17
18# This file is meta-code for SHT.c (spherical harmonic transform).
19# it is intended for "make" to generate C code for 3 similar SHT functions,
20# (namely spat_to_SH [Q tag]), spat_to_SHsphtor [V tag], spat_to_SH3 [both Q&V tags])
21# from one generic function + tags.
22# Basically, there are tags at the beginning of lines (Q,V) that are information
23# to keep or remove the line depending on the function to build. (Q for scalar, V for vector, # for comment)
24#
25//////////////////////////////////////////////////
26
27 static
28QX void GEN3(_an1,NWAY,SUFFIX)(shtns_cfg shtns, double *BrF, cplx *Qlm, const long int llim, const int imlim)
29VX void GEN3(_an2,NWAY,SUFFIX)(shtns_cfg shtns, double *BtF, double *BpF, cplx *Slm, cplx *Tlm, const long int llim, const int imlim)
303 void GEN3(_an3,NWAY,SUFFIX)(shtns_cfg shtns, double *BrF, double *BtF, double *BpF, cplx *Qlm, cplx *Slm, cplx *Tlm, const long int llim, const int imlim)
31 {
32 #define NW (NWAY*2)
33
34 double *alm, *al;
35 double *wg, *ct, *st;
36V double *l_2;
37 long int nk, k, l,m;
38 unsigned m0, mstep;
39 int k_inc, m_inc;
40 #ifndef SHT_AXISYM
41 unsigned im;
42V double m_1;
43 #endif
44Q v2d qq[llim];
45V v2d ss[llim];
46V v2d tt[llim];
47
48Q double rer[NLAT_2 + NW*VSIZE2] SSE;
49Q double ror[NLAT_2 + NW*VSIZE2] SSE;
50V double ter[NLAT_2 + NW*VSIZE2] SSE;
51V double tor[NLAT_2 + NW*VSIZE2] SSE;
52V double per[NLAT_2 + NW*VSIZE2] SSE;
53V double por[NLAT_2 + NW*VSIZE2] SSE;
54 #ifndef SHT_AXISYM
55Q double rei[NLAT_2 + NW*VSIZE2] SSE;
56Q double roi[NLAT_2 + NW*VSIZE2] SSE;
57V double tei[NLAT_2 + NW*VSIZE2] SSE;
58V double toi[NLAT_2 + NW*VSIZE2] SSE;
59V double pei[NLAT_2 + NW*VSIZE2] SSE;
60V double poi[NLAT_2 + NW*VSIZE2] SSE;
61 #endif
62
63 nk = NLAT_2; // copy NLAT_2 to a local variable for faster access (inner loop limit)
64 #if _GCC_VEC_
65 nk = ((unsigned) nk+(VSIZE2-1))/VSIZE2;
66 #endif
67 wg = shtns->wg; ct = shtns->ct; st = shtns->st;
68V l_2 = shtns->l_2;
69 for (k=nk*VSIZE2; k<(nk-1+NW)*VSIZE2; ++k) { // never written, so this is now done for all m's
70Q rer[k] = 0.0; ror[k] = 0.0;
71V ter[k] = 0.0; tor[k] = 0.0;
72V per[k] = 0.0; por[k] = 0.0;
73 #ifndef SHT_AXISYM
74Q rei[k] = 0.0; roi[k] = 0.0;
75V tei[k] = 0.0; toi[k] = 0.0;
76V pei[k] = 0.0; poi[k] = 0.0;
77 #endif
78 }
79
80 // ACCESS PATTERN
81 k_inc = shtns->k_stride_a; m_inc = shtns->m_stride_a;
82
83 #ifndef _OPENMP
84 m0 = 0; mstep = 1;
85 #else
86 m0 = omp_get_thread_num();
87 mstep = omp_get_num_threads();
88 if (m0 == 0)
89 #endif
90 { // im=0 : dzl.p = 0.0 and evrything is REAL
91 alm = shtns->blm;
92Q double r0 = 0.0;
93Q k=0; do { // compute symmetric and antisymmetric parts. (do not weight here, it is cheaper to weight y0)
94Q double an = BrF[k*k_inc]; double bn = BrF[k*k_inc +1];
95Q double bs = BrF[(NLAT-2-k)*k_inc]; double as = BrF[(NLAT-2-k)*k_inc +1];
96Q rer[k] = an+as; ror[k] = an-as;
97Q rer[k+1] = bn+bs; ror[k+1] = bn-bs;
98Q r0 += (an+as)*wg[k] + (bn+bs)*wg[k+1];
99Q k+=2;
100Q } while(k < nk*VSIZE2);
101V k=0; do { // compute symmetric and antisymmetric parts. (do not weight here, it is cheaper to weight y0)
102V double an = BtF[k*k_inc]; double bn = BtF[k*k_inc +1];
103V double bs = BtF[(NLAT-2-k)*k_inc]; double as = BtF[(NLAT-2-k)*k_inc +1];
104V ter[k] = an+as; tor[k] = an-as;
105V ter[k+1] = bn+bs; tor[k+1] = bn-bs;
106V k+=2;
107V } while(k < nk*VSIZE2);
108V k=0; do { // compute symmetric and antisymmetric parts. (do not weight here, it is cheaper to weight y0)
109V double an = BpF[k*k_inc]; double bn = BpF[k*k_inc +1];
110V double bs = BpF[(NLAT-2-k)*k_inc]; double as = BpF[(NLAT-2-k)*k_inc +1];
111V per[k] = an+as; por[k] = an-as;
112V per[k+1] = bn+bs; por[k+1] = bn-bs;
113V k+=2;
114V } while(k < nk*VSIZE2);
115Q Qlm[0] = r0 * alm[0]; // l=0 is done.
116V Slm[0] = 0.0; Tlm[0] = 0.0; // l=0 is zero for the vector transform.
117 k = 0;
118Q double* q_ = (double*) qq;
119V double* s_ = (double*) ss; double* t_ = (double*) tt;
120 for (l=0;l<llim;++l) {
121Q q_[l] = 0.0;
122V s_[l] = 0.0; t_[l] = 0.0;
123 }
124 do {
125 al = alm;
126 rnd cost[NW], y0[NW], y1[NW];
127V rnd sint[NW], dy0[NW], dy1[NW];
128Q rnd rerk[NW], rork[NW]; // help the compiler to cache into registers.
129V rnd terk[NW], tork[NW], perk[NW], pork[NW];
130 for (int j=0; j<NW; ++j) {
131 cost[j] = vread(ct, k+j);
132 y0[j] = vall(al[0]) * vread(wg, k+j); // weight of Gauss quadrature appears here
133V dy0[j] = vall(0.0);
134V sint[j] = -vread(st, k+j);
135 y1[j] = (vall(al[1])*y0[j]) * cost[j];
136V dy1[j] = (vall(al[1])*y0[j]) * sint[j];
137Q rerk[j] = vread(rer, k+j); rork[j] = vread(ror, k+j); // cache into registers.
138V terk[j] = vread(ter, k+j); tork[j] = vread(tor, k+j);
139V perk[j] = vread(per, k+j); pork[j] = vread(por, k+j);
140 }
141 al+=2; l=1;
142 while(l<llim) {
143 for (int j=0; j<NW; ++j) {
144V dy0[j] = vall(al[1])*(cost[j]*dy1[j] + y1[j]*sint[j]) + vall(al[0])*dy0[j];
145 y0[j] = vall(al[1])*(cost[j]*y1[j]) + vall(al[0])*y0[j];
146 }
147Q rnd q = y1[0] * rork[0];
148V rnd s = dy1[0] * terk[0];
149V rnd t = dy1[0] * perk[0];
150 for (int j=1; j<NW; ++j) {
151Q q += y1[j] * rork[j];
152V s += dy1[j] * terk[j];
153V t += dy1[j] * perk[j];
154 }
155Q q_[l-1] += reduce_add(q);
156V s_[l-1] += reduce_add(s);
157V t_[l-1] -= reduce_add(t);
158 for (int j=0; j<NW; ++j) {
159V dy1[j] = vall(al[3])*(cost[j]*dy0[j] + y0[j]*sint[j]) + vall(al[2])*dy1[j];
160 y1[j] = vall(al[3])*(cost[j]*y0[j]) + vall(al[2])*y1[j];
161 }
162Q q = y0[0] * rerk[0];
163V s = dy0[0] * tork[0];
164V t = dy0[0] * pork[0];
165 for (int j=1; j<NW; ++j) {
166Q q += y0[j] * rerk[j];
167V s += dy0[j] * tork[j];
168V t += dy0[j] * pork[j];
169 }
170Q q_[l] += reduce_add(q);
171V s_[l] += reduce_add(s);
172V t_[l] -= reduce_add(t);
173 al+=4; l+=2;
174 }
175 if (l==llim) {
176Q rnd q = y1[0] * rork[0];
177V rnd s = dy1[0] * terk[0];
178V rnd t = dy1[0] * perk[0];
179 for (int j=1; j<NW; ++j) {
180Q q += y1[j] * rork[j];
181V s += dy1[j] * terk[j];
182V t += dy1[j] * perk[j];
183 }
184Q q_[l-1] += reduce_add(q);
185V s_[l-1] += reduce_add(s);
186V t_[l-1] -= reduce_add(t);
187 }
188 k+=NW;
189 } while (k < nk);
190 for (l=1; l<=llim; ++l) {
191Q Qlm[l] = q_[l-1];
192V Slm[l] = s_[l-1]*l_2[l]; Tlm[l] = t_[l-1]*l_2[l];
193 }
194 #ifdef SHT_VAR_LTR
195 for (l=llim+1; l<= LMAX; ++l) {
196Q ((v2d*)Qlm)[l] = vdup(0.0);
197V ((v2d*)Slm)[l] = vdup(0.0); ((v2d*)Tlm)[l] = vdup(0.0);
198 }
199 #ifndef SHT_AXISYM
200 if (imlim <= MMAX) { // zero out m >= imlim
201 l = LiM(shtns, imlim*MRES, imlim);
202 do {
203Q ((v2d*)Qlm)[l] = vdup(0.0);
204V ((v2d*)Slm)[l] = vdup(0.0); ((v2d*)Tlm)[l] = vdup(0.0);
205 } while(++l < shtns->nlm);
206 }
207 #endif
208 #endif
209 m0=mstep;
210 }
211
212 #ifndef SHT_AXISYM
213 for (im=m0; im<imlim; im+=mstep) {
214 m = im*MRES;
215 l = shtns->tm[im] / VSIZE2;
216 alm = shtns->blm + im*(2*LMAX -m+MRES);
217Q k = ((l*VSIZE2)>>1)*2; // k must be even here.
218Q do { // compute symmetric and antisymmetric parts, and reorganize data.
219Q double an, bn, ani, bni, bs, as, bsi, asi, t;
2203 double sina = st[k]; double sinb = st[k+1];
221Q ani = BrF[im*m_inc + k*k_inc]; bni = BrF[im*m_inc + k*k_inc +1]; // north
222Q an = BrF[(NPHI-im)*m_inc + k*k_inc]; bn = BrF[(NPHI-im)*m_inc + k*k_inc +1];
223Q t = ani-an; an += ani; ani = bn-bni; bn += bni; bni = t;
2243 an *= sina; ani*= sina; bn *= sinb; bni *= sinb;
225Q bsi = BrF[im*m_inc + (NLAT-2 -k)*k_inc]; asi = BrF[im*m_inc + (NLAT-2-k)*k_inc + 1]; // south
226Q bs = BrF[(NPHI-im)*m_inc +(NLAT-2-k)*k_inc]; as = BrF[(NPHI-im)*m_inc +(NLAT-2-k)*k_inc +1];
227Q t = bsi-bs; bs += bsi; bsi = as-asi; as += asi; asi = t;
2283 as *= sina; asi*= sina; bs *= sinb; bsi *= sinb;
229Q rer[k] = an+as; rei[k] = ani+asi; rer[k+1] = bn+bs; rei[k+1] = bni+bsi;
230Q ror[k] = an-as; roi[k] = ani-asi; ror[k+1] = bn-bs; roi[k+1] = bni-bsi;
231Q k+=2;
232Q } while (k<nk*VSIZE2);
233V k = ((l*VSIZE2)>>1)*2; // k must be even here.
234V do { // compute symmetric and antisymmetric parts, and reorganize data.
235V double an, bn, ani, bni, bs, as, bsi, asi, t;
236V ani = BtF[im*m_inc + k*k_inc]; bni = BtF[im*m_inc + k*k_inc +1]; // north
237V an = BtF[(NPHI-im)*m_inc + k*k_inc]; bn = BtF[(NPHI-im)*m_inc + k*k_inc +1];
238V t = ani-an; an += ani; ani = bn-bni; bn += bni; bni = t;
239V bsi = BtF[im*m_inc + (NLAT-2 -k)*k_inc]; asi = BtF[im*m_inc + (NLAT-2-k)*k_inc + 1]; // south
240V bs = BtF[(NPHI-im)*m_inc +(NLAT-2-k)*k_inc]; as = BtF[(NPHI-im)*m_inc +(NLAT-2-k)*k_inc +1];
241V t = bsi-bs; bs += bsi; bsi = as-asi; as += asi; asi = t;
242V ter[k] = an+as; tei[k] = ani+asi; ter[k+1] = bn+bs; tei[k+1] = bni+bsi;
243V tor[k] = an-as; toi[k] = ani-asi; tor[k+1] = bn-bs; toi[k+1] = bni-bsi;
244V k+=2;
245V } while (k<nk*VSIZE2);
246V k = ((l*VSIZE2)>>1)*2; // k must be even here.
247V do { // compute symmetric and antisymmetric parts, and reorganize data.
248V double an, bn, ani, bni, bs, as, bsi, asi, t;
249V ani = BpF[im*m_inc + k*k_inc]; bni = BpF[im*m_inc + k*k_inc +1]; // north
250V an = BpF[(NPHI-im)*m_inc + k*k_inc]; bn = BpF[(NPHI-im)*m_inc + k*k_inc +1];
251V t = ani-an; an += ani; ani = bn-bni; bn += bni; bni = t;
252V bsi = BpF[im*m_inc + (NLAT-2 -k)*k_inc]; asi = BpF[im*m_inc + (NLAT-2-k)*k_inc + 1]; // south
253V bs = BpF[(NPHI-im)*m_inc +(NLAT-2-k)*k_inc]; as = BpF[(NPHI-im)*m_inc +(NLAT-2-k)*k_inc +1];
254V t = bsi-bs; bs += bsi; bsi = as-asi; as += asi; asi = t;
255V per[k] = an+as; pei[k] = ani+asi; per[k+1] = bn+bs; pei[k+1] = bni+bsi;
256V por[k] = an-as; poi[k] = ani-asi; por[k+1] = bn-bs; poi[k+1] = bni-bsi;
257V k+=2;
258V } while (k<nk*VSIZE2);
259V m_1 = 1.0/m;
260 k=l;
261 for (l=0; l<=llim-m; l++) {
262Q qq[l] = vdup(0.0);
263V ss[l] = vdup(0.0); tt[l] = vdup(0.0);
264 }
265 do {
266Q v2d* q = qq;
267V v2d* s = ss; v2d* t = tt;
268 al = alm;
269 rnd cost[NW], y0[NW], y1[NW];
270V rnd st2[NW], dy0[NW], dy1[NW];
271Q rnd rerk[NW], reik[NW], rork[NW], roik[NW]; // help the compiler to cache into registers.
272V rnd terk[NW], teik[NW], tork[NW], toik[NW];
273V rnd perk[NW], peik[NW], pork[NW], poik[NW];
274 for (int j=0; j<NW; ++j) {
275 cost[j] = vread(st, k+j);
276 y0[j] = vall(0.5);
277V st2[j] = cost[j]*cost[j]*vall(-m_1);
278V y0[j] *= vall(m); // for the vector transform, compute ylm*m/sint
279 }
280Q l=m;
281V l=m-1;
282 long int ny = 0; // exponent to extend double precision range.
283 if ((int)llim <= SHT_L_RESCALE_FLY) {
284 do { // sin(theta)^m
285 if (l&1) for (int j=0; j<NW; ++j) y0[j] *= cost[j];
286 for (int j=0; j<NW; ++j) cost[j] *= cost[j];
287 } while(l >>= 1);
288 } else {
289 long int nsint = 0;
290 do { // sin(theta)^m (use rescaling to avoid underflow)
291 if (l&1) {
292 for (int j=0; j<NW; ++j) y0[j] *= cost[j];
293 ny += nsint;
294 if (vlo(y0[0]) < (SHT_ACCURACY+1.0/SHT_SCALE_FACTOR)) {
295 ny--;
296 for (int j=0; j<NW; ++j) y0[j] *= vall(SHT_SCALE_FACTOR);
297 }
298 }
299 for (int j=0; j<NW; ++j) cost[j] *= cost[j];
300 nsint += nsint;
301 if (vlo(cost[0]) < 1.0/SHT_SCALE_FACTOR) {
302 nsint--;
303 for (int j=0; j<NW; ++j) cost[j] *= vall(SHT_SCALE_FACTOR);
304 }
305 } while(l >>= 1);
306 }
307 for (int j=0; j<NW; ++j) {
308 y0[j] *= vall(al[0]);
309 cost[j] = vread(ct, k+j);
310V dy0[j] = cost[j]*y0[j];
311 y1[j] = (vall(al[1])*y0[j]) *cost[j];
312V dy1[j] = (vall(al[1])*y0[j]) *(cost[j]*cost[j] + st2[j]);
313 }
314 l=m; al+=2;
315 while ((ny<0) && (l<llim)) { // ylm treated as zero and ignored if ny < 0
316 for (int j=0; j<NW; ++j) {
317V dy0[j] = vall(al[1])*(cost[j]*dy1[j] + y1[j]*st2[j]) + vall(al[0])*dy0[j];
318 y0[j] = vall(al[1])*(cost[j]*y1[j]) + vall(al[0])*y0[j];
319 }
320 for (int j=0; j<NW; ++j) {
321V dy1[j] = vall(al[3])*(cost[j]*dy0[j] + y0[j]*st2[j]) + vall(al[2])*dy1[j];
322 y1[j] = vall(al[3])*(cost[j]*y0[j]) + vall(al[2])*y1[j];
323 }
324 l+=2; al+=4;
325 if (fabs(vlo(y0[NW-1])) > SHT_ACCURACY*SHT_SCALE_FACTOR + 1.0) { // rescale when value is significant
326 ++ny;
327 for (int j=0; j<NW; ++j) {
328 y0[j] *= vall(1.0/SHT_SCALE_FACTOR); y1[j] *= vall(1.0/SHT_SCALE_FACTOR);
329V dy0[j] *= vall(1.0/SHT_SCALE_FACTOR); dy1[j] *= vall(1.0/SHT_SCALE_FACTOR);
330 }
331 }
332 }
333 if (ny == 0) {
334Q q+=(l-m);
335V s+=(l-m); t+=(l-m);
336 for (int j=0; j<NW; ++j) { // prefetch
337 y0[j] *= vread(wg, k+j); y1[j] *= vread(wg, k+j); // weight appears here (must be after the previous accuracy loop).
338V dy0[j] *= vread(wg, k+j); dy1[j] *= vread(wg, k+j);
339Q rerk[j] = vread( rer, k+j); reik[j] = vread( rei, k+j); rork[j] = vread( ror, k+j); roik[j] = vread( roi, k+j);
340V terk[j] = vread( ter, k+j); teik[j] = vread( tei, k+j); tork[j] = vread( tor, k+j); toik[j] = vread( toi, k+j);
341V perk[j] = vread( per, k+j); peik[j] = vread( pei, k+j); pork[j] = vread( por, k+j); poik[j] = vread( poi, k+j);
342 }
343 while (l<llim) { // compute even and odd parts
344Q rnd qq0 = y0[0] * rerk[0];
345Q rnd qq1 = y0[0] * reik[0];
346V rnd ss0 = dy0[0] * tork[0] + y0[0] * peik[0];
347V rnd ss1 = dy0[0] * toik[0] - y0[0] * perk[0];
348V rnd tt0 = dy0[0] * pork[0] - y0[0] * teik[0];
349V rnd tt1 = dy0[0] * poik[0] + y0[0] * terk[0];
350Q for (int j=1; j<NW; ++j) qq0 += y0[j] * rerk[j]; // real even
351Q for (int j=1; j<NW; ++j) qq1 += y0[j] * reik[j]; // imag even
352V for (int j=1; j<NW; ++j) ss0 += dy0[j] * tork[j] + y0[j] * peik[j];
353V for (int j=1; j<NW; ++j) ss1 += dy0[j] * toik[j] - y0[j] * perk[j];
354V for (int j=1; j<NW; ++j) tt0 += dy0[j] * pork[j] - y0[j] * teik[j];
355V for (int j=1; j<NW; ++j) tt1 += dy0[j] * poik[j] + y0[j] * terk[j];
356Q q[0] += v2d_reduce(qq0, qq1);
357V s[0] += v2d_reduce(ss0, ss1);
358V t[0] -= v2d_reduce(tt0, tt1);
359 for (int j=0; j<NW; ++j) {
360V dy0[j] = vall(al[1])*(cost[j]*dy1[j] + y1[j]*st2[j]) + vall(al[0])*dy0[j];
361 y0[j] = vall(al[1])*(cost[j]*y1[j]) + vall(al[0])*y0[j];
362 }
363Q qq0 = y1[0] * rork[0];
364Q qq1 = y1[0] * roik[0];
365V ss0 = dy1[0] * terk[0] + y1[0] * poik[0];
366V ss1 = dy1[0] * teik[0] - y1[0] * pork[0];
367V tt0 = dy1[0] * perk[0] - y1[0] * toik[0];
368V tt1 = dy1[0] * peik[0] + y1[0] * tork[0];
369Q for (int j=1; j<NW; ++j) qq0 += y1[j] * rork[j]; // real odd
370Q for (int j=1; j<NW; ++j) qq1 += y1[j] * roik[j]; // imag odd
371V for (int j=1; j<NW; ++j) ss0 += dy1[j] * terk[j] + y1[j] * poik[j];
372V for (int j=1; j<NW; ++j) ss1 += dy1[j] * teik[j] - y1[j] * pork[j];
373V for (int j=1; j<NW; ++j) tt0 += dy1[j] * perk[j] - y1[j] * toik[j];
374V for (int j=1; j<NW; ++j) tt1 += dy1[j] * peik[j] + y1[j] * tork[j];
375Q q[1] += v2d_reduce(qq0, qq1);
376V s[1] += v2d_reduce(ss0, ss1);
377V t[1] -= v2d_reduce(tt0, tt1);
378Q q+=2;
379V s+=2; t+=2;
380 for (int j=0; j<NW; ++j) {
381V dy1[j] = vall(al[3])*(cost[j]*dy0[j] + y0[j]*st2[j]) + vall(al[2])*dy1[j];
382 y1[j] = vall(al[3])*(cost[j]*y0[j]) + vall(al[2])*y1[j];
383 }
384 l+=2; al+=4;
385 }
386 if (l==llim) {
387Q rnd qq0 = y0[0] * rerk[0];
388Q rnd qq1 = y0[0] * reik[0];
389V rnd ss0 = dy0[0] * tork[0] + y0[0] * peik[0];
390V rnd ss1 = dy0[0] * toik[0] - y0[0] * perk[0];
391V rnd tt0 = dy0[0] * pork[0] - y0[0] * teik[0];
392V rnd tt1 = dy0[0] * poik[0] + y0[0] * terk[0];
393Q for (int j=1; j<NW; ++j) qq0 += y0[j] * rerk[j]; // real even
394Q for (int j=1; j<NW; ++j) qq1 += y0[j] * reik[j]; // imag even
395V for (int j=1; j<NW; ++j) ss0 += dy0[j] * tork[j] + y0[j] * peik[j];
396V for (int j=1; j<NW; ++j) ss1 += dy0[j] * toik[j] - y0[j] * perk[j];
397V for (int j=1; j<NW; ++j) tt0 += dy0[j] * pork[j] - y0[j] * teik[j];
398V for (int j=1; j<NW; ++j) tt1 += dy0[j] * poik[j] + y0[j] * terk[j];
399Q q[0] += v2d_reduce(qq0, qq1);
400V s[0] += v2d_reduce(ss0, ss1);
401V t[0] -= v2d_reduce(tt0, tt1);
402 }
403 }
404 k+=NW;
405 } while (k < nk);
406 l = LiM(shtns, m, im);
407Q v2d *Ql = (v2d*) &Qlm[l];
408V v2d *Sl = (v2d*) &Slm[l];
409V v2d *Tl = (v2d*) &Tlm[l];
410 for (l=0; l<=llim-m; ++l) {
411QX Ql[l] = qq[l];
4123 Ql[l] = qq[l] * vdup(m_1);
413V Sl[l] = ss[l] * vdup(l_2[l+m]);
414V Tl[l] = tt[l] * vdup(l_2[l+m]);
415 }
416 #ifdef SHT_VAR_LTR
417 for (l=llim+1-m; l<=LMAX-m; ++l) {
418Q Ql[l] = vdup(0.0);
419V Sl[l] = vdup(0.0); Tl[l] = vdup(0.0);
420 }
421 #endif
422 }
423 #endif
424 }
425
426 static
427QX void GEN3(spat_to_SH_mic,NWAY,SUFFIX)(shtns_cfg shtns, double *Vr, cplx *Qlm, long int llim) {
428VX void GEN3(spat_to_SHsphtor_mic,NWAY,SUFFIX)(shtns_cfg shtns, double *Vt, double *Vp, cplx *Slm, cplx *Tlm, long int llim) {
4293 void GEN3(spat_to_SHqst_mic,NWAY,SUFFIX)(shtns_cfg shtns, double *Vr, double *Vt, double *Vp, cplx *Qlm, cplx *Slm, cplx *Tlm, long int llim) {
430
431Q double *BrF; // contains the Fourier transformed data
432V double *BtF, *BpF; // contains the Fourier transformed data
433 unsigned imlim=0;
434
435Q BrF = Vr;
436V BtF = Vt; BpF = Vp;
437 #ifndef SHT_AXISYM
438 imlim = MTR;
439 #ifdef SHT_VAR_LTR
440 if (imlim*MRES > (unsigned) llim) imlim = ((unsigned) llim)/MRES; // 32bit mul and div should be faster
441 #endif
442 if (shtns->fftc_mode >= 0) {
443 if (shtns->fftc_mode == 0) { // in-place
444Q fftw_execute_dft(shtns->fftc,(cplx*)BrF, (cplx*)BrF);
445V fftw_execute_dft(shtns->fftc,(cplx*)BtF, (cplx*)BtF);
446V fftw_execute_dft(shtns->fftc,(cplx*)BpF, (cplx*)BpF);
447 } else { // alloc memory for the transpose FFT
448 unsigned long nv = shtns->nspat;
449QX BrF = (double*) VMALLOC( nv * sizeof(double) );
450VX BtF = (double*) VMALLOC( 2*nv * sizeof(double) );
451VX BpF = BtF + nv;
4523 BrF = (double*) VMALLOC( 3*nv * sizeof(double) );
4533 BtF = BrF + nv; BpF = BtF + nv;
454Q fftw_execute_split_dft(shtns->fftc, Vr+NPHI, Vr, BrF+1, BrF);
455V fftw_execute_split_dft(shtns->fftc, Vt+NPHI, Vt, BtF+1, BtF);
456V fftw_execute_split_dft(shtns->fftc, Vp+NPHI, Vp, BpF+1, BpF);
457 }
458 }
459 #endif
460 imlim += 1;
461
462 #pragma omp parallel num_threads(shtns->nthreads)
463 {
464QX GEN3(_an1,NWAY,SUFFIX)(shtns, BrF, Qlm, llim, imlim);
465VX GEN3(_an2,NWAY,SUFFIX)(shtns, BtF, BpF, Slm, Tlm, llim, imlim);
4663 GEN3(_an3,NWAY,SUFFIX)(shtns, BrF, BtF, BpF, Qlm, Slm, Tlm, llim, imlim);
467 }
468
469 #ifndef SHT_AXISYM
470 if (shtns->fftc_mode > 0) { // free memory
471Q VFREE(BrF);
472VX VFREE(BtF); // this frees also BpF.
473 }
474 #endif
475
476 }
Note: See TracBrowser for help on using the repository browser.