source: CIVL/examples/omp/shtns/SHT/omp_spat_to_SH.gen.c

main
Last change on this file was ea777aa, checked in by Alex Wilton <awilton@…>, 3 years ago

Moved examples, include, build_default.properties, common.xml, and README out from dev.civl.com into the root of the repo.

git-svn-id: svn://vsl.cis.udel.edu/civl/trunk@5704 fb995dde-84ed-4084-dfe6-e5aef3e2452c

  • Property mode set to 100644
File size: 18.6 KB
Line 
1/*
2 * Copyright (c) 2010-2015 Centre National de la Recherche Scientifique.
3 * written by Nathanael Schaeffer (CNRS, ISTerre, Grenoble, France).
4 *
5 * nathanael.schaeffer@ujf-grenoble.fr
6 *
7 * This software is governed by the CeCILL license under French law and
8 * abiding by the rules of distribution of free software. You can use,
9 * modify and/or redistribute the software under the terms of the CeCILL
10 * license as circulated by CEA, CNRS and INRIA at the following URL
11 * "http://www.cecill.info".
12 *
13 * The fact that you are presently reading this means that you have had
14 * knowledge of the CeCILL license and that you accept its terms.
15 *
16 */
17
18# This file is meta-code for SHT.c (spherical harmonic transform).
19# it is intended for "make" to generate C code for 3 similar SHT functions,
20# (namely spat_to_SH [Q tag]), spat_to_SHsphtor [V tag], spat_to_SH3 [both Q&V tags])
21# from one generic function + tags.
22# Basically, there are tags at the beginning of lines (Q,V) that are information
23# to keep or remove the line depending on the function to build. (Q for scalar, V for vector, # for comment)
24#
25//////////////////////////////////////////////////
26
27 static
28QX void GEN3(_an1,NWAY,SUFFIX)(shtns_cfg shtns, double *BrF, cplx *Qlm, const long int llim, const int imlim) {
29VX void GEN3(_an2,NWAY,SUFFIX)(shtns_cfg shtns, double *BtF, double *BpF, cplx *Slm, cplx *Tlm, const long int llim, const int imlim) {
303 void GEN3(_an3,NWAY,SUFFIX)(shtns_cfg shtns, double *BrF, double *BtF, double *BpF, cplx *Qlm, cplx *Slm, cplx *Tlm, const long int llim, const int imlim) {
31
32 double *alm, *al;
33 double *wg, *ct, *st;
34V double *l_2;
35 long int nk, k, l,m;
36 int k_inc, m_inc;
37 unsigned m0, mstep;
38 #ifndef SHT_AXISYM
39 unsigned im;
40V double m_1;
41 #endif
42 #if _GCC_VEC_
43Q rnd qq[2*llim];
44V rnd ss[2*llim];
45V rnd tt[2*llim];
46 #else
47Q double qq[llim];
48V double ss[llim];
49V double tt[llim];
50 #endif
51
52Q double rer[NLAT_2 + NWAY*VSIZE2] SSE;
53Q double ror[NLAT_2 + NWAY*VSIZE2] SSE;
54V double ter[NLAT_2 + NWAY*VSIZE2] SSE;
55V double tor[NLAT_2 + NWAY*VSIZE2] SSE;
56V double per[NLAT_2 + NWAY*VSIZE2] SSE;
57V double por[NLAT_2 + NWAY*VSIZE2] SSE;
58 #ifndef SHT_AXISYM
59Q double rei[NLAT_2 + NWAY*VSIZE2] SSE;
60Q double roi[NLAT_2 + NWAY*VSIZE2] SSE;
61V double tei[NLAT_2 + NWAY*VSIZE2] SSE;
62V double toi[NLAT_2 + NWAY*VSIZE2] SSE;
63V double pei[NLAT_2 + NWAY*VSIZE2] SSE;
64V double poi[NLAT_2 + NWAY*VSIZE2] SSE;
65 #endif
66
67 // ACCESS PATTERN
68 k_inc = shtns->k_stride_a;
69 m_inc = shtns->m_stride_a;
70
71 nk = NLAT_2; // copy NLAT_2 to a local variable for faster access (inner loop limit)
72 #if _GCC_VEC_
73 nk = ((unsigned) nk+(VSIZE2-1))/VSIZE2;
74 #endif
75 wg = shtns->wg; ct = shtns->ct; st = shtns->st;
76V l_2 = shtns->l_2;
77 for (k=nk*VSIZE2; k<(nk-1+NWAY)*VSIZE2; ++k) { // never written, so this is now done for all m's
78Q rer[k] = 0.0; ror[k] = 0.0;
79V ter[k] = 0.0; tor[k] = 0.0;
80V per[k] = 0.0; por[k] = 0.0;
81 #ifndef SHT_AXISYM
82Q rei[k] = 0.0; roi[k] = 0.0;
83V tei[k] = 0.0; toi[k] = 0.0;
84V pei[k] = 0.0; poi[k] = 0.0;
85 #endif
86 }
87
88 #ifndef _OPENMP
89 m0 = 0; mstep = 1;
90 #else
91 m0 = omp_get_thread_num();
92 mstep = omp_get_num_threads();
93 if (m0 == 0)
94 #endif
95 { // im=0 : dzl.p = 0.0 and evrything is REAL
96 alm = shtns->blm;
97V k=0; do { // compute symmetric and antisymmetric parts. (do not weight here, it is cheaper to weight y0)
98V double an = BtF[k*k_inc]; double bn = BtF[k*k_inc +1];
99V double bs = BtF[(NLAT-2-k)*k_inc]; double as = BtF[(NLAT-2-k)*k_inc +1];
100V ter[k] = an+as; tor[k] = an-as;
101V ter[k+1] = bn+bs; tor[k+1] = bn-bs;
102V k+=2;
103V } while(k < nk*VSIZE2);
104V k=0; do { // compute symmetric and antisymmetric parts. (do not weight here, it is cheaper to weight y0)
105V double an = BpF[k*k_inc]; double bn = BpF[k*k_inc +1];
106V double bs = BpF[(NLAT-2-k)*k_inc]; double as = BpF[(NLAT-2-k)*k_inc +1];
107V per[k] = an+as; por[k] = an-as;
108V per[k+1] = bn+bs; por[k+1] = bn-bs;
109V k+=2;
110V } while(k < nk*VSIZE2);
111Q double r0a = 0.0; double r0b = 0.0;
112Q k=0; do { // compute symmetric and antisymmetric parts. (do not weight here, it is cheaper to weight y0)
113Q double an = BrF[k*k_inc]; double bn = BrF[k*k_inc +1];
114Q double bs = BrF[(NLAT-2-k)*k_inc]; double as = BrF[(NLAT-2-k)*k_inc +1];
115Q rer[k] = an+as; ror[k] = an-as;
116Q rer[k+1] = bn+bs; ror[k+1] = bn-bs;
117Q r0a += (an+as)*wg[k]; r0b += (bn+bs)*wg[k+1];
118Q k+=2;
119Q } while(k < nk*VSIZE2);
120Q Qlm[0] = (r0a+r0b) * alm[0]; // l=0 is done.
121V Slm[0] = 0.0; Tlm[0] = 0.0; // l=0 is zero for the vector transform.
122 k = 0;
123 for (l=0;l<llim;++l) {
124Q qq[l] = vall(0.0);
125V ss[l] = vall(0.0); tt[l] = vall(0.0);
126 }
127 do {
128 al = alm;
129 rnd cost[NWAY], y0[NWAY], y1[NWAY];
130V rnd sint[NWAY], dy0[NWAY], dy1[NWAY];
131Q rnd rerk[NWAY], rork[NWAY]; // help the compiler to cache into registers.
132V rnd terk[NWAY], tork[NWAY], perk[NWAY], pork[NWAY];
133 for (int j=0; j<NWAY; ++j) {
134 cost[j] = vread(ct, k+j);
135 y0[j] = vall(al[0]) * vread(wg, k+j); // weight of Gauss quadrature appears here
136V dy0[j] = vall(0.0);
137V sint[j] = -vread(st, k+j);
138 y1[j] = (vall(al[1])*y0[j]) * cost[j];
139V dy1[j] = (vall(al[1])*y0[j]) * sint[j];
140Q rerk[j] = vread(rer, k+j); rork[j] = vread(ror, k+j); // cache into registers.
141V terk[j] = vread(ter, k+j); tork[j] = vread(tor, k+j);
142V perk[j] = vread(per, k+j); pork[j] = vread(por, k+j);
143 }
144 al+=2; l=1;
145 while(l<llim) {
146 for (int j=0; j<NWAY; ++j) {
147V dy0[j] = vall(al[1])*(cost[j]*dy1[j] + y1[j]*sint[j]) + vall(al[0])*dy0[j];
148 y0[j] = vall(al[1])*(cost[j]*y1[j]) + vall(al[0])*y0[j];
149 }
150 for (int j=0; j<NWAY; ++j) {
151Q qq[l-1] += y1[j] * rork[j];
152V ss[l-1] += dy1[j] * terk[j];
153V tt[l-1] -= dy1[j] * perk[j];
154 }
155 for (int j=0; j<NWAY; ++j) {
156V dy1[j] = vall(al[3])*(cost[j]*dy0[j] + y0[j]*sint[j]) + vall(al[2])*dy1[j];
157 y1[j] = vall(al[3])*(cost[j]*y0[j]) + vall(al[2])*y1[j];
158 }
159 for (int j=0; j<NWAY; ++j) {
160Q qq[l] += y0[j] * rerk[j];
161V ss[l] += dy0[j] * tork[j];
162V tt[l] -= dy0[j] * pork[j];
163 }
164 al+=4; l+=2;
165 }
166 if (l==llim) {
167 for (int j=0; j<NWAY; ++j) {
168Q qq[l-1] += y1[j] * rork[j];
169V ss[l-1] += dy1[j] * terk[j];
170V tt[l-1] -= dy1[j] * perk[j];
171 }
172 }
173 k+=NWAY;
174 } while (k < nk);
175 for (l=1; l<=llim; ++l) {
176 #if _GCC_VEC_
177Q ((v2d*)Qlm)[l] = v2d_reduce(qq[l-1], vall(0));
178V ((v2d*)Slm)[l] = v2d_reduce(ss[l-1], vall(0)) * vdup(l_2[l]);
179V ((v2d*)Tlm)[l] = v2d_reduce(tt[l-1], vall(0)) * vdup(l_2[l]);
180 #else
181Q Qlm[l] = qq[l-1];
182V Slm[l] = ss[l-1]*l_2[l]; Tlm[l] = tt[l-1]*l_2[l];
183 #endif
184 }
185 #ifdef SHT_VAR_LTR
186 for (l=llim+1; l<= LMAX; ++l) {
187Q ((v2d*)Qlm)[l] = vdup(0.0);
188V ((v2d*)Slm)[l] = vdup(0.0); ((v2d*)Tlm)[l] = vdup(0.0);
189 }
190 #ifndef SHT_AXISYM
191 if (imlim <= MMAX) { // zero out m >= imlim
192 l = LiM(shtns, imlim*MRES, imlim);
193 do {
194Q ((v2d*)Qlm)[l] = vdup(0.0);
195V ((v2d*)Slm)[l] = vdup(0.0); ((v2d*)Tlm)[l] = vdup(0.0);
196 } while(++l < shtns->nlm);
197 }
198 #endif
199 #endif
200 m0=mstep;
201 }
202
203 #ifndef SHT_AXISYM
204 for (im=m0; im<imlim; im+=mstep) {
205 m = im*MRES;
206 l = shtns->tm[im] / VSIZE2;
207 //alm = shtns->blm[im];
208 alm = shtns->blm + im*(2*LMAX -m+MRES);
209Q k = ((l*VSIZE2)>>1)*2; // k must be even here.
210Q do { // compute symmetric and antisymmetric parts, and reorganize data.
211Q double an, bn, ani, bni, bs, as, bsi, asi, t;
2123 double sina = st[k]; double sinb = st[k+1];
213Q ani = BrF[im*m_inc + k*k_inc]; bni = BrF[im*m_inc + k*k_inc +1]; // north
214Q an = BrF[(NPHI-im)*m_inc + k*k_inc]; bn = BrF[(NPHI-im)*m_inc + k*k_inc +1];
215Q t = ani-an; an += ani; ani = bn-bni; bn += bni; bni = t;
2163 an *= sina; ani*= sina; bn *= sinb; bni *= sinb;
217Q bsi = BrF[im*m_inc + (NLAT-2 -k)*k_inc]; asi = BrF[im*m_inc + (NLAT-2-k)*k_inc + 1]; // south
218Q bs = BrF[(NPHI-im)*m_inc +(NLAT-2-k)*k_inc]; as = BrF[(NPHI-im)*m_inc +(NLAT-2-k)*k_inc +1];
219Q t = bsi-bs; bs += bsi; bsi = as-asi; as += asi; asi = t;
2203 as *= sina; asi*= sina; bs *= sinb; bsi *= sinb;
221Q rer[k] = an+as; rei[k] = ani+asi; rer[k+1] = bn+bs; rei[k+1] = bni+bsi;
222Q ror[k] = an-as; roi[k] = ani-asi; ror[k+1] = bn-bs; roi[k+1] = bni-bsi;
223Q k+=2;
224Q } while (k<nk*VSIZE2);
225V k = ((l*VSIZE2)>>1)*2; // k must be even here.
226V do { // compute symmetric and antisymmetric parts, and reorganize data.
227V double an, bn, ani, bni, bs, as, bsi, asi, t;
228V ani = BtF[im*m_inc + k*k_inc]; bni = BtF[im*m_inc + k*k_inc +1]; // north
229V an = BtF[(NPHI-im)*m_inc + k*k_inc]; bn = BtF[(NPHI-im)*m_inc + k*k_inc +1];
230V t = ani-an; an += ani; ani = bn-bni; bn += bni; bni = t;
231V bsi = BtF[im*m_inc + (NLAT-2 -k)*k_inc]; asi = BtF[im*m_inc + (NLAT-2-k)*k_inc + 1]; // south
232V bs = BtF[(NPHI-im)*m_inc +(NLAT-2-k)*k_inc]; as = BtF[(NPHI-im)*m_inc +(NLAT-2-k)*k_inc +1];
233V t = bsi-bs; bs += bsi; bsi = as-asi; as += asi; asi = t;
234V ter[k] = an+as; tei[k] = ani+asi; ter[k+1] = bn+bs; tei[k+1] = bni+bsi;
235V tor[k] = an-as; toi[k] = ani-asi; tor[k+1] = bn-bs; toi[k+1] = bni-bsi;
236V k+=2;
237V } while (k<nk*VSIZE2);
238V k = ((l*VSIZE2)>>1)*2; // k must be even here.
239V do { // compute symmetric and antisymmetric parts, and reorganize data.
240V double an, bn, ani, bni, bs, as, bsi, asi, t;
241V ani = BpF[im*m_inc + k*k_inc]; bni = BpF[im*m_inc + k*k_inc +1]; // north
242V an = BpF[(NPHI-im)*m_inc + k*k_inc]; bn = BpF[(NPHI-im)*m_inc + k*k_inc +1];
243V t = ani-an; an += ani; ani = bn-bni; bn += bni; bni = t;
244V bsi = BpF[im*m_inc + (NLAT-2 -k)*k_inc]; asi = BpF[im*m_inc + (NLAT-2-k)*k_inc + 1]; // south
245V bs = BpF[(NPHI-im)*m_inc +(NLAT-2-k)*k_inc]; as = BpF[(NPHI-im)*m_inc +(NLAT-2-k)*k_inc +1];
246V t = bsi-bs; bs += bsi; bsi = as-asi; as += asi; asi = t;
247V per[k] = an+as; pei[k] = ani+asi; per[k+1] = bn+bs; pei[k+1] = bni+bsi;
248V por[k] = an-as; poi[k] = ani-asi; por[k+1] = bn-bs; poi[k+1] = bni-bsi;
249V k+=2;
250V } while (k<nk*VSIZE2);
251V m_1 = 1.0/m;
252 k=l;
253 #if _GCC_VEC_
254Q rnd* q = qq;
255V rnd* s = ss; rnd* t = tt;
256 #else
257 l = LiM(shtns, m, im);
258Q double* q = (double *) &Qlm[l];
259V double* s = (double *) &Slm[l];
260V double* t = (double *) &Tlm[l];
261 #endif
262 for (l=llim-m; l>=0; l--) {
263Q q[0] = vall(0.0); q[1] = vall(0.0); q+=2;
264V s[0] = vall(0.0); s[1] = vall(0.0); s+=2;
265V t[0] = vall(0.0); t[1] = vall(0.0); t+=2;
266 }
267 do {
268 #if _GCC_VEC_
269Q rnd* q = qq;
270V rnd* s = ss; rnd* t = tt;
271 #else
272 l = LiM(shtns, m, im);
273Q double* q = (double *) &Qlm[l];
274V double* s = (double *) &Slm[l];
275V double* t = (double *) &Tlm[l];
276 #endif
277 al = alm;
278 rnd cost[NWAY], y0[NWAY], y1[NWAY];
279V rnd st2[NWAY], dy0[NWAY], dy1[NWAY];
280Q rnd rerk[NWAY], reik[NWAY], rork[NWAY], roik[NWAY]; // help the compiler to cache into registers.
281V rnd terk[NWAY], teik[NWAY], tork[NWAY], toik[NWAY];
282V rnd perk[NWAY], peik[NWAY], pork[NWAY], poik[NWAY];
283 for (int j=0; j<NWAY; ++j) {
284 cost[j] = vread(st, k+j);
285 y0[j] = vall(0.5);
286V st2[j] = cost[j]*cost[j]*vall(-m_1);
287V y0[j] *= vall(m); // for the vector transform, compute ylm*m/sint
288 }
289Q l=m;
290V l=m-1;
291 long int ny = 0; // exponent to extend double precision range.
292 if ((int)llim <= SHT_L_RESCALE_FLY) {
293 do { // sin(theta)^m
294 if (l&1) for (int j=0; j<NWAY; ++j) y0[j] *= cost[j];
295 for (int j=0; j<NWAY; ++j) cost[j] *= cost[j];
296 } while(l >>= 1);
297 } else {
298 long int nsint = 0;
299 do { // sin(theta)^m (use rescaling to avoid underflow)
300 if (l&1) {
301 for (int j=0; j<NWAY; ++j) y0[j] *= cost[j];
302 ny += nsint;
303 if (vlo(y0[0]) < (SHT_ACCURACY+1.0/SHT_SCALE_FACTOR)) {
304 ny--;
305 for (int j=0; j<NWAY; ++j) y0[j] *= vall(SHT_SCALE_FACTOR);
306 }
307 }
308 for (int j=0; j<NWAY; ++j) cost[j] *= cost[j];
309 nsint += nsint;
310 if (vlo(cost[0]) < 1.0/SHT_SCALE_FACTOR) {
311 nsint--;
312 for (int j=0; j<NWAY; ++j) cost[j] *= vall(SHT_SCALE_FACTOR);
313 }
314 } while(l >>= 1);
315 }
316 for (int j=0; j<NWAY; ++j) {
317 y0[j] *= vall(al[0]);
318 cost[j] = vread(ct, k+j);
319V dy0[j] = cost[j]*y0[j];
320 y1[j] = (vall(al[1])*y0[j]) *cost[j];
321V dy1[j] = (vall(al[1])*y0[j]) *(cost[j]*cost[j] + st2[j]);
322 }
323 l=m; al+=2;
324 while ((ny<0) && (l<llim)) { // ylm treated as zero and ignored if ny < 0
325 for (int j=0; j<NWAY; ++j) {
326V dy0[j] = vall(al[1])*(cost[j]*dy1[j] + y1[j]*st2[j]) + vall(al[0])*dy0[j];
327 y0[j] = vall(al[1])*(cost[j]*y1[j]) + vall(al[0])*y0[j];
328 }
329 for (int j=0; j<NWAY; ++j) {
330V dy1[j] = vall(al[3])*(cost[j]*dy0[j] + y0[j]*st2[j]) + vall(al[2])*dy1[j];
331 y1[j] = vall(al[3])*(cost[j]*y0[j]) + vall(al[2])*y1[j];
332 }
333 l+=2; al+=4;
334 if (fabs(vlo(y0[NWAY-1])) > SHT_ACCURACY*SHT_SCALE_FACTOR + 1.0) { // rescale when value is significant
335 ++ny;
336 for (int j=0; j<NWAY; ++j) {
337 y0[j] *= vall(1.0/SHT_SCALE_FACTOR); y1[j] *= vall(1.0/SHT_SCALE_FACTOR);
338V dy0[j] *= vall(1.0/SHT_SCALE_FACTOR); dy1[j] *= vall(1.0/SHT_SCALE_FACTOR);
339 }
340 }
341 }
342 if (ny == 0) {
343Q q+=2*(l-m);
344V s+=2*(l-m); t+=2*(l-m);
345 for (int j=0; j<NWAY; ++j) { // prefetch
346 y0[j] *= vread(wg, k+j); y1[j] *= vread(wg, k+j); // weight appears here (must be after the previous accuracy loop).
347V dy0[j] *= vread(wg, k+j); dy1[j] *= vread(wg, k+j);
348Q rerk[j] = vread( rer, k+j); reik[j] = vread( rei, k+j); rork[j] = vread( ror, k+j); roik[j] = vread( roi, k+j);
349V terk[j] = vread( ter, k+j); teik[j] = vread( tei, k+j); tork[j] = vread( tor, k+j); toik[j] = vread( toi, k+j);
350V perk[j] = vread( per, k+j); peik[j] = vread( pei, k+j); pork[j] = vread( por, k+j); poik[j] = vread( poi, k+j);
351 }
352 while (l<llim) { // compute even and odd parts
353Q for (int j=0; j<NWAY; ++j) q[0] += y0[j] * rerk[j]; // real even
354Q for (int j=0; j<NWAY; ++j) q[1] += y0[j] * reik[j]; // imag even
355V for (int j=0; j<NWAY; ++j) s[0] += dy0[j] * tork[j] + y0[j] * peik[j];
356V for (int j=0; j<NWAY; ++j) s[1] += dy0[j] * toik[j] - y0[j] * perk[j];
357V for (int j=0; j<NWAY; ++j) t[0] -= dy0[j] * pork[j] - y0[j] * teik[j];
358V for (int j=0; j<NWAY; ++j) t[1] -= dy0[j] * poik[j] + y0[j] * terk[j];
359 for (int j=0; j<NWAY; ++j) {
360V dy0[j] = vall(al[1])*(cost[j]*dy1[j] + y1[j]*st2[j]) + vall(al[0])*dy0[j];
361 y0[j] = vall(al[1])*(cost[j]*y1[j]) + vall(al[0])*y0[j];
362 }
363Q for (int j=0; j<NWAY; ++j) q[2] += y1[j] * rork[j]; // real odd
364Q for (int j=0; j<NWAY; ++j) q[3] += y1[j] * roik[j]; // imag odd
365V for (int j=0; j<NWAY; ++j) s[2] += dy1[j] * terk[j] + y1[j] * poik[j];
366V for (int j=0; j<NWAY; ++j) s[3] += dy1[j] * teik[j] - y1[j] * pork[j];
367V for (int j=0; j<NWAY; ++j) t[2] -= dy1[j] * perk[j] - y1[j] * toik[j];
368V for (int j=0; j<NWAY; ++j) t[3] -= dy1[j] * peik[j] + y1[j] * tork[j];
369Q q+=4;
370V s+=4; t+=4;
371 for (int j=0; j<NWAY; ++j) {
372V dy1[j] = vall(al[3])*(cost[j]*dy0[j] + y0[j]*st2[j]) + vall(al[2])*dy1[j];
373 y1[j] = vall(al[3])*(cost[j]*y0[j]) + vall(al[2])*y1[j];
374 }
375 l+=2; al+=4;
376 }
377 if (l==llim) {
378Q for (int j=0; j<NWAY; ++j) q[0] += y0[j] * rerk[j]; // real even
379Q for (int j=0; j<NWAY; ++j) q[1] += y0[j] * reik[j]; // imag even
380V for (int j=0; j<NWAY; ++j) s[0] += dy0[j] * tork[j] + y0[j] * peik[j];
381V for (int j=0; j<NWAY; ++j) s[1] += dy0[j] * toik[j] - y0[j] * perk[j];
382V for (int j=0; j<NWAY; ++j) t[0] -= dy0[j] * pork[j] - y0[j] * teik[j];
383V for (int j=0; j<NWAY; ++j) t[1] -= dy0[j] * poik[j] + y0[j] * terk[j];
384 }
385 }
386 k+=NWAY;
387 } while (k < nk);
388 l = LiM(shtns, m, im);
389Q v2d *Ql = (v2d*) &Qlm[l];
390V v2d *Sl = (v2d*) &Slm[l];
391V v2d *Tl = (v2d*) &Tlm[l];
392 #if _GCC_VEC_
393 for (l=0; l<=llim-m; ++l) {
394QX Ql[l] = v2d_reduce(qq[2*l], qq[2*l+1]);
3953 Ql[l] = v2d_reduce(qq[2*l], qq[2*l+1]) * vdup(m_1);
396V Sl[l] = v2d_reduce(ss[2*l], ss[2*l+1]) * vdup(l_2[l+m]);
397V Tl[l] = v2d_reduce(tt[2*l], tt[2*l+1]) * vdup(l_2[l+m]);
398 }
399 #else
400V for (l=0; l<=llim-m; ++l) {
4013 Ql[l] *= m_1;
402V Sl[l] *= l_2[l+m];
403V Tl[l] *= l_2[l+m];
404V }
405 #endif
406 #ifdef SHT_VAR_LTR
407 for (l=llim+1-m; l<=LMAX-m; ++l) {
408Q Ql[l] = vdup(0.0);
409V Sl[l] = vdup(0.0); Tl[l] = vdup(0.0);
410 }
411 #endif
412 }
413 #endif
414 }
415
416
417QX static void GEN3(spat_to_SH_omp,NWAY,SUFFIX)(shtns_cfg shtns, double *Vr, cplx *Qlm, long int llim) {
418VX static void GEN3(spat_to_SHsphtor_omp,NWAY,SUFFIX)(shtns_cfg shtns, double *Vt, double *Vp, cplx *Slm, cplx *Tlm, long int llim) {
4193 static void GEN3(spat_to_SHqst_omp,NWAY,SUFFIX)(shtns_cfg shtns, double *Vr, double *Vt, double *Vp, cplx *Qlm, cplx *Slm, cplx *Tlm, long int llim) {
420
421Q double *BrF; // contains the Fourier transformed data
422V double *BtF, *BpF; // contains the Fourier transformed data
423 unsigned imlim=0;
424
425Q BrF = Vr;
426V BtF = Vt; BpF = Vp;
427 #ifndef SHT_AXISYM
428 imlim = MTR;
429 #ifdef SHT_VAR_LTR
430 if (imlim*MRES > (unsigned) llim) imlim = ((unsigned) llim)/MRES; // 32bit mul and div should be faster
431 #endif
432 if (shtns->fftc_mode >= 0) {
433 if (shtns->fftc_mode == 0) { // in-place
434V #ifdef HAVE_LIBFFTW3_OMP
435Q fftw_execute_dft(shtns->fftc,(cplx*)BrF, (cplx*)BrF);
436V fftw_execute_dft(shtns->fftc,(cplx*)BtF, (cplx*)BtF);
437V fftw_execute_dft(shtns->fftc,(cplx*)BpF, (cplx*)BpF);
438V #endif
439 } else { // alloc memory for the transpose FFT
440 unsigned long nv = shtns->nspat;
441QX BrF = (double*) VMALLOC( nv * sizeof(double) );
442VX BtF = (double*) VMALLOC( 2*nv * sizeof(double) );
443VX BpF = BtF + nv;
4443 BrF = (double*) VMALLOC( 3*nv * sizeof(double) );
4453 BtF = BrF + nv; BpF = BtF + nv;
446V #ifdef HAVE_LIBFFTW3_OMP
447Q fftw_execute_split_dft(shtns->fftc, Vr+NPHI, Vr, BrF+1, BrF);
448V fftw_execute_split_dft(shtns->fftc, Vt+NPHI, Vt, BtF+1, BtF);
449V fftw_execute_split_dft(shtns->fftc, Vp+NPHI, Vp, BpF+1, BpF);
450V #endif
451 }
452 }
453 #endif
454 imlim += 1;
455
456 #pragma omp parallel num_threads(shtns->nthreads)
457 {
458 #ifndef SHT_AXISYM
459V #ifndef HAVE_LIBFFTW3_OMP
460V if (shtns->fftc_mode == 0) { // in-place
4613 #pragma omp single nowait
4623 fftw_execute_dft(shtns->fftc,(cplx*)BrF, (cplx*)BrF);
463V #pragma omp single nowait
464V fftw_execute_dft(shtns->fftc,(cplx*)BtF, (cplx*)BtF);
465V #pragma omp single nowait
466V fftw_execute_dft(shtns->fftc,(cplx*)BpF, (cplx*)BpF);
467V } else if (shtns->fftc_mode > 0) { // split out-of-place
4683 #pragma omp single nowait
4693 fftw_execute_split_dft(shtns->fftc, Vr+NPHI, Vr, ((double*)BrF)+1, ((double*)BrF));
470V #pragma omp single nowait
471V fftw_execute_split_dft(shtns->fftc, Vt+NPHI, Vt, ((double*)BtF)+1, ((double*)BtF));
472V #pragma omp single nowait
473V fftw_execute_split_dft(shtns->fftc, Vp+NPHI, Vp, ((double*)BpF)+1, ((double*)BpF));
474V }
475V #pragma omp barrier
476V #endif
477 #endif
478QX GEN3(_an1,NWAY,SUFFIX)(shtns, BrF, Qlm, llim, imlim);
479VX GEN3(_an2,NWAY,SUFFIX)(shtns, BtF, BpF, Slm, Tlm, llim, imlim);
4803 GEN3(_an3,NWAY,SUFFIX)(shtns, BrF, BtF, BpF, Qlm, Slm, Tlm, llim, imlim);
481 }
482
483 #ifndef SHT_AXISYM
484 if (shtns->fftc_mode > 0) { // free memory
485Q VFREE(BrF);
486VX VFREE(BtF); // this frees also BpF.
487 }
488 #endif
489
490 }
Note: See TracBrowser for help on using the repository browser.