source: CIVL/examples/omp/shtns/SHT/mic_SH_to_spat.gen.c

main
Last change on this file was ea777aa, checked in by Alex Wilton <awilton@…>, 3 years ago

Moved examples, include, build_default.properties, common.xml, and README out from dev.civl.com into the root of the repo.

git-svn-id: svn://vsl.cis.udel.edu/civl/trunk@5704 fb995dde-84ed-4084-dfe6-e5aef3e2452c

  • Property mode set to 100644
File size: 16.4 KB
Line 
1/*
2 * Copyright (c) 2010-2015 Centre National de la Recherche Scientifique.
3 * written by Nathanael Schaeffer (CNRS, ISTerre, Grenoble, France).
4 *
5 * nathanael.schaeffer@ujf-grenoble.fr
6 *
7 * This software is governed by the CeCILL license under French law and
8 * abiding by the rules of distribution of free software. You can use,
9 * modify and/or redistribute the software under the terms of the CeCILL
10 * license as circulated by CEA, CNRS and INRIA at the following URL
11 * "http://www.cecill.info".
12 *
13 * The fact that you are presently reading this means that you have had
14 * knowledge of the CeCILL license and that you accept its terms.
15 *
16 */
17
18# This file is meta-code for SHT.c (spherical harmonic transform).
19# it is intended for "make" to generate C code for similar SHT functions,
20# from one generic function + tags.
21# > See Makefile and SHT.c
22# Basically, there are tags at the beginning of lines that are information
23# to keep or remove the line depending on the function to build.
24# tags :
25# Q : line for scalar transform
26# V : line for vector transform (both spheroidal and toroidal)
27# S : line for vector transfrom, spheroidal component
28# T : line for vector transform, toroidal component.
29
30 static
313 void GEN3(_sy3,NWAY,SUFFIX)(shtns_cfg shtns, cplx *Qlm, cplx *Slm, cplx *Tlm, cplx *BrF, cplx *BtF, cplx *BpF, const long int llim, const int imlim)
32QX void GEN3(_sy1,NWAY,SUFFIX)(shtns_cfg shtns, cplx *Qlm, cplx *BrF, const long int llim, const int imlim)
33 #ifndef SHT_GRAD
34VX void GEN3(_sy2,NWAY,SUFFIX)(shtns_cfg shtns, cplx *Slm, cplx *Tlm, cplx *BtF, cplx *BpF, const long int llim, const int imlim)
35 #else
36S void GEN3(_sy1s,NWAY,SUFFIX)(shtns_cfg shtns, cplx *Slm, cplx *BtF, cplx *BpF, const long int llim, const int imlim)
37T void GEN3(_sy1t,NWAY,SUFFIX)(shtns_cfg shtns, cplx *Tlm, cplx *BtF, cplx *BpF, const long int llim, const int imlim)
38 #endif
39 {
40 #ifndef SHT_AXISYM
41Q #define qr(l) vall(creal(Ql[l-1]))
42Q #define qi(l) vall(cimag(Ql[l-1]))
43S #define sr(l) vall(creal(Sl[l-1]))
44S #define si(l) vall(cimag(Sl[l-1]))
45T #define tr(l) vall(creal(Tl[l-1]))
46T #define ti(l) vall(cimag(Tl[l-1]))
47V double m_1;
48 unsigned im;
49 #endif
50 unsigned m0, mstep;
51 long int nk,k,l,m;
52 double *alm, *al;
53 double *ct, *st;
54Q cplx Ql[llim+1] SSE;
55S cplx Sl[llim] SSE;
56T cplx Tl[llim] SSE;
57
58Q double rnr[NLAT_2 + NWAY*VSIZE2 -1] SSE;
59Q double rsr[NLAT_2 + NWAY*VSIZE2 -1] SSE;
60V double tnr[NLAT_2 + NWAY*VSIZE2 -1] SSE;
61V double tsr[NLAT_2 + NWAY*VSIZE2 -1] SSE;
62V double pnr[NLAT_2 + NWAY*VSIZE2 -1] SSE;
63V double psr[NLAT_2 + NWAY*VSIZE2 -1] SSE;
64 #ifndef SHT_AXISYM
65Q double rni[NLAT_2 + NWAY*VSIZE2 -1] SSE;
66Q double rsi[NLAT_2 + NWAY*VSIZE2 -1] SSE;
67V double tni[NLAT_2 + NWAY*VSIZE2 -1] SSE;
68V double tsi[NLAT_2 + NWAY*VSIZE2 -1] SSE;
69V double pni[NLAT_2 + NWAY*VSIZE2 -1] SSE;
70V double psi[NLAT_2 + NWAY*VSIZE2 -1] SSE;
71 #endif
72
73
74 ct = shtns->ct; st = shtns->st;
75 nk = NLAT_2;
76 nk = ((unsigned)(nk+VSIZE2-1)) / VSIZE2;
77
78 // ACCESS PATTERN
79 const int k_inc = 1; const int m_inc = NLAT/2;
80
81 #ifndef _OPENMP
82 m0 = 0; mstep = 1;
83 #else
84 m0 = omp_get_thread_num();
85 mstep = omp_get_num_threads();
86 if (m0 == 0)
87 #endif
88 { // im=0;
89 #ifdef SHT_GRAD
90 #ifndef SHT_AXISYM
91S k=0; do { BpF[k]=0.0; } while(++k<NLAT_2);
92T k=0; do { BtF[k]=0.0; } while(++k<NLAT_2);
93 #else
94S if (BpF != NULL) { int k=0; do { BpF[k]=0.0; } while(++k<NLAT_2); }
95T if (BtF != NULL) { int k=0; do { BtF[k]=0.0; } while(++k<NLAT_2); }
96 #endif
97 #endif
98Q double* Ql0 = (double*) Ql;
99S double* Sl0 = (double*) Sl;
100T double* Tl0 = (double*) Tl;
101 l=1;
102 alm = shtns->alm;
103Q Ql0[0] = (double) Qlm[0]; // l=0
104 do { // for m=0, compress the complex Q,S,T to double
105Q Ql0[l] = creal( Qlm[l] ); // Ql[l+1] = (double) Qlm[l+1];
106S Sl0[l-1] = creal( Slm[l] ); // Sl[l] = (double) Slm[l+1];
107T Tl0[l-1] = creal( Tlm[l] ); // Tl[l] = (double) Tlm[l+1];
108 ++l;
109 } while(l<=llim);
110 k=0;
111 do {
112 l=0; al = alm;
113 rnd cost[NWAY], y0[NWAY], y1[NWAY];
114V rnd sint[NWAY], dy0[NWAY], dy1[NWAY];
115Q rnd re[NWAY], ro[NWAY];
116S rnd te[NWAY], to[NWAY];
117T rnd pe[NWAY], po[NWAY];
118 for (int j=0; j<NWAY; ++j) {
119 cost[j] = vread(ct, j+k);
120V sint[j] = -vread(st, j+k);
121 y0[j] = vall(al[0]);
122V dy0[j] = vall(0.0);
123Q re[j] = y0[j] * vall(Ql0[0]);
124S to[j] = dy0[j];
125T po[j] = dy0[j];
126 }
127 for (int j=0; j<NWAY; ++j) {
128 y1[j] = vall(al[0]*al[1]) * cost[j];
129V dy1[j] = vall(al[0]*al[1]) * sint[j];
130 }
131 for (int j=0; j<NWAY; ++j) {
132Q ro[j] = y1[j] * vall(Ql0[1]);
133S te[j] = dy1[j] * vall(Sl0[0]);
134T pe[j] = -dy1[j] * vall(Tl0[0]);
135 }
136 al+=2; l+=2;
137 while(l<llim) {
138 for (int j=0; j<NWAY; ++j) {
139V dy0[j] = vall(al[1])*(cost[j]*dy1[j] + y1[j]*sint[j]) + vall(al[0])*dy0[j];
140 y0[j] = vall(al[1])*(cost[j]*y1[j]) + vall(al[0])*y0[j];
141 }
142 for (int j=0; j<NWAY; ++j) {
143Q re[j] += y0[j] * vall(Ql0[l]);
144S to[j] += dy0[j] * vall(Sl0[l-1]);
145T po[j] -= dy0[j] * vall(Tl0[l-1]);
146 }
147 for (int j=0; j<NWAY; ++j) {
148V dy1[j] = vall(al[3])*(cost[j]*dy0[j] + y0[j]*sint[j]) + vall(al[2])*dy1[j];
149 y1[j] = vall(al[3])*(cost[j]*y0[j]) + vall(al[2])*y1[j];
150 }
151 for (int j=0; j<NWAY; ++j) {
152Q ro[j] += y1[j] * vall(Ql0[l+1]);
153S te[j] += dy1[j] * vall(Sl0[l]);
154T pe[j] -= dy1[j] * vall(Tl0[l]);
155 }
156 al+=4; l+=2;
157 }
158 if (l==llim) {
159 for (int j=0; j<NWAY; ++j) {
160V dy0[j] = vall(al[1])*(cost[j]*dy1[j] + y1[j]*sint[j]) + vall(al[0])*dy0[j];
161 y0[j] = vall(al[1])*cost[j]*y1[j] + vall(al[0])*y0[j];
162 }
163 for (int j=0; j<NWAY; ++j) {
164Q re[j] += y0[j] * vall(Ql0[l]);
165S to[j] += dy0[j] * vall(Sl0[l-1]);
166T po[j] -= dy0[j] * vall(Tl0[l-1]);
167 }
168 }
169
170 for (int j=0; j<NWAY; ++j) {
171Q vstor(rnr, j+k, re[j]+ro[j]); vstor(rsr, j+k, re[j]-ro[j]);
172S vstor(tnr, j+k, te[j]+to[j]); vstor(tsr, j+k, te[j]-to[j]);
173T vstor(pnr, j+k, pe[j]+po[j]); vstor(psr, j+k, pe[j]-po[j]);
174 }
175 k+=NWAY;
176 } while (k < nk);
177
178 k=0; do { // merge symmetric and antisymmetric parts.
179Q BrF[(k/2)*k_inc] = rnr[k] + I*rnr[k+1];
180Q BrF[(NLAT_2-1 - k/2)*k_inc] = rsr[k+1] + I*rsr[k];
181S BtF[(k/2)*k_inc] = tnr[k] + I*tnr[k+1];
182S BtF[(NLAT_2-1 - k/2)*k_inc] = tsr[k+1] + I*tsr[k];
183T BpF[(k/2)*k_inc] = pnr[k] + I*pnr[k+1];
184T BpF[(NLAT_2-1 - k/2)*k_inc] = psr[k+1] + I*psr[k];
185 k+=2;
186 } while(k < NLAT_2);
187
188 m0=mstep;
189 }
190
191 #ifndef SHT_AXISYM
192 for (im=m0; im<imlim; im+=mstep) {
193 m = im*MRES;
194V m_1 = 1.0/m;
195 //alm = shtns->alm[im];
196 alm = shtns->alm + im*(2*LMAX -m+MRES);
197 l = m;
198 k = LiM(shtns, l,im);
199 //k = (im*(2*(LMAX+1)-(m+MRES)))>>1 + l;
200 do { // copy input coefficients to a local array.
201Q ((v2d*)Ql)[l-1] = ((v2d*)Qlm)[k];
202S ((v2d*)Sl)[l-1] = ((v2d*)Slm)[k];
203T ((v2d*)Tl)[l-1] = ((v2d*)Tlm)[k];
204 ++l; ++k;
205 } while(l<=llim);
206
207 k = shtns->tm[im] / VSIZE2; // stay on vector boundary
208 #if VSIZE2 == 1
209 k -= k&1; // we operate without vectors, but we still need complex alignement (2 doubles).
210 #endif
211 do {
212 al = alm;
213 rnd cost[NWAY], y0[NWAY], y1[NWAY];
214V rnd st2[NWAY], dy0[NWAY], dy1[NWAY];
215Q rnd rer[NWAY], rei[NWAY], ror[NWAY], roi[NWAY];
216V rnd ter[NWAY], tei[NWAY], tor[NWAY], toi[NWAY];
217V rnd per[NWAY], pei[NWAY], por[NWAY], poi[NWAY];
218 for (int j=0; j<NWAY; ++j) {
219 cost[j] = vread(st, k+j);
220 y0[j] = vall(1.0);
221V st2[j] = cost[j]*cost[j]*vall(-m_1);
222V y0[j] = vall(m); // for the vector transform, compute ylm*m/sint
223 }
224Q l=m;
225V l=m-1;
226 long int ny = 0;
227 if ((int)llim <= SHT_L_RESCALE_FLY) {
228 do { // sin(theta)^m
229 if (l&1) for (int j=0; j<NWAY; ++j) y0[j] *= cost[j];
230 for (int j=0; j<NWAY; ++j) cost[j] *= cost[j];
231 } while(l >>= 1);
232 } else {
233 long int nsint = 0;
234 do { // sin(theta)^m (use rescaling to avoid underflow)
235 if (l&1) {
236 for (int j=0; j<NWAY; ++j) y0[j] *= cost[j];
237 ny += nsint;
238 if (vlo(y0[0]) < (SHT_ACCURACY+1.0/SHT_SCALE_FACTOR)) {
239 ny--;
240 for (int j=0; j<NWAY; ++j) y0[j] *= vall(SHT_SCALE_FACTOR);
241 }
242 }
243 for (int j=0; j<NWAY; ++j) cost[j] *= cost[j];
244 nsint += nsint;
245 if (vlo(cost[0]) < 1.0/SHT_SCALE_FACTOR) {
246 nsint--;
247 for (int j=0; j<NWAY; ++j) cost[j] *= vall(SHT_SCALE_FACTOR);
248 }
249 } while(l >>= 1);
250 }
251 for (int j=0; j<NWAY; ++j) {
252 y0[j] *= vall(al[0]);
253 cost[j] = vread(ct, j+k);
254V dy0[j] = cost[j]*y0[j];
255Q ror[j] = vall(0.0); roi[j] = vall(0.0);
256Q rer[j] = vall(0.0); rei[j] = vall(0.0);
257 }
258 for (int j=0; j<NWAY; ++j) {
259 y1[j] = (vall(al[1])*y0[j]) *cost[j]; // y1[j] = vall(al[1])*cost[j]*y0[j];
260V por[j] = vall(0.0); tei[j] = vall(0.0);
261V tor[j] = vall(0.0); pei[j] = vall(0.0);
262V dy1[j] = (vall(al[1])*y0[j]) *(cost[j]*cost[j] + st2[j]); // dy1[j] = vall(al[1])*(cost[j]*dy0[j] - y0[j]*st2[j]);
263V poi[j] = vall(0.0); ter[j] = vall(0.0);
264V toi[j] = vall(0.0); per[j] = vall(0.0);
265 }
266 l=m; al+=2;
267 while ((ny<0) && (l<llim)) { // ylm treated as zero and ignored if ny < 0
268 for (int j=0; j<NWAY; ++j) {
269 y0[j] = vall(al[1])*(cost[j]*y1[j]) + vall(al[0])*y0[j];
270V dy0[j] = vall(al[1])*(cost[j]*dy1[j] + y1[j]*st2[j]) + vall(al[0])*dy0[j];
271 }
272 for (int j=0; j<NWAY; ++j) {
273 y1[j] = vall(al[3])*(cost[j]*y0[j]) + vall(al[2])*y1[j];
274V dy1[j] = vall(al[3])*(cost[j]*dy0[j] + y0[j]*st2[j]) + vall(al[2])*dy1[j];
275 }
276 l+=2; al+=4;
277 if (fabs(vlo(y0[NWAY-1])) > SHT_ACCURACY*SHT_SCALE_FACTOR + 1.0) { // rescale when value is significant
278 ++ny;
279 for (int j=0; j<NWAY; ++j) {
280 y0[j] *= vall(1.0/SHT_SCALE_FACTOR); y1[j] *= vall(1.0/SHT_SCALE_FACTOR);
281V dy0[j] *= vall(1.0/SHT_SCALE_FACTOR); dy1[j] *= vall(1.0/SHT_SCALE_FACTOR);
282 }
283 }
284 }
285 if (ny == 0) {
286 while (l<llim) { // compute even and odd parts
287Q for (int j=0; j<NWAY; ++j) { rer[j] += y0[j] * qr(l); rei[j] += y0[j] * qi(l); }
288Q for (int j=0; j<NWAY; ++j) { ror[j] += y1[j] * qr(l+1); roi[j] += y1[j] * qi(l+1); }
289S for (int j=0; j<NWAY; ++j) { tor[j] += dy0[j] * sr(l); pei[j] += y0[j] * sr(l); }
290S for (int j=0; j<NWAY; ++j) { ter[j] += dy1[j] * sr(l+1); poi[j] += y1[j] * sr(l+1); }
291S for (int j=0; j<NWAY; ++j) { toi[j] += dy0[j] * si(l); per[j] -= y0[j] * si(l); }
292S for (int j=0; j<NWAY; ++j) { tei[j] += dy1[j] * si(l+1); por[j] -= y1[j] * si(l+1); }
293T for (int j=0; j<NWAY; ++j) { por[j] -= dy0[j] * tr(l); tei[j] += y0[j] * tr(l); }
294T for (int j=0; j<NWAY; ++j) { per[j] -= dy1[j] * tr(l+1); toi[j] += y1[j] * tr(l+1); }
295T for (int j=0; j<NWAY; ++j) { poi[j] -= dy0[j] * ti(l); ter[j] -= y0[j] * ti(l); }
296T for (int j=0; j<NWAY; ++j) { pei[j] -= dy1[j] * ti(l+1); tor[j] -= y1[j] * ti(l+1); }
297 for (int j=0; j<NWAY; ++j) {
298V dy0[j] = vall(al[1])*(cost[j]*dy1[j] + y1[j]*st2[j]) + vall(al[0])*dy0[j];
299 y0[j] = vall(al[1])*(cost[j]*y1[j]) + vall(al[0])*y0[j];
300 }
301 for (int j=0; j<NWAY; ++j) {
302V dy1[j] = vall(al[3])*(cost[j]*dy0[j] + y0[j]*st2[j]) + vall(al[2])*dy1[j];
303 y1[j] = vall(al[3])*(cost[j]*y0[j]) + vall(al[2])*y1[j];
304 }
305 l+=2; al+=4;
306 }
307 if (l==llim) {
308Q for (int j=0; j<NWAY; ++j) { rer[j] += y0[j] * qr(l); rei[j] += y0[j] * qi(l); }
309S for (int j=0; j<NWAY; ++j) { tor[j] += dy0[j] * sr(l); pei[j] += y0[j] * sr(l); }
310S for (int j=0; j<NWAY; ++j) { toi[j] += dy0[j] * si(l); per[j] -= y0[j] * si(l); }
311T for (int j=0; j<NWAY; ++j) { por[j] -= dy0[j] * tr(l); tei[j] += y0[j] * tr(l); }
312T for (int j=0; j<NWAY; ++j) { poi[j] -= dy0[j] * ti(l); ter[j] -= y0[j] * ti(l); }
313 }
3143 for (int j=0; j<NWAY; ++j) cost[j] = vread(st, k+j) * vall(m_1);
3153 for (int j=0; j<NWAY; ++j) { rer[j] *= cost[j]; ror[j] *= cost[j]; rei[j] *= cost[j]; roi[j] *= cost[j]; }
316 }
317
318 for (int j=0; j<NWAY; ++j) {
319Q vstor(rnr, j+k, rer[j]+ror[j]); vstor(rsr, j+k, rer[j]-ror[j]);
320Q vstor(rni, j+k, rei[j]+roi[j]); vstor(rsi, j+k, rei[j]-roi[j]);
321V vstor(tnr, j+k, ter[j]+tor[j]); vstor(tsr, j+k, ter[j]-tor[j]);
322V vstor(tni, j+k, tei[j]+toi[j]); vstor(tsi, j+k, tei[j]-toi[j]);
323V vstor(pnr, j+k, per[j]+por[j]); vstor(psr, j+k, per[j]-por[j]);
324V vstor(pni, j+k, pei[j]+poi[j]); vstor(psi, j+k, pei[j]-poi[j]);
325 }
326 k+=NWAY;
327 } while (k < nk);
328
329 l = shtns->tm[im] >> 1; // stay on a 16 byte boundary
330Q k=0; while (k<l) { // polar optimization
331Q BrF[im*m_inc + k*k_inc] = 0.0; BrF[(NPHI-im)*m_inc + k*k_inc] = 0.0;
332Q BrF[im*m_inc + (NLAT_2-l+k)*k_inc] = 0.0; BrF[(NPHI-im)*m_inc + (NLAT_2-l+k)*k_inc] = 0.0;
333Q ++k;
334Q }
335Q k*=2; do {
336Q BrF[im*m_inc + (k/2)*k_inc] = (rnr[k]-rni[k+1]) + I*(rnr[k+1]+rni[k]);
337Q BrF[(NPHI-im)*m_inc + (k/2)*k_inc] = (rnr[k]+rni[k+1]) + I*(rnr[k+1]-rni[k]);
338Q BrF[im*m_inc + (NLAT_2-1-k/2)*k_inc] = (rsr[k+1]-rsi[k]) + I*(rsr[k]+rsi[k+1]);
339Q BrF[(NPHI-im)*m_inc + (NLAT_2-1-k/2)*k_inc] = (rsr[k+1]+rsi[k]) + I*(rsr[k]-rsi[k+1]);
340Q k+=2;
341Q } while(k < NLAT_2);
342
343V k=0; while (k<l) { // polar optimization
344V BtF[im*m_inc + k*k_inc] = 0.0; BtF[(NPHI-im)*m_inc + k*k_inc] = 0.0;
345V BtF[im*m_inc + (NLAT_2-l+k)*k_inc] = 0.0; BtF[(NPHI-im)*m_inc + (NLAT_2-l+k)*k_inc] = 0.0;
346V ++k;
347V }
348V k*=2; do {
349V BtF[im*m_inc + (k/2)*k_inc] = (tnr[k]-tni[k+1]) + I*(tnr[k+1]+tni[k]);
350V BtF[(NPHI-im)*m_inc + (k/2)*k_inc] = (tnr[k]+tni[k+1]) + I*(tnr[k+1]-tni[k]);
351V BtF[im*m_inc + (NLAT_2-1-k/2)*k_inc] = (tsr[k+1]-tsi[k]) + I*(tsr[k]+tsi[k+1]);
352V BtF[(NPHI-im)*m_inc + (NLAT_2-1-k/2)*k_inc] = (tsr[k+1]+tsi[k]) + I*(tsr[k]-tsi[k+1]);
353V k+=2;
354V } while(k < NLAT_2);
355
356V k=0; while (k<l) { // polar optimization
357V BpF[im*m_inc + k*k_inc] = 0.0; BpF[(NPHI-im)*m_inc + k*k_inc] = 0.0;
358V BpF[im*m_inc + (NLAT_2-l+k)*k_inc] = 0.0; BpF[(NPHI-im)*m_inc + (NLAT_2-l+k)*k_inc] = 0.0;
359V ++k;
360V }
361V k*=2; do {
362V BpF[im*m_inc + (k/2)*k_inc] = (pnr[k]-pni[k+1]) + I*(pnr[k+1]+pni[k]);
363V BpF[(NPHI-im)*m_inc + (k/2)*k_inc] = (pnr[k]+pni[k+1]) + I*(pnr[k+1]-pni[k]);
364V BpF[im*m_inc + (NLAT_2-1-k/2)*k_inc] = (psr[k+1]-psi[k]) + I*(psr[k]+psi[k+1]);
365V BpF[(NPHI-im)*m_inc + (NLAT_2-1-k/2)*k_inc] = (psr[k+1]+psi[k]) + I*(psr[k]-psi[k+1]);
366V k+=2;
367V } while(k < NLAT_2);
368
369 }
370
371 while(im <= NPHI-imlim) { // padding for high m's
372 k=0;
373 do {
374Q BrF[im*m_inc + k*k_inc] = 0.0;
375V BtF[im*m_inc + k*k_inc] = 0.0;
376V BpF[im*m_inc + k*k_inc] = 0.0;
377 } while (++k < NLAT_2);
378 im+=mstep;
379 }
380 #endif
381 }
382
383Q #undef qr
384Q #undef qi
385S #undef sr
386S #undef si
387T #undef tr
388T #undef ti
389
390 static
3913 void GEN3(SHqst_to_spat_mic,NWAY,SUFFIX)(shtns_cfg shtns, cplx *Qlm, cplx *Slm, cplx *Tlm, double *Vr, double *Vt, double *Vp, long int llim) {
392QX void GEN3(SH_to_spat_mic,NWAY,SUFFIX)(shtns_cfg shtns, cplx *Qlm, double *Vr, long int llim) {
393 #ifndef SHT_GRAD
394VX void GEN3(SHsphtor_to_spat_mic,NWAY,SUFFIX)(shtns_cfg shtns, cplx *Slm, cplx *Tlm, double *Vt, double *Vp, long int llim) {
395 #else
396S void GEN3(SHsph_to_spat_mic,NWAY,SUFFIX)(shtns_cfg shtns, cplx *Slm, double *Vt, double *Vp, long int llim) {
397T void GEN3(SHtor_to_spat_mic,NWAY,SUFFIX)(shtns_cfg shtns, cplx *Tlm, double *Vt, double *Vp, long int llim) {
398 #endif
399
400 int k;
401 unsigned imlim = 0;
402Q cplx* BrF = (cplx*) Vr;
403V cplx* BtF = (cplx*) Vt; cplx* BpF = (cplx*) Vp;
404
405 #ifndef SHT_AXISYM
406 imlim = MTR;
407 #ifdef SHT_VAR_LTR
408 if (imlim*MRES > (unsigned) llim) imlim = ((unsigned) llim)/MRES; // 32bit mul and div should be faster
409 #endif
410 if (shtns->fftc_mode > 0) { // alloc memory for the FFT
411 unsigned long nv = shtns->nspat;
412QX BrF = (cplx*) VMALLOC( nv * sizeof(double) );
413VX BtF = (cplx*) VMALLOC( 2*nv * sizeof(double) );
414VX BpF = BtF + nv/2;
4153 BrF = (cplx*) VMALLOC( 3*nv * sizeof(double) );
4163 BtF = BrF + nv/2; BpF = BrF + nv;
417 }
418 #endif
419 imlim += 1;
420
421 #pragma omp parallel num_threads(shtns->nthreads)
422 {
4233 GEN3(_sy3,NWAY,SUFFIX)(shtns, Qlm, Slm, Tlm, BrF, BtF, BpF, llim, imlim);
424QX GEN3(_sy1,NWAY,SUFFIX)(shtns, Qlm, BrF, llim, imlim);
425 #ifndef SHT_GRAD
426VX GEN3(_sy2,NWAY,SUFFIX)(shtns, Slm, Tlm, BtF, BpF, llim, imlim);
427 #else
428S GEN3(_sy1s,NWAY,SUFFIX)(shtns, Slm, BtF, BpF, llim, imlim);
429T GEN3(_sy1t,NWAY,SUFFIX)(shtns, Tlm, BtF, BpF, llim, imlim);
430 #endif
431 }
432
433 #ifndef SHT_AXISYM
434 // NPHI > 1 as SHT_AXISYM is not defined.
435 if (shtns->fftc_mode >= 0) {
436 if (shtns->fftc_mode == 0) {
437Q fftw_execute_dft(shtns->ifftc, (cplx *) BrF, (cplx *) Vr);
438V fftw_execute_dft(shtns->ifftc, (cplx *) BtF, (cplx *) Vt);
439V fftw_execute_dft(shtns->ifftc, (cplx *) BpF, (cplx *) Vp);
440 } else { // split dft
441Q fftw_execute_split_dft(shtns->ifftc,((double*)BrF)+1, ((double*)BrF), Vr+NPHI, Vr);
442V fftw_execute_split_dft(shtns->ifftc,((double*)BtF)+1, ((double*)BtF), Vt+NPHI, Vt);
443V fftw_execute_split_dft(shtns->ifftc,((double*)BpF)+1, ((double*)BpF), Vp+NPHI, Vp);
444Q VFREE(BrF);
445VX VFREE(BtF); // this frees also BpF.
446 }
447 }
448 #endif
449
450 }
Note: See TracBrowser for help on using the repository browser.