source: CIVL/examples/omp/shtns/sht_init.c

main
Last change on this file was ea777aa, checked in by Alex Wilton <awilton@…>, 3 years ago

Moved examples, include, build_default.properties, common.xml, and README out from dev.civl.com into the root of the repo.

git-svn-id: svn://vsl.cis.udel.edu/civl/trunk@5704 fb995dde-84ed-4084-dfe6-e5aef3e2452c

  • Property mode set to 100644
File size: 88.5 KB
RevLine 
[2131108]1/*
2 * Copyright (c) 2010-2015 Centre National de la Recherche Scientifique.
3 * written by Nathanael Schaeffer (CNRS, ISTerre, Grenoble, France).
4 *
5 * nathanael.schaeffer@ujf-grenoble.fr
6 *
7 * This software is governed by the CeCILL license under French law and
8 * abiding by the rules of distribution of free software. You can use,
9 * modify and/or redistribute the software under the terms of the CeCILL
10 * license as circulated by CEA, CNRS and INRIA at the following URL
11 * "http://www.cecill.info".
12 *
13 * The fact that you are presently reading this means that you have had
14 * knowledge of the CeCILL license and that you accept its terms.
15 *
16 */
17
18/********************************************************************
19 * SHTns : Spherical Harmonic Transform for numerical simulations. *
20 * written by Nathanael Schaeffer / CNRS *
21 ********************************************************************/
22
23/** \internal \file SHT.c
24 * \brief main source file for SHTns.
25 * This files contains initialization code and also some partial transforms (point or latitudinal evaluations)
26 */
27
28#include <stdio.h>
29#include <string.h>
30// global variables definitions
31#include "sht_private.h"
32
33// cycle counter from FFTW
34#include "fftw3/cycle.h"
35
36// chained list of sht_setup : start with NULL
37shtns_cfg sht_data = NULL;
38#ifdef _OPENMP
39 int omp_threads = 1; // multi-thread disabled by default.
40 #if HAVE_LIBFFTW3_OMP
41 #define OMP_FFTW
42 #endif
43#else
44 #define omp_threads 1
45#endif
46
47static int verbose = 0; // runtime verbosity control: 0 no output, 1 output, 2 debug (if compiled in)
48void shtns_verbose(int v) {
49 verbose = v;
50}
51
52
53/// \internal Abort program with error message.
54static void shtns_runerr(const char * error_text)
55{
56 printf("*** [" PACKAGE_NAME "] Run-time error : %s\n",error_text);
57 exit(1);
58}
59
60/* PUBLIC useful functions */
61
62/// returns the l=0, m=0 SH coefficient corresponding to a uniform value of 1.
63double sh00_1(shtns_cfg shtns) {
64 return shtns->Y00_1;
65}
66/// returns the l=1, m=0 SH coefficient corresponding to cos(theta).
67double sh10_ct(shtns_cfg shtns) {
68 return shtns->Y10_ct;
69}
70/// returns the l=1, m=1 SH coefficient corresponding to sin(theta).cos(phi).
71double sh11_st(shtns_cfg shtns) {
72 return shtns->Y11_st;
73}
74/// returns the l,m SH coefficient corresponding to unit energy.
75double shlm_e1(shtns_cfg shtns, int l, int m) {
76 double x = shtns->Y00_1/sqrt(4.*M_PI);
77 if (SHT_NORM == sht_schmidt) x *= sqrt(2*l+1);
78 if ((m!=0)&&((shtns->norm & SHT_REAL_NORM)==0)) x *= sqrt(0.5);
79 return(x);
80}
81
82/// \code return (mmax+1)*(lmax+1) - mres*(mmax*(mmax+1))/2; \endcode */
83/// \ingroup init
84long nlm_calc(long lmax, long mmax, long mres)
85{
86 if (mmax*mres > lmax) mmax = lmax/mres;
87 return (mmax+1)*(lmax+1) - ((mmax*mres)*(mmax+1))/2; // this is wrong if lmax < mmax*mres
88}
89
90
91/* LEGENDRE FUNCTIONS */
92#include "sht_legendre.c"
93
94
95/* SHT FUNCTIONS */
96#include "sht_func.c"
97
98#include "sht_com.c"
99
100/*
101 INTERNAL INITIALIZATION FUNCTIONS
102*/
103
104// sht algorithms (hyb, fly1, ...)
105enum sht_algos { SHT_DCT, SHT_MEM, SHT_SV,
106 SHT_FLY1, SHT_FLY2, SHT_FLY3, SHT_FLY4, SHT_FLY6, SHT_FLY8,
107 SHT_OMP1, SHT_OMP2, SHT_OMP3, SHT_OMP4, SHT_OMP6, SHT_OMP8,
108 SHT_NALG };
109
110char* sht_name[SHT_NALG] = {"dct", "mem", "s+v", "fly1", "fly2", "fly3", "fly4", "fly6", "fly8", "omp1", "omp2", "omp3", "omp4", "omp6", "omp8" };
111char* sht_type[SHT_NTYP] = {"syn", "ana", "vsy", "van", "gsp", "gto", "v3s", "v3a" };
112char* sht_var[SHT_NVAR] = {"std", "ltr", "m" };
113int sht_npar[SHT_NTYP] = {2, 2, 4, 4, 3, 3, 6, 6};
114
115#ifdef SHTNS_MEM
116extern void* fmem[SHT_NTYP];
117extern void* fmem_l[SHT_NTYP];
118extern void* fmem_m0[SHT_NTYP];
119extern void* fmem_m0l[SHT_NTYP];
120#endif
121#ifdef SHTNS_DCT
122extern void* fdct[SHT_NTYP];
123extern void* fdct_l[SHT_NTYP];
124extern void* fdct_m0[SHT_NTYP];
125extern void* fdct_m0l[SHT_NTYP];
126#endif
127extern void* ffly[6][SHT_NTYP];
128extern void* ffly_m[6][SHT_NTYP];
129extern void* ffly_m0[6][SHT_NTYP];
130#ifdef _OPENMP
131extern void* fomp[6][SHT_NTYP];
132#endif
133
134// big array holding all sht functions, variants and algorithms
135void* sht_func[SHT_NVAR][SHT_NALG][SHT_NTYP];
136
137/// \internal use on-the-fly alogorithm (guess without measuring)
138static void set_sht_fly(shtns_cfg shtns, int typ_start)
139{
140 int algo = SHT_FLY2;
141 if ((shtns->nthreads > 1) && (sht_func[0][SHT_OMP2][typ_start])) algo = SHT_OMP2;
142 for (int it=typ_start; it<SHT_NTYP; it++) {
143 for (int v=0; v<SHT_NVAR; v++)
144 shtns->ftable[v][it] = sht_func[v][algo][it];
145 }
146}
147
148#ifdef SHTNS_MEM
149/// \internal choose memory algorithm everywhere.
150static void set_sht_mem(shtns_cfg shtns) {
151 for (int it=0; it<SHT_NTYP; it++) {
152 for (int v=0; v<SHT_NVAR; v++)
153 shtns->ftable[v][it] = sht_func[v][SHT_MEM][it];
154 shtns->ftable[SHT_M][it] = sht_func[SHT_M][SHT_FLY2][it]; // there is no "mem" algo for SHT_M
155 }
156}
157#endif
158
159/// \internal copy all algos to sht_func array (should be called by set_grid before choosing variants).
160/// if nphi is 1, axisymmetric algorithms are used.
161static void init_sht_array_func(shtns_cfg shtns)
162{
163 int it, j;
164 int alg_lim = SHT_FLY8;
165
166 if (shtns->nlat_2 < 8*VSIZE2) { // limit available on-the-fly algorithm to avoid overflow (and segfaults).
167 it = shtns->nlat_2 / VSIZE2;
168 switch(it) {
169 case 0 : alg_lim = SHT_FLY1-1; break;
170 case 1 : alg_lim = SHT_FLY1; break;
171 case 2 : alg_lim = SHT_FLY2; break;
172 case 3 : alg_lim = SHT_FLY3; break;
173 case 4 : ;
174 case 5 : alg_lim = SHT_FLY4; break;
175 default : alg_lim = SHT_FLY6;
176 }
177 }
178 alg_lim -= SHT_FLY1;
179
180 memset(sht_func, 0, SHT_NVAR*SHT_NTYP*SHT_NALG*sizeof(void*) ); // zero out.
181 sht_func[SHT_STD][SHT_SV][SHT_TYP_3SY] = SHqst_to_spat_2;
182 sht_func[SHT_LTR][SHT_SV][SHT_TYP_3SY] = SHqst_to_spat_2l;
183 sht_func[SHT_STD][SHT_SV][SHT_TYP_3AN] = spat_to_SHqst_2;
184 sht_func[SHT_LTR][SHT_SV][SHT_TYP_3AN] = spat_to_SHqst_2l;
185 sht_func[SHT_M][SHT_SV][SHT_TYP_3SY] = SHqst_to_spat_2ml;
186 sht_func[SHT_M][SHT_SV][SHT_TYP_3AN] = spat_to_SHqst_2ml;
187
188 if (shtns->nphi==1) { // axisymmetric transform requested.
189 for (int j=0; j<=alg_lim; j++) {
190 memcpy(sht_func[SHT_STD][SHT_FLY1 + j], &ffly_m0[j], sizeof(void*)*SHT_NTYP);
191 memcpy(sht_func[SHT_LTR][SHT_FLY1 + j], &ffly_m0[j], sizeof(void*)*SHT_NTYP);
192 }
193 #ifdef SHTNS_DCT
194 memcpy(sht_func[SHT_STD][SHT_DCT], &fdct_m0, sizeof(void*)*SHT_NTYP);
195 memcpy(sht_func[SHT_LTR][SHT_DCT], &fdct_m0l, sizeof(void*)*SHT_NTYP);
196 #endif
197 #ifdef SHTNS_MEM
198 memcpy(sht_func[SHT_STD][SHT_MEM], &fmem_m0, sizeof(void*)*SHT_NTYP);
199 memcpy(sht_func[SHT_LTR][SHT_MEM], &fmem_m0l, sizeof(void*)*SHT_NTYP);
200 #endif
201 } else {
202 for (int j=0; j<=alg_lim; j++) {
203 memcpy(sht_func[SHT_STD][SHT_FLY1 + j], &ffly[j], sizeof(void*)*SHT_NTYP);
204 memcpy(sht_func[SHT_LTR][SHT_FLY1 + j], &ffly[j], sizeof(void*)*SHT_NTYP);
205 memcpy(sht_func[SHT_M][SHT_FLY1 + j], &ffly_m[j], sizeof(void*)*SHT_NTYP);
206 #ifdef _OPENMP
207 memcpy(sht_func[SHT_STD][SHT_OMP1 + j], &fomp[j], sizeof(void*)*SHT_NTYP);
208 memcpy(sht_func[SHT_LTR][SHT_OMP1 + j], &fomp[j], sizeof(void*)*SHT_NTYP);
209 memcpy(sht_func[SHT_M][SHT_OMP1 + j], &ffly_m[j], sizeof(void*)*SHT_NTYP); // no omp algo for SHT_M, use fly instead
210 #endif
211 }
212 #ifdef SHTNS_DCT
213 memcpy(sht_func[SHT_STD][SHT_DCT], &fdct, sizeof(void*)*SHT_NTYP);
214 memcpy(sht_func[SHT_LTR][SHT_DCT], &fdct_l, sizeof(void*)*SHT_NTYP);
215 #endif
216 #ifdef SHTNS_MEM
217 memcpy(sht_func[SHT_STD][SHT_MEM], &fmem, sizeof(void*)*SHT_NTYP);
218 memcpy(sht_func[SHT_LTR][SHT_MEM], &fmem_l, sizeof(void*)*SHT_NTYP);
219 #endif
220 }
221
222 #ifdef SHTNS_MEM
223 set_sht_mem(shtns); // default transform is MEM
224 #else
225 set_sht_fly(shtns, 0);
226 #endif
227}
228
229
230/// \internal return the smallest power of 2 larger than n.
231static int next_power_of_2(int n)
232{
233 int f = 1;
234 if ( (n<=0) || (n>(1<<(sizeof(int)*8-2))) ) return 0;
235 while (f<n) f*=2;
236 return f;
237}
238
239/// \internal find the closest integer that is larger than n and that contains only factors up to fmax.
240/// fmax is 7 for optimal FFTW fourier transforms.
241/// return only even integers for n>fmax.
242static int fft_int(int n, int fmax)
243{
244 int k,f;
245
246 if (n<=fmax) return n;
247 if (fmax<2) return 0;
248 if (fmax==2) return next_power_of_2(n);
249
250 n -= 2-(n&1); // only even n
251 do {
252 n+=2; f=2;
253 while ((2*f <= n) && ((n&f)==0)) f *= 2; // no divisions for factor 2.
254 k=3;
255 while ((k<=fmax) && (k*f <= n)) {
256 while ((k*f <= n) && (n%(k*f)==0)) f *= k;
257 k+=2;
258 }
259 } while (f != n);
260
261 k = next_power_of_2(n); // what is the closest power of 2 ?
262 if ((k-n)*33 < n) return k; // rather choose power of 2 if not too far (3%)
263
264 return n;
265}
266
267
268/// \internal returns an aproximation of the memory usage in mega bytes for the scalar matrices.
269/// \ingroup init
270static double sht_mem_size(int lmax, int mmax, int mres, int nlat)
271{
272 double s = 1./(1024*1024);
273 s *= ((nlat+1)/2) * sizeof(double) * nlm_calc(lmax, mmax, mres);
274 return s;
275}
276
277/// \internal return the number of shtns config that contain a reference to the memory location *pp.
278static int ref_count(shtns_cfg shtns, void* pp)
279{
280 shtns_cfg s2 = sht_data;
281 void* p;
282 long int nref, offset;
283
284 if ((pp==NULL) || (shtns==NULL)) return -1; // error.
285
286 p = *(void**)pp; // the pointer to memory location that we want to free
287 if (p == NULL) return 0; // nothing to do.
288
289 offset = (char*)pp - (char*)shtns; // relative location in shtns_info structure.
290 nref = 0; // reference count.
291 while (s2 != NULL) { // scan all configs.
292 if ( *(void**)(((char*)s2) + offset) == p ) nref++;
293 s2 = s2->next;
294 }
295 return nref; // will be >= 1 (as shtns is included in search)
296}
297
298/// \internal check if the memory location *pp is referenced by another sht config before freeing it.
299/// returns the number of other references, or -1 on error. If >=1, the memory location could not be freed.
300/// If the return value is 0, the ressource has been freed.
301static int free_unused(shtns_cfg shtns, void* pp)
302{
303 int n = ref_count(shtns, pp); // howmany shtns config do reference this memory location ?
304 if (n <= 0) return n; // nothing to free.
305 if (n == 1) { // no other reference found...
306 void** ap = (void**) pp;
307 free(*ap); // ...so we can free it...
308 *ap = NULL; // ...and mark as unaloccated.
309 }
310 return (n-1);
311}
312
313#ifdef SHTNS_MEM
314/// \internal Free matrices if on-the-fly has been selected.
315static void free_unused_matrices(shtns_cfg shtns)
316{
317 long int marray_size = (MMAX+1)*sizeof(double) + (MIN_ALIGNMENT-1);
318 int count[SHT_NTYP];
319 int it, iv, ia, iv2;
320
321 for (it=0; it<SHT_NTYP; it++) {
322 count[it] = 0;
323 for (iv=0; iv<SHT_NVAR; iv++) {
324 void* fptr = shtns->ftable[iv][it];
325 if (fptr != NULL) {
326 for (ia=0; ia<=SHT_MEM; ia++) // count occurences to mem algo
327 for (iv2=0; iv2<SHT_NVAR; iv2++)
328 if (sht_func[iv2][ia][it] == fptr) count[it]++;
329 }
330 }
331 #if SHT_VERBOSE > 1
332 if (verbose>1) printf(" %d ",count[it]);
333 #endif
334 }
335
336 if (shtns->dylm == NULL) { // no vector transform : do not try to free
337 count[SHT_TYP_VAN] = -1; count[SHT_TYP_VSY] = -1;
338 }
339
340 if (count[SHT_TYP_3AN] == 0) { // analysis may be freed.
341 if (count[SHT_TYP_VAN] == 0) {
342 PRINT_VERB("freeing vector analysis matrix\n");
343 free_unused(shtns, &shtns->dzlm);
344 }
345 if (count[SHT_TYP_SAN] == 0) {
346 PRINT_VERB("freeing scalar analysis matrix\n");
347 free_unused(shtns, &shtns->zlm);
348 }
349 } else if (shtns->mmax > 0) { // scalar may be reduced to m=0
350 if ((count[SHT_TYP_SAN] == 0) && (ref_count(shtns, &shtns->zlm) == 1)) {
351 PRINT_VERB("keeping scalar analysis matrix up to m=0\n");
352 shtns->zlm = realloc(shtns->zlm, ((LMAX+1)*NLAT_2 +3)*sizeof(double) + marray_size );
353 }
354 }
355 if (count[SHT_TYP_3SY] == 0) { // synthesis may be freed.
356 if (count[SHT_TYP_VSY] + count[SHT_TYP_GSP] + count[SHT_TYP_GTO] == 0) {
357 PRINT_VERB("freeing vector synthesis matrix\n");
358 free_unused(shtns, &shtns->dylm);
359 }
360 if (count[SHT_TYP_SSY] == 0) {
361 PRINT_VERB("freeing scalar synthesis matrix\n");
362 free_unused(shtns, &shtns->ylm);
363 }
364 } else if (shtns->mmax > 0) { // scalar may be reduced to m=0
365 if ((count[SHT_TYP_SSY] == 0) && (ref_count(shtns, &shtns->ylm) == 1)) {
366 PRINT_VERB("keeping scalar synthesis matrix up to m=0\n");
367 shtns->ylm = realloc(shtns->ylm, (LMAX+2)*NLAT_2*sizeof(double) + marray_size );
368 }
369 }
370}
371#endif
372
373/// \internal allocate arrays for SHT related to a given grid.
374static void alloc_SHTarrays(shtns_cfg shtns, int on_the_fly, int vect, int analys)
375{
376 long int im, l0;
377 long int size, marray_size, lstride;
378
379 im = (VSIZE2 > 2) ? VSIZE2 : 2;
380 l0 = ((NLAT+im-1)/im)*im; // align on vector
381 shtns->ct = (double *) VMALLOC( sizeof(double) * l0*3 ); /// ct[] (including st and st_1)
382 shtns->st = shtns->ct + l0; shtns->st_1 = shtns->ct + 2*l0;
383
384 shtns->ylm = NULL; shtns->dylm = NULL; // synthesis
385 shtns->zlm = NULL; shtns->dzlm = NULL; // analysis
386
387 if (on_the_fly == 0) { // Allocate legendre functions lookup tables.
388 marray_size = (MMAX+1)*sizeof(double*) + (MIN_ALIGNMENT-1); // for sse2 alignement
389
390 /* ylm */
391 lstride = (LMAX+1); lstride += (lstride&1); // even stride.
392 size = sizeof(double) * ((NLM-(LMAX+1)+lstride)*NLAT_2);
393 if (MMAX == 0) size += 3*sizeof(double); // some overflow needed.
394 shtns->ylm = (double **) malloc( marray_size + size );
395 shtns->ylm[0] = (double *) PTR_ALIGN( shtns->ylm + (MMAX+1) );
396 if (MMAX>0) shtns->ylm[1] = shtns->ylm[0] + NLAT_2*lstride;
397 for (im=1; im<MMAX; im++) shtns->ylm[im+1] = shtns->ylm[im] + NLAT_2*(LMAX+1-im*MRES);
398 /* dylm */
399 if (vect) {
400 lstride = LMAX; lstride += (lstride&1); // even stride.
401 size = sizeof(struct DtDp) * ((NLM-(LMAX+1) +lstride/2)*NLAT_2);
402 if (MMAX == 0) size += 2*sizeof(double); // some overflow needed.
403 shtns->dylm = (struct DtDp **) malloc( marray_size + size );
404 shtns->dylm[0] = (struct DtDp *) PTR_ALIGN( shtns->dylm + (MMAX+1) );
405 if (MMAX>0) shtns->dylm[1] = shtns->dylm[0] + (lstride/2)*NLAT_2; // phi-derivative is zero for m=0
406 for (im=1; im<MMAX; im++) shtns->dylm[im+1] = shtns->dylm[im] + NLAT_2*(LMAX+1-im*MRES);
407 }
408 if (analys) {
409 /* zlm */
410 size = sizeof(double) * (NLM*NLAT_2 + (NLAT_2 & 1));
411 if (MMAX == 0) size += 2*(NLAT_2 & 1)*((LMAX+1) & 1) * sizeof(double);
412 shtns->zlm = (double **) malloc( marray_size + size );
413 shtns->zlm[0] = (double *) PTR_ALIGN( shtns->zlm + (MMAX+1) );
414 if (MMAX>0) shtns->zlm[1] = shtns->zlm[0] + NLAT_2*(LMAX+1) + (NLAT_2&1);
415 for (im=1; im<MMAX; im++) shtns->zlm[im+1] = shtns->zlm[im] + NLAT_2*(LMAX+1-im*MRES);
416 /* dzlm */
417 if (vect) {
418 size = sizeof(struct DtDp)* (NLM-1)*NLAT_2; // remove l=0
419 shtns->dzlm = (struct DtDp **) malloc( marray_size + size );
420 shtns->dzlm[0] = (struct DtDp *) PTR_ALIGN( shtns->dzlm + (MMAX+1) );
421 if (MMAX>0) shtns->dzlm[1] = shtns->dzlm[0] + NLAT_2*(LMAX);
422 for (im=1; im<MMAX; im++) shtns->dzlm[im+1] = shtns->dzlm[im] + NLAT_2*(LMAX+1-im*MRES);
423 }
424 }
425 }
426 #if SHT_VERBOSE > 1
427 if (verbose>1) printf(" Memory used for Ylm and Zlm matrices = %.3f Mb x2\n",3.0*sizeof(double)*NLM*NLAT_2/(1024.*1024.));
428 #endif
429}
430
431
432/// \internal free arrays allocated by init_SH_dct
433static void free_SH_dct(shtns_cfg shtns)
434{
435 if (shtns->zlm_dct0 == NULL) return;
436
437 if (ref_count(shtns, &shtns->dzlm_dct0) == 1) VFREE(shtns->dzlm_dct0);
438 if (ref_count(shtns, &shtns->zlm_dct0) == 1) VFREE(shtns->zlm_dct0);
439 shtns->dzlm_dct0 = NULL; shtns->zlm_dct0 = NULL;
440
441 free_unused(shtns, &shtns->dykm_dct);
442 free_unused(shtns, &shtns->ykm_dct);
443
444 if (ref_count(shtns, &shtns->idct) == 1) fftw_destroy_plan(shtns->idct); // free unused dct plans
445 if (ref_count(shtns, &shtns->dct_m0) == 1) fftw_destroy_plan(shtns->dct_m0);
446 shtns->idct = NULL; shtns->dct_m0 = NULL;
447}
448
449/// \internal free arrays allocated by alloc_SHTarrays.
450static void free_SHTarrays(shtns_cfg shtns)
451{
452 free_SH_dct(shtns);
453 free_unused(shtns, &shtns->ylm);
454 free_unused(shtns, &shtns->dylm);
455 free_unused(shtns, &shtns->zlm);
456 free_unused(shtns, &shtns->dzlm);
457
458 if (ref_count(shtns, &shtns->ct) == 1) VFREE(shtns->ct);
459 shtns->ct = NULL; shtns->st = NULL;
460
461 if (ref_count(shtns, &shtns->fft) == 1) fftw_destroy_plan(shtns->fft);
462 if (ref_count(shtns, &shtns->ifft) == 1) fftw_destroy_plan(shtns->ifft);
463 shtns->fft = NULL; shtns->ifft = NULL; shtns->ncplx_fft = -1; // no fft
464}
465
466#ifndef HAVE_FFTW_COST
467 // substitute undefined symbol in mkl and fftw older than 3.3
468 #define fftw_cost(a) 0.0
469#endif
470
471/// \internal initialize FFTs using FFTW.
472/// \param[in] layout defines the spatial layout (see \ref spat).
473/// \param[in] on_the_fly is one, if only on-the-fly transform are considered.
474/// returns the number of double to be allocated for a spatial field.
475static void planFFT(shtns_cfg shtns, int layout, int on_the_fly)
476{
477 double cost_fft_ip, cost_fft_oop, cost_ifft_ip, cost_ifft_oop;
478 cplx *ShF;
479 double *Sh;
480 fftw_plan fft2, ifft2, fft, ifft;
481 int nfft, ncplx, nreal;
482 int theta_inc, phi_inc, phi_embed;
483 #ifdef HAVE_FFTW_COST
484 int in_place = 1; // try to use in-place real fft.
485 #else
486 int in_place = 0; // do not try to use in-place real fft if no timing data available.
487 #endif
488
489 if (NPHI <= 2*MMAX) shtns_runerr("the sampling condition Nphi > 2*Mmax is not met.");
490
491 #ifdef OMP_FFTW
492 if ((shtns->fftw_plan_mode & (FFTW_EXHAUSTIVE | FFTW_PATIENT)) && (omp_threads > 1)) {
493 shtns->fftw_plan_mode = FFTW_PATIENT;
494 fftw_plan_with_nthreads(omp_threads);
495 } else fftw_plan_with_nthreads(shtns->nthreads);
496 #endif
497
498 shtns->k_stride_a = 1; shtns->m_stride_a = NLAT; // default strides
499
500 shtns->fft = NULL; shtns->ifft = NULL;
501 shtns->dct_m0 = NULL; shtns->idct = NULL; // set dct plans to uninitialized.
502
503 shtns->nspat = NPHI * NLAT; // default spatial size
504
505 if (NPHI==1) // no FFT needed.
506 {
507 shtns->fftc_mode = -1; // no FFT
508 #if SHT_VERBOSE > 0
509 if (verbose) printf(" => no fft : Mmax=0, Nphi=1, Nlat=%d\n",NLAT);
510 #endif
511 shtns->ncplx_fft = -1; // no fft.
512 return;
513 }
514
515 /* NPHI > 1 */
516 theta_inc=1; phi_inc=NLAT; phi_embed=2*(NPHI/2+1); // SHT_NATIVE_LAYOUT is the default.
517 if (layout & SHT_THETA_CONTIGUOUS) { theta_inc=1; phi_inc=NLAT; phi_embed=NPHI; }
518 if (layout & SHT_PHI_CONTIGUOUS) { phi_inc=1; theta_inc=NPHI; phi_embed=NPHI; }
519 nfft = NPHI;
520 ncplx = NPHI/2 +1;
521 nreal = phi_embed;
522 if ((theta_inc != 1)||(phi_inc != NLAT)||(nreal < 2*ncplx)) in_place = 0; // we need to do the fft out-of-place.
523
524 #if SHT_VERBOSE > 0
525 if (verbose) {
526 printf(" => using FFTW : Mmax=%d, Nphi=%d, Nlat=%d (data layout : phi_inc=%d, theta_inc=%d, phi_embed=%d)\n",MMAX,NPHI,NLAT,phi_inc,theta_inc,phi_embed);
527 if (NPHI <= (SHT_NL_ORDER+1)*MMAX) printf(" !! Warning : anti-aliasing condition Nphi > %d*Mmax is not met !\n", SHT_NL_ORDER+1);
528 if (NPHI != fft_int(NPHI,7)) printf(" !! Warning : Nphi is not optimal for FFTW !\n");
529 }
530 #endif
531
532// Allocate dummy Spatial Fields.
533 ShF = (cplx *) VMALLOC(ncplx * NLAT * sizeof(cplx));
534 Sh = (double *) VMALLOC(ncplx * NLAT * sizeof(cplx));
535 fft = NULL; ifft = NULL; fft2 = NULL; ifft2 = NULL;
536
537// complex fft for fly transform is a bit different.
538 if (layout & SHT_PHI_CONTIGUOUS) { // out-of-place split dft
539 fftw_iodim dim, many;
540 shtns->fftc_mode = 1;
541 //default internal
542 dim.n = NPHI; dim.os = 1; dim.is = NLAT; // complex transpose
543 many.n = NLAT/2; many.os = 2*NPHI; many.is = 2;
544 shtns->ifftc = fftw_plan_guru_split_dft(1, &dim, 1, &many, ((double*)ShF)+1, (double*)ShF, Sh+NPHI, Sh, shtns->fftw_plan_mode);
545
546 // legacy analysis fft
547 //dim.n = NPHI; dim.is = 1; dim.os = NLAT;
548 //many.n = NLAT/2; many.is = 2*NPHI; many.os = 2;
549 // new internal
550 dim.n = NPHI; dim.is = 1; dim.os = 2; // split complex, but without global transpose (faster).
551 many.n = NLAT/2; many.is = 2*NPHI; many.os = 2*NPHI;
552 shtns->fftc = fftw_plan_guru_split_dft(1, &dim, 1, &many, Sh+NPHI, Sh, ((double*)ShF)+1, (double*)ShF, shtns->fftw_plan_mode);
553 shtns->k_stride_a = NPHI; shtns->m_stride_a = 2;
554
555 #if SHT_VERBOSE > 1
556 if (verbose>1) {
557 printf(" fftw cost ifftc=%lg, fftc=%lg ",fftw_cost(shtns->ifftc), fftw_cost(shtns->fftc)); fflush(stdout);
558 }
559 #endif
560 } else { //if (layout & SHT_THETA_CONTIGUOUS) { // use only in-place here, supposed to be faster.
561 shtns->fftc_mode = 0;
562 shtns->ifftc = fftw_plan_many_dft(1, &nfft, NLAT/2, ShF, &nfft, NLAT/2, 1, ShF, &nfft, NLAT/2, 1, FFTW_BACKWARD, shtns->fftw_plan_mode);
563 shtns->fftc = shtns->ifftc; // same thing, with m>0 and m<0 exchanged.
564 #if SHT_VERBOSE > 1
565 if (verbose>1) {
566 printf(" fftw cost ifftc=%lg ",fftw_cost(shtns->ifftc)); fflush(stdout);
567 }
568 #endif
569 }
570
571#if _GCC_VEC_
572 if (on_the_fly == 0)
573#endif
574 { // the real ffts are required if _GCC_VEC_ == 0 or if on_the_fly is zero.
575// IFFT : unnormalized. FFT : must be normalized.
576 cost_fft_ip = 0.0; cost_ifft_ip = 0.0; cost_fft_oop = 0.0; cost_ifft_oop = 0.0;
577 if (in_place) { // in-place FFT (if allowed)
578 ifft2 = fftw_plan_many_dft_c2r(1, &nfft, NLAT, ShF, &ncplx, NLAT, 1, (double*) ShF, &nreal, phi_inc, theta_inc, shtns->fftw_plan_mode);
579 #if SHT_VERBOSE > 1
580 if (verbose>1) {
581 printf(" in-place cost : ifft=%lg ",fftw_cost(ifft2)); fflush(stdout);
582 }
583 #endif
584 if (ifft2 != NULL) {
585 fft2 = fftw_plan_many_dft_r2c(1, &nfft, NLAT, (double*) ShF, &nreal, phi_inc, theta_inc, ShF, &ncplx, NLAT, 1, shtns->fftw_plan_mode);
586 #if SHT_VERBOSE > 1
587 if (verbose>1) {
588 printf("fft=%lg\n",fftw_cost(fft2)); fflush(stdout);
589 }
590 #endif
591 if (fft2 != NULL) {
592 cost_fft_ip = fftw_cost(fft2); cost_ifft_ip = fftw_cost(ifft2);
593 }
594 }
595 }
596 if ( (in_place == 0) || (cost_fft_ip * cost_ifft_ip > 0.0) )
597 { // out-of-place FFT
598 ifft = fftw_plan_many_dft_c2r(1, &nfft, NLAT, ShF, &ncplx, NLAT, 1, Sh, &nreal, phi_inc, theta_inc, shtns->fftw_plan_mode);
599 #if SHT_VERBOSE > 1
600 if (verbose>1) { printf(" oop cost : ifft=%lg ",fftw_cost(ifft)); fflush(stdout); }
601 #endif
602 if (ifft == NULL) shtns_runerr("[FFTW] ifft planning failed !");
603 fft = fftw_plan_many_dft_r2c(1, &nfft, NLAT, Sh, &nreal, phi_inc, theta_inc, ShF, &ncplx, NLAT, 1, shtns->fftw_plan_mode);
604 #if SHT_VERBOSE > 1
605 if (verbose>1) { printf("fft=%lg\n",fftw_cost(fft)); fflush(stdout); }
606 #endif
607 if (fft == NULL) shtns_runerr("[FFTW] fft planning failed !");
608 cost_fft_oop = fftw_cost(fft); cost_ifft_oop = fftw_cost(ifft);
609 }
610 if ( (cost_fft_ip * cost_ifft_ip > 0.0) && (cost_fft_oop * cost_ifft_oop > 0.0) ) { // both have been succesfully timed.
611 if ( cost_fft_oop + SHT_NL_ORDER*cost_ifft_oop < cost_fft_ip + SHT_NL_ORDER*cost_ifft_ip )
612 in_place = 0; // disable in-place, because out-of-place is faster.
613 }
614
615 if (in_place) {
616 /* IN-PLACE FFT */
617 if (fft != NULL) fftw_destroy_plan(fft);
618 if (ifft != NULL) fftw_destroy_plan(ifft);
619 fft = fft2; ifft = ifft2;
620 shtns->ncplx_fft = 0; // fft is done in-place, no allocation needed.
621 shtns->nspat = phi_embed * NLAT; // more space must be reserved for inplace ffts.
622 } else {
623 /* OUT-OF-PLACE FFT */
624 if (fft2 != NULL) fftw_destroy_plan(fft2); // use OUT-OF-PLACE FFT
625 if (ifft2 != NULL) fftw_destroy_plan(ifft2);
626 shtns->ncplx_fft = ncplx * NLAT; // fft is done out-of-place, store allocation size.
627 #if SHT_VERBOSE > 1
628 if (verbose>1) printf(" ** out-of-place fft **\n");
629 #endif
630 }
631 shtns->fft = fft; shtns->ifft = ifft;
632 }
633 VFREE(Sh); VFREE(ShF);
634
635 #if SHT_VERBOSE > 2
636 if (verbose>2) {
637 printf(" *** fft plan :\n");
638 fftw_print_plan(fft);
639 printf("\n *** ifft plan :\n");
640 fftw_print_plan(ifft);
641 printf("\n");
642 }
643 #endif
644}
645
646#ifdef SHTNS_DCT
647/// \internal initialize DCTs using FFTW. Must be called if MTR_DCT is changed.
648static int planDCT(shtns_cfg shtns)
649{
650 double *Sh;
651 int ndct = NLAT;
652 fftw_r2r_kind r2r_kind;
653 fftw_iodim dims, hdims[2];
654 double Sh0[NLAT] SSE; // temp storage on the stack, aligned.
655
656 #ifdef OMP_FFTW
657 if (shtns->fftw_plan_mode & (FFTW_EXHAUSTIVE | FFTW_PATIENT)) {
658 fftw_plan_with_nthreads(omp_threads);
659 } else fftw_plan_with_nthreads(1);
660 #endif
661 // Allocate dummy Spatial Fields.
662 Sh = (double *) VMALLOC((NPHI/2 +1) * NLAT*2 * sizeof(double));
663
664 if (shtns->dct_m0 == NULL) { // allocate only once since it does not change.
665 int stride = (NPHI>1) ? 2 : 1;
666 r2r_kind = FFTW_REDFT10; // out-of-place.
667 shtns->dct_m0 = fftw_plan_many_r2r(1, &ndct, 1, Sh, &ndct, stride, stride*NLAT, Sh0, &ndct, 1, NLAT, &r2r_kind, shtns->fftw_plan_mode); // out-of-place.
668 if (shtns->dct_m0 == NULL) return 0; // dct_m0 planning failed.
669 #if SHT_VERBOSE > 2
670 if (verbose>2) {
671 printf(" *** dct_m0 plan :\n"); fftw_print_plan(shtns->dct_m0); printf("\n");
672 }
673 #endif
674 }
675
676 if (NPHI > 1) { // complex data for NPHI>1, recompute as it does depend on MTR_DCT
677 if (shtns->idct != NULL) fftw_destroy_plan(shtns->idct);
678
679 dims.n = NLAT; dims.is = 2; dims.os = 2; // real and imaginary part.
680 hdims[0].n = MTR_DCT+1; hdims[0].is = 2*NLAT; hdims[0].os = 2*NLAT;
681 hdims[1].n = 2; hdims[1].is = 1; hdims[1].os = 1;
682 r2r_kind = FFTW_REDFT01;
683 shtns->idct = fftw_plan_guru_r2r(1, &dims, 2, hdims, Sh, Sh, &r2r_kind, shtns->fftw_plan_mode);
684 } else { // NPHI == 1
685 if (shtns->idct == NULL) {
686 r2r_kind = FFTW_REDFT01;
687 shtns->idct = fftw_plan_many_r2r(1, &ndct, 1, Sh, &ndct, 1, NLAT, Sh, &ndct, 1, NLAT, &r2r_kind, shtns->fftw_plan_mode);
688 }
689 }
690
691 VFREE(Sh);
692 if (shtns->idct == NULL) return 0; // idct planning failed.
693 #if SHT_VERBOSE > 2
694 if (verbose>2) {
695 printf(" *** idct plan :\n"); fftw_print_plan(shtns->idct); printf("\n");
696 }
697 #endif
698 return 1; // success.
699}
700
701/// \internal SET MTR_DCT and updates fftw_plan for DCT's
702static int Set_MTR_DCT(shtns_cfg shtns, int m)
703{
704 if ((shtns->zlm_dct0 == NULL)||(m == MTR_DCT)) return MTR_DCT;
705 if ( m < 0 ) { // don't use dct
706 shtns->mtr_dct = -1;
707 } else {
708 if (m>MMAX) m=MMAX;
709 shtns->mtr_dct = m;
710 if (planDCT(shtns) == 0)
711 shtns->mtr_dct = -1; // failure
712 }
713 return MTR_DCT;
714}
715
716/// \internal returns the m-truncation of DCT part of synthesis
717static int Get_MTR_DCT(shtns_cfg shtns) {
718 return MTR_DCT;
719}
720#endif /* SHTNS_DCT */
721
722
723/// \internal Sets the value tm[im] used for polar optimiation on-the-fly.
724static void PolarOptimize(shtns_cfg shtns, double eps)
725{
726 int im, m, l, it;
727 double v;
728 double y[LMAX+1];
729
730 for (im=0;im<=MMAX;im++) shtns->tm[im] = 0;
731
732 if (eps > 0.0) {
733 for (im=1;im<=MMAX;im++) {
734 m = im*MRES;
735 it = shtns->tm[im-1] -1; // tm[im] is monotonic.
736 do {
737 it++;
738 legendre_sphPlm_array(shtns, LMAX, im, shtns->ct[it], y+m);
739 v = 0.0;
740 for (l=m; l<=LMAX; l++) {
741 double ya = fabs(y[l]);
742 if ( v < ya ) v = ya;
743 }
744 } while (v < eps);
745 shtns->tm[im] = it;
746 }
747 #if SHT_VERBOSE > 0
748 if (verbose) printf(" + polar optimization threshold = %.1e\n",eps);
749 #endif
750 #if SHT_VERBOSE > 1
751 if (verbose>1) {
752 printf(" tm[im]=");
753 for (im=0;im<=MMAX;im++)
754 printf(" %d",shtns->tm[im]);
755 printf("\n");
756 }
757 #endif
758 }
759}
760
761#ifdef SHTNS_MEM
762
763/// \internal Perform some optimization on the SHT matrices.
764static void OptimizeMatrices(shtns_cfg shtns, double eps)
765{
766 unsigned short *tm;
767 double **ylm, **zlm;
768 struct DtDp** dylm;
769 struct DtDp** dzlm;
770 int im,m,l,it;
771 int vector = (shtns->dylm != NULL);
772
773 tm = shtns->tm;
774 ylm = shtns->ylm; dylm = shtns->dylm;
775 zlm = shtns->zlm; dzlm = shtns->dzlm;
776
777 for (im=0;im<=MMAX;im++) tm[im] = 0;
778/// POLAR OPTIMIZATION : analyzing coefficients, some can be safely neglected.
779 if (eps > 0.0) {
780 for (im=1;im<=MMAX;im++) {
781 m = im*MRES;
782 tm[im] = NLAT_2;
783 for (l=m;l<=LMAX;l++) {
784 it = tm[im-1]; // tm[im] is monotonic.
785 while( fabs(ylm[im][it*(LMAX-m+1) + (l-m)]) < eps ) { it++; }
786 if (tm[im] > it) tm[im] = it;
787 }
788 }
789#if SHT_VERBOSE > 0
790 if (verbose) printf(" + polar optimization threshold = %.1e\n",eps);
791#endif
792#if SHT_VERBOSE > 1
793 if (verbose>1) {
794 printf(" tm[im]=");
795 for (im=0;im<=MMAX;im++)
796 printf(" %d",tm[im]);
797 printf("\n");
798 }
799#endif
800
801 for (im=1; im<=MMAX; im++) { // im >= 1
802 if (tm[im] > 0) { // we can remove the data corresponding to polar values.
803 m = im*MRES;
804 ylm[im] += tm[im]*(LMAX-m+1); // shift pointers (still one block for each m)
805 if (vector) dylm[im] += tm[im]*(LMAX-m+1);
806 if (zlm[0] != NULL) {
807 for (l=m; l<LMAX; l+=2) {
808 for (it=0; it<NLAT_2-tm[im]; it++) { // copy data to avoid cache misses.
809 zlm[im][(l-m)*(NLAT_2-tm[im]) + it*2] = zlm[im][(l-m)*NLAT_2 + (it+tm[im])*2];
810 zlm[im][(l-m)*(NLAT_2-tm[im]) + it*2+1] = zlm[im][(l-m)*NLAT_2 + (it+tm[im])*2+1];
811 if (vector) {
812 dzlm[im][(l-m)*(NLAT_2-tm[im]) + it*2].t = dzlm[im][(l-m)*NLAT_2 + (it+tm[im])*2].t;
813 dzlm[im][(l-m)*(NLAT_2-tm[im]) + it*2].p = dzlm[im][(l-m)*NLAT_2 + (it+tm[im])*2].p;
814 dzlm[im][(l-m)*(NLAT_2-tm[im]) + it*2+1].t = dzlm[im][(l-m)*NLAT_2 + (it+tm[im])*2+1].t;
815 dzlm[im][(l-m)*(NLAT_2-tm[im]) + it*2+1].p = dzlm[im][(l-m)*NLAT_2 + (it+tm[im])*2+1].p;
816 }
817 }
818 }
819 if (l==LMAX) {
820 for (it=0; it<NLAT_2-tm[im]; it++) {
821 zlm[im][(l-m)*(NLAT_2-tm[im]) + it] = zlm[im][(l-m)*NLAT_2 + (it+tm[im])];
822 if (vector) {
823 dzlm[im][(l-m)*(NLAT_2-tm[im]) + it].t = dzlm[im][(l-m)*NLAT_2 + (it+tm[im])].t;
824 dzlm[im][(l-m)*(NLAT_2-tm[im]) + it].p = dzlm[im][(l-m)*NLAT_2 + (it+tm[im])].p;
825 }
826 }
827 }
828 }
829 }
830 }
831 }
832
833/// Compression of dzlm for m=0, as .p is 0
834 if ((vector) && (dzlm[0] != NULL)) { // for sht_reg_poles there is no dzlm defined.
835 im=0; m=0;
836 double* yg = (double *) dzlm[im];
837 for (l=1; l<LMAX; l+=2) { // l=0 is zero, so we start at l=1.
838 for (it=0; it<NLAT_2; it++) {
839 yg[(l-1)*NLAT_2 + it*2] = dzlm[im][(l-1)*NLAT_2 + it*2].t; // l
840 yg[(l-1)*NLAT_2 + it*2+1] = dzlm[im][(l-1)*NLAT_2 + it*2+1].t; // l+1
841 }
842 }
843 if (l==LMAX) { // last l is stored right away, without interleaving.
844 for (it=0; it<NLAT_2; it++) {
845 yg[(l-1)*NLAT_2 + it] = dzlm[im][(l-1)*NLAT_2 + it].t; // l (odd)
846 }
847 }
848 }
849 if ((zlm[0] != NULL) && (NLAT_2 & 1)) {
850 zlm[0][NLAT_2] = 0.0; // take care to write 0.0 for this sse2 padding value.
851 if ( (MMAX==0) && (!(LMAX & 1)) ) { // NLAT_2 odd + LMAX even
852 zlm[0][(LMAX+1)*NLAT_2 +1] = 0.0; // overflow for im=0
853 zlm[0][(LMAX+1)*NLAT_2 +2] = 0.0;
854 }
855 }
856}
857
858// how to access the ylm matrix for im=0
859#define YL0(it, l) ( shtns->ylm[0][ (it)*(((LMAX+2)>>1)*2) + (l) ] )
860#define DYL0(it, l) ( ((double*)(shtns->dylm[0]))[ (it)*(((LMAX+1)>>1)*2) + (l-1) ] )
861
862#define YLM(it, l, im) ( (im==0) ? YL0(it,l) : shtns->ylm[im][(it)*(LMAX-(im)*MRES+1) + ((l)-(im)*MRES)] )
863#define DTYLM(it, l, im) ( (im==0) ? ( (l==0) ? 0.0 : DYL0(it,l) ) : shtns->dylm[im][(it)*(LMAX-(im)*MRES+1) + ((l)-(im)*MRES)].t )
864#define DPYLM(it, l, im) ( (im==0) ? 0.0 : shtns->dylm[im][(it)*(LMAX-(im)*MRES+1) + ((l)-(im)*MRES)].p )
865
866/// \internal Precompute the matrix for SH synthesis.
867static void init_SH_synth(shtns_cfg shtns)
868{
869 double *yl, *dyl; // temp storage for derivative for legendre function values.
870 long int it,im,m,l;
871 double *ct = shtns->ct;
872 double *st = shtns->st;
873 int vector = (shtns->dylm != NULL);
874
875 yl = (double*) malloc( 2*sizeof(double) *(LMAX+1) );
876 dyl = yl + (LMAX+1);
877
878 { im=0; m=0;
879 for (it=0; it<NLAT_2; it++) {
880 legendre_sphPlm_deriv_array_hp(shtns, LMAX, im, ct[it], st[it], yl, dyl); // fixed im legendre functions lookup table.
881 for (l=0; l<=LMAX; l++) YL0(it, l) = yl[l];
882 for (l=LMAX+1; l<=LMAX+3; l++) YL0(it, l) = 0.0; // allow overflow.
883 if (vector) {
884 for (l=1; l<=LMAX; l++) DYL0(it, l) = dyl[l];
885 for (l=LMAX+1; l<=LMAX+2; l++) DYL0(it, l) = 0.0; // allow overflow.
886 }
887 }
888 }
889 for (im=1; im<=MMAX; im++) {
890 double *ylm = shtns->ylm[im];
891 struct DtDp* dylm = vector ? shtns->dylm[im] : NULL;
892 m = im*MRES;
893 for (it=0; it<NLAT_2; it++) {
894 legendre_sphPlm_deriv_array_hp(shtns, LMAX, im, ct[it], st[it], yl, dyl); // fixed im legendre functions lookup table.
895 for (l=m; l<=LMAX; l++) {
896 ylm[it*(LMAX-m+1) + (l-m)] = yl[l-m] * st[it];
897 if (vector) {
898 dylm[it*(LMAX-m+1) + (l-m)].t = dyl[l-m];
899 dylm[it*(LMAX-m+1) + (l-m)].p = yl[l-m] *m; // 1/sint(t) dYlm/dphi
900 }
901 }
902 }
903 }
904 free(yl);
905}
906
907
908/// \internal Precompute matrices for SH synthesis and analysis, on a Gauss-Legendre grid.
909static void init_SH_gauss(shtns_cfg shtns)
910{
911 long int it,im,m,l;
912 int vector = (shtns->dylm != NULL);
913
914 init_SH_synth(shtns);
915
916// for analysis (decomposition, direct transform) : transpose and multiply by gauss weight and other normalizations.
917// interleave l and l+1 : this stores data in the way it will be read.
918 double **zlm = shtns->zlm;
919 struct DtDp** dzlm = shtns->dzlm;
920 for (im=0; im<=MMAX; im++) {
921 m = im*MRES;
922 long int talign = 0;
923
924 for (it=0;it<NLAT_2;it++) {
925 double norm = shtns->wg[it];
926 if ( (m>0) && (shtns->norm & SHT_REAL_NORM) ) norm *= 2; // "Real" norm : zlm must be doubled for m>0
927 long int l0 = m;
928 if (m==0) {
929 zlm[im][it] = YL0(it, 0) * norm;
930 // les derivees sont nulles pour l=0
931 l0++;
932 talign = (NLAT_2&1);
933 }
934 for (l=l0; l<LMAX; l+=2) {
935 double nz0 = norm; double nz1 = norm;
936 if (SHT_NORM == sht_schmidt) {
937 nz0 *= (2*l+1); nz1 *= (2*l+3);
938 }
939 zlm[im][(l-m)*NLAT_2 + it*2 +talign] = YLM(it, l, im) * nz0;
940 zlm[im][(l-m)*NLAT_2 + it*2 +1 +talign] = YLM(it, l+1, im) * nz1;
941 if (vector) {
942 nz0 *= shtns->l_2[l]; nz1 *= shtns->l_2[l+1];
943 dzlm[im][(l-l0)*NLAT_2 + it*2].t = DTYLM(it, l, im) * nz0;
944 dzlm[im][(l-l0)*NLAT_2 + it*2].p = DPYLM(it, l, im) * nz0;
945 dzlm[im][(l-l0)*NLAT_2 + it*2+1].t = DTYLM(it, l+1, im) * nz1;
946 dzlm[im][(l-l0)*NLAT_2 + it*2+1].p = DPYLM(it, l+1, im) * nz1;
947 }
948 }
949 if (l==LMAX) { // last l is stored right away, without interleaving.
950 double nz0 = norm;
951 if (SHT_NORM == sht_schmidt) nz0 *= (2*l+1);
952 zlm[im][(l-m)*NLAT_2 + it +talign] = YLM(it, l, im) * nz0;
953 if (vector) {
954 nz0 *= shtns->l_2[l];
955 dzlm[im][(l-l0)*NLAT_2 + it].t = DTYLM(it, l, im) * nz0;
956 dzlm[im][(l-l0)*NLAT_2 + it].p = DPYLM(it, l, im) * nz0;
957 }
958 }
959 }
960 }
961}
962
963#endif
964
965#ifdef SHTNS_DCT
966
967static void init_SH_dct_m(shtns_cfg shtns, double* is1, fftw_plan dct, fftw_plan idct, int analysis, const int m0, const int mstep)
968{
969 double *yk, *yk0, *dyk0, *yg; // temp storage
970 struct DtDp *dyg, *dyk;
971 int it,im,m,l;
972 double Z[2*NLAT_2] SSE;
973 double dZt[2*NLAT_2] SSE;
974 double dZp[2*NLAT_2] SSE; // equally spaced theta points.
975
976 double *st = shtns->st;
977 double *st_1 = shtns->st_1;
978 const int vector = (shtns->dylm != NULL);
979 const int KMAX = LMAX+1;
980
981 real iylm_fft_norm = 1.0; // FFT/SHT normalization for zlm (4pi normalized)
982 if ((SHT_NORM != sht_fourpi)&&(SHT_NORM != sht_schmidt)) iylm_fft_norm = 4*M_PIl; // FFT/SHT normalization for zlm (orthonormalized)
983 iylm_fft_norm /= (2*NPHI*NLAT_2);
984
985// Even/Odd symmetry : ylm is even or odd across equator, as l-m is even or odd => only NLAT_2 points required.
986 // temp memory for ykm_dct.
987 yk = (double *) malloc( sizeof(double) * (KMAX+1)*(LMAX+1) );
988 dyk = (struct DtDp *) malloc( sizeof(struct DtDp)* (KMAX+1)*(LMAX+1) );
989 if ((m0==0)&&(analysis)) {
990 yk0 = (double *) malloc( sizeof(double) * (LMAX/2+1)*(2*NLAT_2) * 2 ); // temp for zlm_dct0
991 dyk0 = yk0 + (LMAX/2+1)*(2*NLAT_2);
992 }
993
994 for (im=m0; im<=MMAX; im+=mstep) {
995 m = im*MRES;
996 // go to DCT space
997 for (it=0;it<=KMAX;it+=2) {
998 for(l=m; l<=LMAX; l++) {
999 yk[(it/2)*(LMAX+1-m) + (l-m)] = 0.0;
1000 dyk[(it/2)*(LMAX+1-m) + (l-m)].t = 0.0;
1001 dyk[(it/2)*(LMAX+1-m) + (l-m)].p = 0.0;
1002 }
1003 }
1004 for (l=m; l<=LMAX; l++) {
1005 if (m & 1) { // m odd
1006 for (it=0; it<NLAT_2; it++) {
1007 Z[it] = YLM(it, l, im) * st[it]; // P[l+1](x) *st
1008 if (vector) {
1009 dZt[it] = DTYLM(it, l, im); // P[l](x) *1
1010 dZp[it] = DPYLM(it, l, im); // P[l-1](x) *1
1011 }
1012 }
1013 } else { // m even
1014 for (it=0; it<NLAT_2; it++) {
1015 Z[it] = YLM(it, l, im); // P[l](x) *1
1016 if (vector) {
1017 dZt[it] = DTYLM(it, l, im) *st[it]; // P[l+1](x) *st
1018 dZp[it] = YLM(it, l, im) * m; // P[l](x) *st
1019 }
1020 }
1021 }
1022 if ((l-m)&1) { // odd
1023 for (it=NLAT_2; it<2*NLAT_2; it++) {
1024 Z[it] = - Z[2*NLAT_2-it-1]; // reconstruct even/odd part
1025 dZt[it] = dZt[2*NLAT_2-it-1];
1026 dZp[it] = - dZp[2*NLAT_2-it-1];
1027 }
1028 } else { // even
1029 for (it=NLAT_2; it<2*NLAT_2; it++) {
1030 Z[it] = Z[2*NLAT_2-it-1]; // reconstruct even/odd part
1031 dZt[it] = - dZt[2*NLAT_2-it-1];
1032 dZp[it] = dZp[2*NLAT_2-it-1];
1033 }
1034 }
1035 fftw_execute_r2r(dct, Z, Z);
1036 fftw_execute_r2r(dct, dZt, dZt);
1037 fftw_execute_r2r(dct, dZp, dZp);
1038#if SHT_VERBOSE > 1
1039 if ((LMAX <= 12) && (verbose>1))
1040 #pragma omp critical
1041 {
1042 printf("\nl=%d, m=%d ::\t", l,m);
1043 for(it=0;it<2*NLAT_2;it++) printf("%e ",Z[it]/(2*NLAT));
1044 printf("\n dYt ::\t");
1045 for(it=0;it<2*NLAT_2;it++) printf("%e ",dZt[it]/(2*NLAT));
1046 printf("\n dYp ::\t");
1047 for(it=0;it<2*NLAT_2;it++) printf("%e ",dZp[it]/(2*NLAT));
1048 }
1049#endif
1050 for (it=(l-m)&1; it<=l+1; it+=2) {
1051 yk[(it/2)*(LMAX+1-m) + (l-m)] = Z[it]/(2*NLAT); // and transpose
1052 dyk[(it/2)*(LMAX+1-m) + (l-m)].p = dZp[it]/(2*NLAT);
1053 }
1054 for (it=(l+1-m)&1; it<=l+1; it+=2) {
1055 dyk[(it/2)*(LMAX+1-m) + (l-m)].t = dZt[it]/(2*NLAT);
1056 }
1057 }
1058
1059 /* compute analysis coefficients (fast way)
1060 * Wklm = int(Tk*Ylm) = int(Tk.sum(i,a_ilm*Ti)) = sum(i, a_ilm* int(Tk*Ti)) = sum(i, a_ilm*Jik)
1061 * with Jik = int(Tk*Ti) = 1/(1-(k-i)^2) + 1/(1-(k+i)^2)
1062 */
1063 if (analysis) {
1064 #if SHT_VERBOSE > 0
1065 if ((LMAX>126)&&(m0==0)&&(verbose)) { // only one thread prints this message.
1066 printf("computing weights m=%d\r",m); fflush(stdout);
1067 }
1068 #endif
1069 for (l=m; l<=LMAX; l++) {
1070 unsigned k0,k1, k,i,d;
1071 double Jik0, Jik1, yy, dyp, dyt;
1072 double lnorm = iylm_fft_norm;
1073
1074 if (SHT_NORM == sht_schmidt) lnorm *= (2*l+1); // Schmidt semi-normalization
1075 if ( (m>0) && (shtns->norm & SHT_REAL_NORM) ) lnorm *= 2; // "real" norm : zlm must be doubled for m>0
1076
1077 k0 = (l-m)&1; k1 = 1-k0;
1078 for(k=0; k<NLAT; k++) { Z[k] = 0.0; dZt[k] = 0.0; dZp[k] = 0.0; }
1079 for (i=0; i<=(l+1)/2; i++) {
1080 yy = yk[i*(LMAX+1-m) + (l-m)] * lnorm;
1081 dyp = dyk[i*(LMAX+1-m) + (l-m)].p * lnorm/(l*(l+1));
1082 dyt = dyk[i*(LMAX+1-m) + (l-m)].t * lnorm/(l*(l+1));
1083 if (i+k0==0) { yy*=0.5; dyp*=0.5; }
1084 if (i+k1==0) dyt*=0.5;
1085 for (k=0; k<NLAT_2; k++) {
1086 d = (k<i) ? i-k : k-i;
1087 Jik0 = is1[(i+k0)+k] + is1[d];
1088 Jik1 = is1[(i+k1)+k] + is1[d];
1089 Z[2*k+k0] += yy * Jik0;
1090 dZt[2*k+k1] += dyt * Jik1;
1091 if (m&1) dZp[2*k+k0] += dyp * Jik0;
1092 }
1093 }
1094#if SHT_VERBOSE > 1
1095 if ((LMAX <= 12)&&(verbose>1))
1096 #pragma omp critical
1097 {
1098 printf("\nl=%d, m=%d ::\t",l,m);
1099 for (k=0; k<(2*NLAT_2); k++) printf("%f ",Z[k]);
1100 printf("\n dZt ::\t");
1101 for (k=0; k<(2*NLAT_2); k++) printf("%f ",dZt[k]);
1102 if (m&1) {
1103 printf("\n dZp ::\t");
1104 for (k=0; k<(2*NLAT_2); k++) printf("%f ",dZp[k]);
1105 }
1106 }
1107#endif
1108
1109 if (m == 0) { // we store zlm in dct space for m=0
1110 if (k0==0) {
1111 yk0[((l-m)>>1)*(2*NLAT_2)] = Z[0]*0.5; // store zlm_dct (k=0)
1112 for (k=1; k<(2*NLAT_2); k++) yk0[((l-m)>>1)*(2*NLAT_2) +k] = 0.0; // zero out.
1113 k0=2;
1114 }
1115 for (k=k0; k<(2*NLAT_2); k+=2)
1116 yk0[((l-m)>>1)*(2*NLAT_2) +k] = Z[k]; // store zlm_dct
1117
1118 if (l>0) {
1119 if (k1==0) {
1120 dyk0[((l-1-m)>>1)*(2*NLAT_2)] = dZt[0]*0.5; // store dzlm_dct (k=0)
1121 for (k=1; k<(2*NLAT_2); k++) dyk0[((l-1-m)>>1)*(2*NLAT_2) +k] = 0.0; // zero out.
1122 k1=2;
1123 }
1124 for (k=k1; k<(2*NLAT_2); k+=2)
1125 dyk0[((l-1-m)>>1)*(2*NLAT_2) +k] = dZt[k]; // store dzlm_dct
1126 }
1127 }
1128
1129 fftw_execute_r2r(idct, Z, Z); fftw_execute_r2r(idct, dZt, dZt);
1130 if (m == 0) {
1131 for (it=0; it<NLAT; it++) { dZp[it] = 0.0; dZt[it] *= st_1[it]; }
1132 } else if (m & 1) { // m odd
1133 fftw_execute_r2r(idct, dZp, dZp);
1134 for (it=0; it<NLAT; it++) { Z[it] *= st_1[it]; }
1135 } else { // m even
1136 for (it=0; it<NLAT; it++) { dZp[it] = Z[it]*m/(l*(l+1)*st[it]); dZt[it] *= st_1[it]; }
1137 }
1138
1139 long int l0 = (m==0) ? 1 : m;
1140 long int talign = (m==0)*(NLAT_2 & 1);
1141 long int sk = (l-l0)&1;
1142 if (l==0) {
1143 for (it=0; it<NLAT_2; it++) {
1144 shtns->zlm[im][it] = Z[it];
1145 }
1146 } else if ((sk == 0)&&(l == LMAX)) {
1147 for (it=0; it<NLAT_2; it++) {
1148 shtns->zlm[im][(l-m)*NLAT_2 + it + talign] = Z[it];
1149 if (vector) {
1150 shtns->dzlm[im][(l-l0)*NLAT_2 + it].p = dZp[it];
1151 shtns->dzlm[im][(l-l0)*NLAT_2 + it].t = dZt[it];
1152 }
1153 }
1154 } else {
1155 for (it=0; it<NLAT_2; it++) {
1156 shtns->zlm[im][(l-m-sk)*NLAT_2 + it*2 +sk + talign] = Z[it];
1157 if (vector) {
1158 shtns->dzlm[im][(l-l0-sk)*NLAT_2 + it*2 +sk].p = dZp[it];
1159 shtns->dzlm[im][(l-l0-sk)*NLAT_2 + it*2 +sk].t = dZt[it];
1160 }
1161 }
1162 }
1163 }
1164 }
1165
1166 // Compact the coefficients for improved cache efficiency.
1167 yg = shtns->ykm_dct[im];
1168 for (it=0; it<= KMAX; it+=2) {
1169 l = (it < m) ? m : it-(m&1);
1170 while (l<=LMAX) {
1171 yg[0] = yk[(it/2)*(LMAX+1-m) + (l-m)];
1172 l++; yg++;
1173 }
1174 if ((m==0) && ((LMAX & 1) == 0)) { yg[0] = 0; yg++; } // SSE2 padding.
1175 }
1176 if (MMAX==0) for (int j=0; j<3; j++) yg[j] = 0; // allow some overflow.
1177 if (vector) {
1178 dyg = shtns->dykm_dct[im];
1179 for (it=0; it<= KMAX; it+=2) {
1180 l = (it-2 < m) ? m : it-2+(m&1);
1181 while (l<=LMAX) {
1182 dyg[0].t = dyk[(it/2)*(LMAX+1-m) + (l-m)].t;
1183 dyg[0].p = dyk[(it/2)*(LMAX+1-m) + (l-m)].p;
1184 l++; dyg++;
1185 }
1186 }
1187 if (im == 0) { // compact and reorder m=0 dylm because .p = 0 :
1188 dyg = shtns->dykm_dct[im];
1189 yg = (double *) shtns->dykm_dct[im];
1190 for (it=0; it<= KMAX; it+=2) {
1191 dyg++;
1192 for (l=it-1; l<=LMAX; l++) {
1193 if (l>0) {
1194 yg[0] = dyg[0].t;
1195 yg++; dyg++;
1196 }
1197 }
1198 if (LMAX&1) { // padding for SSE2 alignement.
1199 yg[0] = 0.0; yg++;
1200 }
1201 }
1202 }
1203 }
1204 }
1205
1206 // compact yk to zlm_dct0
1207 if ((m0==0)&&(analysis)) {
1208 long int klim = (LMAX * SHT_NL_ORDER) + 2; // max k needed for nl-terms...
1209 klim = (klim/2)*2; // must be even...
1210 if (klim > 2*NLAT_2) klim = 2*NLAT_2; // but no more than 2*NLAT_2.
1211 shtns->klim = klim; // store for use in codelets.
1212 yg = shtns->zlm_dct0;
1213 for (l=0; l<=LMAX; l+=2) {
1214 for (it=l; it<klim; it++) { // for m=0, zl coeff with i<l are zeros.
1215 *yg = yk0[it];
1216 yg++;
1217 }
1218 yk0 += 2*NLAT_2;
1219 }
1220 if (vector) {
1221 yg = shtns->dzlm_dct0;
1222 for (l=1; l<=LMAX; l+=2) {
1223 for (it=l-1; it<klim; it++) { // for m=0, dzl coeff with i<l-1 are zeros.
1224 *yg = dyk0[it];
1225 yg++;
1226 }
1227 dyk0 += 2*NLAT_2;
1228 }
1229 }
1230 free(yk0 - (2*NLAT_2)*(LMAX/2+1));
1231 }
1232
1233 free(dyk); free(yk);
1234}
1235
1236/// \internal Computes the matrices required for SH transform on a regular grid (with or without DCT).
1237/// \param analysis : 0 => synthesis only.
1238static void init_SH_dct(shtns_cfg shtns, int analysis)
1239{
1240 fftw_plan dct, idct;
1241 long int it,im,m,l;
1242 long int sk, dsk;
1243 double Z[2*NLAT_2] SSE;
1244 double is1[NLAT]; // tabulate values for integrals.
1245
1246 const long int marray_size = sizeof(void*)*(MMAX+1) + (MIN_ALIGNMENT-1);
1247 const int vector = (shtns->dylm != NULL);
1248 const int KMAX = LMAX+1;
1249
1250 for(im=0, sk=0, dsk=0; im<=MMAX; im++) { // how much memory to allocate for ykm_dct ?
1251 m = im*MRES;
1252 for (it=0; it<= KMAX; it+=2) {
1253 l = (it < m) ? m : it-(m&1);
1254 sk += LMAX+1 - l;
1255 if ((m==0) && ((LMAX & 1) ==0)) sk++; // SSE padding for m=0
1256 }
1257 for (it=0; it<= KMAX; it+=2) {
1258 l = (it-2 < m) ? m : it-2+(m&1);
1259 dsk += LMAX+1 - l;
1260 }
1261 }
1262 if (MMAX == 0) sk+=3; // allow some overflow.
1263 for (l=0, it=0; l<=LMAX; l+=2) // how much memory for zlm_dct0 ?
1264 it += (2*NLAT_2 -l);
1265 for (l=1, im=0; l<=LMAX; l+=2) // how much memory for dzlm_dct0 ?
1266 im += (2*NLAT_2 -l+1);
1267
1268#if SHT_VERBOSE > 1
1269 if (verbose>1) printf(" Memory used for Ykm_dct matrices = %.3f Mb\n",sizeof(double)*(sk + 2.*dsk + it)/(1024.*1024.));
1270#endif
1271 shtns->ykm_dct = (double **) malloc( marray_size + sizeof(double)*sk );
1272 shtns->ykm_dct[0] = (double *) PTR_ALIGN( shtns->ykm_dct + (MMAX+1) );
1273 if (vector) {
1274 shtns->dykm_dct = (struct DtDp **) malloc( marray_size + sizeof(struct DtDp)*dsk );
1275 shtns->dykm_dct[0] = (struct DtDp *) PTR_ALIGN( shtns->dykm_dct + (MMAX+1) );
1276 }
1277 shtns->zlm_dct0 = (double *) VMALLOC( sizeof(double)* it );
1278 if (vector) {
1279 shtns->dzlm_dct0 = (double *) VMALLOC( sizeof(double)* im );
1280 }
1281 for (im=0; im<MMAX; im++) {
1282 m = im*MRES;
1283 for (it=0, sk=0; it<= KMAX; it+=2) {
1284 l = (it < m) ? m : it-(m&1);
1285 sk += LMAX+1 - l;
1286 if ((m==0) && ((LMAX & 1) ==0)) sk++; // SSE padding for m=0
1287 }
1288 for (it=0, dsk=0; it<= KMAX; it+=2) {
1289 l = (it-2 < m) ? m : it-2+(m&1);
1290 dsk += LMAX+1 - l;
1291 }
1292 shtns->ykm_dct[im+1] = shtns->ykm_dct[im] + sk;
1293 if (vector) shtns->dykm_dct[im+1] = shtns->dykm_dct[im] + dsk;
1294 }
1295
1296#if SHT_VERBOSE > 1
1297 ticks tik0, tik1;
1298 tik0 = getticks();
1299#endif
1300
1301 #ifdef OMP_FFTW
1302 fftw_plan_with_nthreads(1);
1303 #endif
1304 dct = fftw_plan_r2r_1d( 2*NLAT_2, Z, Z, FFTW_REDFT10, FFTW_MEASURE ); // quick and dirty dct.
1305 idct = fftw_plan_r2r_1d( 2*NLAT_2, Z, Z, FFTW_REDFT01, FFTW_MEASURE ); // quick and dirty idct.
1306
1307 init_SH_synth(shtns);
1308
1309// precomputation for scalar product of Chebychev polynomials.
1310 for(it=0; it<NLAT; it++)
1311 is1[it] = 1./(1. - 4.*it*it);
1312
1313 #ifdef _OPENMP
1314 #pragma omp parallel
1315 {
1316 int n=omp_get_num_threads();
1317 int k=omp_get_thread_num();
1318 init_SH_dct_m(shtns, is1, dct, idct, analysis, k, n);
1319 }
1320 #else
1321 init_SH_dct_m(shtns, is1, dct, idct, analysis, 0, 1);
1322 #endif
1323
1324#if SHT_VERBOSE > 1
1325 tik1 = getticks();
1326 if (verbose>1) printf("\n ticks : %.3f\n", elapsed(tik1,tik0)/(NLM*NLAT*(MMAX+1)));
1327#endif
1328 fftw_destroy_plan(idct); fftw_destroy_plan(dct);
1329}
1330
1331#endif /* SHTNS_DCT */
1332
1333/// \internal Generate a gauss grid (including weights)
1334static void grid_gauss(shtns_cfg shtns, double latdir)
1335{
1336 long int it;
1337 real iylm_fft_norm;
1338 real xg[NLAT], wgl[NLAT]; // gauss points and weights.
1339 const int overflow = 8*VSIZE2-1;
1340
1341 shtns->grid = GRID_GAUSS;
1342 shtns->wg = VMALLOC((NLAT_2 +overflow) * sizeof(double)); // gauss weights, double precision.
1343
1344 iylm_fft_norm = 1.0; // FFT/SHT normalization for zlm (4pi normalized)
1345 if ((SHT_NORM != sht_fourpi)&&(SHT_NORM != sht_schmidt)) iylm_fft_norm = 4*M_PIl; // FFT/SHT normalization for zlm (orthonormalized)
1346 iylm_fft_norm /= (2*NPHI);
1347#if SHT_VERBOSE > 0
1348 if (verbose) {
1349 printf(" => using Gauss nodes\n");
1350 if (2*NLAT <= (SHT_NL_ORDER +1)*LMAX) printf(" !! Warning : Gauss-Legendre anti-aliasing condition 2*Nlat > %d*Lmax is not met.\n",SHT_NL_ORDER+1);
1351 }
1352#endif
1353 gauss_nodes(xg,wgl,NLAT); // generate gauss nodes and weights : ct = ]1,-1[ = cos(theta)
1354 for (it=0; it<NLAT; it++) {
1355 shtns->ct[it] = latdir * xg[it];
1356 shtns->st[it] = SQRT((1.-xg[it])*(1.+xg[it]));
1357 shtns->st_1[it] = 1.0/SQRT((1.-xg[it])*(1.+xg[it]));
1358 }
1359 for (it=0; it<NLAT_2; it++)
1360 shtns->wg[it] = wgl[it]*iylm_fft_norm; // faster double-precision computations.
1361 if (NLAT & 1) { // odd NLAT : adjust weigth of middle point.
1362 shtns->wg[NLAT_2-1] *= 0.5;
1363 }
1364 for (it=NLAT_2; it < NLAT_2 +overflow; it++) shtns->wg[it] = 0.0; // padding for multi-way algorithm.
1365
1366#if SHT_VERBOSE > 1
1367 if (verbose>1) {
1368 printf(" NLAT=%d, NLAT_2=%d\n",NLAT,NLAT_2);
1369 // TEST if gauss points are ok.
1370 double tmax = 0.0;
1371 for (it = 0; it<NLAT_2; it++) {
1372 double t = legendre_Pl(NLAT, shtns->ct[it]);
1373 if (t>tmax) tmax = t;
1374 // printf("i=%d, x=%12.12g, p=%12.12g\n",it,ct[it],t);
1375 }
1376 printf(" max zero at Gauss nodes for Pl[l=NLAT] : %g\n",tmax);
1377 if (NLAT_2 < 100) {
1378 printf(" Gauss nodes :");
1379 for (it=0;it<NLAT_2; it++)
1380 printf(" %g",shtns->ct[it]);
1381 printf("\n");
1382 }
1383 }
1384#endif
1385}
1386
1387#ifdef SHTNS_DCT
1388/// \internal Generate an equi-spaced theta grid (Chebychev points, excluding poles) for Féjer-DCT SHT.
1389static void grid_dct(shtns_cfg shtns, double latdir)
1390{
1391 long int it;
1392
1393 shtns->grid = GRID_REGULAR;
1394#if SHT_VERBOSE > 0
1395 if (verbose) {
1396 printf(" => using equaly spaced nodes with DCT acceleration\n");
1397 if (NLAT <= SHT_NL_ORDER *LMAX) printf(" !! Warning : DCT anti-aliasing condition Nlat > %d*Lmax is not met.\n",SHT_NL_ORDER);
1398 if (NLAT != fft_int(NLAT,7)) printf(" !! Warning : Nlat is not optimal for FFTW !\n");
1399 }
1400#endif
1401 if (NLAT & 1) shtns_runerr("NLAT must be even (DCT)");
1402 if (NLAT <= LMAX+1) shtns_runerr("NLAT should be at least LMAX+2 (DCT)");
1403
1404 for (it=0; it<NLAT; it++) { // Chebychev points : equaly spaced but skipping poles.
1405 real th = M_PIl; th = (th*(2*it+1))/(2*NLAT);
1406 shtns->ct[it] = latdir * COS(th);
1407 shtns->st[it] = SIN(th);
1408 shtns->st_1[it] = 1.0/SIN(th);
1409 }
1410
1411#if SHT_VERBOSE > 1
1412 if (verbose>1) {
1413 printf(" NLAT=%d, NLAT_2=%d\n",NLAT,NLAT_2);
1414 double tmax = 0.0;
1415 for (it=0;it<NLAT_2; it++) {
1416 double ct = shtns->ct[it]; double st = shtns->st[it];
1417 double t = fabs((ct*ct + st*st) -1.0);
1418 if (t > tmax) tmax=t;
1419 }
1420 printf(" max st^2 + ct^2 -1 = %g\n",tmax);
1421 if (NLAT_2 < 100) {
1422 printf(" DCT nodes :");
1423 for (it=0; it<NLAT_2; it++)
1424 printf(" %g",shtns->ct[it]);
1425 printf("\n");
1426 }
1427 }
1428#endif
1429}
1430#endif /* SHTNS_DCT */
1431
1432/// \internal Generate an equi-spaced theta grid including the poles, for synthesis only.
1433static void grid_equal_polar(shtns_cfg shtns, double latdir)
1434{
1435 long int j;
1436
1437 shtns->grid = GRID_POLES;
1438#if SHT_VERBOSE > 0
1439 if (verbose) printf(" => using Equaly Spaced Nodes including poles\n");
1440#endif
1441// cos theta of latidunal points (equaly spaced in theta)
1442 double f = M_PIl/(NLAT-1);
1443 for (j=0; j<NLAT; j++) {
1444 shtns->ct[j] = latdir * cos(f*j);
1445 shtns->st[j] = sin(f*j);
1446 shtns->st_1[j] = 1.0/sin(f*j);
1447 }
1448#if SHT_VERBOSE > 0
1449 if (verbose) printf(" !! Warning : only synthesis (inverse transform) supported for this grid !\n");
1450#endif
1451}
1452
1453
1454/* TEST AND TIMING FUNCTIONS */
1455
1456/// \internal return the max error for a back-and-forth SHT transform.
1457/// this function is used to internally measure the accuracy.
1458double SHT_error(shtns_cfg shtns, int vector)
1459{
1460 cplx *Tlm0=0, *Slm0=0, *Tlm=0, *Slm=0;
1461 double *Sh=0, *Th=0;
1462 double t, tmax, n2, err;
1463 long int i, jj, nlm_cplx;
1464
1465 srand( time(NULL) ); // init random numbers.
1466
1467 Slm0 = (cplx *) VMALLOC(sizeof(cplx)* NLM);
1468 Slm = (cplx *) VMALLOC(sizeof(cplx)* NLM);
1469 Sh = (double *) VMALLOC( NSPAT_ALLOC(shtns) * sizeof(double) );
1470 if ((Sh==0) || (Slm==0) || (Slm0==0)) shtns_runerr("not enough memory.");
1471 if (vector) {
1472 Tlm0 = (cplx *) VMALLOC(sizeof(cplx)* NLM);
1473 Tlm = (cplx *) VMALLOC(sizeof(cplx)* NLM);
1474 Th = (double *) VMALLOC( NSPAT_ALLOC(shtns) * sizeof(double) );
1475 if ((Th==0) || (Tlm==0) || (Tlm0==0)) vector=0;
1476 }
1477
1478// m = nphi/2 is also real if nphi is even.
1479 nlm_cplx = ( MMAX*2 == NPHI ) ? LiM(shtns, MRES*MMAX,MMAX) : NLM;
1480 t = 1.0 / (RAND_MAX/2);
1481 for (i=0; i<NLM; i++) {
1482 if ((i<=LMAX)||(i>=nlm_cplx)) { // m=0 or m*2=nphi : real random data
1483 Slm0[i] = t*((double) (rand() - RAND_MAX/2));
1484 if (vector) Tlm0[i] = t*((double) (rand() - RAND_MAX/2));
1485 } else { // m>0 : complex random data
1486 Slm0[i] = t*((double) (rand() - RAND_MAX/2)) + I*t*((double) (rand() - RAND_MAX/2));
1487 if (vector) Tlm0[i] = t*((double) (rand() - RAND_MAX/2)) + I*t*((double) (rand() - RAND_MAX/2));
1488 }
1489 }
1490
1491 SH_to_spat(shtns, Slm0,Sh); // scalar SHT
1492 spat_to_SH(shtns, Sh, Slm);
1493 for (i=0, tmax=0., n2=0., jj=0; i<NLM; i++) { // compute error
1494 t = cabs(Slm[i] - Slm0[i]);
1495 n2 += t*t;
1496 if (t>tmax) { tmax = t; jj = i; }
1497 }
1498 err = tmax;
1499#if SHT_VERBOSE > 1
1500 if (verbose>1) printf(" scalar SH - poloidal rms error = %.3g max error = %.3g for l=%hu,lm=%ld\n",sqrt(n2/NLM),tmax,shtns->li[jj],jj);
1501#endif
1502
1503 if (vector) {
1504 Slm0[0] = 0.0; Tlm0[0] = 0.0; // l=0, m=0 n'a pas de signification sph/tor
1505 SHsphtor_to_spat(shtns, Slm0, Tlm0, Sh, Th); // vector SHT
1506 spat_to_SHsphtor(shtns, Sh, Th, Slm, Tlm);
1507 for (i=0, tmax=0., n2=0., jj=0; i<NLM; i++) { // compute error
1508 t = cabs(Slm[i] - Slm0[i]);
1509 n2 += t*t;
1510 if (t>tmax) { tmax = t; jj = i; }
1511 }
1512 if (tmax > err) err = tmax;
1513 #if SHT_VERBOSE > 1
1514 if (verbose>1) printf(" vector SH - spheroidal rms error = %.3g max error = %.3g for l=%hu,lm=%ld\n",sqrt(n2/NLM),tmax,shtns->li[jj],jj);
1515 #endif
1516 for (i=0, tmax=0., n2=0., jj=0; i<NLM; i++) { // compute error
1517 t = cabs(Tlm[i] - Tlm0[i]);
1518 n2 += t*t;
1519 if (t>tmax) { tmax = t; jj = i; }
1520 }
1521 if (tmax > err) err = tmax;
1522 #if SHT_VERBOSE > 1
1523 if (verbose>1) printf(" - toroidal rms error = %.3g max error = %.3g for l=%hu,lm=%ld\n",sqrt(n2/NLM),tmax,shtns->li[jj],jj);
1524 #endif
1525 }
1526
1527 if (Th) VFREE(Th); if (Tlm) VFREE(Tlm); if (Tlm0) VFREE(Tlm0);
1528 VFREE(Sh); VFREE(Slm); VFREE(Slm0);
1529 return(err); // return max error.
1530}
1531
1532
1533#if SHT_VERBOSE == 1
1534 #define PRINT_DOT if (verbose>=1) { printf("."); fflush(stdout); }
1535#else
1536 #define PRINT_DOT (0);
1537#endif
1538
1539/// \internal measure time used for a transform function
1540static double get_time(shtns_cfg shtns, int nloop, int npar, char* name, void *fptr, void *i1, void *i2, void *i3, void *o1, void *o2, void *o3, int l)
1541{
1542 double t;
1543 int i;
1544 ticks tik0, tik1;
1545
1546 if (fptr == NULL) return(0.0);
1547
1548 tik1 = getticks();
1549 for (i=0; i<nloop; i++) {
1550 switch(npar) {
1551 case 2: (*(pf2l)fptr)(shtns, i1,o1, l); break; // l may be discarded.
1552 case 3: (*(pf3l)fptr)(shtns, i1,o1,o2, l); break;
1553 case 4: (*(pf4l)fptr)(shtns, i1,i2,o1,o2, l); break;
1554 default: (*(pf6l)fptr)(shtns, i1,i2,i3, o1,o2,o3, l); break;
1555 }
1556 if (i==0) tik0 = getticks();
1557 }
1558 if (nloop == 1) {
1559 t = elapsed(tik0, tik1);
1560 } else {
1561 tik1 = getticks();
1562 t = elapsed(tik1, tik0)/(nloop-1); // discard first iteration.
1563 }
1564 #if SHT_VERBOSE > 1
1565 if (verbose>1) { printf(" t(%s) = %.3g",name,t); fflush(stdout); }
1566 #endif
1567 return t;
1568}
1569
1570
1571/// \internal choose fastest between on-the-fly and gauss algorithms.
1572/// *nlp is the number of loops. If zero, it is set to a good value.
1573/// on_the_fly : 1 = skip all memory algorithm. 0 = include memory and on-the-fly. -1 = test only DCT.
1574/// returns time without dct / best time with dct (or 0 if no dct available).
1575static double choose_best_sht(shtns_cfg shtns, int* nlp, int vector, int dct_mtr)
1576{
1577 cplx *Qlm=0, *Slm=0, *Tlm=0;
1578 double *Qh=0, *Sh=0, *Th=0;
1579 int m, i, i0, minc, nloop, alg_end;
1580 int typ_lim = SHT_NTYP; // time every type.
1581 double t0, t, tt, r;
1582 double tdct, tnodct;
1583 clock_t tcpu;
1584 int on_the_fly_only = (shtns->ylm == NULL); // only on-the-fly.
1585 int otf_analys = (shtns->wg != NULL); // on-the-fly analysis supported.
1586
1587 if (NLAT < VSIZE2*4) return(0.0); // on-the-fly not possible for NLAT_2 < 2*NWAY (overflow) and DCT not efficient for low NLAT.
1588 if ((dct_mtr != 0) && (shtns->ykm_dct == NULL)) return(0.0); // no dct available : do nothing.
1589
1590 size_t nspat = sizeof(double) * NSPAT_ALLOC(shtns);
1591 size_t nspec = sizeof(cplx)* NLM;
1592 if (nspec>nspat) nspat=nspec;
1593 Sh = (double *) VMALLOC(nspat); Slm = (cplx *) VMALLOC(nspec);
1594 if ((Sh==0) || (Slm==0)) shtns_runerr("not enough memory.");
1595 if (vector) {
1596 Th = (double *) VMALLOC(nspat); Qh = (double *) VMALLOC(nspat);
1597 Tlm = (cplx *) VMALLOC(nspec); Qlm = (cplx *) VMALLOC(nspec);
1598 if ( (Th==0) || (Qh==0) || (Tlm==0) || (Qlm==0) ) vector = 0;
1599 }
1600
1601 for (i=0;i<NLM;i++) {
1602 int l = shtns->li[i];
1603 Slm[i] = shtns->l_2[l] + 0.5*I*shtns->l_2[l];
1604 if (vector) {
1605 Tlm[i] = 0.5*shtns->l_2[l] + I*shtns->l_2[l];
1606 Qlm[i] = 3*shtns->l_2[l] + 2*I*shtns->l_2[l];
1607 }
1608 }
1609
1610 #if SHT_VERBOSE > 0
1611 if (verbose) {
1612 if (dct_mtr != 0) printf(" finding optimal m-truncation for DCT synthesis");
1613 else printf(" finding optimal algorithm");
1614 fflush(stdout);
1615 }
1616 #endif
1617
1618 if (*nlp <= 0) {
1619 // find good nloop by requiring less than 3% difference between 2 consecutive timings.
1620 m=0; nloop = 1; // number of loops to get timings.
1621 r = 0.0; tt = 1.0;
1622 do {
1623 if ((r > 0.03)||(tt<0.1)) {
1624 m = 0; nloop *= 3;
1625 } else m++;
1626 tcpu = clock();
1627 t0 = get_time(shtns, nloop, 2, "", sht_func[SHT_STD][SHT_FLY2][SHT_TYP_SSY], Slm, Tlm, Qlm, Sh, Th, Qh, LMAX);
1628 tcpu = clock() - tcpu; tt = 1.e-6 * tcpu;
1629 if (tt >= SHT_TIME_LIMIT) break; // we should not exceed 1 second
1630 t = get_time(shtns, nloop, 2, "", sht_func[SHT_STD][SHT_FLY2][SHT_TYP_SSY], Slm, Tlm, Qlm, Sh, Th, Qh, LMAX);
1631 r = fabs(2.0*(t-t0)/(t+t0));
1632 #if SHT_VERBOSE > 1
1633 if (verbose>1) printf(", nloop=%d, r=%g, m=%d (real time = %g s)\n",nloop,r,m,tt);
1634 if (tt >= 0.01) break; // faster timing in debug mode.
1635 #endif
1636 PRINT_DOT
1637 } while((nloop<10000)&&(m < 3));
1638 *nlp = nloop;
1639 } else {
1640 nloop = *nlp;
1641 }
1642 #if SHT_VERBOSE > 1
1643 if (verbose>1) printf(" => nloop=%d (takes %g s)\n",nloop, tt);
1644 #endif
1645 if (vector == 0) typ_lim = SHT_TYP_VSY; // time only scalar transforms.
1646// if (tt > 3.0) typ_lim = SHT_TYP_VSY; // time only scalar transforms.
1647// if (tt > 10.0) goto done; // timing this will be too slow...
1648
1649 int ityp = 0; do {
1650 if ((dct_mtr != 0) && (ityp >= 4)) break; // dct !=0 : only scalar and vector.
1651 if (ityp == 2) nloop = (nloop+1)/2; // scalar ar done.
1652 t0 = 1e100;
1653 i0 = 0;
1654 if (MTR_DCT < 0) i0 = SHT_MEM; // skip dct.
1655 if (on_the_fly_only) i0 = SHT_SV; // only on-the-fly (SV is then also on-the-fly)
1656 alg_end = SHT_NALG;
1657 if (shtns->nthreads <= 1) alg_end = SHT_OMP1; // no OpenMP with 1 thread.
1658 if ((ityp&1) && (otf_analys == 0)) alg_end = SHT_FLY1; // no on-the-fly analysis for regular grid.
1659 for (i=i0, m=0; i<alg_end; i++) {
1660 if (sht_func[0][i][ityp] != NULL) m++; // count number of algos
1661 }
1662 if (m >= 2) { // don't time if there is only 1 algo !
1663 #if SHT_VERBOSE > 1
1664 if (verbose>1) { printf("finding best %s ...",sht_type[ityp]); fflush(stdout); }
1665 #endif
1666 i = i0-1; i0 = -1;
1667 while (++i < alg_end) {
1668 void *pf = sht_func[0][i][ityp];
1669 if (pf != NULL) {
1670 if (ityp&1) { // analysis
1671 t = get_time(shtns, nloop, sht_npar[ityp], sht_name[i], pf, Sh, Th, Qh, Slm, Tlm, Qlm, LMAX);
1672 } else {
1673 t = get_time(shtns, nloop, sht_npar[ityp], sht_name[i], pf, Slm, Tlm, Qlm, Sh, Th, Qh, LMAX);
1674 }
1675 if (i < SHT_FLY1) t *= 1.03; // 3% penality for memory based transforms.
1676 #ifdef _OPENMP
1677 if ((i >= SHT_OMP1)||(i == SHT_SV)) t *= 1.3; // 30% penality for openmp transforms.
1678 #endif
1679 if (t < t0) { i0 = i; t0 = t; PRINT_VERB("*"); }
1680 }
1681 }
1682 if (i0 >= 0) {
1683 for (int iv=0; iv<SHT_NVAR; iv++) {
1684 if (sht_func[iv][i0][ityp]) shtns->ftable[iv][ityp] = sht_func[iv][i0][ityp];
1685 if (ityp == 4) { // only one timing for both gradients variants.
1686 if (sht_func[iv][i0][ityp+1]) shtns->ftable[iv][ityp+1] = sht_func[iv][i0][ityp+1];
1687 }
1688 }
1689 PRINT_DOT
1690 #if SHT_VERBOSE > 1
1691 if (verbose>1) printf(" => %s\n",sht_name[i0]);
1692 #endif
1693 }
1694 }
1695 if (ityp == 4) ityp++; // skip second gradient
1696 } while(++ityp < typ_lim);
1697
1698 #ifdef SHTNS_DCT
1699 if (dct_mtr != 0) { // find the best DCT timings...
1700 #if SHT_VERBOSE > 1
1701 if (verbose>1) { printf("finding best mtr_dct ..."); fflush(stdout); }
1702 #endif
1703 minc = MMAX/20 + 1; // don't test every single m.
1704 m = -1; i = -1; t0 = 0.0; // reference = no dct.
1705 if (sht_func[SHT_STD][SHT_DCT][SHT_TYP_SSY] != NULL)
1706 t0 += get_time(shtns, *nlp, 2, "s", shtns->ftable[SHT_STD][SHT_TYP_SSY], Slm, Tlm, Qlm, Sh, Th, Qh, LMAX);
1707 if ( (sht_func[SHT_STD][SHT_DCT][SHT_TYP_VSY] != NULL) && (vector) )
1708 t0 += get_time(shtns, nloop, 4, "v", shtns->ftable[SHT_STD][SHT_TYP_VSY], Slm, Tlm, Qlm, Sh, Th, Qh, LMAX);
1709 tnodct = t0;
1710 for (m=0; m<=MMAX; m+=minc) {
1711 #if SHT_VERBOSE > 1
1712 if (verbose>1) printf("\n\tm=%d :",m);
1713 #endif
1714 if (Set_MTR_DCT(shtns, m) >= 0) {
1715 t = get_time(shtns, *nlp, 2, "sdct", sht_func[SHT_STD][SHT_DCT][SHT_TYP_SSY], Slm, Tlm, Qlm, Sh, Th, Qh, LMAX);
1716 if (vector)
1717 t += get_time(shtns, nloop, 4, "vdct", sht_func[SHT_STD][SHT_DCT][SHT_TYP_VSY], Slm, Tlm, Qlm, Sh, Th, Qh, LMAX);
1718 if (t < t0) { t0 = t; i = m; PRINT_VERB("*"); }
1719 PRINT_DOT
1720 }
1721 }
1722 tdct = t0;
1723 Set_MTR_DCT(shtns, i); // the best DCT is chosen.
1724 #if SHT_VERBOSE > 0
1725 if (verbose) printf(" mtr_dct=%d (%.1f%% performance gain)", MTR_DCT*MRES, 100.*(tnodct/tdct-1.));
1726 #endif
1727 }
1728 #endif
1729
1730done:
1731 #if SHT_VERBOSE > 0
1732 if (verbose) printf("\n");
1733 #endif
1734 if (Qlm) VFREE(Qlm); if (Tlm) VFREE(Tlm);
1735 if (Qh) VFREE(Qh); if (Th) VFREE(Th);
1736 if (Slm) VFREE(Slm); if (Sh) VFREE(Sh);
1737 if (dct_mtr > 0) {
1738 return(tnodct/tdct);
1739 } else return(0.0);
1740}
1741
1742
1743void shtns_print_version() {
1744 printf("[" PACKAGE_STRING "] built " __DATE__ ", " __TIME__ ", id: " _SIMD_NAME_ "\n");
1745}
1746
1747void fprint_ftable(FILE* fp, void* ftable[SHT_NVAR][SHT_NTYP])
1748{
1749 for (int iv=0; iv<SHT_NVAR; iv++) {
1750 fprintf(fp, "\n %4s:",sht_var[iv]);
1751 void** f = ftable[iv];
1752 for (int it=0; it<SHT_NTYP; it++) {
1753 if (f[it] != NULL) {
1754 for (int ia=0; ia<SHT_NALG; ia++)
1755 if (sht_func[iv][ia][it] == f[it]) {
1756 fprintf(fp, "%5s ",sht_name[ia]); break;
1757 }
1758 } else fprintf(fp, " none ");
1759 }
1760 }
1761}
1762
1763void shtns_print_cfg(shtns_cfg shtns)
1764{
1765 printf("Lmax=%d, Mmax*Mres=%d, Mres=%d, Nlm=%d [%d threads, ",LMAX, MMAX*MRES, MRES, NLM, shtns->nthreads);
1766 if (shtns->norm & SHT_REAL_NORM) printf("'real' norm, ");
1767 if (shtns->norm & SHT_NO_CS_PHASE) printf("no Condon-Shortley phase, ");
1768 if (SHT_NORM == sht_fourpi) printf("4.pi normalized]\n");
1769 else if (SHT_NORM == sht_schmidt) printf("Schmidt semi-normalized]\n");
1770 else printf("orthonormalized]\n");
1771 if (shtns->ct == NULL) return; // no grid is set
1772
1773 switch(shtns->grid) {
1774 case GRID_GAUSS : printf("Gauss grid"); break;
1775 case GRID_REGULAR : printf("Regular grid (mtr_dct=%d)",shtns->mtr_dct); break;
1776 case GRID_POLES : printf("Regular grid including poles"); break;
1777 default : printf("Unknown grid");
1778 }
1779 printf(" : Nlat=%d, Nphi=%d\n", NLAT, NPHI);
1780 printf(" ");
1781 for (int it=0; it<SHT_NTYP; it++)
1782 printf("%5s ",sht_type[it]);
1783 fprint_ftable(stdout, shtns->ftable);
1784 printf("\n");
1785}
1786
1787
1788/// \internal saves config to a file for later restart.
1789int config_save(shtns_cfg shtns, int req_flags)
1790{
1791 int err = 0;
1792
1793 if (shtns->ct == NULL) return -1; // no grid set
1794
1795 if ((shtns->nphi > 1)||(shtns->mtr_dct >= 0)) {
1796 FILE* f = fopen("shtns_cfg_fftw","w");
1797 if (f != NULL) {
1798 fftw_export_wisdom_to_file(f);
1799 fclose(f);
1800 } else err -= 2;
1801 }
1802
1803 FILE *fcfg = fopen("shtns_cfg","a");
1804 if (fcfg != NULL) {
1805 fprintf(fcfg, "%s %s %d %d %d %d %d %d %d %d %d %d",PACKAGE_VERSION, _SIMD_NAME_, shtns->lmax, shtns->mmax, shtns->mres, shtns->nphi, shtns->nlat, shtns->grid, shtns->nthreads, req_flags, shtns->nlorder, shtns->mtr_dct);
1806 fprint_ftable(fcfg, shtns->ftable);
1807 fprintf(fcfg,"\n");
1808 fclose(fcfg);
1809 } else err -= 4;
1810
1811 #if SHT_VERBOSE > 0
1812 if (err < 0) fprintf(stderr,"! Warning ! SHTns could not save config\n");
1813 #endif
1814 return err;
1815}
1816
1817/// \internal try to load config from a file
1818int config_load(shtns_cfg shtns, int req_flags)
1819{
1820 void* ft2[SHT_NVAR][SHT_NTYP]; // pointers to transform functions.
1821 int lmax2, mmax2, mres2, nphi2, nlat2, grid2, nthreads2, req_flags2, nlorder2, mtr_dct2;
1822 int found = 0;
1823 char version[32], simd[8], alg[8];
1824
1825 if (shtns->ct == NULL) return -1; // no grid set
1826
1827 if ((req_flags & 255) == sht_quick_init) req_flags += sht_gauss - sht_quick_init; // quick_init uses gauss.
1828
1829 FILE *fcfg = fopen("shtns_cfg","r");
1830 if (fcfg != NULL) {
1831 int i=0;
1832 while(1) {
1833 fscanf(fcfg, "%30s %8s %d %d %d %d %d %d %d %d %d %d",version, simd, &lmax2, &mmax2, &mres2, &nphi2, &nlat2, &grid2, &nthreads2, &req_flags2, &nlorder2, &mtr_dct2);
1834 for (int iv=0; iv<SHT_NVAR; iv++) {
1835 fscanf(fcfg, "%7s", alg);
1836 for (int it=0; it<SHT_NTYP; it++) {
1837 fscanf(fcfg, "%7s", alg),
1838 ft2[iv][it] = 0;
1839 for (int ia=0; ia<SHT_NALG; ia++) {
1840 if (strcmp(alg, sht_name[ia]) == 0) {
1841 ft2[iv][it] = sht_func[iv][ia][it];
1842 break;
1843 }
1844 }
1845 }
1846 }
1847 if (feof(fcfg)) break;
1848 #ifndef SHTNS_DCT
1849 if (mtr_dct2 <= 0)
1850 #endif
1851 if ((shtns->lmax == lmax2) && (shtns->mmax == mmax2) && (shtns->mres == mres2) && (shtns->nthreads == nthreads2) &&
1852 (shtns->nphi == nphi2) && (shtns->nlat == nlat2) && (shtns->grid == grid2) && (req_flags == req_flags2) &&
1853 (shtns->nlorder == nlorder2) && (strcmp(simd, _SIMD_NAME_)==0)) {
1854 #if SHT_VERBOSE > 0
1855 if (verbose > 0) printf(" + using saved config\n");
1856 #endif
1857 #if SHT_VERBOSE > 1
1858 if (verbose > 1) {
1859 fprint_ftable(stdout, ft2);
1860 printf("\n");
1861 }
1862 #endif
1863 #ifdef SHTNS_DCT
1864 Set_MTR_DCT(shtns, mtr_dct2); // use loaded mtr_dct
1865 #endif
1866 for (int iv=0; iv<SHT_NVAR; iv++)
1867 for (int it=0; it<SHT_NTYP; it++)
1868 if (ft2[iv][it]) shtns->ftable[iv][it] = ft2[iv][it]; // accept only non-null pointer
1869 found = 1;
1870 break;
1871 }
1872 }
1873 fclose(fcfg);
1874 return found;
1875 } else {
1876 #if SHT_VERBOSE > 0
1877 if (verbose) fprintf(stderr,"! Warning ! SHTns could not load config\n");
1878 #endif
1879 return -2; // file not found
1880 }
1881}
1882
1883/// \internal returns 1 if val cannot fit in dest (unsigned)
1884#define IS_TOO_LARGE(val, dest) (sizeof(dest) >= sizeof(val)) ? 0 : ( ( val >= (1<<(8*sizeof(dest))) ) ? 1 : 0 )
1885
1886/// \internal returns the size that must be allocated for an shtns_info.
1887#define SIZEOF_SHTNS_INFO(mmax) ( sizeof(struct shtns_info) + (mmax+1)*( sizeof(int)+sizeof(unsigned short) ) )
1888
1889/* PUBLIC INITIALIZATION & DESTRUCTION */
1890
1891/** \addtogroup init Initialization functions.
1892*/
1893//@{
1894
1895/*! This sets the description of spherical harmonic coefficients.
1896 * It tells SHTns how to interpret spherical harmonic coefficient arrays, and it sets usefull arrays.
1897 * Returns the configuration to be passed to subsequent transform functions, which is basicaly a pointer to a \ref shtns_info struct.
1898 * \param lmax : maximum SH degree that we want to describe.
1899 * \param mmax : number of azimutal wave numbers.
1900 * \param mres : \c 2.pi/mres is the azimutal periodicity. \c mmax*mres is the maximum SH order.
1901 * \param norm : define the normalization of the spherical harmonics (\ref shtns_norm)
1902 * + optionaly disable Condon-Shortley phase (ex: \ref sht_schmidt | \ref SHT_NO_CS_PHASE)
1903 * + optionaly use a 'real' normalization (ex: \ref sht_fourpi | \ref SHT_REAL_NORM)
1904*/
1905shtns_cfg shtns_create(int lmax, int mmax, int mres, enum shtns_norm norm)
1906{
1907 shtns_cfg shtns, s2;
1908 int im, m, l, lm;
1909 int with_cs_phase = 1; /// Condon-Shortley phase (-1)^m is used by default.
1910 double mpos_renorm = 1.0; /// renormalization of m>0.
1911 int larrays_ok = 0;
1912 int legendre_ok = 0;
1913 int l_2_ok = 0;
1914
1915// if (lmax < 1) shtns_runerr("lmax must be larger than 1");
1916 if (lmax < 2) shtns_runerr("lmax must be at least 2");
1917 if (IS_TOO_LARGE(lmax, shtns->lmax)) shtns_runerr("lmax too large");
1918 if (mmax*mres > lmax) shtns_runerr("MMAX*MRES should not exceed LMAX");
1919 if (mres <= 0) shtns_runerr("MRES must be > 0");
1920
1921 // allocate new setup and initialize some variables (used as flags) :
1922 shtns = malloc( SIZEOF_SHTNS_INFO(mmax) );
1923 if (shtns == NULL) return shtns; // FAIL
1924 {
1925 void **p0 = (void**) &shtns->tm; // first pointer in struct.
1926 void **p1 = (void**) &shtns->Y00_1; // first non-pointer.
1927 while(p0 < p1) *p0++ = NULL; // write NULL to every pointer.
1928 shtns->lmidx = (int*) (shtns + 1); // lmidx is stored at the end of the struct...
1929 shtns->tm = (unsigned short*) (shtns->lmidx + (mmax+1)); // and tm just after.
1930 shtns->ct = NULL; shtns->st = NULL;
1931 shtns->nphi = 0; shtns->nlat = 0; shtns->nlat_2 = 0; shtns->nspat = 0; // public data
1932 }
1933
1934 // copy sizes.
1935 shtns->norm = norm;
1936 if (norm & SHT_NO_CS_PHASE)
1937 with_cs_phase = 0;
1938 if (norm & SHT_REAL_NORM)
1939 mpos_renorm = 0.5; // normalization for 'real' spherical harmonics.
1940
1941 shtns->mmax = mmax; shtns->mres = mres; shtns->lmax = lmax;
1942 shtns->nlm = nlm_calc(lmax, mmax, mres);
1943 shtns->nthreads = omp_threads;
1944 if (omp_threads > mmax+1) shtns->nthreads = mmax+1; // limit the number of threads to mmax+1
1945 #if SHT_VERBOSE > 0
1946 if (verbose) {
1947 shtns_print_version();
1948 printf(" "); shtns_print_cfg(shtns);
1949 }
1950 #endif
1951
1952 s2 = sht_data; // check if some data can be shared ...
1953 while(s2 != NULL) {
1954 if ((s2->mmax >= mmax) && (s2->mres == mres)) {
1955 if (s2->lmax == lmax) { // we can reuse the l-related arrays (li + copy lmidx)
1956 shtns->li = s2->li; shtns->mi = s2->mi;
1957 for (im=0; im<=mmax; im++) shtns->lmidx[im] = s2->lmidx[im];
1958 larrays_ok = 1;
1959 }
1960 if ( (s2->lmax >= lmax) && (s2->norm == norm) ) { // we can reuse the legendre tables.
1961 shtns->alm = s2->alm; shtns->blm = s2->blm;
1962 legendre_ok = 1;
1963 }
1964 }
1965 if (s2->lmax >= lmax) { // we can reuse l_2
1966 shtns->l_2 = s2->l_2;
1967 l_2_ok = 1;
1968 }
1969 s2 = s2->next;
1970 }
1971 if (larrays_ok == 0) {
1972 // alloc spectral arrays
1973 shtns->li = (unsigned short *) malloc( 2*NLM*sizeof(unsigned short) ); // NLM defined at runtime.
1974 shtns->mi = shtns->li + NLM;
1975 for (im=0, lm=0; im<=MMAX; im++) { // init l-related arrays.
1976 m = im*MRES;
1977 shtns->lmidx[im] = lm -m; // virtual pointer for l=0
1978 for (l=im*MRES;l<=LMAX;l++) {
1979 shtns->li[lm] = l; shtns->mi[lm] = m;
1980 lm++;
1981 }
1982 }
1983 if (lm != NLM) shtns_runerr("unexpected error");
1984 }
1985 if (legendre_ok == 0) { // this quickly precomputes some values for the legendre recursion.
1986 legendre_precomp(shtns, SHT_NORM, with_cs_phase, mpos_renorm);
1987 }
1988 if (l_2_ok == 0) {
1989 shtns->l_2 = (double *) malloc( (LMAX+1)*sizeof(double) );
1990 shtns->l_2[0] = 0.0; // undefined for l=0 => replace with 0.
1991 real one = 1.0;
1992 for (l=1; l<=LMAX; l++) shtns->l_2[l] = one/(l*(l+1));
1993 }
1994
1995 switch(SHT_NORM) {
1996 case sht_schmidt:
1997 shtns->Y00_1 = 1.0; shtns->Y10_ct = 1.0;
1998 break;
1999 case sht_fourpi:
2000 shtns->Y00_1 = 1.0; shtns->Y10_ct = sqrt(1./3.);
2001 break;
2002 case sht_orthonormal:
2003 default:
2004 shtns->Y00_1 = sqrt(4.*M_PI); shtns->Y10_ct = sqrt(4.*M_PI/3.);
2005// Y11_st = sqrt(2.*M_PI/3.); // orthonormal : \f$ \sin\theta\cos\phi/(Y_1^1 + Y_1^{-1}) = -\sqrt{2 \pi /3} \f$
2006 }
2007 shtns->Y11_st = shtns->Y10_ct * sqrt(0.5/mpos_renorm);
2008 if (with_cs_phase) shtns->Y11_st *= -1.0; // correct Condon-Shortley phase
2009
2010// save a pointer to this setup and return.
2011 shtns->next = sht_data; // reference of previous setup (may be NULL).
2012 sht_data = shtns; // keep track of new setup.
2013 return(shtns);
2014}
2015
2016/// Copy a given config but allow a different (smaller) mmax and the possibility to enable/disable fft (beta).
2017shtns_cfg shtns_create_with_grid(shtns_cfg base, int mmax, int nofft)
2018{
2019 shtns_cfg shtns;
2020
2021 if (mmax > base->mmax) return (NULL); // fail if mmax larger than source config.
2022
2023 shtns = malloc( SIZEOF_SHTNS_INFO(mmax) );
2024 memcpy(shtns, base, SIZEOF_SHTNS_INFO(mmax) ); // copy all
2025 shtns->lmidx = (int*) shtns+1; // lmidx is stored at the end of the struct...
2026 shtns->tm = (unsigned short*) (shtns->lmidx + (mmax+1)); // ...and tm just after.
2027
2028 if (mmax != shtns->mmax) {
2029 shtns->mmax = mmax;
2030 for (int im=0; im<=mmax; im++) {
2031 shtns->lmidx[im] = base->lmidx[im];
2032 shtns->tm[im] = base->tm[im];
2033 }
2034 #ifdef SHTNS_DCT
2035 if (mmax < shtns->mtr_dct) {
2036 shtns->idct = NULL; // do not destroy the plan of the source.
2037 Set_MTR_DCT(shtns, mmax); // adjut mtr_dct if required.
2038 }
2039 #endif
2040 if (mmax == 0) {
2041 // TODO we may disable fft and replace with a phi-averaging function ...
2042 // ... then switch to axisymmetric functions :
2043 // init_sht_array_func(shtns);
2044 // choose_best_sht(shtns, &nloop, 0);
2045 }
2046 }
2047 if (nofft != 0) {
2048 shtns->ncplx_fft = -1; // fft disabled.
2049 }
2050
2051// save a pointer to this setup and return.
2052 shtns->next = sht_data; // reference of previous setup (may be NULL).
2053 sht_data = shtns; // keep track of new setup.
2054 return(shtns);
2055}
2056
2057/// release all resources allocated by a grid.
2058void shtns_unset_grid(shtns_cfg shtns)
2059{
2060 if (ref_count(shtns, &shtns->wg) == 1) VFREE(shtns->wg);
2061 shtns->wg = NULL;
2062 free_SH_dct(shtns);
2063 free_SHTarrays(shtns);
2064 shtns->nlat = 0; shtns->nlat_2 = 0;
2065 shtns->nphi = 0; shtns->nspat = 0;
2066}
2067
2068/// release all resources allocated by a given shtns_cfg.
2069void shtns_destroy(shtns_cfg shtns)
2070{
2071 free_unused(shtns, &shtns->l_2);
2072 if (shtns->blm != shtns->alm)
2073 free_unused(shtns, &shtns->blm);
2074 free_unused(shtns, &shtns->alm);
2075 free_unused(shtns, &shtns->li);
2076
2077 shtns_unset_grid(shtns);
2078
2079 if (sht_data == shtns) {
2080 sht_data = shtns->next; // forget shtns
2081 } else {
2082 shtns_cfg s2 = sht_data;
2083 while (s2 != NULL) {
2084 if (s2->next == shtns) {
2085 s2->next = shtns->next; // forget shtns
2086 break;
2087 }
2088 s2 = s2->next;
2089 }
2090 }
2091 free(shtns);
2092}
2093
2094/// clear all allocated memory (hopefully) and go back to 0 state.
2095void shtns_reset()
2096{
2097 while (sht_data != NULL) {
2098 shtns_destroy(sht_data);
2099 }
2100}
2101
2102/*! Initialization of Spherical Harmonic transforms (backward and forward, vector and scalar, ...) of given size.
2103 * <b>This function must be called after \ref shtns_create and before any SH transform.</b> and sets all global variables and internal data.
2104 * returns the required number of doubles to be allocated for a spatial field.
2105 * \param shtns is the config created by \ref shtns_create for which the grid will be set.
2106 * \param nlat,nphi pointers to the number of latitudinal and longitudinal grid points respectively. If 0, they are set to optimal values.
2107 * \param nl_order defines the maximum SH degree to be resolved by analysis : lmax_analysis = lmax*nl_order. It is used to set an optimal and anti-aliasing nlat. If 0, the default SHT_DEFAULT_NL_ORDER is used.
2108 * \param flags allows to choose the type of transform (see \ref shtns_type) and the spatial data layout (see \ref spat)
2109 * \param eps polar optimization threshold : polar values of Legendre Polynomials below that threshold are neglected (for high m), leading to increased performance (a few percents)
2110 * 0 = no polar optimization; 1.e-14 = VERY safe; 1.e-10 = safe; 1.e-6 = aggresive, but still good accuracy.
2111*/
2112int shtns_set_grid_auto(shtns_cfg shtns, enum shtns_type flags, double eps, int nl_order, int *nlat, int *nphi)
2113{
2114 double t, mem;
2115 int im,m;
2116 int layout;
2117 int nloop = 0;
2118 int n_gauss = 0;
2119 int on_the_fly = 0;
2120 int quick_init = 0;
2121 int vector = !(flags & SHT_SCALAR_ONLY);
2122 int latdir = (flags & SHT_SOUTH_POLE_FIRST) ? -1 : 1; // choose latitudinal direction (change sign of ct)
2123 int cfg_loaded = 0;
2124 int analys = 1;
2125 const int req_flags = flags; // requested flags.
2126
2127 #if _GCC_VEC_
2128 if (*nlat & 1) shtns_runerr("Nlat must be even\n");
2129 #ifdef __MIC__
2130 if (*nlat % VSIZE2) shtns_runerr("Nlat must be a multiple of 8 for the MIC\n");
2131 #endif
2132 #endif
2133 shtns_unset_grid(shtns); // release grid if previously allocated.
2134 if (nl_order <= 0) nl_order = SHT_DEFAULT_NL_ORDER;
2135/* shtns.lshift = 0;
2136 if (nl_order == 0) nl_order = SHT_DEFAULT_NL_ORDER;
2137 if (nl_order < 0) { shtns.lshift = -nl_order; nl_order = 1; } // linear with a shift in l.
2138*/
2139 shtns->nspat = 0;
2140 shtns->nlorder = nl_order;
2141 shtns->mtr_dct = -1; // dct switched off completely.
2142 layout = flags & 0xFFFF00;
2143 flags = flags & 255; // clear higher bits.
2144
2145 switch (flags) {
2146 #ifndef SHTNS_DCT
2147 case sht_auto : flags = sht_gauss; break; // only gauss available.
2148 case sht_reg_fast:
2149 case sht_reg_dct: shtns_runerr("regular grid not available (DCT required)."); break;
2150 #endif
2151 case sht_gauss_fly : flags = sht_gauss; on_the_fly = 1; break;
2152 case sht_quick_init : flags = sht_gauss; quick_init = 1; break;
2153 case sht_reg_poles : analys = 0; quick_init = 1; break;
2154 default : break;
2155 }
2156 #ifndef SHTNS_MEM
2157 on_the_fly = 1;
2158 #endif
2159
2160 if (*nphi == 0) {
2161 *nphi = fft_int((nl_order+1)*MMAX+1, 7); // required fft nodes
2162 }
2163 if (*nlat == 0) {
2164 n_gauss = ((nl_order+1)*LMAX)/2 +1; // required gauss nodes
2165 n_gauss += (n_gauss&1); // even is better.
2166 n_gauss = ((n_gauss+(VSIZE2-1))/VSIZE2) * VSIZE2; // multiple of vector size
2167 if (flags != sht_gauss) {
2168 m = fft_int(nl_order*LMAX+2, 7); // required dct nodes
2169 *nlat = m + (m&1); // even is better.
2170 } else *nlat = n_gauss;
2171 }
2172
2173 mem = sht_mem_size(shtns->lmax, shtns->mmax, shtns->mres, *nlat);
2174 t=mem; if (analys) t*=2; if (vector) t*=3;
2175 #if SHT_VERBOSE > 1
2176 if (verbose>1) printf("Memory required for precomputed matrices (estimate) : %.3f Mb\n",t);
2177 #endif
2178 if ( t > SHTNS_MAX_MEMORY ) { // huge transform has been requested
2179 on_the_fly = 1;
2180 if ( (flags == sht_reg_dct) || (flags == sht_reg_fast) ) shtns_runerr("Memory limit exceeded, try using sht_gauss or increase SHTNS_MAX_MEMORY in sht_config.h");
2181 if (flags != sht_reg_poles) {
2182 flags = sht_gauss;
2183 if (n_gauss > 0) *nlat = n_gauss;
2184 }
2185// if (t > 10*SHTNS_MAX_MEMORY) quick_init =1; // do not time such large transforms.
2186 }
2187
2188 if (quick_init == 0) { // do not waste too much time finding optimal fftw.
2189 shtns->fftw_plan_mode = FFTW_EXHAUSTIVE; // defines the default FFTW planner mode.
2190 // fftw_set_timelimit(60.0); // do not search plans for more than 1 minute (does it work well ???)
2191 if (*nphi > 512) shtns->fftw_plan_mode = FFTW_PATIENT;
2192 if (*nphi > 1024) shtns->fftw_plan_mode = FFTW_MEASURE;
2193 } else {
2194 shtns->fftw_plan_mode = FFTW_ESTIMATE;
2195 if ((mem < 1.0) && (SHT_VERBOSE < 2)) shtns->nthreads = 1; // disable threads for small transforms (in quickinit mode).
2196 if ((VSIZE2 >= 4) && (*nlat >= VSIZE2*4)) on_the_fly = 1; // with AVX, on-the-fly should be the default (faster).
2197 if ((shtns->nthreads > 1) && (*nlat >= VSIZE2*16)) on_the_fly = 1; // force multi-thread transforms
2198 }
2199
2200 if (flags == sht_auto) {
2201 if ( ((nl_order>=2)&&(MMAX*MRES > LMAX/2)) || (*nlat < SHT_MIN_NLAT_DCT) || (*nlat & 1) || (*nlat <= LMAX+1) ) {
2202 flags = sht_gauss; // avoid computing DCT stuff when it is not expected to be faster.
2203 if (n_gauss > 0) *nlat = n_gauss;
2204 }
2205 }
2206
2207 if (*nlat <= shtns->lmax) shtns_runerr("Nlat must be larger than Lmax");
2208 if (IS_TOO_LARGE(*nlat, shtns->nlat)) shtns_runerr("Nlat too large");
2209 if (IS_TOO_LARGE(*nphi, shtns->nphi)) shtns_runerr("Nphi too large");
2210
2211 // copy to global variables.
2212 shtns->nphi = *nphi;
2213 shtns->nlat_2 = (*nlat+1)/2; shtns->nlat = *nlat;
2214
2215 if (layout & SHT_LOAD_SAVE_CFG) {
2216 FILE* f = fopen("shtns_cfg_fftw","r");
2217 if (f) {
2218 fftw_import_wisdom_from_file(f); // load fftw wisdom.
2219 fclose(f);
2220 }
2221 }
2222 planFFT(shtns, layout, on_the_fly); // initialize fftw
2223 shtns->zlm_dct0 = NULL; // used as a flag.
2224 init_sht_array_func(shtns); // array of SHT functions is now set.
2225
2226 #ifdef SHTNS_DCT
2227 if (flags == sht_reg_dct) { // pure dct.
2228 alloc_SHTarrays(shtns, on_the_fly, vector, analys); // allocate dynamic arrays
2229 grid_dct(shtns, latdir); init_SH_dct(shtns, 1);
2230 OptimizeMatrices(shtns, eps);
2231 Set_MTR_DCT(shtns, MMAX);
2232 if (MTR_DCT != MMAX) shtns_runerr("DCT planning failed.");
2233 }
2234 if ((flags == sht_auto)||(flags == sht_reg_fast))
2235 {
2236 alloc_SHTarrays(shtns, on_the_fly, vector, analys); // allocate dynamic arrays
2237 grid_dct(shtns, latdir); init_SH_dct(shtns, 1);
2238 OptimizeMatrices(shtns, eps);
2239 if (NLAT >= SHT_MIN_NLAT_DCT) { // dct requires large NLAT to perform well.
2240 if ((layout & SHT_LOAD_SAVE_CFG) && (flags == sht_reg_fast))
2241 cfg_loaded = (config_load(shtns, req_flags) > 0);
2242 if (!cfg_loaded) {
2243 t = choose_best_sht(shtns, &nloop, vector, 1); // find optimal MTR_DCT.
2244 if ((n_gauss > 0)&&(flags == sht_auto)) t *= ((double) n_gauss)/NLAT; // we can revert to gauss with a smaller nlat.
2245 if (t < MIN_PERF_IMPROVE_DCT) {
2246 Set_MTR_DCT(shtns, -1); // turn off DCT.
2247 } else {
2248 t = SHT_error(shtns, vector);
2249 if (t > MIN_ACCURACY_DCT) {
2250 #if SHT_VERBOSE > 0
2251 if (verbose) printf(" !! Not enough accuracy (%.3g) => DCT disabled.\n",t);
2252 #endif
2253 #if SHT_VERBOSE < 2
2254 Set_MTR_DCT(shtns, -1); // turn off DCT.
2255 #endif
2256 }
2257 }
2258 }
2259 }
2260 if (MTR_DCT < 0) { // free memory used by DCT and disables DCT.
2261 free_SH_dct(shtns); // free now useless arrays.
2262 if (flags == sht_auto) {
2263 flags = sht_gauss; // switch to gauss grid, even better accuracy.
2264 #if SHT_VERBOSE > 0
2265 if (verbose) printf(" => switching back to Gauss Grid\n");
2266 #endif
2267 for (im=1; im<=MMAX; im++) { // im >= 1
2268 m = im*MRES;
2269 shtns->ylm[im] -= shtns->tm[im]*(LMAX-m+1); // restore pointers altered by OptimizeMatrices().
2270 if (vector) shtns->dylm[im] -= shtns->tm[im]*(LMAX-m+1);
2271 }
2272 if (n_gauss > 0) { // we should use the optimal size for gauss-legendre
2273 free_SHTarrays(shtns);
2274 *nlat = n_gauss;
2275 shtns->nlat_2 = (*nlat+1)/2; shtns->nlat = *nlat;
2276 planFFT(shtns, layout, on_the_fly); // fft must be replanned because NLAT has changed.
2277 }
2278 }
2279 }
2280 }
2281 #endif /* SHTNS_DCT */
2282 if (flags == sht_gauss)
2283 {
2284 alloc_SHTarrays(shtns, on_the_fly, vector, analys); // allocate dynamic arrays
2285 grid_gauss(shtns, latdir);
2286 #ifdef SHTNS_MEM
2287 if (on_the_fly == 0) {
2288 init_SH_gauss(shtns); // precompute matrices
2289 OptimizeMatrices(shtns, eps);
2290 }
2291 #endif
2292 }
2293 if (flags == sht_reg_poles)
2294 {
2295 alloc_SHTarrays(shtns, on_the_fly, vector, 0); // allocate dynamic arrays (no analysis)
2296 grid_equal_polar(shtns, latdir);
2297 #ifdef SHTNS_MEM
2298 if (on_the_fly == 0) {
2299 init_SH_synth(shtns);
2300 for (im=0; im<=MMAX; im++) shtns->tm[im] = 0; // avoid problems with tm[im] modified ????
2301 }
2302 #endif
2303 }
2304
2305 if (on_the_fly == 1) {
2306 #if SHT_VERBOSE > 0
2307 if (verbose) printf(" + using on-the-fly transforms.\n");
2308 #endif
2309 if (NLAT < VSIZE2*4) shtns_runerr("on-the-fly only available for nlat>=32"); // avoid overflow with NLAT_2 < VSIZE2*2
2310 PolarOptimize(shtns, eps);
2311 set_sht_fly(shtns, 0); // switch function pointers to "on-the-fly" functions.
2312 }
2313
2314 if ((layout & SHT_LOAD_SAVE_CFG) && (!cfg_loaded)) cfg_loaded = (config_load(shtns, req_flags) > 0);
2315 if (quick_init == 0) {
2316 if (!cfg_loaded) {
2317 choose_best_sht(shtns, &nloop, vector, 0);
2318 if (layout & SHT_LOAD_SAVE_CFG) config_save(shtns, req_flags);
2319 }
2320 #ifdef SHTNS_MEM
2321 if (on_the_fly == 0) free_unused_matrices(shtns);
2322 #endif
2323 t = SHT_error(shtns, vector); // compute SHT accuracy.
2324 #if SHT_VERBOSE > 0
2325 if (verbose) printf(" + SHT accuracy = %.3g\n",t);
2326 #endif
2327 #if SHT_VERBOSE < 2
2328 if (t > 1.e-3) {
2329 shtns_print_cfg(shtns);
2330 shtns_runerr("bad SHT accuracy"); // stop if something went wrong (but not in debug mode)
2331 }
2332 #endif
2333 }
2334
2335// set_sht_fly(shtns, SHT_TYP_VAN);
2336 #if SHT_VERBOSE > 1
2337 if ((omp_threads > 1)&&(verbose>1)) printf(" nthreads = %d\n",shtns->nthreads);
2338 #endif
2339 #if SHT_VERBOSE > 0
2340 if (verbose) printf(" => " PACKAGE_NAME " is ready.\n");
2341 #endif
2342 return(shtns->nspat); // returns the number of doubles to be allocated for a spatial field.
2343}
2344
2345
2346/*! Initialization of Spherical Harmonic transforms (backward and forward, vector and scalar, ...) of given size.
2347 * <b>This function must be called after \ref shtns_create and before any SH transform.</b> and sets all global variables.
2348 * returns the required number of doubles to be allocated for a spatial field.
2349 * \param shtns is the config created by shtns_create for which the grid will be set.
2350 * \param flags allows to choose the type of transform (see \ref shtns_type) and the spatial data layout (see \ref spat)
2351 * \param eps polar optimization threshold : polar values of Legendre Polynomials below that threshold are neglected (for high m), leading to increased performance (a few percents)
2352 * \param nlat,nphi respectively the number of latitudinal and longitudinal grid points.
2353 * 0 = no polar optimization; 1.e-14 = VERY safe; 1.e-10 = safe; 1.e-6 = aggresive, but still good accuracy.
2354*/
2355int shtns_set_grid(shtns_cfg shtns, enum shtns_type flags, double eps, int nlat, int nphi)
2356{
2357 if ((nlat == 0)||(nphi == 0)) shtns_runerr("nlat or nphi is zero !");
2358 return( shtns_set_grid_auto(shtns, flags, eps, 0, &nlat, &nphi) );
2359}
2360
2361/*! Simple initialization of Spherical Harmonic transforms (backward and forward, vector and scalar, ...) of given size.
2362 * This function sets all global variables by calling \ref shtns_create followed by \ref shtns_set_grid, with the
2363 * default normalization and the default polar optimization (see \ref sht_config.h).
2364 * Returns the configuration to be passed to subsequent transform functions, which is basicaly a pointer to a \ref shtns_info struct.
2365 * \param lmax : maximum SH degree that we want to describe.
2366 * \param mmax : number of azimutal wave numbers.
2367 * \param mres : \c 2.pi/mres is the azimutal periodicity. \c mmax*mres is the maximum SH order.
2368 * \param nlat,nphi : respectively the number of latitudinal and longitudinal grid points.
2369 * \param flags allows to choose the type of transform (see \ref shtns_type) and the spatial data layout (see \ref spat)
2370*/
2371shtns_cfg shtns_init(enum shtns_type flags, int lmax, int mmax, int mres, int nlat, int nphi)
2372{
2373 shtns_cfg shtns = shtns_create(lmax, mmax, mres, SHT_DEFAULT_NORM);
2374 if (shtns != NULL)
2375 shtns_set_grid(shtns, flags, SHT_DEFAULT_POLAR_OPT, nlat, nphi);
2376 return shtns;
2377}
2378
2379/** Enables OpenMP parallel transforms, if available (see \ref compil).
2380 Call before any initialization of shtns to use mutliple threads. Returns the actual number of threads.
2381 \li If num_threads > 0, specifies the maximum number of threads that should be used.
2382 \li If num_threads <= 0, maximum number of threads is automatically set to the number of processors.
2383 \li If num_threads == 1, openmp will be disabled. */
2384int shtns_use_threads(int num_threads)
2385{
2386#ifdef _OPENMP
2387 int procs = omp_get_num_procs();
2388 if (num_threads <= 0) num_threads = omp_get_max_threads();
2389 else if (num_threads > 4*procs) num_threads = 4*procs; // limit the number of threads
2390 omp_threads = num_threads;
2391#endif
2392#ifdef OMP_FFTW
2393 fftw_init_threads(); // enable threads for FFTW.
2394#endif
2395 return omp_threads;
2396}
2397
2398/// fill the given array with Gauss weights. returns the number of weights written, which
2399/// may be zero if the grid is not a Gauss grid.
2400int shtns_gauss_wts(shtns_cfg shtns, double *wts)
2401{
2402 int i = 0;
2403 if (shtns->wg) {
2404 double rescale = 2*NPHI; // weights are stored with a rescaling that depends on SHT_NORM.
2405 if ((SHT_NORM != sht_fourpi)&&(SHT_NORM != sht_schmidt)) rescale *= 0.25/M_PI;
2406
2407 do {
2408 wts[i] = shtns->wg[i] * rescale;
2409 } while(++i < shtns->nlat_2);
2410 }
2411 return i;
2412}
2413
2414//@}
2415
2416
2417#ifdef SHT_F77_API
2418
2419/* FORTRAN API */
2420
2421/** \addtogroup fortapi Fortran API.
2422* Call from fortran without the trailing '_'.
2423* see the \link SHT_example.f Fortran example \endlink for a simple usage of SHTns from Fortran language.
2424*/
2425//@{
2426
2427/// Set verbosity level
2428void shtns_verbose_(int *v)
2429{
2430 shtns_verbose(*v);
2431}
2432
2433/// Enable threads
2434void shtns_use_threads_(int *num_threads)
2435{
2436 shtns_use_threads(*num_threads);
2437}
2438
2439/// Print info
2440void shtns_print_cfg_()
2441{
2442 shtns_print_version();
2443 if (sht_data) shtns_print_cfg(sht_data);
2444}
2445
2446/// Initializes spherical harmonic transforms of given size using Gauss algorithm with default polar optimization.
2447void shtns_init_sh_gauss_(int *layout, int *lmax, int *mmax, int *mres, int *nlat, int *nphi)
2448{
2449 shtns_reset();
2450 shtns_cfg shtns = shtns_create(*lmax, *mmax, *mres, SHT_DEFAULT_NORM);
2451 shtns_set_grid(shtns, sht_gauss | *layout, SHT_DEFAULT_POLAR_OPT, *nlat, *nphi);
2452}
2453
2454/// Initializes spherical harmonic transforms of given size using Fastest available algorithm and polar optimization.
2455void shtns_init_sh_auto_(int *layout, int *lmax, int *mmax, int *mres, int *nlat, int *nphi)
2456{
2457 shtns_reset();
2458 shtns_cfg shtns = shtns_create(*lmax, *mmax, *mres, SHT_DEFAULT_NORM);
2459 shtns_set_grid(shtns, sht_auto | *layout, SHT_DEFAULT_POLAR_OPT, *nlat, *nphi);
2460}
2461
2462/// Initializes spherical harmonic transforms of given size using a regular grid and agressive optimizations.
2463void shtns_init_sh_reg_fast_(int *layout, int *lmax, int *mmax, int *mres, int *nlat, int *nphi)
2464{
2465 shtns_reset();
2466 shtns_cfg shtns = shtns_create(*lmax, *mmax, *mres, SHT_DEFAULT_NORM);
2467 shtns_set_grid(shtns, sht_reg_fast | *layout, 1.e-6, *nlat, *nphi);
2468}
2469
2470/// Initializes spherical harmonic transform SYNTHESIS ONLY of given size using a regular grid including poles.
2471void shtns_init_sh_poles_(int *layout, int *lmax, int *mmax, int *mres, int *nlat, int *nphi)
2472{
2473 shtns_reset();
2474 shtns_cfg shtns = shtns_create(*lmax, *mmax, *mres, SHT_DEFAULT_NORM);
2475 shtns_set_grid(shtns, sht_reg_poles | *layout, 0, *nlat, *nphi);
2476}
2477
2478/// Defines the size and convention of the transform.
2479/// Allow to choose the normalization and whether or not to include the Condon-Shortley phase.
2480/// \see shtns_create
2481void shtns_set_size_(int *lmax, int *mmax, int *mres, int *norm)
2482{
2483 shtns_reset();
2484 shtns_create(*lmax, *mmax, *mres, *norm);
2485}
2486
2487/// Precompute matrices for synthesis and analysis.
2488/// Allow to choose polar optimization threshold and algorithm type.
2489/// \see shtns_set_grid
2490void shtns_precompute_(int *type, int *layout, double *eps, int *nlat, int *nphi)
2491{
2492 shtns_set_grid(sht_data, *type | *layout, *eps, *nlat, *nphi);
2493}
2494
2495/// Same as shtns_precompute_ but choose optimal nlat and/or nphi.
2496/// \see shtns_set_grid_auto
2497void shtns_precompute_auto_(int *type, int *layout, double *eps, int *nl_order, int *nlat, int *nphi)
2498{
2499 shtns_set_grid_auto(sht_data, *type | *layout, *eps, *nl_order, nlat, nphi);
2500}
2501
2502/// Clear everything
2503void shtns_reset_() {
2504 shtns_reset();
2505}
2506
2507/// returns nlm, the number of complex*16 elements in an SH array.
2508/// call from fortran using \code call shtns_calc_nlm(nlm, lmax, mmax, mres) \endcode
2509void shtns_calc_nlm_(int *nlm, const int *const lmax, const int *const mmax, const int *const mres)
2510{
2511 *nlm = nlm_calc(*lmax, *mmax, *mres);
2512}
2513
2514/// returns lm, the index in an SH array of mode (l,m).
2515/// call from fortran using \code call shtns_lmidx(lm, l, m) \endcode
2516void shtns_lmidx_(int *lm, const int *const l, const int *const m)
2517{
2518 unsigned im = *m;
2519 unsigned mres = sht_data->mres;
2520 if (mres > 1) {
2521 unsigned k = im % mres;
2522 im = im / mres;
2523 if (k) printf("wrong m");
2524 }
2525 *lm = LiM(sht_data, *l, im) + 1; // convert to fortran convention index.
2526}
2527
2528/// returns l and m, degree and order of an index in SH array lm.
2529/// call from fortran using \code call shtns_l_m(l, m, lm) \endcode
2530void shtns_l_m_(int *l, int *m, const int *const lm)
2531{
2532 *l = sht_data->li[*lm -1]; // convert from fortran convention index.
2533 *m = sht_data->mi[*lm -1];
2534}
2535
2536/// fills the given array with the cosine of the co-latitude angle (NLAT real*8)
2537/// if no grid has been set, the first element will be set to zero.
2538void shtns_cos_array_(double *costh)
2539{
2540 if (sht_data->ct) {
2541 for (int i=0; i<sht_data->nlat; i++)
2542 costh[i] = sht_data->ct[i];
2543 } else costh[0] = 0.0; // mark as invalid.
2544}
2545
2546/// fills the given array with the gaussian quadrature weights ((NLAT+1)/2 real*8).
2547/// when there is no gaussian grid, the first element is set to zero.
2548void shtns_gauss_wts_(double *wts)
2549{
2550 int i = shtns_gauss_wts(sht_data, wts);
2551 if (i==0) wts[0] = 0; // mark as invalid.
2552}
2553
2554
2555/** \name Point evaluation of Spherical Harmonics
2556Evaluate at a given point (\f$cos(\theta)\f$ and \f$\phi\f$) a spherical harmonic representation.
2557*/
2558//@{
2559/// \see SH_to_point for argument description
2560void shtns_sh_to_point_(double *spat, cplx *Qlm, double *cost, double *phi)
2561{
2562 *spat = SH_to_point(sht_data, Qlm, *cost, *phi);
2563}
2564
2565/// \see SHqst_to_point for argument description
2566void shtns_qst_to_point_(double *vr, double *vt, double *vp,
2567 cplx *Qlm, cplx *Slm, cplx *Tlm, double *cost, double *phi)
2568{
2569 SHqst_to_point(sht_data, Qlm, Slm, Tlm, *cost, *phi, vr, vt, vp);
2570}
2571//@}
2572
2573
2574//@}
2575
2576#endif
Note: See TracBrowser for help on using the repository browser.