| 1 | /*
|
|---|
| 2 | * Copyright (c) 2010-2015 Centre National de la Recherche Scientifique.
|
|---|
| 3 | * written by Nathanael Schaeffer (CNRS, ISTerre, Grenoble, France).
|
|---|
| 4 | *
|
|---|
| 5 | * nathanael.schaeffer@ujf-grenoble.fr
|
|---|
| 6 | *
|
|---|
| 7 | * This software is governed by the CeCILL license under French law and
|
|---|
| 8 | * abiding by the rules of distribution of free software. You can use,
|
|---|
| 9 | * modify and/or redistribute the software under the terms of the CeCILL
|
|---|
| 10 | * license as circulated by CEA, CNRS and INRIA at the following URL
|
|---|
| 11 | * "http://www.cecill.info".
|
|---|
| 12 | *
|
|---|
| 13 | * The fact that you are presently reading this means that you have had
|
|---|
| 14 | * knowledge of the CeCILL license and that you accept its terms.
|
|---|
| 15 | *
|
|---|
| 16 | */
|
|---|
| 17 |
|
|---|
| 18 | /********************************************************************
|
|---|
| 19 | * SHTns : Spherical Harmonic Transform for numerical simulations. *
|
|---|
| 20 | * written by Nathanael Schaeffer / CNRS *
|
|---|
| 21 | ********************************************************************/
|
|---|
| 22 |
|
|---|
| 23 | /** \internal \file SHT.c
|
|---|
| 24 | * \brief main source file for SHTns.
|
|---|
| 25 | * This files contains initialization code and also some partial transforms (point or latitudinal evaluations)
|
|---|
| 26 | */
|
|---|
| 27 |
|
|---|
| 28 | #include <stdio.h>
|
|---|
| 29 | #include <string.h>
|
|---|
| 30 | // global variables definitions
|
|---|
| 31 | #include "sht_private.h"
|
|---|
| 32 |
|
|---|
| 33 | // cycle counter from FFTW
|
|---|
| 34 | #include "fftw3/cycle.h"
|
|---|
| 35 |
|
|---|
| 36 | // chained list of sht_setup : start with NULL
|
|---|
| 37 | shtns_cfg sht_data = NULL;
|
|---|
| 38 | #ifdef _OPENMP
|
|---|
| 39 | int omp_threads = 1; // multi-thread disabled by default.
|
|---|
| 40 | #if HAVE_LIBFFTW3_OMP
|
|---|
| 41 | #define OMP_FFTW
|
|---|
| 42 | #endif
|
|---|
| 43 | #else
|
|---|
| 44 | #define omp_threads 1
|
|---|
| 45 | #endif
|
|---|
| 46 |
|
|---|
| 47 | static int verbose = 0; // runtime verbosity control: 0 no output, 1 output, 2 debug (if compiled in)
|
|---|
| 48 | void shtns_verbose(int v) {
|
|---|
| 49 | verbose = v;
|
|---|
| 50 | }
|
|---|
| 51 |
|
|---|
| 52 |
|
|---|
| 53 | /// \internal Abort program with error message.
|
|---|
| 54 | static void shtns_runerr(const char * error_text)
|
|---|
| 55 | {
|
|---|
| 56 | printf("*** [" PACKAGE_NAME "] Run-time error : %s\n",error_text);
|
|---|
| 57 | exit(1);
|
|---|
| 58 | }
|
|---|
| 59 |
|
|---|
| 60 | /* PUBLIC useful functions */
|
|---|
| 61 |
|
|---|
| 62 | /// returns the l=0, m=0 SH coefficient corresponding to a uniform value of 1.
|
|---|
| 63 | double sh00_1(shtns_cfg shtns) {
|
|---|
| 64 | return shtns->Y00_1;
|
|---|
| 65 | }
|
|---|
| 66 | /// returns the l=1, m=0 SH coefficient corresponding to cos(theta).
|
|---|
| 67 | double sh10_ct(shtns_cfg shtns) {
|
|---|
| 68 | return shtns->Y10_ct;
|
|---|
| 69 | }
|
|---|
| 70 | /// returns the l=1, m=1 SH coefficient corresponding to sin(theta).cos(phi).
|
|---|
| 71 | double sh11_st(shtns_cfg shtns) {
|
|---|
| 72 | return shtns->Y11_st;
|
|---|
| 73 | }
|
|---|
| 74 | /// returns the l,m SH coefficient corresponding to unit energy.
|
|---|
| 75 | double shlm_e1(shtns_cfg shtns, int l, int m) {
|
|---|
| 76 | double x = shtns->Y00_1/sqrt(4.*M_PI);
|
|---|
| 77 | if (SHT_NORM == sht_schmidt) x *= sqrt(2*l+1);
|
|---|
| 78 | if ((m!=0)&&((shtns->norm & SHT_REAL_NORM)==0)) x *= sqrt(0.5);
|
|---|
| 79 | return(x);
|
|---|
| 80 | }
|
|---|
| 81 |
|
|---|
| 82 | /// \code return (mmax+1)*(lmax+1) - mres*(mmax*(mmax+1))/2; \endcode */
|
|---|
| 83 | /// \ingroup init
|
|---|
| 84 | long nlm_calc(long lmax, long mmax, long mres)
|
|---|
| 85 | {
|
|---|
| 86 | if (mmax*mres > lmax) mmax = lmax/mres;
|
|---|
| 87 | return (mmax+1)*(lmax+1) - ((mmax*mres)*(mmax+1))/2; // this is wrong if lmax < mmax*mres
|
|---|
| 88 | }
|
|---|
| 89 |
|
|---|
| 90 |
|
|---|
| 91 | /* LEGENDRE FUNCTIONS */
|
|---|
| 92 | #include "sht_legendre.c"
|
|---|
| 93 |
|
|---|
| 94 |
|
|---|
| 95 | /* SHT FUNCTIONS */
|
|---|
| 96 | #include "sht_func.c"
|
|---|
| 97 |
|
|---|
| 98 | #include "sht_com.c"
|
|---|
| 99 |
|
|---|
| 100 | /*
|
|---|
| 101 | INTERNAL INITIALIZATION FUNCTIONS
|
|---|
| 102 | */
|
|---|
| 103 |
|
|---|
| 104 | // sht algorithms (hyb, fly1, ...)
|
|---|
| 105 | enum sht_algos { SHT_DCT, SHT_MEM, SHT_SV,
|
|---|
| 106 | SHT_FLY1, SHT_FLY2, SHT_FLY3, SHT_FLY4, SHT_FLY6, SHT_FLY8,
|
|---|
| 107 | SHT_OMP1, SHT_OMP2, SHT_OMP3, SHT_OMP4, SHT_OMP6, SHT_OMP8,
|
|---|
| 108 | SHT_NALG };
|
|---|
| 109 |
|
|---|
| 110 | char* sht_name[SHT_NALG] = {"dct", "mem", "s+v", "fly1", "fly2", "fly3", "fly4", "fly6", "fly8", "omp1", "omp2", "omp3", "omp4", "omp6", "omp8" };
|
|---|
| 111 | char* sht_type[SHT_NTYP] = {"syn", "ana", "vsy", "van", "gsp", "gto", "v3s", "v3a" };
|
|---|
| 112 | char* sht_var[SHT_NVAR] = {"std", "ltr", "m" };
|
|---|
| 113 | int sht_npar[SHT_NTYP] = {2, 2, 4, 4, 3, 3, 6, 6};
|
|---|
| 114 |
|
|---|
| 115 | #ifdef SHTNS_MEM
|
|---|
| 116 | extern void* fmem[SHT_NTYP];
|
|---|
| 117 | extern void* fmem_l[SHT_NTYP];
|
|---|
| 118 | extern void* fmem_m0[SHT_NTYP];
|
|---|
| 119 | extern void* fmem_m0l[SHT_NTYP];
|
|---|
| 120 | #endif
|
|---|
| 121 | #ifdef SHTNS_DCT
|
|---|
| 122 | extern void* fdct[SHT_NTYP];
|
|---|
| 123 | extern void* fdct_l[SHT_NTYP];
|
|---|
| 124 | extern void* fdct_m0[SHT_NTYP];
|
|---|
| 125 | extern void* fdct_m0l[SHT_NTYP];
|
|---|
| 126 | #endif
|
|---|
| 127 | extern void* ffly[6][SHT_NTYP];
|
|---|
| 128 | extern void* ffly_m[6][SHT_NTYP];
|
|---|
| 129 | extern void* ffly_m0[6][SHT_NTYP];
|
|---|
| 130 | #ifdef _OPENMP
|
|---|
| 131 | extern void* fomp[6][SHT_NTYP];
|
|---|
| 132 | #endif
|
|---|
| 133 |
|
|---|
| 134 | // big array holding all sht functions, variants and algorithms
|
|---|
| 135 | void* sht_func[SHT_NVAR][SHT_NALG][SHT_NTYP];
|
|---|
| 136 |
|
|---|
| 137 | /// \internal use on-the-fly alogorithm (guess without measuring)
|
|---|
| 138 | static void set_sht_fly(shtns_cfg shtns, int typ_start)
|
|---|
| 139 | {
|
|---|
| 140 | int algo = SHT_FLY2;
|
|---|
| 141 | if ((shtns->nthreads > 1) && (sht_func[0][SHT_OMP2][typ_start])) algo = SHT_OMP2;
|
|---|
| 142 | for (int it=typ_start; it<SHT_NTYP; it++) {
|
|---|
| 143 | for (int v=0; v<SHT_NVAR; v++)
|
|---|
| 144 | shtns->ftable[v][it] = sht_func[v][algo][it];
|
|---|
| 145 | }
|
|---|
| 146 | }
|
|---|
| 147 |
|
|---|
| 148 | #ifdef SHTNS_MEM
|
|---|
| 149 | /// \internal choose memory algorithm everywhere.
|
|---|
| 150 | static void set_sht_mem(shtns_cfg shtns) {
|
|---|
| 151 | for (int it=0; it<SHT_NTYP; it++) {
|
|---|
| 152 | for (int v=0; v<SHT_NVAR; v++)
|
|---|
| 153 | shtns->ftable[v][it] = sht_func[v][SHT_MEM][it];
|
|---|
| 154 | shtns->ftable[SHT_M][it] = sht_func[SHT_M][SHT_FLY2][it]; // there is no "mem" algo for SHT_M
|
|---|
| 155 | }
|
|---|
| 156 | }
|
|---|
| 157 | #endif
|
|---|
| 158 |
|
|---|
| 159 | /// \internal copy all algos to sht_func array (should be called by set_grid before choosing variants).
|
|---|
| 160 | /// if nphi is 1, axisymmetric algorithms are used.
|
|---|
| 161 | static void init_sht_array_func(shtns_cfg shtns)
|
|---|
| 162 | {
|
|---|
| 163 | int it, j;
|
|---|
| 164 | int alg_lim = SHT_FLY8;
|
|---|
| 165 |
|
|---|
| 166 | if (shtns->nlat_2 < 8*VSIZE2) { // limit available on-the-fly algorithm to avoid overflow (and segfaults).
|
|---|
| 167 | it = shtns->nlat_2 / VSIZE2;
|
|---|
| 168 | switch(it) {
|
|---|
| 169 | case 0 : alg_lim = SHT_FLY1-1; break;
|
|---|
| 170 | case 1 : alg_lim = SHT_FLY1; break;
|
|---|
| 171 | case 2 : alg_lim = SHT_FLY2; break;
|
|---|
| 172 | case 3 : alg_lim = SHT_FLY3; break;
|
|---|
| 173 | case 4 : ;
|
|---|
| 174 | case 5 : alg_lim = SHT_FLY4; break;
|
|---|
| 175 | default : alg_lim = SHT_FLY6;
|
|---|
| 176 | }
|
|---|
| 177 | }
|
|---|
| 178 | alg_lim -= SHT_FLY1;
|
|---|
| 179 |
|
|---|
| 180 | memset(sht_func, 0, SHT_NVAR*SHT_NTYP*SHT_NALG*sizeof(void*) ); // zero out.
|
|---|
| 181 | sht_func[SHT_STD][SHT_SV][SHT_TYP_3SY] = SHqst_to_spat_2;
|
|---|
| 182 | sht_func[SHT_LTR][SHT_SV][SHT_TYP_3SY] = SHqst_to_spat_2l;
|
|---|
| 183 | sht_func[SHT_STD][SHT_SV][SHT_TYP_3AN] = spat_to_SHqst_2;
|
|---|
| 184 | sht_func[SHT_LTR][SHT_SV][SHT_TYP_3AN] = spat_to_SHqst_2l;
|
|---|
| 185 | sht_func[SHT_M][SHT_SV][SHT_TYP_3SY] = SHqst_to_spat_2ml;
|
|---|
| 186 | sht_func[SHT_M][SHT_SV][SHT_TYP_3AN] = spat_to_SHqst_2ml;
|
|---|
| 187 |
|
|---|
| 188 | if (shtns->nphi==1) { // axisymmetric transform requested.
|
|---|
| 189 | for (int j=0; j<=alg_lim; j++) {
|
|---|
| 190 | memcpy(sht_func[SHT_STD][SHT_FLY1 + j], &ffly_m0[j], sizeof(void*)*SHT_NTYP);
|
|---|
| 191 | memcpy(sht_func[SHT_LTR][SHT_FLY1 + j], &ffly_m0[j], sizeof(void*)*SHT_NTYP);
|
|---|
| 192 | }
|
|---|
| 193 | #ifdef SHTNS_DCT
|
|---|
| 194 | memcpy(sht_func[SHT_STD][SHT_DCT], &fdct_m0, sizeof(void*)*SHT_NTYP);
|
|---|
| 195 | memcpy(sht_func[SHT_LTR][SHT_DCT], &fdct_m0l, sizeof(void*)*SHT_NTYP);
|
|---|
| 196 | #endif
|
|---|
| 197 | #ifdef SHTNS_MEM
|
|---|
| 198 | memcpy(sht_func[SHT_STD][SHT_MEM], &fmem_m0, sizeof(void*)*SHT_NTYP);
|
|---|
| 199 | memcpy(sht_func[SHT_LTR][SHT_MEM], &fmem_m0l, sizeof(void*)*SHT_NTYP);
|
|---|
| 200 | #endif
|
|---|
| 201 | } else {
|
|---|
| 202 | for (int j=0; j<=alg_lim; j++) {
|
|---|
| 203 | memcpy(sht_func[SHT_STD][SHT_FLY1 + j], &ffly[j], sizeof(void*)*SHT_NTYP);
|
|---|
| 204 | memcpy(sht_func[SHT_LTR][SHT_FLY1 + j], &ffly[j], sizeof(void*)*SHT_NTYP);
|
|---|
| 205 | memcpy(sht_func[SHT_M][SHT_FLY1 + j], &ffly_m[j], sizeof(void*)*SHT_NTYP);
|
|---|
| 206 | #ifdef _OPENMP
|
|---|
| 207 | memcpy(sht_func[SHT_STD][SHT_OMP1 + j], &fomp[j], sizeof(void*)*SHT_NTYP);
|
|---|
| 208 | memcpy(sht_func[SHT_LTR][SHT_OMP1 + j], &fomp[j], sizeof(void*)*SHT_NTYP);
|
|---|
| 209 | memcpy(sht_func[SHT_M][SHT_OMP1 + j], &ffly_m[j], sizeof(void*)*SHT_NTYP); // no omp algo for SHT_M, use fly instead
|
|---|
| 210 | #endif
|
|---|
| 211 | }
|
|---|
| 212 | #ifdef SHTNS_DCT
|
|---|
| 213 | memcpy(sht_func[SHT_STD][SHT_DCT], &fdct, sizeof(void*)*SHT_NTYP);
|
|---|
| 214 | memcpy(sht_func[SHT_LTR][SHT_DCT], &fdct_l, sizeof(void*)*SHT_NTYP);
|
|---|
| 215 | #endif
|
|---|
| 216 | #ifdef SHTNS_MEM
|
|---|
| 217 | memcpy(sht_func[SHT_STD][SHT_MEM], &fmem, sizeof(void*)*SHT_NTYP);
|
|---|
| 218 | memcpy(sht_func[SHT_LTR][SHT_MEM], &fmem_l, sizeof(void*)*SHT_NTYP);
|
|---|
| 219 | #endif
|
|---|
| 220 | }
|
|---|
| 221 |
|
|---|
| 222 | #ifdef SHTNS_MEM
|
|---|
| 223 | set_sht_mem(shtns); // default transform is MEM
|
|---|
| 224 | #else
|
|---|
| 225 | set_sht_fly(shtns, 0);
|
|---|
| 226 | #endif
|
|---|
| 227 | }
|
|---|
| 228 |
|
|---|
| 229 |
|
|---|
| 230 | /// \internal return the smallest power of 2 larger than n.
|
|---|
| 231 | static int next_power_of_2(int n)
|
|---|
| 232 | {
|
|---|
| 233 | int f = 1;
|
|---|
| 234 | if ( (n<=0) || (n>(1<<(sizeof(int)*8-2))) ) return 0;
|
|---|
| 235 | while (f<n) f*=2;
|
|---|
| 236 | return f;
|
|---|
| 237 | }
|
|---|
| 238 |
|
|---|
| 239 | /// \internal find the closest integer that is larger than n and that contains only factors up to fmax.
|
|---|
| 240 | /// fmax is 7 for optimal FFTW fourier transforms.
|
|---|
| 241 | /// return only even integers for n>fmax.
|
|---|
| 242 | static int fft_int(int n, int fmax)
|
|---|
| 243 | {
|
|---|
| 244 | int k,f;
|
|---|
| 245 |
|
|---|
| 246 | if (n<=fmax) return n;
|
|---|
| 247 | if (fmax<2) return 0;
|
|---|
| 248 | if (fmax==2) return next_power_of_2(n);
|
|---|
| 249 |
|
|---|
| 250 | n -= 2-(n&1); // only even n
|
|---|
| 251 | do {
|
|---|
| 252 | n+=2; f=2;
|
|---|
| 253 | while ((2*f <= n) && ((n&f)==0)) f *= 2; // no divisions for factor 2.
|
|---|
| 254 | k=3;
|
|---|
| 255 | while ((k<=fmax) && (k*f <= n)) {
|
|---|
| 256 | while ((k*f <= n) && (n%(k*f)==0)) f *= k;
|
|---|
| 257 | k+=2;
|
|---|
| 258 | }
|
|---|
| 259 | } while (f != n);
|
|---|
| 260 |
|
|---|
| 261 | k = next_power_of_2(n); // what is the closest power of 2 ?
|
|---|
| 262 | if ((k-n)*33 < n) return k; // rather choose power of 2 if not too far (3%)
|
|---|
| 263 |
|
|---|
| 264 | return n;
|
|---|
| 265 | }
|
|---|
| 266 |
|
|---|
| 267 |
|
|---|
| 268 | /// \internal returns an aproximation of the memory usage in mega bytes for the scalar matrices.
|
|---|
| 269 | /// \ingroup init
|
|---|
| 270 | static double sht_mem_size(int lmax, int mmax, int mres, int nlat)
|
|---|
| 271 | {
|
|---|
| 272 | double s = 1./(1024*1024);
|
|---|
| 273 | s *= ((nlat+1)/2) * sizeof(double) * nlm_calc(lmax, mmax, mres);
|
|---|
| 274 | return s;
|
|---|
| 275 | }
|
|---|
| 276 |
|
|---|
| 277 | /// \internal return the number of shtns config that contain a reference to the memory location *pp.
|
|---|
| 278 | static int ref_count(shtns_cfg shtns, void* pp)
|
|---|
| 279 | {
|
|---|
| 280 | shtns_cfg s2 = sht_data;
|
|---|
| 281 | void* p;
|
|---|
| 282 | long int nref, offset;
|
|---|
| 283 |
|
|---|
| 284 | if ((pp==NULL) || (shtns==NULL)) return -1; // error.
|
|---|
| 285 |
|
|---|
| 286 | p = *(void**)pp; // the pointer to memory location that we want to free
|
|---|
| 287 | if (p == NULL) return 0; // nothing to do.
|
|---|
| 288 |
|
|---|
| 289 | offset = (char*)pp - (char*)shtns; // relative location in shtns_info structure.
|
|---|
| 290 | nref = 0; // reference count.
|
|---|
| 291 | while (s2 != NULL) { // scan all configs.
|
|---|
| 292 | if ( *(void**)(((char*)s2) + offset) == p ) nref++;
|
|---|
| 293 | s2 = s2->next;
|
|---|
| 294 | }
|
|---|
| 295 | return nref; // will be >= 1 (as shtns is included in search)
|
|---|
| 296 | }
|
|---|
| 297 |
|
|---|
| 298 | /// \internal check if the memory location *pp is referenced by another sht config before freeing it.
|
|---|
| 299 | /// returns the number of other references, or -1 on error. If >=1, the memory location could not be freed.
|
|---|
| 300 | /// If the return value is 0, the ressource has been freed.
|
|---|
| 301 | static int free_unused(shtns_cfg shtns, void* pp)
|
|---|
| 302 | {
|
|---|
| 303 | int n = ref_count(shtns, pp); // howmany shtns config do reference this memory location ?
|
|---|
| 304 | if (n <= 0) return n; // nothing to free.
|
|---|
| 305 | if (n == 1) { // no other reference found...
|
|---|
| 306 | void** ap = (void**) pp;
|
|---|
| 307 | free(*ap); // ...so we can free it...
|
|---|
| 308 | *ap = NULL; // ...and mark as unaloccated.
|
|---|
| 309 | }
|
|---|
| 310 | return (n-1);
|
|---|
| 311 | }
|
|---|
| 312 |
|
|---|
| 313 | #ifdef SHTNS_MEM
|
|---|
| 314 | /// \internal Free matrices if on-the-fly has been selected.
|
|---|
| 315 | static void free_unused_matrices(shtns_cfg shtns)
|
|---|
| 316 | {
|
|---|
| 317 | long int marray_size = (MMAX+1)*sizeof(double) + (MIN_ALIGNMENT-1);
|
|---|
| 318 | int count[SHT_NTYP];
|
|---|
| 319 | int it, iv, ia, iv2;
|
|---|
| 320 |
|
|---|
| 321 | for (it=0; it<SHT_NTYP; it++) {
|
|---|
| 322 | count[it] = 0;
|
|---|
| 323 | for (iv=0; iv<SHT_NVAR; iv++) {
|
|---|
| 324 | void* fptr = shtns->ftable[iv][it];
|
|---|
| 325 | if (fptr != NULL) {
|
|---|
| 326 | for (ia=0; ia<=SHT_MEM; ia++) // count occurences to mem algo
|
|---|
| 327 | for (iv2=0; iv2<SHT_NVAR; iv2++)
|
|---|
| 328 | if (sht_func[iv2][ia][it] == fptr) count[it]++;
|
|---|
| 329 | }
|
|---|
| 330 | }
|
|---|
| 331 | #if SHT_VERBOSE > 1
|
|---|
| 332 | if (verbose>1) printf(" %d ",count[it]);
|
|---|
| 333 | #endif
|
|---|
| 334 | }
|
|---|
| 335 |
|
|---|
| 336 | if (shtns->dylm == NULL) { // no vector transform : do not try to free
|
|---|
| 337 | count[SHT_TYP_VAN] = -1; count[SHT_TYP_VSY] = -1;
|
|---|
| 338 | }
|
|---|
| 339 |
|
|---|
| 340 | if (count[SHT_TYP_3AN] == 0) { // analysis may be freed.
|
|---|
| 341 | if (count[SHT_TYP_VAN] == 0) {
|
|---|
| 342 | PRINT_VERB("freeing vector analysis matrix\n");
|
|---|
| 343 | free_unused(shtns, &shtns->dzlm);
|
|---|
| 344 | }
|
|---|
| 345 | if (count[SHT_TYP_SAN] == 0) {
|
|---|
| 346 | PRINT_VERB("freeing scalar analysis matrix\n");
|
|---|
| 347 | free_unused(shtns, &shtns->zlm);
|
|---|
| 348 | }
|
|---|
| 349 | } else if (shtns->mmax > 0) { // scalar may be reduced to m=0
|
|---|
| 350 | if ((count[SHT_TYP_SAN] == 0) && (ref_count(shtns, &shtns->zlm) == 1)) {
|
|---|
| 351 | PRINT_VERB("keeping scalar analysis matrix up to m=0\n");
|
|---|
| 352 | shtns->zlm = realloc(shtns->zlm, ((LMAX+1)*NLAT_2 +3)*sizeof(double) + marray_size );
|
|---|
| 353 | }
|
|---|
| 354 | }
|
|---|
| 355 | if (count[SHT_TYP_3SY] == 0) { // synthesis may be freed.
|
|---|
| 356 | if (count[SHT_TYP_VSY] + count[SHT_TYP_GSP] + count[SHT_TYP_GTO] == 0) {
|
|---|
| 357 | PRINT_VERB("freeing vector synthesis matrix\n");
|
|---|
| 358 | free_unused(shtns, &shtns->dylm);
|
|---|
| 359 | }
|
|---|
| 360 | if (count[SHT_TYP_SSY] == 0) {
|
|---|
| 361 | PRINT_VERB("freeing scalar synthesis matrix\n");
|
|---|
| 362 | free_unused(shtns, &shtns->ylm);
|
|---|
| 363 | }
|
|---|
| 364 | } else if (shtns->mmax > 0) { // scalar may be reduced to m=0
|
|---|
| 365 | if ((count[SHT_TYP_SSY] == 0) && (ref_count(shtns, &shtns->ylm) == 1)) {
|
|---|
| 366 | PRINT_VERB("keeping scalar synthesis matrix up to m=0\n");
|
|---|
| 367 | shtns->ylm = realloc(shtns->ylm, (LMAX+2)*NLAT_2*sizeof(double) + marray_size );
|
|---|
| 368 | }
|
|---|
| 369 | }
|
|---|
| 370 | }
|
|---|
| 371 | #endif
|
|---|
| 372 |
|
|---|
| 373 | /// \internal allocate arrays for SHT related to a given grid.
|
|---|
| 374 | static void alloc_SHTarrays(shtns_cfg shtns, int on_the_fly, int vect, int analys)
|
|---|
| 375 | {
|
|---|
| 376 | long int im, l0;
|
|---|
| 377 | long int size, marray_size, lstride;
|
|---|
| 378 |
|
|---|
| 379 | im = (VSIZE2 > 2) ? VSIZE2 : 2;
|
|---|
| 380 | l0 = ((NLAT+im-1)/im)*im; // align on vector
|
|---|
| 381 | shtns->ct = (double *) VMALLOC( sizeof(double) * l0*3 ); /// ct[] (including st and st_1)
|
|---|
| 382 | shtns->st = shtns->ct + l0; shtns->st_1 = shtns->ct + 2*l0;
|
|---|
| 383 |
|
|---|
| 384 | shtns->ylm = NULL; shtns->dylm = NULL; // synthesis
|
|---|
| 385 | shtns->zlm = NULL; shtns->dzlm = NULL; // analysis
|
|---|
| 386 |
|
|---|
| 387 | if (on_the_fly == 0) { // Allocate legendre functions lookup tables.
|
|---|
| 388 | marray_size = (MMAX+1)*sizeof(double*) + (MIN_ALIGNMENT-1); // for sse2 alignement
|
|---|
| 389 |
|
|---|
| 390 | /* ylm */
|
|---|
| 391 | lstride = (LMAX+1); lstride += (lstride&1); // even stride.
|
|---|
| 392 | size = sizeof(double) * ((NLM-(LMAX+1)+lstride)*NLAT_2);
|
|---|
| 393 | if (MMAX == 0) size += 3*sizeof(double); // some overflow needed.
|
|---|
| 394 | shtns->ylm = (double **) malloc( marray_size + size );
|
|---|
| 395 | shtns->ylm[0] = (double *) PTR_ALIGN( shtns->ylm + (MMAX+1) );
|
|---|
| 396 | if (MMAX>0) shtns->ylm[1] = shtns->ylm[0] + NLAT_2*lstride;
|
|---|
| 397 | for (im=1; im<MMAX; im++) shtns->ylm[im+1] = shtns->ylm[im] + NLAT_2*(LMAX+1-im*MRES);
|
|---|
| 398 | /* dylm */
|
|---|
| 399 | if (vect) {
|
|---|
| 400 | lstride = LMAX; lstride += (lstride&1); // even stride.
|
|---|
| 401 | size = sizeof(struct DtDp) * ((NLM-(LMAX+1) +lstride/2)*NLAT_2);
|
|---|
| 402 | if (MMAX == 0) size += 2*sizeof(double); // some overflow needed.
|
|---|
| 403 | shtns->dylm = (struct DtDp **) malloc( marray_size + size );
|
|---|
| 404 | shtns->dylm[0] = (struct DtDp *) PTR_ALIGN( shtns->dylm + (MMAX+1) );
|
|---|
| 405 | if (MMAX>0) shtns->dylm[1] = shtns->dylm[0] + (lstride/2)*NLAT_2; // phi-derivative is zero for m=0
|
|---|
| 406 | for (im=1; im<MMAX; im++) shtns->dylm[im+1] = shtns->dylm[im] + NLAT_2*(LMAX+1-im*MRES);
|
|---|
| 407 | }
|
|---|
| 408 | if (analys) {
|
|---|
| 409 | /* zlm */
|
|---|
| 410 | size = sizeof(double) * (NLM*NLAT_2 + (NLAT_2 & 1));
|
|---|
| 411 | if (MMAX == 0) size += 2*(NLAT_2 & 1)*((LMAX+1) & 1) * sizeof(double);
|
|---|
| 412 | shtns->zlm = (double **) malloc( marray_size + size );
|
|---|
| 413 | shtns->zlm[0] = (double *) PTR_ALIGN( shtns->zlm + (MMAX+1) );
|
|---|
| 414 | if (MMAX>0) shtns->zlm[1] = shtns->zlm[0] + NLAT_2*(LMAX+1) + (NLAT_2&1);
|
|---|
| 415 | for (im=1; im<MMAX; im++) shtns->zlm[im+1] = shtns->zlm[im] + NLAT_2*(LMAX+1-im*MRES);
|
|---|
| 416 | /* dzlm */
|
|---|
| 417 | if (vect) {
|
|---|
| 418 | size = sizeof(struct DtDp)* (NLM-1)*NLAT_2; // remove l=0
|
|---|
| 419 | shtns->dzlm = (struct DtDp **) malloc( marray_size + size );
|
|---|
| 420 | shtns->dzlm[0] = (struct DtDp *) PTR_ALIGN( shtns->dzlm + (MMAX+1) );
|
|---|
| 421 | if (MMAX>0) shtns->dzlm[1] = shtns->dzlm[0] + NLAT_2*(LMAX);
|
|---|
| 422 | for (im=1; im<MMAX; im++) shtns->dzlm[im+1] = shtns->dzlm[im] + NLAT_2*(LMAX+1-im*MRES);
|
|---|
| 423 | }
|
|---|
| 424 | }
|
|---|
| 425 | }
|
|---|
| 426 | #if SHT_VERBOSE > 1
|
|---|
| 427 | if (verbose>1) printf(" Memory used for Ylm and Zlm matrices = %.3f Mb x2\n",3.0*sizeof(double)*NLM*NLAT_2/(1024.*1024.));
|
|---|
| 428 | #endif
|
|---|
| 429 | }
|
|---|
| 430 |
|
|---|
| 431 |
|
|---|
| 432 | /// \internal free arrays allocated by init_SH_dct
|
|---|
| 433 | static void free_SH_dct(shtns_cfg shtns)
|
|---|
| 434 | {
|
|---|
| 435 | if (shtns->zlm_dct0 == NULL) return;
|
|---|
| 436 |
|
|---|
| 437 | if (ref_count(shtns, &shtns->dzlm_dct0) == 1) VFREE(shtns->dzlm_dct0);
|
|---|
| 438 | if (ref_count(shtns, &shtns->zlm_dct0) == 1) VFREE(shtns->zlm_dct0);
|
|---|
| 439 | shtns->dzlm_dct0 = NULL; shtns->zlm_dct0 = NULL;
|
|---|
| 440 |
|
|---|
| 441 | free_unused(shtns, &shtns->dykm_dct);
|
|---|
| 442 | free_unused(shtns, &shtns->ykm_dct);
|
|---|
| 443 |
|
|---|
| 444 | if (ref_count(shtns, &shtns->idct) == 1) fftw_destroy_plan(shtns->idct); // free unused dct plans
|
|---|
| 445 | if (ref_count(shtns, &shtns->dct_m0) == 1) fftw_destroy_plan(shtns->dct_m0);
|
|---|
| 446 | shtns->idct = NULL; shtns->dct_m0 = NULL;
|
|---|
| 447 | }
|
|---|
| 448 |
|
|---|
| 449 | /// \internal free arrays allocated by alloc_SHTarrays.
|
|---|
| 450 | static void free_SHTarrays(shtns_cfg shtns)
|
|---|
| 451 | {
|
|---|
| 452 | free_SH_dct(shtns);
|
|---|
| 453 | free_unused(shtns, &shtns->ylm);
|
|---|
| 454 | free_unused(shtns, &shtns->dylm);
|
|---|
| 455 | free_unused(shtns, &shtns->zlm);
|
|---|
| 456 | free_unused(shtns, &shtns->dzlm);
|
|---|
| 457 |
|
|---|
| 458 | if (ref_count(shtns, &shtns->ct) == 1) VFREE(shtns->ct);
|
|---|
| 459 | shtns->ct = NULL; shtns->st = NULL;
|
|---|
| 460 |
|
|---|
| 461 | if (ref_count(shtns, &shtns->fft) == 1) fftw_destroy_plan(shtns->fft);
|
|---|
| 462 | if (ref_count(shtns, &shtns->ifft) == 1) fftw_destroy_plan(shtns->ifft);
|
|---|
| 463 | shtns->fft = NULL; shtns->ifft = NULL; shtns->ncplx_fft = -1; // no fft
|
|---|
| 464 | }
|
|---|
| 465 |
|
|---|
| 466 | #ifndef HAVE_FFTW_COST
|
|---|
| 467 | // substitute undefined symbol in mkl and fftw older than 3.3
|
|---|
| 468 | #define fftw_cost(a) 0.0
|
|---|
| 469 | #endif
|
|---|
| 470 |
|
|---|
| 471 | /// \internal initialize FFTs using FFTW.
|
|---|
| 472 | /// \param[in] layout defines the spatial layout (see \ref spat).
|
|---|
| 473 | /// \param[in] on_the_fly is one, if only on-the-fly transform are considered.
|
|---|
| 474 | /// returns the number of double to be allocated for a spatial field.
|
|---|
| 475 | static void planFFT(shtns_cfg shtns, int layout, int on_the_fly)
|
|---|
| 476 | {
|
|---|
| 477 | double cost_fft_ip, cost_fft_oop, cost_ifft_ip, cost_ifft_oop;
|
|---|
| 478 | cplx *ShF;
|
|---|
| 479 | double *Sh;
|
|---|
| 480 | fftw_plan fft2, ifft2, fft, ifft;
|
|---|
| 481 | int nfft, ncplx, nreal;
|
|---|
| 482 | int theta_inc, phi_inc, phi_embed;
|
|---|
| 483 | #ifdef HAVE_FFTW_COST
|
|---|
| 484 | int in_place = 1; // try to use in-place real fft.
|
|---|
| 485 | #else
|
|---|
| 486 | int in_place = 0; // do not try to use in-place real fft if no timing data available.
|
|---|
| 487 | #endif
|
|---|
| 488 |
|
|---|
| 489 | if (NPHI <= 2*MMAX) shtns_runerr("the sampling condition Nphi > 2*Mmax is not met.");
|
|---|
| 490 |
|
|---|
| 491 | #ifdef OMP_FFTW
|
|---|
| 492 | if ((shtns->fftw_plan_mode & (FFTW_EXHAUSTIVE | FFTW_PATIENT)) && (omp_threads > 1)) {
|
|---|
| 493 | shtns->fftw_plan_mode = FFTW_PATIENT;
|
|---|
| 494 | fftw_plan_with_nthreads(omp_threads);
|
|---|
| 495 | } else fftw_plan_with_nthreads(shtns->nthreads);
|
|---|
| 496 | #endif
|
|---|
| 497 |
|
|---|
| 498 | shtns->k_stride_a = 1; shtns->m_stride_a = NLAT; // default strides
|
|---|
| 499 |
|
|---|
| 500 | shtns->fft = NULL; shtns->ifft = NULL;
|
|---|
| 501 | shtns->dct_m0 = NULL; shtns->idct = NULL; // set dct plans to uninitialized.
|
|---|
| 502 |
|
|---|
| 503 | shtns->nspat = NPHI * NLAT; // default spatial size
|
|---|
| 504 |
|
|---|
| 505 | if (NPHI==1) // no FFT needed.
|
|---|
| 506 | {
|
|---|
| 507 | shtns->fftc_mode = -1; // no FFT
|
|---|
| 508 | #if SHT_VERBOSE > 0
|
|---|
| 509 | if (verbose) printf(" => no fft : Mmax=0, Nphi=1, Nlat=%d\n",NLAT);
|
|---|
| 510 | #endif
|
|---|
| 511 | shtns->ncplx_fft = -1; // no fft.
|
|---|
| 512 | return;
|
|---|
| 513 | }
|
|---|
| 514 |
|
|---|
| 515 | /* NPHI > 1 */
|
|---|
| 516 | theta_inc=1; phi_inc=NLAT; phi_embed=2*(NPHI/2+1); // SHT_NATIVE_LAYOUT is the default.
|
|---|
| 517 | if (layout & SHT_THETA_CONTIGUOUS) { theta_inc=1; phi_inc=NLAT; phi_embed=NPHI; }
|
|---|
| 518 | if (layout & SHT_PHI_CONTIGUOUS) { phi_inc=1; theta_inc=NPHI; phi_embed=NPHI; }
|
|---|
| 519 | nfft = NPHI;
|
|---|
| 520 | ncplx = NPHI/2 +1;
|
|---|
| 521 | nreal = phi_embed;
|
|---|
| 522 | if ((theta_inc != 1)||(phi_inc != NLAT)||(nreal < 2*ncplx)) in_place = 0; // we need to do the fft out-of-place.
|
|---|
| 523 |
|
|---|
| 524 | #if SHT_VERBOSE > 0
|
|---|
| 525 | if (verbose) {
|
|---|
| 526 | printf(" => using FFTW : Mmax=%d, Nphi=%d, Nlat=%d (data layout : phi_inc=%d, theta_inc=%d, phi_embed=%d)\n",MMAX,NPHI,NLAT,phi_inc,theta_inc,phi_embed);
|
|---|
| 527 | if (NPHI <= (SHT_NL_ORDER+1)*MMAX) printf(" !! Warning : anti-aliasing condition Nphi > %d*Mmax is not met !\n", SHT_NL_ORDER+1);
|
|---|
| 528 | if (NPHI != fft_int(NPHI,7)) printf(" !! Warning : Nphi is not optimal for FFTW !\n");
|
|---|
| 529 | }
|
|---|
| 530 | #endif
|
|---|
| 531 |
|
|---|
| 532 | // Allocate dummy Spatial Fields.
|
|---|
| 533 | ShF = (cplx *) VMALLOC(ncplx * NLAT * sizeof(cplx));
|
|---|
| 534 | Sh = (double *) VMALLOC(ncplx * NLAT * sizeof(cplx));
|
|---|
| 535 | fft = NULL; ifft = NULL; fft2 = NULL; ifft2 = NULL;
|
|---|
| 536 |
|
|---|
| 537 | // complex fft for fly transform is a bit different.
|
|---|
| 538 | if (layout & SHT_PHI_CONTIGUOUS) { // out-of-place split dft
|
|---|
| 539 | fftw_iodim dim, many;
|
|---|
| 540 | shtns->fftc_mode = 1;
|
|---|
| 541 | //default internal
|
|---|
| 542 | dim.n = NPHI; dim.os = 1; dim.is = NLAT; // complex transpose
|
|---|
| 543 | many.n = NLAT/2; many.os = 2*NPHI; many.is = 2;
|
|---|
| 544 | shtns->ifftc = fftw_plan_guru_split_dft(1, &dim, 1, &many, ((double*)ShF)+1, (double*)ShF, Sh+NPHI, Sh, shtns->fftw_plan_mode);
|
|---|
| 545 |
|
|---|
| 546 | // legacy analysis fft
|
|---|
| 547 | //dim.n = NPHI; dim.is = 1; dim.os = NLAT;
|
|---|
| 548 | //many.n = NLAT/2; many.is = 2*NPHI; many.os = 2;
|
|---|
| 549 | // new internal
|
|---|
| 550 | dim.n = NPHI; dim.is = 1; dim.os = 2; // split complex, but without global transpose (faster).
|
|---|
| 551 | many.n = NLAT/2; many.is = 2*NPHI; many.os = 2*NPHI;
|
|---|
| 552 | shtns->fftc = fftw_plan_guru_split_dft(1, &dim, 1, &many, Sh+NPHI, Sh, ((double*)ShF)+1, (double*)ShF, shtns->fftw_plan_mode);
|
|---|
| 553 | shtns->k_stride_a = NPHI; shtns->m_stride_a = 2;
|
|---|
| 554 |
|
|---|
| 555 | #if SHT_VERBOSE > 1
|
|---|
| 556 | if (verbose>1) {
|
|---|
| 557 | printf(" fftw cost ifftc=%lg, fftc=%lg ",fftw_cost(shtns->ifftc), fftw_cost(shtns->fftc)); fflush(stdout);
|
|---|
| 558 | }
|
|---|
| 559 | #endif
|
|---|
| 560 | } else { //if (layout & SHT_THETA_CONTIGUOUS) { // use only in-place here, supposed to be faster.
|
|---|
| 561 | shtns->fftc_mode = 0;
|
|---|
| 562 | shtns->ifftc = fftw_plan_many_dft(1, &nfft, NLAT/2, ShF, &nfft, NLAT/2, 1, ShF, &nfft, NLAT/2, 1, FFTW_BACKWARD, shtns->fftw_plan_mode);
|
|---|
| 563 | shtns->fftc = shtns->ifftc; // same thing, with m>0 and m<0 exchanged.
|
|---|
| 564 | #if SHT_VERBOSE > 1
|
|---|
| 565 | if (verbose>1) {
|
|---|
| 566 | printf(" fftw cost ifftc=%lg ",fftw_cost(shtns->ifftc)); fflush(stdout);
|
|---|
| 567 | }
|
|---|
| 568 | #endif
|
|---|
| 569 | }
|
|---|
| 570 |
|
|---|
| 571 | #if _GCC_VEC_
|
|---|
| 572 | if (on_the_fly == 0)
|
|---|
| 573 | #endif
|
|---|
| 574 | { // the real ffts are required if _GCC_VEC_ == 0 or if on_the_fly is zero.
|
|---|
| 575 | // IFFT : unnormalized. FFT : must be normalized.
|
|---|
| 576 | cost_fft_ip = 0.0; cost_ifft_ip = 0.0; cost_fft_oop = 0.0; cost_ifft_oop = 0.0;
|
|---|
| 577 | if (in_place) { // in-place FFT (if allowed)
|
|---|
| 578 | ifft2 = fftw_plan_many_dft_c2r(1, &nfft, NLAT, ShF, &ncplx, NLAT, 1, (double*) ShF, &nreal, phi_inc, theta_inc, shtns->fftw_plan_mode);
|
|---|
| 579 | #if SHT_VERBOSE > 1
|
|---|
| 580 | if (verbose>1) {
|
|---|
| 581 | printf(" in-place cost : ifft=%lg ",fftw_cost(ifft2)); fflush(stdout);
|
|---|
| 582 | }
|
|---|
| 583 | #endif
|
|---|
| 584 | if (ifft2 != NULL) {
|
|---|
| 585 | fft2 = fftw_plan_many_dft_r2c(1, &nfft, NLAT, (double*) ShF, &nreal, phi_inc, theta_inc, ShF, &ncplx, NLAT, 1, shtns->fftw_plan_mode);
|
|---|
| 586 | #if SHT_VERBOSE > 1
|
|---|
| 587 | if (verbose>1) {
|
|---|
| 588 | printf("fft=%lg\n",fftw_cost(fft2)); fflush(stdout);
|
|---|
| 589 | }
|
|---|
| 590 | #endif
|
|---|
| 591 | if (fft2 != NULL) {
|
|---|
| 592 | cost_fft_ip = fftw_cost(fft2); cost_ifft_ip = fftw_cost(ifft2);
|
|---|
| 593 | }
|
|---|
| 594 | }
|
|---|
| 595 | }
|
|---|
| 596 | if ( (in_place == 0) || (cost_fft_ip * cost_ifft_ip > 0.0) )
|
|---|
| 597 | { // out-of-place FFT
|
|---|
| 598 | ifft = fftw_plan_many_dft_c2r(1, &nfft, NLAT, ShF, &ncplx, NLAT, 1, Sh, &nreal, phi_inc, theta_inc, shtns->fftw_plan_mode);
|
|---|
| 599 | #if SHT_VERBOSE > 1
|
|---|
| 600 | if (verbose>1) { printf(" oop cost : ifft=%lg ",fftw_cost(ifft)); fflush(stdout); }
|
|---|
| 601 | #endif
|
|---|
| 602 | if (ifft == NULL) shtns_runerr("[FFTW] ifft planning failed !");
|
|---|
| 603 | fft = fftw_plan_many_dft_r2c(1, &nfft, NLAT, Sh, &nreal, phi_inc, theta_inc, ShF, &ncplx, NLAT, 1, shtns->fftw_plan_mode);
|
|---|
| 604 | #if SHT_VERBOSE > 1
|
|---|
| 605 | if (verbose>1) { printf("fft=%lg\n",fftw_cost(fft)); fflush(stdout); }
|
|---|
| 606 | #endif
|
|---|
| 607 | if (fft == NULL) shtns_runerr("[FFTW] fft planning failed !");
|
|---|
| 608 | cost_fft_oop = fftw_cost(fft); cost_ifft_oop = fftw_cost(ifft);
|
|---|
| 609 | }
|
|---|
| 610 | if ( (cost_fft_ip * cost_ifft_ip > 0.0) && (cost_fft_oop * cost_ifft_oop > 0.0) ) { // both have been succesfully timed.
|
|---|
| 611 | if ( cost_fft_oop + SHT_NL_ORDER*cost_ifft_oop < cost_fft_ip + SHT_NL_ORDER*cost_ifft_ip )
|
|---|
| 612 | in_place = 0; // disable in-place, because out-of-place is faster.
|
|---|
| 613 | }
|
|---|
| 614 |
|
|---|
| 615 | if (in_place) {
|
|---|
| 616 | /* IN-PLACE FFT */
|
|---|
| 617 | if (fft != NULL) fftw_destroy_plan(fft);
|
|---|
| 618 | if (ifft != NULL) fftw_destroy_plan(ifft);
|
|---|
| 619 | fft = fft2; ifft = ifft2;
|
|---|
| 620 | shtns->ncplx_fft = 0; // fft is done in-place, no allocation needed.
|
|---|
| 621 | shtns->nspat = phi_embed * NLAT; // more space must be reserved for inplace ffts.
|
|---|
| 622 | } else {
|
|---|
| 623 | /* OUT-OF-PLACE FFT */
|
|---|
| 624 | if (fft2 != NULL) fftw_destroy_plan(fft2); // use OUT-OF-PLACE FFT
|
|---|
| 625 | if (ifft2 != NULL) fftw_destroy_plan(ifft2);
|
|---|
| 626 | shtns->ncplx_fft = ncplx * NLAT; // fft is done out-of-place, store allocation size.
|
|---|
| 627 | #if SHT_VERBOSE > 1
|
|---|
| 628 | if (verbose>1) printf(" ** out-of-place fft **\n");
|
|---|
| 629 | #endif
|
|---|
| 630 | }
|
|---|
| 631 | shtns->fft = fft; shtns->ifft = ifft;
|
|---|
| 632 | }
|
|---|
| 633 | VFREE(Sh); VFREE(ShF);
|
|---|
| 634 |
|
|---|
| 635 | #if SHT_VERBOSE > 2
|
|---|
| 636 | if (verbose>2) {
|
|---|
| 637 | printf(" *** fft plan :\n");
|
|---|
| 638 | fftw_print_plan(fft);
|
|---|
| 639 | printf("\n *** ifft plan :\n");
|
|---|
| 640 | fftw_print_plan(ifft);
|
|---|
| 641 | printf("\n");
|
|---|
| 642 | }
|
|---|
| 643 | #endif
|
|---|
| 644 | }
|
|---|
| 645 |
|
|---|
| 646 | #ifdef SHTNS_DCT
|
|---|
| 647 | /// \internal initialize DCTs using FFTW. Must be called if MTR_DCT is changed.
|
|---|
| 648 | static int planDCT(shtns_cfg shtns)
|
|---|
| 649 | {
|
|---|
| 650 | double *Sh;
|
|---|
| 651 | int ndct = NLAT;
|
|---|
| 652 | fftw_r2r_kind r2r_kind;
|
|---|
| 653 | fftw_iodim dims, hdims[2];
|
|---|
| 654 | double Sh0[NLAT] SSE; // temp storage on the stack, aligned.
|
|---|
| 655 |
|
|---|
| 656 | #ifdef OMP_FFTW
|
|---|
| 657 | if (shtns->fftw_plan_mode & (FFTW_EXHAUSTIVE | FFTW_PATIENT)) {
|
|---|
| 658 | fftw_plan_with_nthreads(omp_threads);
|
|---|
| 659 | } else fftw_plan_with_nthreads(1);
|
|---|
| 660 | #endif
|
|---|
| 661 | // Allocate dummy Spatial Fields.
|
|---|
| 662 | Sh = (double *) VMALLOC((NPHI/2 +1) * NLAT*2 * sizeof(double));
|
|---|
| 663 |
|
|---|
| 664 | if (shtns->dct_m0 == NULL) { // allocate only once since it does not change.
|
|---|
| 665 | int stride = (NPHI>1) ? 2 : 1;
|
|---|
| 666 | r2r_kind = FFTW_REDFT10; // out-of-place.
|
|---|
| 667 | shtns->dct_m0 = fftw_plan_many_r2r(1, &ndct, 1, Sh, &ndct, stride, stride*NLAT, Sh0, &ndct, 1, NLAT, &r2r_kind, shtns->fftw_plan_mode); // out-of-place.
|
|---|
| 668 | if (shtns->dct_m0 == NULL) return 0; // dct_m0 planning failed.
|
|---|
| 669 | #if SHT_VERBOSE > 2
|
|---|
| 670 | if (verbose>2) {
|
|---|
| 671 | printf(" *** dct_m0 plan :\n"); fftw_print_plan(shtns->dct_m0); printf("\n");
|
|---|
| 672 | }
|
|---|
| 673 | #endif
|
|---|
| 674 | }
|
|---|
| 675 |
|
|---|
| 676 | if (NPHI > 1) { // complex data for NPHI>1, recompute as it does depend on MTR_DCT
|
|---|
| 677 | if (shtns->idct != NULL) fftw_destroy_plan(shtns->idct);
|
|---|
| 678 |
|
|---|
| 679 | dims.n = NLAT; dims.is = 2; dims.os = 2; // real and imaginary part.
|
|---|
| 680 | hdims[0].n = MTR_DCT+1; hdims[0].is = 2*NLAT; hdims[0].os = 2*NLAT;
|
|---|
| 681 | hdims[1].n = 2; hdims[1].is = 1; hdims[1].os = 1;
|
|---|
| 682 | r2r_kind = FFTW_REDFT01;
|
|---|
| 683 | shtns->idct = fftw_plan_guru_r2r(1, &dims, 2, hdims, Sh, Sh, &r2r_kind, shtns->fftw_plan_mode);
|
|---|
| 684 | } else { // NPHI == 1
|
|---|
| 685 | if (shtns->idct == NULL) {
|
|---|
| 686 | r2r_kind = FFTW_REDFT01;
|
|---|
| 687 | shtns->idct = fftw_plan_many_r2r(1, &ndct, 1, Sh, &ndct, 1, NLAT, Sh, &ndct, 1, NLAT, &r2r_kind, shtns->fftw_plan_mode);
|
|---|
| 688 | }
|
|---|
| 689 | }
|
|---|
| 690 |
|
|---|
| 691 | VFREE(Sh);
|
|---|
| 692 | if (shtns->idct == NULL) return 0; // idct planning failed.
|
|---|
| 693 | #if SHT_VERBOSE > 2
|
|---|
| 694 | if (verbose>2) {
|
|---|
| 695 | printf(" *** idct plan :\n"); fftw_print_plan(shtns->idct); printf("\n");
|
|---|
| 696 | }
|
|---|
| 697 | #endif
|
|---|
| 698 | return 1; // success.
|
|---|
| 699 | }
|
|---|
| 700 |
|
|---|
| 701 | /// \internal SET MTR_DCT and updates fftw_plan for DCT's
|
|---|
| 702 | static int Set_MTR_DCT(shtns_cfg shtns, int m)
|
|---|
| 703 | {
|
|---|
| 704 | if ((shtns->zlm_dct0 == NULL)||(m == MTR_DCT)) return MTR_DCT;
|
|---|
| 705 | if ( m < 0 ) { // don't use dct
|
|---|
| 706 | shtns->mtr_dct = -1;
|
|---|
| 707 | } else {
|
|---|
| 708 | if (m>MMAX) m=MMAX;
|
|---|
| 709 | shtns->mtr_dct = m;
|
|---|
| 710 | if (planDCT(shtns) == 0)
|
|---|
| 711 | shtns->mtr_dct = -1; // failure
|
|---|
| 712 | }
|
|---|
| 713 | return MTR_DCT;
|
|---|
| 714 | }
|
|---|
| 715 |
|
|---|
| 716 | /// \internal returns the m-truncation of DCT part of synthesis
|
|---|
| 717 | static int Get_MTR_DCT(shtns_cfg shtns) {
|
|---|
| 718 | return MTR_DCT;
|
|---|
| 719 | }
|
|---|
| 720 | #endif /* SHTNS_DCT */
|
|---|
| 721 |
|
|---|
| 722 |
|
|---|
| 723 | /// \internal Sets the value tm[im] used for polar optimiation on-the-fly.
|
|---|
| 724 | static void PolarOptimize(shtns_cfg shtns, double eps)
|
|---|
| 725 | {
|
|---|
| 726 | int im, m, l, it;
|
|---|
| 727 | double v;
|
|---|
| 728 | double y[LMAX+1];
|
|---|
| 729 |
|
|---|
| 730 | for (im=0;im<=MMAX;im++) shtns->tm[im] = 0;
|
|---|
| 731 |
|
|---|
| 732 | if (eps > 0.0) {
|
|---|
| 733 | for (im=1;im<=MMAX;im++) {
|
|---|
| 734 | m = im*MRES;
|
|---|
| 735 | it = shtns->tm[im-1] -1; // tm[im] is monotonic.
|
|---|
| 736 | do {
|
|---|
| 737 | it++;
|
|---|
| 738 | legendre_sphPlm_array(shtns, LMAX, im, shtns->ct[it], y+m);
|
|---|
| 739 | v = 0.0;
|
|---|
| 740 | for (l=m; l<=LMAX; l++) {
|
|---|
| 741 | double ya = fabs(y[l]);
|
|---|
| 742 | if ( v < ya ) v = ya;
|
|---|
| 743 | }
|
|---|
| 744 | } while (v < eps);
|
|---|
| 745 | shtns->tm[im] = it;
|
|---|
| 746 | }
|
|---|
| 747 | #if SHT_VERBOSE > 0
|
|---|
| 748 | if (verbose) printf(" + polar optimization threshold = %.1e\n",eps);
|
|---|
| 749 | #endif
|
|---|
| 750 | #if SHT_VERBOSE > 1
|
|---|
| 751 | if (verbose>1) {
|
|---|
| 752 | printf(" tm[im]=");
|
|---|
| 753 | for (im=0;im<=MMAX;im++)
|
|---|
| 754 | printf(" %d",shtns->tm[im]);
|
|---|
| 755 | printf("\n");
|
|---|
| 756 | }
|
|---|
| 757 | #endif
|
|---|
| 758 | }
|
|---|
| 759 | }
|
|---|
| 760 |
|
|---|
| 761 | #ifdef SHTNS_MEM
|
|---|
| 762 |
|
|---|
| 763 | /// \internal Perform some optimization on the SHT matrices.
|
|---|
| 764 | static void OptimizeMatrices(shtns_cfg shtns, double eps)
|
|---|
| 765 | {
|
|---|
| 766 | unsigned short *tm;
|
|---|
| 767 | double **ylm, **zlm;
|
|---|
| 768 | struct DtDp** dylm;
|
|---|
| 769 | struct DtDp** dzlm;
|
|---|
| 770 | int im,m,l,it;
|
|---|
| 771 | int vector = (shtns->dylm != NULL);
|
|---|
| 772 |
|
|---|
| 773 | tm = shtns->tm;
|
|---|
| 774 | ylm = shtns->ylm; dylm = shtns->dylm;
|
|---|
| 775 | zlm = shtns->zlm; dzlm = shtns->dzlm;
|
|---|
| 776 |
|
|---|
| 777 | for (im=0;im<=MMAX;im++) tm[im] = 0;
|
|---|
| 778 | /// POLAR OPTIMIZATION : analyzing coefficients, some can be safely neglected.
|
|---|
| 779 | if (eps > 0.0) {
|
|---|
| 780 | for (im=1;im<=MMAX;im++) {
|
|---|
| 781 | m = im*MRES;
|
|---|
| 782 | tm[im] = NLAT_2;
|
|---|
| 783 | for (l=m;l<=LMAX;l++) {
|
|---|
| 784 | it = tm[im-1]; // tm[im] is monotonic.
|
|---|
| 785 | while( fabs(ylm[im][it*(LMAX-m+1) + (l-m)]) < eps ) { it++; }
|
|---|
| 786 | if (tm[im] > it) tm[im] = it;
|
|---|
| 787 | }
|
|---|
| 788 | }
|
|---|
| 789 | #if SHT_VERBOSE > 0
|
|---|
| 790 | if (verbose) printf(" + polar optimization threshold = %.1e\n",eps);
|
|---|
| 791 | #endif
|
|---|
| 792 | #if SHT_VERBOSE > 1
|
|---|
| 793 | if (verbose>1) {
|
|---|
| 794 | printf(" tm[im]=");
|
|---|
| 795 | for (im=0;im<=MMAX;im++)
|
|---|
| 796 | printf(" %d",tm[im]);
|
|---|
| 797 | printf("\n");
|
|---|
| 798 | }
|
|---|
| 799 | #endif
|
|---|
| 800 |
|
|---|
| 801 | for (im=1; im<=MMAX; im++) { // im >= 1
|
|---|
| 802 | if (tm[im] > 0) { // we can remove the data corresponding to polar values.
|
|---|
| 803 | m = im*MRES;
|
|---|
| 804 | ylm[im] += tm[im]*(LMAX-m+1); // shift pointers (still one block for each m)
|
|---|
| 805 | if (vector) dylm[im] += tm[im]*(LMAX-m+1);
|
|---|
| 806 | if (zlm[0] != NULL) {
|
|---|
| 807 | for (l=m; l<LMAX; l+=2) {
|
|---|
| 808 | for (it=0; it<NLAT_2-tm[im]; it++) { // copy data to avoid cache misses.
|
|---|
| 809 | zlm[im][(l-m)*(NLAT_2-tm[im]) + it*2] = zlm[im][(l-m)*NLAT_2 + (it+tm[im])*2];
|
|---|
| 810 | zlm[im][(l-m)*(NLAT_2-tm[im]) + it*2+1] = zlm[im][(l-m)*NLAT_2 + (it+tm[im])*2+1];
|
|---|
| 811 | if (vector) {
|
|---|
| 812 | dzlm[im][(l-m)*(NLAT_2-tm[im]) + it*2].t = dzlm[im][(l-m)*NLAT_2 + (it+tm[im])*2].t;
|
|---|
| 813 | dzlm[im][(l-m)*(NLAT_2-tm[im]) + it*2].p = dzlm[im][(l-m)*NLAT_2 + (it+tm[im])*2].p;
|
|---|
| 814 | dzlm[im][(l-m)*(NLAT_2-tm[im]) + it*2+1].t = dzlm[im][(l-m)*NLAT_2 + (it+tm[im])*2+1].t;
|
|---|
| 815 | dzlm[im][(l-m)*(NLAT_2-tm[im]) + it*2+1].p = dzlm[im][(l-m)*NLAT_2 + (it+tm[im])*2+1].p;
|
|---|
| 816 | }
|
|---|
| 817 | }
|
|---|
| 818 | }
|
|---|
| 819 | if (l==LMAX) {
|
|---|
| 820 | for (it=0; it<NLAT_2-tm[im]; it++) {
|
|---|
| 821 | zlm[im][(l-m)*(NLAT_2-tm[im]) + it] = zlm[im][(l-m)*NLAT_2 + (it+tm[im])];
|
|---|
| 822 | if (vector) {
|
|---|
| 823 | dzlm[im][(l-m)*(NLAT_2-tm[im]) + it].t = dzlm[im][(l-m)*NLAT_2 + (it+tm[im])].t;
|
|---|
| 824 | dzlm[im][(l-m)*(NLAT_2-tm[im]) + it].p = dzlm[im][(l-m)*NLAT_2 + (it+tm[im])].p;
|
|---|
| 825 | }
|
|---|
| 826 | }
|
|---|
| 827 | }
|
|---|
| 828 | }
|
|---|
| 829 | }
|
|---|
| 830 | }
|
|---|
| 831 | }
|
|---|
| 832 |
|
|---|
| 833 | /// Compression of dzlm for m=0, as .p is 0
|
|---|
| 834 | if ((vector) && (dzlm[0] != NULL)) { // for sht_reg_poles there is no dzlm defined.
|
|---|
| 835 | im=0; m=0;
|
|---|
| 836 | double* yg = (double *) dzlm[im];
|
|---|
| 837 | for (l=1; l<LMAX; l+=2) { // l=0 is zero, so we start at l=1.
|
|---|
| 838 | for (it=0; it<NLAT_2; it++) {
|
|---|
| 839 | yg[(l-1)*NLAT_2 + it*2] = dzlm[im][(l-1)*NLAT_2 + it*2].t; // l
|
|---|
| 840 | yg[(l-1)*NLAT_2 + it*2+1] = dzlm[im][(l-1)*NLAT_2 + it*2+1].t; // l+1
|
|---|
| 841 | }
|
|---|
| 842 | }
|
|---|
| 843 | if (l==LMAX) { // last l is stored right away, without interleaving.
|
|---|
| 844 | for (it=0; it<NLAT_2; it++) {
|
|---|
| 845 | yg[(l-1)*NLAT_2 + it] = dzlm[im][(l-1)*NLAT_2 + it].t; // l (odd)
|
|---|
| 846 | }
|
|---|
| 847 | }
|
|---|
| 848 | }
|
|---|
| 849 | if ((zlm[0] != NULL) && (NLAT_2 & 1)) {
|
|---|
| 850 | zlm[0][NLAT_2] = 0.0; // take care to write 0.0 for this sse2 padding value.
|
|---|
| 851 | if ( (MMAX==0) && (!(LMAX & 1)) ) { // NLAT_2 odd + LMAX even
|
|---|
| 852 | zlm[0][(LMAX+1)*NLAT_2 +1] = 0.0; // overflow for im=0
|
|---|
| 853 | zlm[0][(LMAX+1)*NLAT_2 +2] = 0.0;
|
|---|
| 854 | }
|
|---|
| 855 | }
|
|---|
| 856 | }
|
|---|
| 857 |
|
|---|
| 858 | // how to access the ylm matrix for im=0
|
|---|
| 859 | #define YL0(it, l) ( shtns->ylm[0][ (it)*(((LMAX+2)>>1)*2) + (l) ] )
|
|---|
| 860 | #define DYL0(it, l) ( ((double*)(shtns->dylm[0]))[ (it)*(((LMAX+1)>>1)*2) + (l-1) ] )
|
|---|
| 861 |
|
|---|
| 862 | #define YLM(it, l, im) ( (im==0) ? YL0(it,l) : shtns->ylm[im][(it)*(LMAX-(im)*MRES+1) + ((l)-(im)*MRES)] )
|
|---|
| 863 | #define DTYLM(it, l, im) ( (im==0) ? ( (l==0) ? 0.0 : DYL0(it,l) ) : shtns->dylm[im][(it)*(LMAX-(im)*MRES+1) + ((l)-(im)*MRES)].t )
|
|---|
| 864 | #define DPYLM(it, l, im) ( (im==0) ? 0.0 : shtns->dylm[im][(it)*(LMAX-(im)*MRES+1) + ((l)-(im)*MRES)].p )
|
|---|
| 865 |
|
|---|
| 866 | /// \internal Precompute the matrix for SH synthesis.
|
|---|
| 867 | static void init_SH_synth(shtns_cfg shtns)
|
|---|
| 868 | {
|
|---|
| 869 | double *yl, *dyl; // temp storage for derivative for legendre function values.
|
|---|
| 870 | long int it,im,m,l;
|
|---|
| 871 | double *ct = shtns->ct;
|
|---|
| 872 | double *st = shtns->st;
|
|---|
| 873 | int vector = (shtns->dylm != NULL);
|
|---|
| 874 |
|
|---|
| 875 | yl = (double*) malloc( 2*sizeof(double) *(LMAX+1) );
|
|---|
| 876 | dyl = yl + (LMAX+1);
|
|---|
| 877 |
|
|---|
| 878 | { im=0; m=0;
|
|---|
| 879 | for (it=0; it<NLAT_2; it++) {
|
|---|
| 880 | legendre_sphPlm_deriv_array_hp(shtns, LMAX, im, ct[it], st[it], yl, dyl); // fixed im legendre functions lookup table.
|
|---|
| 881 | for (l=0; l<=LMAX; l++) YL0(it, l) = yl[l];
|
|---|
| 882 | for (l=LMAX+1; l<=LMAX+3; l++) YL0(it, l) = 0.0; // allow overflow.
|
|---|
| 883 | if (vector) {
|
|---|
| 884 | for (l=1; l<=LMAX; l++) DYL0(it, l) = dyl[l];
|
|---|
| 885 | for (l=LMAX+1; l<=LMAX+2; l++) DYL0(it, l) = 0.0; // allow overflow.
|
|---|
| 886 | }
|
|---|
| 887 | }
|
|---|
| 888 | }
|
|---|
| 889 | for (im=1; im<=MMAX; im++) {
|
|---|
| 890 | double *ylm = shtns->ylm[im];
|
|---|
| 891 | struct DtDp* dylm = vector ? shtns->dylm[im] : NULL;
|
|---|
| 892 | m = im*MRES;
|
|---|
| 893 | for (it=0; it<NLAT_2; it++) {
|
|---|
| 894 | legendre_sphPlm_deriv_array_hp(shtns, LMAX, im, ct[it], st[it], yl, dyl); // fixed im legendre functions lookup table.
|
|---|
| 895 | for (l=m; l<=LMAX; l++) {
|
|---|
| 896 | ylm[it*(LMAX-m+1) + (l-m)] = yl[l-m] * st[it];
|
|---|
| 897 | if (vector) {
|
|---|
| 898 | dylm[it*(LMAX-m+1) + (l-m)].t = dyl[l-m];
|
|---|
| 899 | dylm[it*(LMAX-m+1) + (l-m)].p = yl[l-m] *m; // 1/sint(t) dYlm/dphi
|
|---|
| 900 | }
|
|---|
| 901 | }
|
|---|
| 902 | }
|
|---|
| 903 | }
|
|---|
| 904 | free(yl);
|
|---|
| 905 | }
|
|---|
| 906 |
|
|---|
| 907 |
|
|---|
| 908 | /// \internal Precompute matrices for SH synthesis and analysis, on a Gauss-Legendre grid.
|
|---|
| 909 | static void init_SH_gauss(shtns_cfg shtns)
|
|---|
| 910 | {
|
|---|
| 911 | long int it,im,m,l;
|
|---|
| 912 | int vector = (shtns->dylm != NULL);
|
|---|
| 913 |
|
|---|
| 914 | init_SH_synth(shtns);
|
|---|
| 915 |
|
|---|
| 916 | // for analysis (decomposition, direct transform) : transpose and multiply by gauss weight and other normalizations.
|
|---|
| 917 | // interleave l and l+1 : this stores data in the way it will be read.
|
|---|
| 918 | double **zlm = shtns->zlm;
|
|---|
| 919 | struct DtDp** dzlm = shtns->dzlm;
|
|---|
| 920 | for (im=0; im<=MMAX; im++) {
|
|---|
| 921 | m = im*MRES;
|
|---|
| 922 | long int talign = 0;
|
|---|
| 923 |
|
|---|
| 924 | for (it=0;it<NLAT_2;it++) {
|
|---|
| 925 | double norm = shtns->wg[it];
|
|---|
| 926 | if ( (m>0) && (shtns->norm & SHT_REAL_NORM) ) norm *= 2; // "Real" norm : zlm must be doubled for m>0
|
|---|
| 927 | long int l0 = m;
|
|---|
| 928 | if (m==0) {
|
|---|
| 929 | zlm[im][it] = YL0(it, 0) * norm;
|
|---|
| 930 | // les derivees sont nulles pour l=0
|
|---|
| 931 | l0++;
|
|---|
| 932 | talign = (NLAT_2&1);
|
|---|
| 933 | }
|
|---|
| 934 | for (l=l0; l<LMAX; l+=2) {
|
|---|
| 935 | double nz0 = norm; double nz1 = norm;
|
|---|
| 936 | if (SHT_NORM == sht_schmidt) {
|
|---|
| 937 | nz0 *= (2*l+1); nz1 *= (2*l+3);
|
|---|
| 938 | }
|
|---|
| 939 | zlm[im][(l-m)*NLAT_2 + it*2 +talign] = YLM(it, l, im) * nz0;
|
|---|
| 940 | zlm[im][(l-m)*NLAT_2 + it*2 +1 +talign] = YLM(it, l+1, im) * nz1;
|
|---|
| 941 | if (vector) {
|
|---|
| 942 | nz0 *= shtns->l_2[l]; nz1 *= shtns->l_2[l+1];
|
|---|
| 943 | dzlm[im][(l-l0)*NLAT_2 + it*2].t = DTYLM(it, l, im) * nz0;
|
|---|
| 944 | dzlm[im][(l-l0)*NLAT_2 + it*2].p = DPYLM(it, l, im) * nz0;
|
|---|
| 945 | dzlm[im][(l-l0)*NLAT_2 + it*2+1].t = DTYLM(it, l+1, im) * nz1;
|
|---|
| 946 | dzlm[im][(l-l0)*NLAT_2 + it*2+1].p = DPYLM(it, l+1, im) * nz1;
|
|---|
| 947 | }
|
|---|
| 948 | }
|
|---|
| 949 | if (l==LMAX) { // last l is stored right away, without interleaving.
|
|---|
| 950 | double nz0 = norm;
|
|---|
| 951 | if (SHT_NORM == sht_schmidt) nz0 *= (2*l+1);
|
|---|
| 952 | zlm[im][(l-m)*NLAT_2 + it +talign] = YLM(it, l, im) * nz0;
|
|---|
| 953 | if (vector) {
|
|---|
| 954 | nz0 *= shtns->l_2[l];
|
|---|
| 955 | dzlm[im][(l-l0)*NLAT_2 + it].t = DTYLM(it, l, im) * nz0;
|
|---|
| 956 | dzlm[im][(l-l0)*NLAT_2 + it].p = DPYLM(it, l, im) * nz0;
|
|---|
| 957 | }
|
|---|
| 958 | }
|
|---|
| 959 | }
|
|---|
| 960 | }
|
|---|
| 961 | }
|
|---|
| 962 |
|
|---|
| 963 | #endif
|
|---|
| 964 |
|
|---|
| 965 | #ifdef SHTNS_DCT
|
|---|
| 966 |
|
|---|
| 967 | static void init_SH_dct_m(shtns_cfg shtns, double* is1, fftw_plan dct, fftw_plan idct, int analysis, const int m0, const int mstep)
|
|---|
| 968 | {
|
|---|
| 969 | double *yk, *yk0, *dyk0, *yg; // temp storage
|
|---|
| 970 | struct DtDp *dyg, *dyk;
|
|---|
| 971 | int it,im,m,l;
|
|---|
| 972 | double Z[2*NLAT_2] SSE;
|
|---|
| 973 | double dZt[2*NLAT_2] SSE;
|
|---|
| 974 | double dZp[2*NLAT_2] SSE; // equally spaced theta points.
|
|---|
| 975 |
|
|---|
| 976 | double *st = shtns->st;
|
|---|
| 977 | double *st_1 = shtns->st_1;
|
|---|
| 978 | const int vector = (shtns->dylm != NULL);
|
|---|
| 979 | const int KMAX = LMAX+1;
|
|---|
| 980 |
|
|---|
| 981 | real iylm_fft_norm = 1.0; // FFT/SHT normalization for zlm (4pi normalized)
|
|---|
| 982 | if ((SHT_NORM != sht_fourpi)&&(SHT_NORM != sht_schmidt)) iylm_fft_norm = 4*M_PIl; // FFT/SHT normalization for zlm (orthonormalized)
|
|---|
| 983 | iylm_fft_norm /= (2*NPHI*NLAT_2);
|
|---|
| 984 |
|
|---|
| 985 | // Even/Odd symmetry : ylm is even or odd across equator, as l-m is even or odd => only NLAT_2 points required.
|
|---|
| 986 | // temp memory for ykm_dct.
|
|---|
| 987 | yk = (double *) malloc( sizeof(double) * (KMAX+1)*(LMAX+1) );
|
|---|
| 988 | dyk = (struct DtDp *) malloc( sizeof(struct DtDp)* (KMAX+1)*(LMAX+1) );
|
|---|
| 989 | if ((m0==0)&&(analysis)) {
|
|---|
| 990 | yk0 = (double *) malloc( sizeof(double) * (LMAX/2+1)*(2*NLAT_2) * 2 ); // temp for zlm_dct0
|
|---|
| 991 | dyk0 = yk0 + (LMAX/2+1)*(2*NLAT_2);
|
|---|
| 992 | }
|
|---|
| 993 |
|
|---|
| 994 | for (im=m0; im<=MMAX; im+=mstep) {
|
|---|
| 995 | m = im*MRES;
|
|---|
| 996 | // go to DCT space
|
|---|
| 997 | for (it=0;it<=KMAX;it+=2) {
|
|---|
| 998 | for(l=m; l<=LMAX; l++) {
|
|---|
| 999 | yk[(it/2)*(LMAX+1-m) + (l-m)] = 0.0;
|
|---|
| 1000 | dyk[(it/2)*(LMAX+1-m) + (l-m)].t = 0.0;
|
|---|
| 1001 | dyk[(it/2)*(LMAX+1-m) + (l-m)].p = 0.0;
|
|---|
| 1002 | }
|
|---|
| 1003 | }
|
|---|
| 1004 | for (l=m; l<=LMAX; l++) {
|
|---|
| 1005 | if (m & 1) { // m odd
|
|---|
| 1006 | for (it=0; it<NLAT_2; it++) {
|
|---|
| 1007 | Z[it] = YLM(it, l, im) * st[it]; // P[l+1](x) *st
|
|---|
| 1008 | if (vector) {
|
|---|
| 1009 | dZt[it] = DTYLM(it, l, im); // P[l](x) *1
|
|---|
| 1010 | dZp[it] = DPYLM(it, l, im); // P[l-1](x) *1
|
|---|
| 1011 | }
|
|---|
| 1012 | }
|
|---|
| 1013 | } else { // m even
|
|---|
| 1014 | for (it=0; it<NLAT_2; it++) {
|
|---|
| 1015 | Z[it] = YLM(it, l, im); // P[l](x) *1
|
|---|
| 1016 | if (vector) {
|
|---|
| 1017 | dZt[it] = DTYLM(it, l, im) *st[it]; // P[l+1](x) *st
|
|---|
| 1018 | dZp[it] = YLM(it, l, im) * m; // P[l](x) *st
|
|---|
| 1019 | }
|
|---|
| 1020 | }
|
|---|
| 1021 | }
|
|---|
| 1022 | if ((l-m)&1) { // odd
|
|---|
| 1023 | for (it=NLAT_2; it<2*NLAT_2; it++) {
|
|---|
| 1024 | Z[it] = - Z[2*NLAT_2-it-1]; // reconstruct even/odd part
|
|---|
| 1025 | dZt[it] = dZt[2*NLAT_2-it-1];
|
|---|
| 1026 | dZp[it] = - dZp[2*NLAT_2-it-1];
|
|---|
| 1027 | }
|
|---|
| 1028 | } else { // even
|
|---|
| 1029 | for (it=NLAT_2; it<2*NLAT_2; it++) {
|
|---|
| 1030 | Z[it] = Z[2*NLAT_2-it-1]; // reconstruct even/odd part
|
|---|
| 1031 | dZt[it] = - dZt[2*NLAT_2-it-1];
|
|---|
| 1032 | dZp[it] = dZp[2*NLAT_2-it-1];
|
|---|
| 1033 | }
|
|---|
| 1034 | }
|
|---|
| 1035 | fftw_execute_r2r(dct, Z, Z);
|
|---|
| 1036 | fftw_execute_r2r(dct, dZt, dZt);
|
|---|
| 1037 | fftw_execute_r2r(dct, dZp, dZp);
|
|---|
| 1038 | #if SHT_VERBOSE > 1
|
|---|
| 1039 | if ((LMAX <= 12) && (verbose>1))
|
|---|
| 1040 | #pragma omp critical
|
|---|
| 1041 | {
|
|---|
| 1042 | printf("\nl=%d, m=%d ::\t", l,m);
|
|---|
| 1043 | for(it=0;it<2*NLAT_2;it++) printf("%e ",Z[it]/(2*NLAT));
|
|---|
| 1044 | printf("\n dYt ::\t");
|
|---|
| 1045 | for(it=0;it<2*NLAT_2;it++) printf("%e ",dZt[it]/(2*NLAT));
|
|---|
| 1046 | printf("\n dYp ::\t");
|
|---|
| 1047 | for(it=0;it<2*NLAT_2;it++) printf("%e ",dZp[it]/(2*NLAT));
|
|---|
| 1048 | }
|
|---|
| 1049 | #endif
|
|---|
| 1050 | for (it=(l-m)&1; it<=l+1; it+=2) {
|
|---|
| 1051 | yk[(it/2)*(LMAX+1-m) + (l-m)] = Z[it]/(2*NLAT); // and transpose
|
|---|
| 1052 | dyk[(it/2)*(LMAX+1-m) + (l-m)].p = dZp[it]/(2*NLAT);
|
|---|
| 1053 | }
|
|---|
| 1054 | for (it=(l+1-m)&1; it<=l+1; it+=2) {
|
|---|
| 1055 | dyk[(it/2)*(LMAX+1-m) + (l-m)].t = dZt[it]/(2*NLAT);
|
|---|
| 1056 | }
|
|---|
| 1057 | }
|
|---|
| 1058 |
|
|---|
| 1059 | /* compute analysis coefficients (fast way)
|
|---|
| 1060 | * Wklm = int(Tk*Ylm) = int(Tk.sum(i,a_ilm*Ti)) = sum(i, a_ilm* int(Tk*Ti)) = sum(i, a_ilm*Jik)
|
|---|
| 1061 | * with Jik = int(Tk*Ti) = 1/(1-(k-i)^2) + 1/(1-(k+i)^2)
|
|---|
| 1062 | */
|
|---|
| 1063 | if (analysis) {
|
|---|
| 1064 | #if SHT_VERBOSE > 0
|
|---|
| 1065 | if ((LMAX>126)&&(m0==0)&&(verbose)) { // only one thread prints this message.
|
|---|
| 1066 | printf("computing weights m=%d\r",m); fflush(stdout);
|
|---|
| 1067 | }
|
|---|
| 1068 | #endif
|
|---|
| 1069 | for (l=m; l<=LMAX; l++) {
|
|---|
| 1070 | unsigned k0,k1, k,i,d;
|
|---|
| 1071 | double Jik0, Jik1, yy, dyp, dyt;
|
|---|
| 1072 | double lnorm = iylm_fft_norm;
|
|---|
| 1073 |
|
|---|
| 1074 | if (SHT_NORM == sht_schmidt) lnorm *= (2*l+1); // Schmidt semi-normalization
|
|---|
| 1075 | if ( (m>0) && (shtns->norm & SHT_REAL_NORM) ) lnorm *= 2; // "real" norm : zlm must be doubled for m>0
|
|---|
| 1076 |
|
|---|
| 1077 | k0 = (l-m)&1; k1 = 1-k0;
|
|---|
| 1078 | for(k=0; k<NLAT; k++) { Z[k] = 0.0; dZt[k] = 0.0; dZp[k] = 0.0; }
|
|---|
| 1079 | for (i=0; i<=(l+1)/2; i++) {
|
|---|
| 1080 | yy = yk[i*(LMAX+1-m) + (l-m)] * lnorm;
|
|---|
| 1081 | dyp = dyk[i*(LMAX+1-m) + (l-m)].p * lnorm/(l*(l+1));
|
|---|
| 1082 | dyt = dyk[i*(LMAX+1-m) + (l-m)].t * lnorm/(l*(l+1));
|
|---|
| 1083 | if (i+k0==0) { yy*=0.5; dyp*=0.5; }
|
|---|
| 1084 | if (i+k1==0) dyt*=0.5;
|
|---|
| 1085 | for (k=0; k<NLAT_2; k++) {
|
|---|
| 1086 | d = (k<i) ? i-k : k-i;
|
|---|
| 1087 | Jik0 = is1[(i+k0)+k] + is1[d];
|
|---|
| 1088 | Jik1 = is1[(i+k1)+k] + is1[d];
|
|---|
| 1089 | Z[2*k+k0] += yy * Jik0;
|
|---|
| 1090 | dZt[2*k+k1] += dyt * Jik1;
|
|---|
| 1091 | if (m&1) dZp[2*k+k0] += dyp * Jik0;
|
|---|
| 1092 | }
|
|---|
| 1093 | }
|
|---|
| 1094 | #if SHT_VERBOSE > 1
|
|---|
| 1095 | if ((LMAX <= 12)&&(verbose>1))
|
|---|
| 1096 | #pragma omp critical
|
|---|
| 1097 | {
|
|---|
| 1098 | printf("\nl=%d, m=%d ::\t",l,m);
|
|---|
| 1099 | for (k=0; k<(2*NLAT_2); k++) printf("%f ",Z[k]);
|
|---|
| 1100 | printf("\n dZt ::\t");
|
|---|
| 1101 | for (k=0; k<(2*NLAT_2); k++) printf("%f ",dZt[k]);
|
|---|
| 1102 | if (m&1) {
|
|---|
| 1103 | printf("\n dZp ::\t");
|
|---|
| 1104 | for (k=0; k<(2*NLAT_2); k++) printf("%f ",dZp[k]);
|
|---|
| 1105 | }
|
|---|
| 1106 | }
|
|---|
| 1107 | #endif
|
|---|
| 1108 |
|
|---|
| 1109 | if (m == 0) { // we store zlm in dct space for m=0
|
|---|
| 1110 | if (k0==0) {
|
|---|
| 1111 | yk0[((l-m)>>1)*(2*NLAT_2)] = Z[0]*0.5; // store zlm_dct (k=0)
|
|---|
| 1112 | for (k=1; k<(2*NLAT_2); k++) yk0[((l-m)>>1)*(2*NLAT_2) +k] = 0.0; // zero out.
|
|---|
| 1113 | k0=2;
|
|---|
| 1114 | }
|
|---|
| 1115 | for (k=k0; k<(2*NLAT_2); k+=2)
|
|---|
| 1116 | yk0[((l-m)>>1)*(2*NLAT_2) +k] = Z[k]; // store zlm_dct
|
|---|
| 1117 |
|
|---|
| 1118 | if (l>0) {
|
|---|
| 1119 | if (k1==0) {
|
|---|
| 1120 | dyk0[((l-1-m)>>1)*(2*NLAT_2)] = dZt[0]*0.5; // store dzlm_dct (k=0)
|
|---|
| 1121 | for (k=1; k<(2*NLAT_2); k++) dyk0[((l-1-m)>>1)*(2*NLAT_2) +k] = 0.0; // zero out.
|
|---|
| 1122 | k1=2;
|
|---|
| 1123 | }
|
|---|
| 1124 | for (k=k1; k<(2*NLAT_2); k+=2)
|
|---|
| 1125 | dyk0[((l-1-m)>>1)*(2*NLAT_2) +k] = dZt[k]; // store dzlm_dct
|
|---|
| 1126 | }
|
|---|
| 1127 | }
|
|---|
| 1128 |
|
|---|
| 1129 | fftw_execute_r2r(idct, Z, Z); fftw_execute_r2r(idct, dZt, dZt);
|
|---|
| 1130 | if (m == 0) {
|
|---|
| 1131 | for (it=0; it<NLAT; it++) { dZp[it] = 0.0; dZt[it] *= st_1[it]; }
|
|---|
| 1132 | } else if (m & 1) { // m odd
|
|---|
| 1133 | fftw_execute_r2r(idct, dZp, dZp);
|
|---|
| 1134 | for (it=0; it<NLAT; it++) { Z[it] *= st_1[it]; }
|
|---|
| 1135 | } else { // m even
|
|---|
| 1136 | for (it=0; it<NLAT; it++) { dZp[it] = Z[it]*m/(l*(l+1)*st[it]); dZt[it] *= st_1[it]; }
|
|---|
| 1137 | }
|
|---|
| 1138 |
|
|---|
| 1139 | long int l0 = (m==0) ? 1 : m;
|
|---|
| 1140 | long int talign = (m==0)*(NLAT_2 & 1);
|
|---|
| 1141 | long int sk = (l-l0)&1;
|
|---|
| 1142 | if (l==0) {
|
|---|
| 1143 | for (it=0; it<NLAT_2; it++) {
|
|---|
| 1144 | shtns->zlm[im][it] = Z[it];
|
|---|
| 1145 | }
|
|---|
| 1146 | } else if ((sk == 0)&&(l == LMAX)) {
|
|---|
| 1147 | for (it=0; it<NLAT_2; it++) {
|
|---|
| 1148 | shtns->zlm[im][(l-m)*NLAT_2 + it + talign] = Z[it];
|
|---|
| 1149 | if (vector) {
|
|---|
| 1150 | shtns->dzlm[im][(l-l0)*NLAT_2 + it].p = dZp[it];
|
|---|
| 1151 | shtns->dzlm[im][(l-l0)*NLAT_2 + it].t = dZt[it];
|
|---|
| 1152 | }
|
|---|
| 1153 | }
|
|---|
| 1154 | } else {
|
|---|
| 1155 | for (it=0; it<NLAT_2; it++) {
|
|---|
| 1156 | shtns->zlm[im][(l-m-sk)*NLAT_2 + it*2 +sk + talign] = Z[it];
|
|---|
| 1157 | if (vector) {
|
|---|
| 1158 | shtns->dzlm[im][(l-l0-sk)*NLAT_2 + it*2 +sk].p = dZp[it];
|
|---|
| 1159 | shtns->dzlm[im][(l-l0-sk)*NLAT_2 + it*2 +sk].t = dZt[it];
|
|---|
| 1160 | }
|
|---|
| 1161 | }
|
|---|
| 1162 | }
|
|---|
| 1163 | }
|
|---|
| 1164 | }
|
|---|
| 1165 |
|
|---|
| 1166 | // Compact the coefficients for improved cache efficiency.
|
|---|
| 1167 | yg = shtns->ykm_dct[im];
|
|---|
| 1168 | for (it=0; it<= KMAX; it+=2) {
|
|---|
| 1169 | l = (it < m) ? m : it-(m&1);
|
|---|
| 1170 | while (l<=LMAX) {
|
|---|
| 1171 | yg[0] = yk[(it/2)*(LMAX+1-m) + (l-m)];
|
|---|
| 1172 | l++; yg++;
|
|---|
| 1173 | }
|
|---|
| 1174 | if ((m==0) && ((LMAX & 1) == 0)) { yg[0] = 0; yg++; } // SSE2 padding.
|
|---|
| 1175 | }
|
|---|
| 1176 | if (MMAX==0) for (int j=0; j<3; j++) yg[j] = 0; // allow some overflow.
|
|---|
| 1177 | if (vector) {
|
|---|
| 1178 | dyg = shtns->dykm_dct[im];
|
|---|
| 1179 | for (it=0; it<= KMAX; it+=2) {
|
|---|
| 1180 | l = (it-2 < m) ? m : it-2+(m&1);
|
|---|
| 1181 | while (l<=LMAX) {
|
|---|
| 1182 | dyg[0].t = dyk[(it/2)*(LMAX+1-m) + (l-m)].t;
|
|---|
| 1183 | dyg[0].p = dyk[(it/2)*(LMAX+1-m) + (l-m)].p;
|
|---|
| 1184 | l++; dyg++;
|
|---|
| 1185 | }
|
|---|
| 1186 | }
|
|---|
| 1187 | if (im == 0) { // compact and reorder m=0 dylm because .p = 0 :
|
|---|
| 1188 | dyg = shtns->dykm_dct[im];
|
|---|
| 1189 | yg = (double *) shtns->dykm_dct[im];
|
|---|
| 1190 | for (it=0; it<= KMAX; it+=2) {
|
|---|
| 1191 | dyg++;
|
|---|
| 1192 | for (l=it-1; l<=LMAX; l++) {
|
|---|
| 1193 | if (l>0) {
|
|---|
| 1194 | yg[0] = dyg[0].t;
|
|---|
| 1195 | yg++; dyg++;
|
|---|
| 1196 | }
|
|---|
| 1197 | }
|
|---|
| 1198 | if (LMAX&1) { // padding for SSE2 alignement.
|
|---|
| 1199 | yg[0] = 0.0; yg++;
|
|---|
| 1200 | }
|
|---|
| 1201 | }
|
|---|
| 1202 | }
|
|---|
| 1203 | }
|
|---|
| 1204 | }
|
|---|
| 1205 |
|
|---|
| 1206 | // compact yk to zlm_dct0
|
|---|
| 1207 | if ((m0==0)&&(analysis)) {
|
|---|
| 1208 | long int klim = (LMAX * SHT_NL_ORDER) + 2; // max k needed for nl-terms...
|
|---|
| 1209 | klim = (klim/2)*2; // must be even...
|
|---|
| 1210 | if (klim > 2*NLAT_2) klim = 2*NLAT_2; // but no more than 2*NLAT_2.
|
|---|
| 1211 | shtns->klim = klim; // store for use in codelets.
|
|---|
| 1212 | yg = shtns->zlm_dct0;
|
|---|
| 1213 | for (l=0; l<=LMAX; l+=2) {
|
|---|
| 1214 | for (it=l; it<klim; it++) { // for m=0, zl coeff with i<l are zeros.
|
|---|
| 1215 | *yg = yk0[it];
|
|---|
| 1216 | yg++;
|
|---|
| 1217 | }
|
|---|
| 1218 | yk0 += 2*NLAT_2;
|
|---|
| 1219 | }
|
|---|
| 1220 | if (vector) {
|
|---|
| 1221 | yg = shtns->dzlm_dct0;
|
|---|
| 1222 | for (l=1; l<=LMAX; l+=2) {
|
|---|
| 1223 | for (it=l-1; it<klim; it++) { // for m=0, dzl coeff with i<l-1 are zeros.
|
|---|
| 1224 | *yg = dyk0[it];
|
|---|
| 1225 | yg++;
|
|---|
| 1226 | }
|
|---|
| 1227 | dyk0 += 2*NLAT_2;
|
|---|
| 1228 | }
|
|---|
| 1229 | }
|
|---|
| 1230 | free(yk0 - (2*NLAT_2)*(LMAX/2+1));
|
|---|
| 1231 | }
|
|---|
| 1232 |
|
|---|
| 1233 | free(dyk); free(yk);
|
|---|
| 1234 | }
|
|---|
| 1235 |
|
|---|
| 1236 | /// \internal Computes the matrices required for SH transform on a regular grid (with or without DCT).
|
|---|
| 1237 | /// \param analysis : 0 => synthesis only.
|
|---|
| 1238 | static void init_SH_dct(shtns_cfg shtns, int analysis)
|
|---|
| 1239 | {
|
|---|
| 1240 | fftw_plan dct, idct;
|
|---|
| 1241 | long int it,im,m,l;
|
|---|
| 1242 | long int sk, dsk;
|
|---|
| 1243 | double Z[2*NLAT_2] SSE;
|
|---|
| 1244 | double is1[NLAT]; // tabulate values for integrals.
|
|---|
| 1245 |
|
|---|
| 1246 | const long int marray_size = sizeof(void*)*(MMAX+1) + (MIN_ALIGNMENT-1);
|
|---|
| 1247 | const int vector = (shtns->dylm != NULL);
|
|---|
| 1248 | const int KMAX = LMAX+1;
|
|---|
| 1249 |
|
|---|
| 1250 | for(im=0, sk=0, dsk=0; im<=MMAX; im++) { // how much memory to allocate for ykm_dct ?
|
|---|
| 1251 | m = im*MRES;
|
|---|
| 1252 | for (it=0; it<= KMAX; it+=2) {
|
|---|
| 1253 | l = (it < m) ? m : it-(m&1);
|
|---|
| 1254 | sk += LMAX+1 - l;
|
|---|
| 1255 | if ((m==0) && ((LMAX & 1) ==0)) sk++; // SSE padding for m=0
|
|---|
| 1256 | }
|
|---|
| 1257 | for (it=0; it<= KMAX; it+=2) {
|
|---|
| 1258 | l = (it-2 < m) ? m : it-2+(m&1);
|
|---|
| 1259 | dsk += LMAX+1 - l;
|
|---|
| 1260 | }
|
|---|
| 1261 | }
|
|---|
| 1262 | if (MMAX == 0) sk+=3; // allow some overflow.
|
|---|
| 1263 | for (l=0, it=0; l<=LMAX; l+=2) // how much memory for zlm_dct0 ?
|
|---|
| 1264 | it += (2*NLAT_2 -l);
|
|---|
| 1265 | for (l=1, im=0; l<=LMAX; l+=2) // how much memory for dzlm_dct0 ?
|
|---|
| 1266 | im += (2*NLAT_2 -l+1);
|
|---|
| 1267 |
|
|---|
| 1268 | #if SHT_VERBOSE > 1
|
|---|
| 1269 | if (verbose>1) printf(" Memory used for Ykm_dct matrices = %.3f Mb\n",sizeof(double)*(sk + 2.*dsk + it)/(1024.*1024.));
|
|---|
| 1270 | #endif
|
|---|
| 1271 | shtns->ykm_dct = (double **) malloc( marray_size + sizeof(double)*sk );
|
|---|
| 1272 | shtns->ykm_dct[0] = (double *) PTR_ALIGN( shtns->ykm_dct + (MMAX+1) );
|
|---|
| 1273 | if (vector) {
|
|---|
| 1274 | shtns->dykm_dct = (struct DtDp **) malloc( marray_size + sizeof(struct DtDp)*dsk );
|
|---|
| 1275 | shtns->dykm_dct[0] = (struct DtDp *) PTR_ALIGN( shtns->dykm_dct + (MMAX+1) );
|
|---|
| 1276 | }
|
|---|
| 1277 | shtns->zlm_dct0 = (double *) VMALLOC( sizeof(double)* it );
|
|---|
| 1278 | if (vector) {
|
|---|
| 1279 | shtns->dzlm_dct0 = (double *) VMALLOC( sizeof(double)* im );
|
|---|
| 1280 | }
|
|---|
| 1281 | for (im=0; im<MMAX; im++) {
|
|---|
| 1282 | m = im*MRES;
|
|---|
| 1283 | for (it=0, sk=0; it<= KMAX; it+=2) {
|
|---|
| 1284 | l = (it < m) ? m : it-(m&1);
|
|---|
| 1285 | sk += LMAX+1 - l;
|
|---|
| 1286 | if ((m==0) && ((LMAX & 1) ==0)) sk++; // SSE padding for m=0
|
|---|
| 1287 | }
|
|---|
| 1288 | for (it=0, dsk=0; it<= KMAX; it+=2) {
|
|---|
| 1289 | l = (it-2 < m) ? m : it-2+(m&1);
|
|---|
| 1290 | dsk += LMAX+1 - l;
|
|---|
| 1291 | }
|
|---|
| 1292 | shtns->ykm_dct[im+1] = shtns->ykm_dct[im] + sk;
|
|---|
| 1293 | if (vector) shtns->dykm_dct[im+1] = shtns->dykm_dct[im] + dsk;
|
|---|
| 1294 | }
|
|---|
| 1295 |
|
|---|
| 1296 | #if SHT_VERBOSE > 1
|
|---|
| 1297 | ticks tik0, tik1;
|
|---|
| 1298 | tik0 = getticks();
|
|---|
| 1299 | #endif
|
|---|
| 1300 |
|
|---|
| 1301 | #ifdef OMP_FFTW
|
|---|
| 1302 | fftw_plan_with_nthreads(1);
|
|---|
| 1303 | #endif
|
|---|
| 1304 | dct = fftw_plan_r2r_1d( 2*NLAT_2, Z, Z, FFTW_REDFT10, FFTW_MEASURE ); // quick and dirty dct.
|
|---|
| 1305 | idct = fftw_plan_r2r_1d( 2*NLAT_2, Z, Z, FFTW_REDFT01, FFTW_MEASURE ); // quick and dirty idct.
|
|---|
| 1306 |
|
|---|
| 1307 | init_SH_synth(shtns);
|
|---|
| 1308 |
|
|---|
| 1309 | // precomputation for scalar product of Chebychev polynomials.
|
|---|
| 1310 | for(it=0; it<NLAT; it++)
|
|---|
| 1311 | is1[it] = 1./(1. - 4.*it*it);
|
|---|
| 1312 |
|
|---|
| 1313 | #ifdef _OPENMP
|
|---|
| 1314 | #pragma omp parallel
|
|---|
| 1315 | {
|
|---|
| 1316 | int n=omp_get_num_threads();
|
|---|
| 1317 | int k=omp_get_thread_num();
|
|---|
| 1318 | init_SH_dct_m(shtns, is1, dct, idct, analysis, k, n);
|
|---|
| 1319 | }
|
|---|
| 1320 | #else
|
|---|
| 1321 | init_SH_dct_m(shtns, is1, dct, idct, analysis, 0, 1);
|
|---|
| 1322 | #endif
|
|---|
| 1323 |
|
|---|
| 1324 | #if SHT_VERBOSE > 1
|
|---|
| 1325 | tik1 = getticks();
|
|---|
| 1326 | if (verbose>1) printf("\n ticks : %.3f\n", elapsed(tik1,tik0)/(NLM*NLAT*(MMAX+1)));
|
|---|
| 1327 | #endif
|
|---|
| 1328 | fftw_destroy_plan(idct); fftw_destroy_plan(dct);
|
|---|
| 1329 | }
|
|---|
| 1330 |
|
|---|
| 1331 | #endif /* SHTNS_DCT */
|
|---|
| 1332 |
|
|---|
| 1333 | /// \internal Generate a gauss grid (including weights)
|
|---|
| 1334 | static void grid_gauss(shtns_cfg shtns, double latdir)
|
|---|
| 1335 | {
|
|---|
| 1336 | long int it;
|
|---|
| 1337 | real iylm_fft_norm;
|
|---|
| 1338 | real xg[NLAT], wgl[NLAT]; // gauss points and weights.
|
|---|
| 1339 | const int overflow = 8*VSIZE2-1;
|
|---|
| 1340 |
|
|---|
| 1341 | shtns->grid = GRID_GAUSS;
|
|---|
| 1342 | shtns->wg = VMALLOC((NLAT_2 +overflow) * sizeof(double)); // gauss weights, double precision.
|
|---|
| 1343 |
|
|---|
| 1344 | iylm_fft_norm = 1.0; // FFT/SHT normalization for zlm (4pi normalized)
|
|---|
| 1345 | if ((SHT_NORM != sht_fourpi)&&(SHT_NORM != sht_schmidt)) iylm_fft_norm = 4*M_PIl; // FFT/SHT normalization for zlm (orthonormalized)
|
|---|
| 1346 | iylm_fft_norm /= (2*NPHI);
|
|---|
| 1347 | #if SHT_VERBOSE > 0
|
|---|
| 1348 | if (verbose) {
|
|---|
| 1349 | printf(" => using Gauss nodes\n");
|
|---|
| 1350 | if (2*NLAT <= (SHT_NL_ORDER +1)*LMAX) printf(" !! Warning : Gauss-Legendre anti-aliasing condition 2*Nlat > %d*Lmax is not met.\n",SHT_NL_ORDER+1);
|
|---|
| 1351 | }
|
|---|
| 1352 | #endif
|
|---|
| 1353 | gauss_nodes(xg,wgl,NLAT); // generate gauss nodes and weights : ct = ]1,-1[ = cos(theta)
|
|---|
| 1354 | for (it=0; it<NLAT; it++) {
|
|---|
| 1355 | shtns->ct[it] = latdir * xg[it];
|
|---|
| 1356 | shtns->st[it] = SQRT((1.-xg[it])*(1.+xg[it]));
|
|---|
| 1357 | shtns->st_1[it] = 1.0/SQRT((1.-xg[it])*(1.+xg[it]));
|
|---|
| 1358 | }
|
|---|
| 1359 | for (it=0; it<NLAT_2; it++)
|
|---|
| 1360 | shtns->wg[it] = wgl[it]*iylm_fft_norm; // faster double-precision computations.
|
|---|
| 1361 | if (NLAT & 1) { // odd NLAT : adjust weigth of middle point.
|
|---|
| 1362 | shtns->wg[NLAT_2-1] *= 0.5;
|
|---|
| 1363 | }
|
|---|
| 1364 | for (it=NLAT_2; it < NLAT_2 +overflow; it++) shtns->wg[it] = 0.0; // padding for multi-way algorithm.
|
|---|
| 1365 |
|
|---|
| 1366 | #if SHT_VERBOSE > 1
|
|---|
| 1367 | if (verbose>1) {
|
|---|
| 1368 | printf(" NLAT=%d, NLAT_2=%d\n",NLAT,NLAT_2);
|
|---|
| 1369 | // TEST if gauss points are ok.
|
|---|
| 1370 | double tmax = 0.0;
|
|---|
| 1371 | for (it = 0; it<NLAT_2; it++) {
|
|---|
| 1372 | double t = legendre_Pl(NLAT, shtns->ct[it]);
|
|---|
| 1373 | if (t>tmax) tmax = t;
|
|---|
| 1374 | // printf("i=%d, x=%12.12g, p=%12.12g\n",it,ct[it],t);
|
|---|
| 1375 | }
|
|---|
| 1376 | printf(" max zero at Gauss nodes for Pl[l=NLAT] : %g\n",tmax);
|
|---|
| 1377 | if (NLAT_2 < 100) {
|
|---|
| 1378 | printf(" Gauss nodes :");
|
|---|
| 1379 | for (it=0;it<NLAT_2; it++)
|
|---|
| 1380 | printf(" %g",shtns->ct[it]);
|
|---|
| 1381 | printf("\n");
|
|---|
| 1382 | }
|
|---|
| 1383 | }
|
|---|
| 1384 | #endif
|
|---|
| 1385 | }
|
|---|
| 1386 |
|
|---|
| 1387 | #ifdef SHTNS_DCT
|
|---|
| 1388 | /// \internal Generate an equi-spaced theta grid (Chebychev points, excluding poles) for Féjer-DCT SHT.
|
|---|
| 1389 | static void grid_dct(shtns_cfg shtns, double latdir)
|
|---|
| 1390 | {
|
|---|
| 1391 | long int it;
|
|---|
| 1392 |
|
|---|
| 1393 | shtns->grid = GRID_REGULAR;
|
|---|
| 1394 | #if SHT_VERBOSE > 0
|
|---|
| 1395 | if (verbose) {
|
|---|
| 1396 | printf(" => using equaly spaced nodes with DCT acceleration\n");
|
|---|
| 1397 | if (NLAT <= SHT_NL_ORDER *LMAX) printf(" !! Warning : DCT anti-aliasing condition Nlat > %d*Lmax is not met.\n",SHT_NL_ORDER);
|
|---|
| 1398 | if (NLAT != fft_int(NLAT,7)) printf(" !! Warning : Nlat is not optimal for FFTW !\n");
|
|---|
| 1399 | }
|
|---|
| 1400 | #endif
|
|---|
| 1401 | if (NLAT & 1) shtns_runerr("NLAT must be even (DCT)");
|
|---|
| 1402 | if (NLAT <= LMAX+1) shtns_runerr("NLAT should be at least LMAX+2 (DCT)");
|
|---|
| 1403 |
|
|---|
| 1404 | for (it=0; it<NLAT; it++) { // Chebychev points : equaly spaced but skipping poles.
|
|---|
| 1405 | real th = M_PIl; th = (th*(2*it+1))/(2*NLAT);
|
|---|
| 1406 | shtns->ct[it] = latdir * COS(th);
|
|---|
| 1407 | shtns->st[it] = SIN(th);
|
|---|
| 1408 | shtns->st_1[it] = 1.0/SIN(th);
|
|---|
| 1409 | }
|
|---|
| 1410 |
|
|---|
| 1411 | #if SHT_VERBOSE > 1
|
|---|
| 1412 | if (verbose>1) {
|
|---|
| 1413 | printf(" NLAT=%d, NLAT_2=%d\n",NLAT,NLAT_2);
|
|---|
| 1414 | double tmax = 0.0;
|
|---|
| 1415 | for (it=0;it<NLAT_2; it++) {
|
|---|
| 1416 | double ct = shtns->ct[it]; double st = shtns->st[it];
|
|---|
| 1417 | double t = fabs((ct*ct + st*st) -1.0);
|
|---|
| 1418 | if (t > tmax) tmax=t;
|
|---|
| 1419 | }
|
|---|
| 1420 | printf(" max st^2 + ct^2 -1 = %g\n",tmax);
|
|---|
| 1421 | if (NLAT_2 < 100) {
|
|---|
| 1422 | printf(" DCT nodes :");
|
|---|
| 1423 | for (it=0; it<NLAT_2; it++)
|
|---|
| 1424 | printf(" %g",shtns->ct[it]);
|
|---|
| 1425 | printf("\n");
|
|---|
| 1426 | }
|
|---|
| 1427 | }
|
|---|
| 1428 | #endif
|
|---|
| 1429 | }
|
|---|
| 1430 | #endif /* SHTNS_DCT */
|
|---|
| 1431 |
|
|---|
| 1432 | /// \internal Generate an equi-spaced theta grid including the poles, for synthesis only.
|
|---|
| 1433 | static void grid_equal_polar(shtns_cfg shtns, double latdir)
|
|---|
| 1434 | {
|
|---|
| 1435 | long int j;
|
|---|
| 1436 |
|
|---|
| 1437 | shtns->grid = GRID_POLES;
|
|---|
| 1438 | #if SHT_VERBOSE > 0
|
|---|
| 1439 | if (verbose) printf(" => using Equaly Spaced Nodes including poles\n");
|
|---|
| 1440 | #endif
|
|---|
| 1441 | // cos theta of latidunal points (equaly spaced in theta)
|
|---|
| 1442 | double f = M_PIl/(NLAT-1);
|
|---|
| 1443 | for (j=0; j<NLAT; j++) {
|
|---|
| 1444 | shtns->ct[j] = latdir * cos(f*j);
|
|---|
| 1445 | shtns->st[j] = sin(f*j);
|
|---|
| 1446 | shtns->st_1[j] = 1.0/sin(f*j);
|
|---|
| 1447 | }
|
|---|
| 1448 | #if SHT_VERBOSE > 0
|
|---|
| 1449 | if (verbose) printf(" !! Warning : only synthesis (inverse transform) supported for this grid !\n");
|
|---|
| 1450 | #endif
|
|---|
| 1451 | }
|
|---|
| 1452 |
|
|---|
| 1453 |
|
|---|
| 1454 | /* TEST AND TIMING FUNCTIONS */
|
|---|
| 1455 |
|
|---|
| 1456 | /// \internal return the max error for a back-and-forth SHT transform.
|
|---|
| 1457 | /// this function is used to internally measure the accuracy.
|
|---|
| 1458 | double SHT_error(shtns_cfg shtns, int vector)
|
|---|
| 1459 | {
|
|---|
| 1460 | cplx *Tlm0=0, *Slm0=0, *Tlm=0, *Slm=0;
|
|---|
| 1461 | double *Sh=0, *Th=0;
|
|---|
| 1462 | double t, tmax, n2, err;
|
|---|
| 1463 | long int i, jj, nlm_cplx;
|
|---|
| 1464 |
|
|---|
| 1465 | srand( time(NULL) ); // init random numbers.
|
|---|
| 1466 |
|
|---|
| 1467 | Slm0 = (cplx *) VMALLOC(sizeof(cplx)* NLM);
|
|---|
| 1468 | Slm = (cplx *) VMALLOC(sizeof(cplx)* NLM);
|
|---|
| 1469 | Sh = (double *) VMALLOC( NSPAT_ALLOC(shtns) * sizeof(double) );
|
|---|
| 1470 | if ((Sh==0) || (Slm==0) || (Slm0==0)) shtns_runerr("not enough memory.");
|
|---|
| 1471 | if (vector) {
|
|---|
| 1472 | Tlm0 = (cplx *) VMALLOC(sizeof(cplx)* NLM);
|
|---|
| 1473 | Tlm = (cplx *) VMALLOC(sizeof(cplx)* NLM);
|
|---|
| 1474 | Th = (double *) VMALLOC( NSPAT_ALLOC(shtns) * sizeof(double) );
|
|---|
| 1475 | if ((Th==0) || (Tlm==0) || (Tlm0==0)) vector=0;
|
|---|
| 1476 | }
|
|---|
| 1477 |
|
|---|
| 1478 | // m = nphi/2 is also real if nphi is even.
|
|---|
| 1479 | nlm_cplx = ( MMAX*2 == NPHI ) ? LiM(shtns, MRES*MMAX,MMAX) : NLM;
|
|---|
| 1480 | t = 1.0 / (RAND_MAX/2);
|
|---|
| 1481 | for (i=0; i<NLM; i++) {
|
|---|
| 1482 | if ((i<=LMAX)||(i>=nlm_cplx)) { // m=0 or m*2=nphi : real random data
|
|---|
| 1483 | Slm0[i] = t*((double) (rand() - RAND_MAX/2));
|
|---|
| 1484 | if (vector) Tlm0[i] = t*((double) (rand() - RAND_MAX/2));
|
|---|
| 1485 | } else { // m>0 : complex random data
|
|---|
| 1486 | Slm0[i] = t*((double) (rand() - RAND_MAX/2)) + I*t*((double) (rand() - RAND_MAX/2));
|
|---|
| 1487 | if (vector) Tlm0[i] = t*((double) (rand() - RAND_MAX/2)) + I*t*((double) (rand() - RAND_MAX/2));
|
|---|
| 1488 | }
|
|---|
| 1489 | }
|
|---|
| 1490 |
|
|---|
| 1491 | SH_to_spat(shtns, Slm0,Sh); // scalar SHT
|
|---|
| 1492 | spat_to_SH(shtns, Sh, Slm);
|
|---|
| 1493 | for (i=0, tmax=0., n2=0., jj=0; i<NLM; i++) { // compute error
|
|---|
| 1494 | t = cabs(Slm[i] - Slm0[i]);
|
|---|
| 1495 | n2 += t*t;
|
|---|
| 1496 | if (t>tmax) { tmax = t; jj = i; }
|
|---|
| 1497 | }
|
|---|
| 1498 | err = tmax;
|
|---|
| 1499 | #if SHT_VERBOSE > 1
|
|---|
| 1500 | if (verbose>1) printf(" scalar SH - poloidal rms error = %.3g max error = %.3g for l=%hu,lm=%ld\n",sqrt(n2/NLM),tmax,shtns->li[jj],jj);
|
|---|
| 1501 | #endif
|
|---|
| 1502 |
|
|---|
| 1503 | if (vector) {
|
|---|
| 1504 | Slm0[0] = 0.0; Tlm0[0] = 0.0; // l=0, m=0 n'a pas de signification sph/tor
|
|---|
| 1505 | SHsphtor_to_spat(shtns, Slm0, Tlm0, Sh, Th); // vector SHT
|
|---|
| 1506 | spat_to_SHsphtor(shtns, Sh, Th, Slm, Tlm);
|
|---|
| 1507 | for (i=0, tmax=0., n2=0., jj=0; i<NLM; i++) { // compute error
|
|---|
| 1508 | t = cabs(Slm[i] - Slm0[i]);
|
|---|
| 1509 | n2 += t*t;
|
|---|
| 1510 | if (t>tmax) { tmax = t; jj = i; }
|
|---|
| 1511 | }
|
|---|
| 1512 | if (tmax > err) err = tmax;
|
|---|
| 1513 | #if SHT_VERBOSE > 1
|
|---|
| 1514 | if (verbose>1) printf(" vector SH - spheroidal rms error = %.3g max error = %.3g for l=%hu,lm=%ld\n",sqrt(n2/NLM),tmax,shtns->li[jj],jj);
|
|---|
| 1515 | #endif
|
|---|
| 1516 | for (i=0, tmax=0., n2=0., jj=0; i<NLM; i++) { // compute error
|
|---|
| 1517 | t = cabs(Tlm[i] - Tlm0[i]);
|
|---|
| 1518 | n2 += t*t;
|
|---|
| 1519 | if (t>tmax) { tmax = t; jj = i; }
|
|---|
| 1520 | }
|
|---|
| 1521 | if (tmax > err) err = tmax;
|
|---|
| 1522 | #if SHT_VERBOSE > 1
|
|---|
| 1523 | if (verbose>1) printf(" - toroidal rms error = %.3g max error = %.3g for l=%hu,lm=%ld\n",sqrt(n2/NLM),tmax,shtns->li[jj],jj);
|
|---|
| 1524 | #endif
|
|---|
| 1525 | }
|
|---|
| 1526 |
|
|---|
| 1527 | if (Th) VFREE(Th); if (Tlm) VFREE(Tlm); if (Tlm0) VFREE(Tlm0);
|
|---|
| 1528 | VFREE(Sh); VFREE(Slm); VFREE(Slm0);
|
|---|
| 1529 | return(err); // return max error.
|
|---|
| 1530 | }
|
|---|
| 1531 |
|
|---|
| 1532 |
|
|---|
| 1533 | #if SHT_VERBOSE == 1
|
|---|
| 1534 | #define PRINT_DOT if (verbose>=1) { printf("."); fflush(stdout); }
|
|---|
| 1535 | #else
|
|---|
| 1536 | #define PRINT_DOT (0);
|
|---|
| 1537 | #endif
|
|---|
| 1538 |
|
|---|
| 1539 | /// \internal measure time used for a transform function
|
|---|
| 1540 | static double get_time(shtns_cfg shtns, int nloop, int npar, char* name, void *fptr, void *i1, void *i2, void *i3, void *o1, void *o2, void *o3, int l)
|
|---|
| 1541 | {
|
|---|
| 1542 | double t;
|
|---|
| 1543 | int i;
|
|---|
| 1544 | ticks tik0, tik1;
|
|---|
| 1545 |
|
|---|
| 1546 | if (fptr == NULL) return(0.0);
|
|---|
| 1547 |
|
|---|
| 1548 | tik1 = getticks();
|
|---|
| 1549 | for (i=0; i<nloop; i++) {
|
|---|
| 1550 | switch(npar) {
|
|---|
| 1551 | case 2: (*(pf2l)fptr)(shtns, i1,o1, l); break; // l may be discarded.
|
|---|
| 1552 | case 3: (*(pf3l)fptr)(shtns, i1,o1,o2, l); break;
|
|---|
| 1553 | case 4: (*(pf4l)fptr)(shtns, i1,i2,o1,o2, l); break;
|
|---|
| 1554 | default: (*(pf6l)fptr)(shtns, i1,i2,i3, o1,o2,o3, l); break;
|
|---|
| 1555 | }
|
|---|
| 1556 | if (i==0) tik0 = getticks();
|
|---|
| 1557 | }
|
|---|
| 1558 | if (nloop == 1) {
|
|---|
| 1559 | t = elapsed(tik0, tik1);
|
|---|
| 1560 | } else {
|
|---|
| 1561 | tik1 = getticks();
|
|---|
| 1562 | t = elapsed(tik1, tik0)/(nloop-1); // discard first iteration.
|
|---|
| 1563 | }
|
|---|
| 1564 | #if SHT_VERBOSE > 1
|
|---|
| 1565 | if (verbose>1) { printf(" t(%s) = %.3g",name,t); fflush(stdout); }
|
|---|
| 1566 | #endif
|
|---|
| 1567 | return t;
|
|---|
| 1568 | }
|
|---|
| 1569 |
|
|---|
| 1570 |
|
|---|
| 1571 | /// \internal choose fastest between on-the-fly and gauss algorithms.
|
|---|
| 1572 | /// *nlp is the number of loops. If zero, it is set to a good value.
|
|---|
| 1573 | /// on_the_fly : 1 = skip all memory algorithm. 0 = include memory and on-the-fly. -1 = test only DCT.
|
|---|
| 1574 | /// returns time without dct / best time with dct (or 0 if no dct available).
|
|---|
| 1575 | static double choose_best_sht(shtns_cfg shtns, int* nlp, int vector, int dct_mtr)
|
|---|
| 1576 | {
|
|---|
| 1577 | cplx *Qlm=0, *Slm=0, *Tlm=0;
|
|---|
| 1578 | double *Qh=0, *Sh=0, *Th=0;
|
|---|
| 1579 | int m, i, i0, minc, nloop, alg_end;
|
|---|
| 1580 | int typ_lim = SHT_NTYP; // time every type.
|
|---|
| 1581 | double t0, t, tt, r;
|
|---|
| 1582 | double tdct, tnodct;
|
|---|
| 1583 | clock_t tcpu;
|
|---|
| 1584 | int on_the_fly_only = (shtns->ylm == NULL); // only on-the-fly.
|
|---|
| 1585 | int otf_analys = (shtns->wg != NULL); // on-the-fly analysis supported.
|
|---|
| 1586 |
|
|---|
| 1587 | if (NLAT < VSIZE2*4) return(0.0); // on-the-fly not possible for NLAT_2 < 2*NWAY (overflow) and DCT not efficient for low NLAT.
|
|---|
| 1588 | if ((dct_mtr != 0) && (shtns->ykm_dct == NULL)) return(0.0); // no dct available : do nothing.
|
|---|
| 1589 |
|
|---|
| 1590 | size_t nspat = sizeof(double) * NSPAT_ALLOC(shtns);
|
|---|
| 1591 | size_t nspec = sizeof(cplx)* NLM;
|
|---|
| 1592 | if (nspec>nspat) nspat=nspec;
|
|---|
| 1593 | Sh = (double *) VMALLOC(nspat); Slm = (cplx *) VMALLOC(nspec);
|
|---|
| 1594 | if ((Sh==0) || (Slm==0)) shtns_runerr("not enough memory.");
|
|---|
| 1595 | if (vector) {
|
|---|
| 1596 | Th = (double *) VMALLOC(nspat); Qh = (double *) VMALLOC(nspat);
|
|---|
| 1597 | Tlm = (cplx *) VMALLOC(nspec); Qlm = (cplx *) VMALLOC(nspec);
|
|---|
| 1598 | if ( (Th==0) || (Qh==0) || (Tlm==0) || (Qlm==0) ) vector = 0;
|
|---|
| 1599 | }
|
|---|
| 1600 |
|
|---|
| 1601 | for (i=0;i<NLM;i++) {
|
|---|
| 1602 | int l = shtns->li[i];
|
|---|
| 1603 | Slm[i] = shtns->l_2[l] + 0.5*I*shtns->l_2[l];
|
|---|
| 1604 | if (vector) {
|
|---|
| 1605 | Tlm[i] = 0.5*shtns->l_2[l] + I*shtns->l_2[l];
|
|---|
| 1606 | Qlm[i] = 3*shtns->l_2[l] + 2*I*shtns->l_2[l];
|
|---|
| 1607 | }
|
|---|
| 1608 | }
|
|---|
| 1609 |
|
|---|
| 1610 | #if SHT_VERBOSE > 0
|
|---|
| 1611 | if (verbose) {
|
|---|
| 1612 | if (dct_mtr != 0) printf(" finding optimal m-truncation for DCT synthesis");
|
|---|
| 1613 | else printf(" finding optimal algorithm");
|
|---|
| 1614 | fflush(stdout);
|
|---|
| 1615 | }
|
|---|
| 1616 | #endif
|
|---|
| 1617 |
|
|---|
| 1618 | if (*nlp <= 0) {
|
|---|
| 1619 | // find good nloop by requiring less than 3% difference between 2 consecutive timings.
|
|---|
| 1620 | m=0; nloop = 1; // number of loops to get timings.
|
|---|
| 1621 | r = 0.0; tt = 1.0;
|
|---|
| 1622 | do {
|
|---|
| 1623 | if ((r > 0.03)||(tt<0.1)) {
|
|---|
| 1624 | m = 0; nloop *= 3;
|
|---|
| 1625 | } else m++;
|
|---|
| 1626 | tcpu = clock();
|
|---|
| 1627 | t0 = get_time(shtns, nloop, 2, "", sht_func[SHT_STD][SHT_FLY2][SHT_TYP_SSY], Slm, Tlm, Qlm, Sh, Th, Qh, LMAX);
|
|---|
| 1628 | tcpu = clock() - tcpu; tt = 1.e-6 * tcpu;
|
|---|
| 1629 | if (tt >= SHT_TIME_LIMIT) break; // we should not exceed 1 second
|
|---|
| 1630 | t = get_time(shtns, nloop, 2, "", sht_func[SHT_STD][SHT_FLY2][SHT_TYP_SSY], Slm, Tlm, Qlm, Sh, Th, Qh, LMAX);
|
|---|
| 1631 | r = fabs(2.0*(t-t0)/(t+t0));
|
|---|
| 1632 | #if SHT_VERBOSE > 1
|
|---|
| 1633 | if (verbose>1) printf(", nloop=%d, r=%g, m=%d (real time = %g s)\n",nloop,r,m,tt);
|
|---|
| 1634 | if (tt >= 0.01) break; // faster timing in debug mode.
|
|---|
| 1635 | #endif
|
|---|
| 1636 | PRINT_DOT
|
|---|
| 1637 | } while((nloop<10000)&&(m < 3));
|
|---|
| 1638 | *nlp = nloop;
|
|---|
| 1639 | } else {
|
|---|
| 1640 | nloop = *nlp;
|
|---|
| 1641 | }
|
|---|
| 1642 | #if SHT_VERBOSE > 1
|
|---|
| 1643 | if (verbose>1) printf(" => nloop=%d (takes %g s)\n",nloop, tt);
|
|---|
| 1644 | #endif
|
|---|
| 1645 | if (vector == 0) typ_lim = SHT_TYP_VSY; // time only scalar transforms.
|
|---|
| 1646 | // if (tt > 3.0) typ_lim = SHT_TYP_VSY; // time only scalar transforms.
|
|---|
| 1647 | // if (tt > 10.0) goto done; // timing this will be too slow...
|
|---|
| 1648 |
|
|---|
| 1649 | int ityp = 0; do {
|
|---|
| 1650 | if ((dct_mtr != 0) && (ityp >= 4)) break; // dct !=0 : only scalar and vector.
|
|---|
| 1651 | if (ityp == 2) nloop = (nloop+1)/2; // scalar ar done.
|
|---|
| 1652 | t0 = 1e100;
|
|---|
| 1653 | i0 = 0;
|
|---|
| 1654 | if (MTR_DCT < 0) i0 = SHT_MEM; // skip dct.
|
|---|
| 1655 | if (on_the_fly_only) i0 = SHT_SV; // only on-the-fly (SV is then also on-the-fly)
|
|---|
| 1656 | alg_end = SHT_NALG;
|
|---|
| 1657 | if (shtns->nthreads <= 1) alg_end = SHT_OMP1; // no OpenMP with 1 thread.
|
|---|
| 1658 | if ((ityp&1) && (otf_analys == 0)) alg_end = SHT_FLY1; // no on-the-fly analysis for regular grid.
|
|---|
| 1659 | for (i=i0, m=0; i<alg_end; i++) {
|
|---|
| 1660 | if (sht_func[0][i][ityp] != NULL) m++; // count number of algos
|
|---|
| 1661 | }
|
|---|
| 1662 | if (m >= 2) { // don't time if there is only 1 algo !
|
|---|
| 1663 | #if SHT_VERBOSE > 1
|
|---|
| 1664 | if (verbose>1) { printf("finding best %s ...",sht_type[ityp]); fflush(stdout); }
|
|---|
| 1665 | #endif
|
|---|
| 1666 | i = i0-1; i0 = -1;
|
|---|
| 1667 | while (++i < alg_end) {
|
|---|
| 1668 | void *pf = sht_func[0][i][ityp];
|
|---|
| 1669 | if (pf != NULL) {
|
|---|
| 1670 | if (ityp&1) { // analysis
|
|---|
| 1671 | t = get_time(shtns, nloop, sht_npar[ityp], sht_name[i], pf, Sh, Th, Qh, Slm, Tlm, Qlm, LMAX);
|
|---|
| 1672 | } else {
|
|---|
| 1673 | t = get_time(shtns, nloop, sht_npar[ityp], sht_name[i], pf, Slm, Tlm, Qlm, Sh, Th, Qh, LMAX);
|
|---|
| 1674 | }
|
|---|
| 1675 | if (i < SHT_FLY1) t *= 1.03; // 3% penality for memory based transforms.
|
|---|
| 1676 | #ifdef _OPENMP
|
|---|
| 1677 | if ((i >= SHT_OMP1)||(i == SHT_SV)) t *= 1.3; // 30% penality for openmp transforms.
|
|---|
| 1678 | #endif
|
|---|
| 1679 | if (t < t0) { i0 = i; t0 = t; PRINT_VERB("*"); }
|
|---|
| 1680 | }
|
|---|
| 1681 | }
|
|---|
| 1682 | if (i0 >= 0) {
|
|---|
| 1683 | for (int iv=0; iv<SHT_NVAR; iv++) {
|
|---|
| 1684 | if (sht_func[iv][i0][ityp]) shtns->ftable[iv][ityp] = sht_func[iv][i0][ityp];
|
|---|
| 1685 | if (ityp == 4) { // only one timing for both gradients variants.
|
|---|
| 1686 | if (sht_func[iv][i0][ityp+1]) shtns->ftable[iv][ityp+1] = sht_func[iv][i0][ityp+1];
|
|---|
| 1687 | }
|
|---|
| 1688 | }
|
|---|
| 1689 | PRINT_DOT
|
|---|
| 1690 | #if SHT_VERBOSE > 1
|
|---|
| 1691 | if (verbose>1) printf(" => %s\n",sht_name[i0]);
|
|---|
| 1692 | #endif
|
|---|
| 1693 | }
|
|---|
| 1694 | }
|
|---|
| 1695 | if (ityp == 4) ityp++; // skip second gradient
|
|---|
| 1696 | } while(++ityp < typ_lim);
|
|---|
| 1697 |
|
|---|
| 1698 | #ifdef SHTNS_DCT
|
|---|
| 1699 | if (dct_mtr != 0) { // find the best DCT timings...
|
|---|
| 1700 | #if SHT_VERBOSE > 1
|
|---|
| 1701 | if (verbose>1) { printf("finding best mtr_dct ..."); fflush(stdout); }
|
|---|
| 1702 | #endif
|
|---|
| 1703 | minc = MMAX/20 + 1; // don't test every single m.
|
|---|
| 1704 | m = -1; i = -1; t0 = 0.0; // reference = no dct.
|
|---|
| 1705 | if (sht_func[SHT_STD][SHT_DCT][SHT_TYP_SSY] != NULL)
|
|---|
| 1706 | t0 += get_time(shtns, *nlp, 2, "s", shtns->ftable[SHT_STD][SHT_TYP_SSY], Slm, Tlm, Qlm, Sh, Th, Qh, LMAX);
|
|---|
| 1707 | if ( (sht_func[SHT_STD][SHT_DCT][SHT_TYP_VSY] != NULL) && (vector) )
|
|---|
| 1708 | t0 += get_time(shtns, nloop, 4, "v", shtns->ftable[SHT_STD][SHT_TYP_VSY], Slm, Tlm, Qlm, Sh, Th, Qh, LMAX);
|
|---|
| 1709 | tnodct = t0;
|
|---|
| 1710 | for (m=0; m<=MMAX; m+=minc) {
|
|---|
| 1711 | #if SHT_VERBOSE > 1
|
|---|
| 1712 | if (verbose>1) printf("\n\tm=%d :",m);
|
|---|
| 1713 | #endif
|
|---|
| 1714 | if (Set_MTR_DCT(shtns, m) >= 0) {
|
|---|
| 1715 | t = get_time(shtns, *nlp, 2, "sdct", sht_func[SHT_STD][SHT_DCT][SHT_TYP_SSY], Slm, Tlm, Qlm, Sh, Th, Qh, LMAX);
|
|---|
| 1716 | if (vector)
|
|---|
| 1717 | t += get_time(shtns, nloop, 4, "vdct", sht_func[SHT_STD][SHT_DCT][SHT_TYP_VSY], Slm, Tlm, Qlm, Sh, Th, Qh, LMAX);
|
|---|
| 1718 | if (t < t0) { t0 = t; i = m; PRINT_VERB("*"); }
|
|---|
| 1719 | PRINT_DOT
|
|---|
| 1720 | }
|
|---|
| 1721 | }
|
|---|
| 1722 | tdct = t0;
|
|---|
| 1723 | Set_MTR_DCT(shtns, i); // the best DCT is chosen.
|
|---|
| 1724 | #if SHT_VERBOSE > 0
|
|---|
| 1725 | if (verbose) printf(" mtr_dct=%d (%.1f%% performance gain)", MTR_DCT*MRES, 100.*(tnodct/tdct-1.));
|
|---|
| 1726 | #endif
|
|---|
| 1727 | }
|
|---|
| 1728 | #endif
|
|---|
| 1729 |
|
|---|
| 1730 | done:
|
|---|
| 1731 | #if SHT_VERBOSE > 0
|
|---|
| 1732 | if (verbose) printf("\n");
|
|---|
| 1733 | #endif
|
|---|
| 1734 | if (Qlm) VFREE(Qlm); if (Tlm) VFREE(Tlm);
|
|---|
| 1735 | if (Qh) VFREE(Qh); if (Th) VFREE(Th);
|
|---|
| 1736 | if (Slm) VFREE(Slm); if (Sh) VFREE(Sh);
|
|---|
| 1737 | if (dct_mtr > 0) {
|
|---|
| 1738 | return(tnodct/tdct);
|
|---|
| 1739 | } else return(0.0);
|
|---|
| 1740 | }
|
|---|
| 1741 |
|
|---|
| 1742 |
|
|---|
| 1743 | void shtns_print_version() {
|
|---|
| 1744 | printf("[" PACKAGE_STRING "] built " __DATE__ ", " __TIME__ ", id: " _SIMD_NAME_ "\n");
|
|---|
| 1745 | }
|
|---|
| 1746 |
|
|---|
| 1747 | void fprint_ftable(FILE* fp, void* ftable[SHT_NVAR][SHT_NTYP])
|
|---|
| 1748 | {
|
|---|
| 1749 | for (int iv=0; iv<SHT_NVAR; iv++) {
|
|---|
| 1750 | fprintf(fp, "\n %4s:",sht_var[iv]);
|
|---|
| 1751 | void** f = ftable[iv];
|
|---|
| 1752 | for (int it=0; it<SHT_NTYP; it++) {
|
|---|
| 1753 | if (f[it] != NULL) {
|
|---|
| 1754 | for (int ia=0; ia<SHT_NALG; ia++)
|
|---|
| 1755 | if (sht_func[iv][ia][it] == f[it]) {
|
|---|
| 1756 | fprintf(fp, "%5s ",sht_name[ia]); break;
|
|---|
| 1757 | }
|
|---|
| 1758 | } else fprintf(fp, " none ");
|
|---|
| 1759 | }
|
|---|
| 1760 | }
|
|---|
| 1761 | }
|
|---|
| 1762 |
|
|---|
| 1763 | void shtns_print_cfg(shtns_cfg shtns)
|
|---|
| 1764 | {
|
|---|
| 1765 | printf("Lmax=%d, Mmax*Mres=%d, Mres=%d, Nlm=%d [%d threads, ",LMAX, MMAX*MRES, MRES, NLM, shtns->nthreads);
|
|---|
| 1766 | if (shtns->norm & SHT_REAL_NORM) printf("'real' norm, ");
|
|---|
| 1767 | if (shtns->norm & SHT_NO_CS_PHASE) printf("no Condon-Shortley phase, ");
|
|---|
| 1768 | if (SHT_NORM == sht_fourpi) printf("4.pi normalized]\n");
|
|---|
| 1769 | else if (SHT_NORM == sht_schmidt) printf("Schmidt semi-normalized]\n");
|
|---|
| 1770 | else printf("orthonormalized]\n");
|
|---|
| 1771 | if (shtns->ct == NULL) return; // no grid is set
|
|---|
| 1772 |
|
|---|
| 1773 | switch(shtns->grid) {
|
|---|
| 1774 | case GRID_GAUSS : printf("Gauss grid"); break;
|
|---|
| 1775 | case GRID_REGULAR : printf("Regular grid (mtr_dct=%d)",shtns->mtr_dct); break;
|
|---|
| 1776 | case GRID_POLES : printf("Regular grid including poles"); break;
|
|---|
| 1777 | default : printf("Unknown grid");
|
|---|
| 1778 | }
|
|---|
| 1779 | printf(" : Nlat=%d, Nphi=%d\n", NLAT, NPHI);
|
|---|
| 1780 | printf(" ");
|
|---|
| 1781 | for (int it=0; it<SHT_NTYP; it++)
|
|---|
| 1782 | printf("%5s ",sht_type[it]);
|
|---|
| 1783 | fprint_ftable(stdout, shtns->ftable);
|
|---|
| 1784 | printf("\n");
|
|---|
| 1785 | }
|
|---|
| 1786 |
|
|---|
| 1787 |
|
|---|
| 1788 | /// \internal saves config to a file for later restart.
|
|---|
| 1789 | int config_save(shtns_cfg shtns, int req_flags)
|
|---|
| 1790 | {
|
|---|
| 1791 | int err = 0;
|
|---|
| 1792 |
|
|---|
| 1793 | if (shtns->ct == NULL) return -1; // no grid set
|
|---|
| 1794 |
|
|---|
| 1795 | if ((shtns->nphi > 1)||(shtns->mtr_dct >= 0)) {
|
|---|
| 1796 | FILE* f = fopen("shtns_cfg_fftw","w");
|
|---|
| 1797 | if (f != NULL) {
|
|---|
| 1798 | fftw_export_wisdom_to_file(f);
|
|---|
| 1799 | fclose(f);
|
|---|
| 1800 | } else err -= 2;
|
|---|
| 1801 | }
|
|---|
| 1802 |
|
|---|
| 1803 | FILE *fcfg = fopen("shtns_cfg","a");
|
|---|
| 1804 | if (fcfg != NULL) {
|
|---|
| 1805 | fprintf(fcfg, "%s %s %d %d %d %d %d %d %d %d %d %d",PACKAGE_VERSION, _SIMD_NAME_, shtns->lmax, shtns->mmax, shtns->mres, shtns->nphi, shtns->nlat, shtns->grid, shtns->nthreads, req_flags, shtns->nlorder, shtns->mtr_dct);
|
|---|
| 1806 | fprint_ftable(fcfg, shtns->ftable);
|
|---|
| 1807 | fprintf(fcfg,"\n");
|
|---|
| 1808 | fclose(fcfg);
|
|---|
| 1809 | } else err -= 4;
|
|---|
| 1810 |
|
|---|
| 1811 | #if SHT_VERBOSE > 0
|
|---|
| 1812 | if (err < 0) fprintf(stderr,"! Warning ! SHTns could not save config\n");
|
|---|
| 1813 | #endif
|
|---|
| 1814 | return err;
|
|---|
| 1815 | }
|
|---|
| 1816 |
|
|---|
| 1817 | /// \internal try to load config from a file
|
|---|
| 1818 | int config_load(shtns_cfg shtns, int req_flags)
|
|---|
| 1819 | {
|
|---|
| 1820 | void* ft2[SHT_NVAR][SHT_NTYP]; // pointers to transform functions.
|
|---|
| 1821 | int lmax2, mmax2, mres2, nphi2, nlat2, grid2, nthreads2, req_flags2, nlorder2, mtr_dct2;
|
|---|
| 1822 | int found = 0;
|
|---|
| 1823 | char version[32], simd[8], alg[8];
|
|---|
| 1824 |
|
|---|
| 1825 | if (shtns->ct == NULL) return -1; // no grid set
|
|---|
| 1826 |
|
|---|
| 1827 | if ((req_flags & 255) == sht_quick_init) req_flags += sht_gauss - sht_quick_init; // quick_init uses gauss.
|
|---|
| 1828 |
|
|---|
| 1829 | FILE *fcfg = fopen("shtns_cfg","r");
|
|---|
| 1830 | if (fcfg != NULL) {
|
|---|
| 1831 | int i=0;
|
|---|
| 1832 | while(1) {
|
|---|
| 1833 | fscanf(fcfg, "%30s %8s %d %d %d %d %d %d %d %d %d %d",version, simd, &lmax2, &mmax2, &mres2, &nphi2, &nlat2, &grid2, &nthreads2, &req_flags2, &nlorder2, &mtr_dct2);
|
|---|
| 1834 | for (int iv=0; iv<SHT_NVAR; iv++) {
|
|---|
| 1835 | fscanf(fcfg, "%7s", alg);
|
|---|
| 1836 | for (int it=0; it<SHT_NTYP; it++) {
|
|---|
| 1837 | fscanf(fcfg, "%7s", alg),
|
|---|
| 1838 | ft2[iv][it] = 0;
|
|---|
| 1839 | for (int ia=0; ia<SHT_NALG; ia++) {
|
|---|
| 1840 | if (strcmp(alg, sht_name[ia]) == 0) {
|
|---|
| 1841 | ft2[iv][it] = sht_func[iv][ia][it];
|
|---|
| 1842 | break;
|
|---|
| 1843 | }
|
|---|
| 1844 | }
|
|---|
| 1845 | }
|
|---|
| 1846 | }
|
|---|
| 1847 | if (feof(fcfg)) break;
|
|---|
| 1848 | #ifndef SHTNS_DCT
|
|---|
| 1849 | if (mtr_dct2 <= 0)
|
|---|
| 1850 | #endif
|
|---|
| 1851 | if ((shtns->lmax == lmax2) && (shtns->mmax == mmax2) && (shtns->mres == mres2) && (shtns->nthreads == nthreads2) &&
|
|---|
| 1852 | (shtns->nphi == nphi2) && (shtns->nlat == nlat2) && (shtns->grid == grid2) && (req_flags == req_flags2) &&
|
|---|
| 1853 | (shtns->nlorder == nlorder2) && (strcmp(simd, _SIMD_NAME_)==0)) {
|
|---|
| 1854 | #if SHT_VERBOSE > 0
|
|---|
| 1855 | if (verbose > 0) printf(" + using saved config\n");
|
|---|
| 1856 | #endif
|
|---|
| 1857 | #if SHT_VERBOSE > 1
|
|---|
| 1858 | if (verbose > 1) {
|
|---|
| 1859 | fprint_ftable(stdout, ft2);
|
|---|
| 1860 | printf("\n");
|
|---|
| 1861 | }
|
|---|
| 1862 | #endif
|
|---|
| 1863 | #ifdef SHTNS_DCT
|
|---|
| 1864 | Set_MTR_DCT(shtns, mtr_dct2); // use loaded mtr_dct
|
|---|
| 1865 | #endif
|
|---|
| 1866 | for (int iv=0; iv<SHT_NVAR; iv++)
|
|---|
| 1867 | for (int it=0; it<SHT_NTYP; it++)
|
|---|
| 1868 | if (ft2[iv][it]) shtns->ftable[iv][it] = ft2[iv][it]; // accept only non-null pointer
|
|---|
| 1869 | found = 1;
|
|---|
| 1870 | break;
|
|---|
| 1871 | }
|
|---|
| 1872 | }
|
|---|
| 1873 | fclose(fcfg);
|
|---|
| 1874 | return found;
|
|---|
| 1875 | } else {
|
|---|
| 1876 | #if SHT_VERBOSE > 0
|
|---|
| 1877 | if (verbose) fprintf(stderr,"! Warning ! SHTns could not load config\n");
|
|---|
| 1878 | #endif
|
|---|
| 1879 | return -2; // file not found
|
|---|
| 1880 | }
|
|---|
| 1881 | }
|
|---|
| 1882 |
|
|---|
| 1883 | /// \internal returns 1 if val cannot fit in dest (unsigned)
|
|---|
| 1884 | #define IS_TOO_LARGE(val, dest) (sizeof(dest) >= sizeof(val)) ? 0 : ( ( val >= (1<<(8*sizeof(dest))) ) ? 1 : 0 )
|
|---|
| 1885 |
|
|---|
| 1886 | /// \internal returns the size that must be allocated for an shtns_info.
|
|---|
| 1887 | #define SIZEOF_SHTNS_INFO(mmax) ( sizeof(struct shtns_info) + (mmax+1)*( sizeof(int)+sizeof(unsigned short) ) )
|
|---|
| 1888 |
|
|---|
| 1889 | /* PUBLIC INITIALIZATION & DESTRUCTION */
|
|---|
| 1890 |
|
|---|
| 1891 | /** \addtogroup init Initialization functions.
|
|---|
| 1892 | */
|
|---|
| 1893 | //@{
|
|---|
| 1894 |
|
|---|
| 1895 | /*! This sets the description of spherical harmonic coefficients.
|
|---|
| 1896 | * It tells SHTns how to interpret spherical harmonic coefficient arrays, and it sets usefull arrays.
|
|---|
| 1897 | * Returns the configuration to be passed to subsequent transform functions, which is basicaly a pointer to a \ref shtns_info struct.
|
|---|
| 1898 | * \param lmax : maximum SH degree that we want to describe.
|
|---|
| 1899 | * \param mmax : number of azimutal wave numbers.
|
|---|
| 1900 | * \param mres : \c 2.pi/mres is the azimutal periodicity. \c mmax*mres is the maximum SH order.
|
|---|
| 1901 | * \param norm : define the normalization of the spherical harmonics (\ref shtns_norm)
|
|---|
| 1902 | * + optionaly disable Condon-Shortley phase (ex: \ref sht_schmidt | \ref SHT_NO_CS_PHASE)
|
|---|
| 1903 | * + optionaly use a 'real' normalization (ex: \ref sht_fourpi | \ref SHT_REAL_NORM)
|
|---|
| 1904 | */
|
|---|
| 1905 | shtns_cfg shtns_create(int lmax, int mmax, int mres, enum shtns_norm norm)
|
|---|
| 1906 | {
|
|---|
| 1907 | shtns_cfg shtns, s2;
|
|---|
| 1908 | int im, m, l, lm;
|
|---|
| 1909 | int with_cs_phase = 1; /// Condon-Shortley phase (-1)^m is used by default.
|
|---|
| 1910 | double mpos_renorm = 1.0; /// renormalization of m>0.
|
|---|
| 1911 | int larrays_ok = 0;
|
|---|
| 1912 | int legendre_ok = 0;
|
|---|
| 1913 | int l_2_ok = 0;
|
|---|
| 1914 |
|
|---|
| 1915 | // if (lmax < 1) shtns_runerr("lmax must be larger than 1");
|
|---|
| 1916 | if (lmax < 2) shtns_runerr("lmax must be at least 2");
|
|---|
| 1917 | if (IS_TOO_LARGE(lmax, shtns->lmax)) shtns_runerr("lmax too large");
|
|---|
| 1918 | if (mmax*mres > lmax) shtns_runerr("MMAX*MRES should not exceed LMAX");
|
|---|
| 1919 | if (mres <= 0) shtns_runerr("MRES must be > 0");
|
|---|
| 1920 |
|
|---|
| 1921 | // allocate new setup and initialize some variables (used as flags) :
|
|---|
| 1922 | shtns = malloc( SIZEOF_SHTNS_INFO(mmax) );
|
|---|
| 1923 | if (shtns == NULL) return shtns; // FAIL
|
|---|
| 1924 | {
|
|---|
| 1925 | void **p0 = (void**) &shtns->tm; // first pointer in struct.
|
|---|
| 1926 | void **p1 = (void**) &shtns->Y00_1; // first non-pointer.
|
|---|
| 1927 | while(p0 < p1) *p0++ = NULL; // write NULL to every pointer.
|
|---|
| 1928 | shtns->lmidx = (int*) (shtns + 1); // lmidx is stored at the end of the struct...
|
|---|
| 1929 | shtns->tm = (unsigned short*) (shtns->lmidx + (mmax+1)); // and tm just after.
|
|---|
| 1930 | shtns->ct = NULL; shtns->st = NULL;
|
|---|
| 1931 | shtns->nphi = 0; shtns->nlat = 0; shtns->nlat_2 = 0; shtns->nspat = 0; // public data
|
|---|
| 1932 | }
|
|---|
| 1933 |
|
|---|
| 1934 | // copy sizes.
|
|---|
| 1935 | shtns->norm = norm;
|
|---|
| 1936 | if (norm & SHT_NO_CS_PHASE)
|
|---|
| 1937 | with_cs_phase = 0;
|
|---|
| 1938 | if (norm & SHT_REAL_NORM)
|
|---|
| 1939 | mpos_renorm = 0.5; // normalization for 'real' spherical harmonics.
|
|---|
| 1940 |
|
|---|
| 1941 | shtns->mmax = mmax; shtns->mres = mres; shtns->lmax = lmax;
|
|---|
| 1942 | shtns->nlm = nlm_calc(lmax, mmax, mres);
|
|---|
| 1943 | shtns->nthreads = omp_threads;
|
|---|
| 1944 | if (omp_threads > mmax+1) shtns->nthreads = mmax+1; // limit the number of threads to mmax+1
|
|---|
| 1945 | #if SHT_VERBOSE > 0
|
|---|
| 1946 | if (verbose) {
|
|---|
| 1947 | shtns_print_version();
|
|---|
| 1948 | printf(" "); shtns_print_cfg(shtns);
|
|---|
| 1949 | }
|
|---|
| 1950 | #endif
|
|---|
| 1951 |
|
|---|
| 1952 | s2 = sht_data; // check if some data can be shared ...
|
|---|
| 1953 | while(s2 != NULL) {
|
|---|
| 1954 | if ((s2->mmax >= mmax) && (s2->mres == mres)) {
|
|---|
| 1955 | if (s2->lmax == lmax) { // we can reuse the l-related arrays (li + copy lmidx)
|
|---|
| 1956 | shtns->li = s2->li; shtns->mi = s2->mi;
|
|---|
| 1957 | for (im=0; im<=mmax; im++) shtns->lmidx[im] = s2->lmidx[im];
|
|---|
| 1958 | larrays_ok = 1;
|
|---|
| 1959 | }
|
|---|
| 1960 | if ( (s2->lmax >= lmax) && (s2->norm == norm) ) { // we can reuse the legendre tables.
|
|---|
| 1961 | shtns->alm = s2->alm; shtns->blm = s2->blm;
|
|---|
| 1962 | legendre_ok = 1;
|
|---|
| 1963 | }
|
|---|
| 1964 | }
|
|---|
| 1965 | if (s2->lmax >= lmax) { // we can reuse l_2
|
|---|
| 1966 | shtns->l_2 = s2->l_2;
|
|---|
| 1967 | l_2_ok = 1;
|
|---|
| 1968 | }
|
|---|
| 1969 | s2 = s2->next;
|
|---|
| 1970 | }
|
|---|
| 1971 | if (larrays_ok == 0) {
|
|---|
| 1972 | // alloc spectral arrays
|
|---|
| 1973 | shtns->li = (unsigned short *) malloc( 2*NLM*sizeof(unsigned short) ); // NLM defined at runtime.
|
|---|
| 1974 | shtns->mi = shtns->li + NLM;
|
|---|
| 1975 | for (im=0, lm=0; im<=MMAX; im++) { // init l-related arrays.
|
|---|
| 1976 | m = im*MRES;
|
|---|
| 1977 | shtns->lmidx[im] = lm -m; // virtual pointer for l=0
|
|---|
| 1978 | for (l=im*MRES;l<=LMAX;l++) {
|
|---|
| 1979 | shtns->li[lm] = l; shtns->mi[lm] = m;
|
|---|
| 1980 | lm++;
|
|---|
| 1981 | }
|
|---|
| 1982 | }
|
|---|
| 1983 | if (lm != NLM) shtns_runerr("unexpected error");
|
|---|
| 1984 | }
|
|---|
| 1985 | if (legendre_ok == 0) { // this quickly precomputes some values for the legendre recursion.
|
|---|
| 1986 | legendre_precomp(shtns, SHT_NORM, with_cs_phase, mpos_renorm);
|
|---|
| 1987 | }
|
|---|
| 1988 | if (l_2_ok == 0) {
|
|---|
| 1989 | shtns->l_2 = (double *) malloc( (LMAX+1)*sizeof(double) );
|
|---|
| 1990 | shtns->l_2[0] = 0.0; // undefined for l=0 => replace with 0.
|
|---|
| 1991 | real one = 1.0;
|
|---|
| 1992 | for (l=1; l<=LMAX; l++) shtns->l_2[l] = one/(l*(l+1));
|
|---|
| 1993 | }
|
|---|
| 1994 |
|
|---|
| 1995 | switch(SHT_NORM) {
|
|---|
| 1996 | case sht_schmidt:
|
|---|
| 1997 | shtns->Y00_1 = 1.0; shtns->Y10_ct = 1.0;
|
|---|
| 1998 | break;
|
|---|
| 1999 | case sht_fourpi:
|
|---|
| 2000 | shtns->Y00_1 = 1.0; shtns->Y10_ct = sqrt(1./3.);
|
|---|
| 2001 | break;
|
|---|
| 2002 | case sht_orthonormal:
|
|---|
| 2003 | default:
|
|---|
| 2004 | shtns->Y00_1 = sqrt(4.*M_PI); shtns->Y10_ct = sqrt(4.*M_PI/3.);
|
|---|
| 2005 | // Y11_st = sqrt(2.*M_PI/3.); // orthonormal : \f$ \sin\theta\cos\phi/(Y_1^1 + Y_1^{-1}) = -\sqrt{2 \pi /3} \f$
|
|---|
| 2006 | }
|
|---|
| 2007 | shtns->Y11_st = shtns->Y10_ct * sqrt(0.5/mpos_renorm);
|
|---|
| 2008 | if (with_cs_phase) shtns->Y11_st *= -1.0; // correct Condon-Shortley phase
|
|---|
| 2009 |
|
|---|
| 2010 | // save a pointer to this setup and return.
|
|---|
| 2011 | shtns->next = sht_data; // reference of previous setup (may be NULL).
|
|---|
| 2012 | sht_data = shtns; // keep track of new setup.
|
|---|
| 2013 | return(shtns);
|
|---|
| 2014 | }
|
|---|
| 2015 |
|
|---|
| 2016 | /// Copy a given config but allow a different (smaller) mmax and the possibility to enable/disable fft (beta).
|
|---|
| 2017 | shtns_cfg shtns_create_with_grid(shtns_cfg base, int mmax, int nofft)
|
|---|
| 2018 | {
|
|---|
| 2019 | shtns_cfg shtns;
|
|---|
| 2020 |
|
|---|
| 2021 | if (mmax > base->mmax) return (NULL); // fail if mmax larger than source config.
|
|---|
| 2022 |
|
|---|
| 2023 | shtns = malloc( SIZEOF_SHTNS_INFO(mmax) );
|
|---|
| 2024 | memcpy(shtns, base, SIZEOF_SHTNS_INFO(mmax) ); // copy all
|
|---|
| 2025 | shtns->lmidx = (int*) shtns+1; // lmidx is stored at the end of the struct...
|
|---|
| 2026 | shtns->tm = (unsigned short*) (shtns->lmidx + (mmax+1)); // ...and tm just after.
|
|---|
| 2027 |
|
|---|
| 2028 | if (mmax != shtns->mmax) {
|
|---|
| 2029 | shtns->mmax = mmax;
|
|---|
| 2030 | for (int im=0; im<=mmax; im++) {
|
|---|
| 2031 | shtns->lmidx[im] = base->lmidx[im];
|
|---|
| 2032 | shtns->tm[im] = base->tm[im];
|
|---|
| 2033 | }
|
|---|
| 2034 | #ifdef SHTNS_DCT
|
|---|
| 2035 | if (mmax < shtns->mtr_dct) {
|
|---|
| 2036 | shtns->idct = NULL; // do not destroy the plan of the source.
|
|---|
| 2037 | Set_MTR_DCT(shtns, mmax); // adjut mtr_dct if required.
|
|---|
| 2038 | }
|
|---|
| 2039 | #endif
|
|---|
| 2040 | if (mmax == 0) {
|
|---|
| 2041 | // TODO we may disable fft and replace with a phi-averaging function ...
|
|---|
| 2042 | // ... then switch to axisymmetric functions :
|
|---|
| 2043 | // init_sht_array_func(shtns);
|
|---|
| 2044 | // choose_best_sht(shtns, &nloop, 0);
|
|---|
| 2045 | }
|
|---|
| 2046 | }
|
|---|
| 2047 | if (nofft != 0) {
|
|---|
| 2048 | shtns->ncplx_fft = -1; // fft disabled.
|
|---|
| 2049 | }
|
|---|
| 2050 |
|
|---|
| 2051 | // save a pointer to this setup and return.
|
|---|
| 2052 | shtns->next = sht_data; // reference of previous setup (may be NULL).
|
|---|
| 2053 | sht_data = shtns; // keep track of new setup.
|
|---|
| 2054 | return(shtns);
|
|---|
| 2055 | }
|
|---|
| 2056 |
|
|---|
| 2057 | /// release all resources allocated by a grid.
|
|---|
| 2058 | void shtns_unset_grid(shtns_cfg shtns)
|
|---|
| 2059 | {
|
|---|
| 2060 | if (ref_count(shtns, &shtns->wg) == 1) VFREE(shtns->wg);
|
|---|
| 2061 | shtns->wg = NULL;
|
|---|
| 2062 | free_SH_dct(shtns);
|
|---|
| 2063 | free_SHTarrays(shtns);
|
|---|
| 2064 | shtns->nlat = 0; shtns->nlat_2 = 0;
|
|---|
| 2065 | shtns->nphi = 0; shtns->nspat = 0;
|
|---|
| 2066 | }
|
|---|
| 2067 |
|
|---|
| 2068 | /// release all resources allocated by a given shtns_cfg.
|
|---|
| 2069 | void shtns_destroy(shtns_cfg shtns)
|
|---|
| 2070 | {
|
|---|
| 2071 | free_unused(shtns, &shtns->l_2);
|
|---|
| 2072 | if (shtns->blm != shtns->alm)
|
|---|
| 2073 | free_unused(shtns, &shtns->blm);
|
|---|
| 2074 | free_unused(shtns, &shtns->alm);
|
|---|
| 2075 | free_unused(shtns, &shtns->li);
|
|---|
| 2076 |
|
|---|
| 2077 | shtns_unset_grid(shtns);
|
|---|
| 2078 |
|
|---|
| 2079 | if (sht_data == shtns) {
|
|---|
| 2080 | sht_data = shtns->next; // forget shtns
|
|---|
| 2081 | } else {
|
|---|
| 2082 | shtns_cfg s2 = sht_data;
|
|---|
| 2083 | while (s2 != NULL) {
|
|---|
| 2084 | if (s2->next == shtns) {
|
|---|
| 2085 | s2->next = shtns->next; // forget shtns
|
|---|
| 2086 | break;
|
|---|
| 2087 | }
|
|---|
| 2088 | s2 = s2->next;
|
|---|
| 2089 | }
|
|---|
| 2090 | }
|
|---|
| 2091 | free(shtns);
|
|---|
| 2092 | }
|
|---|
| 2093 |
|
|---|
| 2094 | /// clear all allocated memory (hopefully) and go back to 0 state.
|
|---|
| 2095 | void shtns_reset()
|
|---|
| 2096 | {
|
|---|
| 2097 | while (sht_data != NULL) {
|
|---|
| 2098 | shtns_destroy(sht_data);
|
|---|
| 2099 | }
|
|---|
| 2100 | }
|
|---|
| 2101 |
|
|---|
| 2102 | /*! Initialization of Spherical Harmonic transforms (backward and forward, vector and scalar, ...) of given size.
|
|---|
| 2103 | * <b>This function must be called after \ref shtns_create and before any SH transform.</b> and sets all global variables and internal data.
|
|---|
| 2104 | * returns the required number of doubles to be allocated for a spatial field.
|
|---|
| 2105 | * \param shtns is the config created by \ref shtns_create for which the grid will be set.
|
|---|
| 2106 | * \param nlat,nphi pointers to the number of latitudinal and longitudinal grid points respectively. If 0, they are set to optimal values.
|
|---|
| 2107 | * \param nl_order defines the maximum SH degree to be resolved by analysis : lmax_analysis = lmax*nl_order. It is used to set an optimal and anti-aliasing nlat. If 0, the default SHT_DEFAULT_NL_ORDER is used.
|
|---|
| 2108 | * \param flags allows to choose the type of transform (see \ref shtns_type) and the spatial data layout (see \ref spat)
|
|---|
| 2109 | * \param eps polar optimization threshold : polar values of Legendre Polynomials below that threshold are neglected (for high m), leading to increased performance (a few percents)
|
|---|
| 2110 | * 0 = no polar optimization; 1.e-14 = VERY safe; 1.e-10 = safe; 1.e-6 = aggresive, but still good accuracy.
|
|---|
| 2111 | */
|
|---|
| 2112 | int shtns_set_grid_auto(shtns_cfg shtns, enum shtns_type flags, double eps, int nl_order, int *nlat, int *nphi)
|
|---|
| 2113 | {
|
|---|
| 2114 | double t, mem;
|
|---|
| 2115 | int im,m;
|
|---|
| 2116 | int layout;
|
|---|
| 2117 | int nloop = 0;
|
|---|
| 2118 | int n_gauss = 0;
|
|---|
| 2119 | int on_the_fly = 0;
|
|---|
| 2120 | int quick_init = 0;
|
|---|
| 2121 | int vector = !(flags & SHT_SCALAR_ONLY);
|
|---|
| 2122 | int latdir = (flags & SHT_SOUTH_POLE_FIRST) ? -1 : 1; // choose latitudinal direction (change sign of ct)
|
|---|
| 2123 | int cfg_loaded = 0;
|
|---|
| 2124 | int analys = 1;
|
|---|
| 2125 | const int req_flags = flags; // requested flags.
|
|---|
| 2126 |
|
|---|
| 2127 | #if _GCC_VEC_
|
|---|
| 2128 | if (*nlat & 1) shtns_runerr("Nlat must be even\n");
|
|---|
| 2129 | #ifdef __MIC__
|
|---|
| 2130 | if (*nlat % VSIZE2) shtns_runerr("Nlat must be a multiple of 8 for the MIC\n");
|
|---|
| 2131 | #endif
|
|---|
| 2132 | #endif
|
|---|
| 2133 | shtns_unset_grid(shtns); // release grid if previously allocated.
|
|---|
| 2134 | if (nl_order <= 0) nl_order = SHT_DEFAULT_NL_ORDER;
|
|---|
| 2135 | /* shtns.lshift = 0;
|
|---|
| 2136 | if (nl_order == 0) nl_order = SHT_DEFAULT_NL_ORDER;
|
|---|
| 2137 | if (nl_order < 0) { shtns.lshift = -nl_order; nl_order = 1; } // linear with a shift in l.
|
|---|
| 2138 | */
|
|---|
| 2139 | shtns->nspat = 0;
|
|---|
| 2140 | shtns->nlorder = nl_order;
|
|---|
| 2141 | shtns->mtr_dct = -1; // dct switched off completely.
|
|---|
| 2142 | layout = flags & 0xFFFF00;
|
|---|
| 2143 | flags = flags & 255; // clear higher bits.
|
|---|
| 2144 |
|
|---|
| 2145 | switch (flags) {
|
|---|
| 2146 | #ifndef SHTNS_DCT
|
|---|
| 2147 | case sht_auto : flags = sht_gauss; break; // only gauss available.
|
|---|
| 2148 | case sht_reg_fast:
|
|---|
| 2149 | case sht_reg_dct: shtns_runerr("regular grid not available (DCT required)."); break;
|
|---|
| 2150 | #endif
|
|---|
| 2151 | case sht_gauss_fly : flags = sht_gauss; on_the_fly = 1; break;
|
|---|
| 2152 | case sht_quick_init : flags = sht_gauss; quick_init = 1; break;
|
|---|
| 2153 | case sht_reg_poles : analys = 0; quick_init = 1; break;
|
|---|
| 2154 | default : break;
|
|---|
| 2155 | }
|
|---|
| 2156 | #ifndef SHTNS_MEM
|
|---|
| 2157 | on_the_fly = 1;
|
|---|
| 2158 | #endif
|
|---|
| 2159 |
|
|---|
| 2160 | if (*nphi == 0) {
|
|---|
| 2161 | *nphi = fft_int((nl_order+1)*MMAX+1, 7); // required fft nodes
|
|---|
| 2162 | }
|
|---|
| 2163 | if (*nlat == 0) {
|
|---|
| 2164 | n_gauss = ((nl_order+1)*LMAX)/2 +1; // required gauss nodes
|
|---|
| 2165 | n_gauss += (n_gauss&1); // even is better.
|
|---|
| 2166 | n_gauss = ((n_gauss+(VSIZE2-1))/VSIZE2) * VSIZE2; // multiple of vector size
|
|---|
| 2167 | if (flags != sht_gauss) {
|
|---|
| 2168 | m = fft_int(nl_order*LMAX+2, 7); // required dct nodes
|
|---|
| 2169 | *nlat = m + (m&1); // even is better.
|
|---|
| 2170 | } else *nlat = n_gauss;
|
|---|
| 2171 | }
|
|---|
| 2172 |
|
|---|
| 2173 | mem = sht_mem_size(shtns->lmax, shtns->mmax, shtns->mres, *nlat);
|
|---|
| 2174 | t=mem; if (analys) t*=2; if (vector) t*=3;
|
|---|
| 2175 | #if SHT_VERBOSE > 1
|
|---|
| 2176 | if (verbose>1) printf("Memory required for precomputed matrices (estimate) : %.3f Mb\n",t);
|
|---|
| 2177 | #endif
|
|---|
| 2178 | if ( t > SHTNS_MAX_MEMORY ) { // huge transform has been requested
|
|---|
| 2179 | on_the_fly = 1;
|
|---|
| 2180 | if ( (flags == sht_reg_dct) || (flags == sht_reg_fast) ) shtns_runerr("Memory limit exceeded, try using sht_gauss or increase SHTNS_MAX_MEMORY in sht_config.h");
|
|---|
| 2181 | if (flags != sht_reg_poles) {
|
|---|
| 2182 | flags = sht_gauss;
|
|---|
| 2183 | if (n_gauss > 0) *nlat = n_gauss;
|
|---|
| 2184 | }
|
|---|
| 2185 | // if (t > 10*SHTNS_MAX_MEMORY) quick_init =1; // do not time such large transforms.
|
|---|
| 2186 | }
|
|---|
| 2187 |
|
|---|
| 2188 | if (quick_init == 0) { // do not waste too much time finding optimal fftw.
|
|---|
| 2189 | shtns->fftw_plan_mode = FFTW_EXHAUSTIVE; // defines the default FFTW planner mode.
|
|---|
| 2190 | // fftw_set_timelimit(60.0); // do not search plans for more than 1 minute (does it work well ???)
|
|---|
| 2191 | if (*nphi > 512) shtns->fftw_plan_mode = FFTW_PATIENT;
|
|---|
| 2192 | if (*nphi > 1024) shtns->fftw_plan_mode = FFTW_MEASURE;
|
|---|
| 2193 | } else {
|
|---|
| 2194 | shtns->fftw_plan_mode = FFTW_ESTIMATE;
|
|---|
| 2195 | if ((mem < 1.0) && (SHT_VERBOSE < 2)) shtns->nthreads = 1; // disable threads for small transforms (in quickinit mode).
|
|---|
| 2196 | if ((VSIZE2 >= 4) && (*nlat >= VSIZE2*4)) on_the_fly = 1; // with AVX, on-the-fly should be the default (faster).
|
|---|
| 2197 | if ((shtns->nthreads > 1) && (*nlat >= VSIZE2*16)) on_the_fly = 1; // force multi-thread transforms
|
|---|
| 2198 | }
|
|---|
| 2199 |
|
|---|
| 2200 | if (flags == sht_auto) {
|
|---|
| 2201 | if ( ((nl_order>=2)&&(MMAX*MRES > LMAX/2)) || (*nlat < SHT_MIN_NLAT_DCT) || (*nlat & 1) || (*nlat <= LMAX+1) ) {
|
|---|
| 2202 | flags = sht_gauss; // avoid computing DCT stuff when it is not expected to be faster.
|
|---|
| 2203 | if (n_gauss > 0) *nlat = n_gauss;
|
|---|
| 2204 | }
|
|---|
| 2205 | }
|
|---|
| 2206 |
|
|---|
| 2207 | if (*nlat <= shtns->lmax) shtns_runerr("Nlat must be larger than Lmax");
|
|---|
| 2208 | if (IS_TOO_LARGE(*nlat, shtns->nlat)) shtns_runerr("Nlat too large");
|
|---|
| 2209 | if (IS_TOO_LARGE(*nphi, shtns->nphi)) shtns_runerr("Nphi too large");
|
|---|
| 2210 |
|
|---|
| 2211 | // copy to global variables.
|
|---|
| 2212 | shtns->nphi = *nphi;
|
|---|
| 2213 | shtns->nlat_2 = (*nlat+1)/2; shtns->nlat = *nlat;
|
|---|
| 2214 |
|
|---|
| 2215 | if (layout & SHT_LOAD_SAVE_CFG) {
|
|---|
| 2216 | FILE* f = fopen("shtns_cfg_fftw","r");
|
|---|
| 2217 | if (f) {
|
|---|
| 2218 | fftw_import_wisdom_from_file(f); // load fftw wisdom.
|
|---|
| 2219 | fclose(f);
|
|---|
| 2220 | }
|
|---|
| 2221 | }
|
|---|
| 2222 | planFFT(shtns, layout, on_the_fly); // initialize fftw
|
|---|
| 2223 | shtns->zlm_dct0 = NULL; // used as a flag.
|
|---|
| 2224 | init_sht_array_func(shtns); // array of SHT functions is now set.
|
|---|
| 2225 |
|
|---|
| 2226 | #ifdef SHTNS_DCT
|
|---|
| 2227 | if (flags == sht_reg_dct) { // pure dct.
|
|---|
| 2228 | alloc_SHTarrays(shtns, on_the_fly, vector, analys); // allocate dynamic arrays
|
|---|
| 2229 | grid_dct(shtns, latdir); init_SH_dct(shtns, 1);
|
|---|
| 2230 | OptimizeMatrices(shtns, eps);
|
|---|
| 2231 | Set_MTR_DCT(shtns, MMAX);
|
|---|
| 2232 | if (MTR_DCT != MMAX) shtns_runerr("DCT planning failed.");
|
|---|
| 2233 | }
|
|---|
| 2234 | if ((flags == sht_auto)||(flags == sht_reg_fast))
|
|---|
| 2235 | {
|
|---|
| 2236 | alloc_SHTarrays(shtns, on_the_fly, vector, analys); // allocate dynamic arrays
|
|---|
| 2237 | grid_dct(shtns, latdir); init_SH_dct(shtns, 1);
|
|---|
| 2238 | OptimizeMatrices(shtns, eps);
|
|---|
| 2239 | if (NLAT >= SHT_MIN_NLAT_DCT) { // dct requires large NLAT to perform well.
|
|---|
| 2240 | if ((layout & SHT_LOAD_SAVE_CFG) && (flags == sht_reg_fast))
|
|---|
| 2241 | cfg_loaded = (config_load(shtns, req_flags) > 0);
|
|---|
| 2242 | if (!cfg_loaded) {
|
|---|
| 2243 | t = choose_best_sht(shtns, &nloop, vector, 1); // find optimal MTR_DCT.
|
|---|
| 2244 | if ((n_gauss > 0)&&(flags == sht_auto)) t *= ((double) n_gauss)/NLAT; // we can revert to gauss with a smaller nlat.
|
|---|
| 2245 | if (t < MIN_PERF_IMPROVE_DCT) {
|
|---|
| 2246 | Set_MTR_DCT(shtns, -1); // turn off DCT.
|
|---|
| 2247 | } else {
|
|---|
| 2248 | t = SHT_error(shtns, vector);
|
|---|
| 2249 | if (t > MIN_ACCURACY_DCT) {
|
|---|
| 2250 | #if SHT_VERBOSE > 0
|
|---|
| 2251 | if (verbose) printf(" !! Not enough accuracy (%.3g) => DCT disabled.\n",t);
|
|---|
| 2252 | #endif
|
|---|
| 2253 | #if SHT_VERBOSE < 2
|
|---|
| 2254 | Set_MTR_DCT(shtns, -1); // turn off DCT.
|
|---|
| 2255 | #endif
|
|---|
| 2256 | }
|
|---|
| 2257 | }
|
|---|
| 2258 | }
|
|---|
| 2259 | }
|
|---|
| 2260 | if (MTR_DCT < 0) { // free memory used by DCT and disables DCT.
|
|---|
| 2261 | free_SH_dct(shtns); // free now useless arrays.
|
|---|
| 2262 | if (flags == sht_auto) {
|
|---|
| 2263 | flags = sht_gauss; // switch to gauss grid, even better accuracy.
|
|---|
| 2264 | #if SHT_VERBOSE > 0
|
|---|
| 2265 | if (verbose) printf(" => switching back to Gauss Grid\n");
|
|---|
| 2266 | #endif
|
|---|
| 2267 | for (im=1; im<=MMAX; im++) { // im >= 1
|
|---|
| 2268 | m = im*MRES;
|
|---|
| 2269 | shtns->ylm[im] -= shtns->tm[im]*(LMAX-m+1); // restore pointers altered by OptimizeMatrices().
|
|---|
| 2270 | if (vector) shtns->dylm[im] -= shtns->tm[im]*(LMAX-m+1);
|
|---|
| 2271 | }
|
|---|
| 2272 | if (n_gauss > 0) { // we should use the optimal size for gauss-legendre
|
|---|
| 2273 | free_SHTarrays(shtns);
|
|---|
| 2274 | *nlat = n_gauss;
|
|---|
| 2275 | shtns->nlat_2 = (*nlat+1)/2; shtns->nlat = *nlat;
|
|---|
| 2276 | planFFT(shtns, layout, on_the_fly); // fft must be replanned because NLAT has changed.
|
|---|
| 2277 | }
|
|---|
| 2278 | }
|
|---|
| 2279 | }
|
|---|
| 2280 | }
|
|---|
| 2281 | #endif /* SHTNS_DCT */
|
|---|
| 2282 | if (flags == sht_gauss)
|
|---|
| 2283 | {
|
|---|
| 2284 | alloc_SHTarrays(shtns, on_the_fly, vector, analys); // allocate dynamic arrays
|
|---|
| 2285 | grid_gauss(shtns, latdir);
|
|---|
| 2286 | #ifdef SHTNS_MEM
|
|---|
| 2287 | if (on_the_fly == 0) {
|
|---|
| 2288 | init_SH_gauss(shtns); // precompute matrices
|
|---|
| 2289 | OptimizeMatrices(shtns, eps);
|
|---|
| 2290 | }
|
|---|
| 2291 | #endif
|
|---|
| 2292 | }
|
|---|
| 2293 | if (flags == sht_reg_poles)
|
|---|
| 2294 | {
|
|---|
| 2295 | alloc_SHTarrays(shtns, on_the_fly, vector, 0); // allocate dynamic arrays (no analysis)
|
|---|
| 2296 | grid_equal_polar(shtns, latdir);
|
|---|
| 2297 | #ifdef SHTNS_MEM
|
|---|
| 2298 | if (on_the_fly == 0) {
|
|---|
| 2299 | init_SH_synth(shtns);
|
|---|
| 2300 | for (im=0; im<=MMAX; im++) shtns->tm[im] = 0; // avoid problems with tm[im] modified ????
|
|---|
| 2301 | }
|
|---|
| 2302 | #endif
|
|---|
| 2303 | }
|
|---|
| 2304 |
|
|---|
| 2305 | if (on_the_fly == 1) {
|
|---|
| 2306 | #if SHT_VERBOSE > 0
|
|---|
| 2307 | if (verbose) printf(" + using on-the-fly transforms.\n");
|
|---|
| 2308 | #endif
|
|---|
| 2309 | if (NLAT < VSIZE2*4) shtns_runerr("on-the-fly only available for nlat>=32"); // avoid overflow with NLAT_2 < VSIZE2*2
|
|---|
| 2310 | PolarOptimize(shtns, eps);
|
|---|
| 2311 | set_sht_fly(shtns, 0); // switch function pointers to "on-the-fly" functions.
|
|---|
| 2312 | }
|
|---|
| 2313 |
|
|---|
| 2314 | if ((layout & SHT_LOAD_SAVE_CFG) && (!cfg_loaded)) cfg_loaded = (config_load(shtns, req_flags) > 0);
|
|---|
| 2315 | if (quick_init == 0) {
|
|---|
| 2316 | if (!cfg_loaded) {
|
|---|
| 2317 | choose_best_sht(shtns, &nloop, vector, 0);
|
|---|
| 2318 | if (layout & SHT_LOAD_SAVE_CFG) config_save(shtns, req_flags);
|
|---|
| 2319 | }
|
|---|
| 2320 | #ifdef SHTNS_MEM
|
|---|
| 2321 | if (on_the_fly == 0) free_unused_matrices(shtns);
|
|---|
| 2322 | #endif
|
|---|
| 2323 | t = SHT_error(shtns, vector); // compute SHT accuracy.
|
|---|
| 2324 | #if SHT_VERBOSE > 0
|
|---|
| 2325 | if (verbose) printf(" + SHT accuracy = %.3g\n",t);
|
|---|
| 2326 | #endif
|
|---|
| 2327 | #if SHT_VERBOSE < 2
|
|---|
| 2328 | if (t > 1.e-3) {
|
|---|
| 2329 | shtns_print_cfg(shtns);
|
|---|
| 2330 | shtns_runerr("bad SHT accuracy"); // stop if something went wrong (but not in debug mode)
|
|---|
| 2331 | }
|
|---|
| 2332 | #endif
|
|---|
| 2333 | }
|
|---|
| 2334 |
|
|---|
| 2335 | // set_sht_fly(shtns, SHT_TYP_VAN);
|
|---|
| 2336 | #if SHT_VERBOSE > 1
|
|---|
| 2337 | if ((omp_threads > 1)&&(verbose>1)) printf(" nthreads = %d\n",shtns->nthreads);
|
|---|
| 2338 | #endif
|
|---|
| 2339 | #if SHT_VERBOSE > 0
|
|---|
| 2340 | if (verbose) printf(" => " PACKAGE_NAME " is ready.\n");
|
|---|
| 2341 | #endif
|
|---|
| 2342 | return(shtns->nspat); // returns the number of doubles to be allocated for a spatial field.
|
|---|
| 2343 | }
|
|---|
| 2344 |
|
|---|
| 2345 |
|
|---|
| 2346 | /*! Initialization of Spherical Harmonic transforms (backward and forward, vector and scalar, ...) of given size.
|
|---|
| 2347 | * <b>This function must be called after \ref shtns_create and before any SH transform.</b> and sets all global variables.
|
|---|
| 2348 | * returns the required number of doubles to be allocated for a spatial field.
|
|---|
| 2349 | * \param shtns is the config created by shtns_create for which the grid will be set.
|
|---|
| 2350 | * \param flags allows to choose the type of transform (see \ref shtns_type) and the spatial data layout (see \ref spat)
|
|---|
| 2351 | * \param eps polar optimization threshold : polar values of Legendre Polynomials below that threshold are neglected (for high m), leading to increased performance (a few percents)
|
|---|
| 2352 | * \param nlat,nphi respectively the number of latitudinal and longitudinal grid points.
|
|---|
| 2353 | * 0 = no polar optimization; 1.e-14 = VERY safe; 1.e-10 = safe; 1.e-6 = aggresive, but still good accuracy.
|
|---|
| 2354 | */
|
|---|
| 2355 | int shtns_set_grid(shtns_cfg shtns, enum shtns_type flags, double eps, int nlat, int nphi)
|
|---|
| 2356 | {
|
|---|
| 2357 | if ((nlat == 0)||(nphi == 0)) shtns_runerr("nlat or nphi is zero !");
|
|---|
| 2358 | return( shtns_set_grid_auto(shtns, flags, eps, 0, &nlat, &nphi) );
|
|---|
| 2359 | }
|
|---|
| 2360 |
|
|---|
| 2361 | /*! Simple initialization of Spherical Harmonic transforms (backward and forward, vector and scalar, ...) of given size.
|
|---|
| 2362 | * This function sets all global variables by calling \ref shtns_create followed by \ref shtns_set_grid, with the
|
|---|
| 2363 | * default normalization and the default polar optimization (see \ref sht_config.h).
|
|---|
| 2364 | * Returns the configuration to be passed to subsequent transform functions, which is basicaly a pointer to a \ref shtns_info struct.
|
|---|
| 2365 | * \param lmax : maximum SH degree that we want to describe.
|
|---|
| 2366 | * \param mmax : number of azimutal wave numbers.
|
|---|
| 2367 | * \param mres : \c 2.pi/mres is the azimutal periodicity. \c mmax*mres is the maximum SH order.
|
|---|
| 2368 | * \param nlat,nphi : respectively the number of latitudinal and longitudinal grid points.
|
|---|
| 2369 | * \param flags allows to choose the type of transform (see \ref shtns_type) and the spatial data layout (see \ref spat)
|
|---|
| 2370 | */
|
|---|
| 2371 | shtns_cfg shtns_init(enum shtns_type flags, int lmax, int mmax, int mres, int nlat, int nphi)
|
|---|
| 2372 | {
|
|---|
| 2373 | shtns_cfg shtns = shtns_create(lmax, mmax, mres, SHT_DEFAULT_NORM);
|
|---|
| 2374 | if (shtns != NULL)
|
|---|
| 2375 | shtns_set_grid(shtns, flags, SHT_DEFAULT_POLAR_OPT, nlat, nphi);
|
|---|
| 2376 | return shtns;
|
|---|
| 2377 | }
|
|---|
| 2378 |
|
|---|
| 2379 | /** Enables OpenMP parallel transforms, if available (see \ref compil).
|
|---|
| 2380 | Call before any initialization of shtns to use mutliple threads. Returns the actual number of threads.
|
|---|
| 2381 | \li If num_threads > 0, specifies the maximum number of threads that should be used.
|
|---|
| 2382 | \li If num_threads <= 0, maximum number of threads is automatically set to the number of processors.
|
|---|
| 2383 | \li If num_threads == 1, openmp will be disabled. */
|
|---|
| 2384 | int shtns_use_threads(int num_threads)
|
|---|
| 2385 | {
|
|---|
| 2386 | #ifdef _OPENMP
|
|---|
| 2387 | int procs = omp_get_num_procs();
|
|---|
| 2388 | if (num_threads <= 0) num_threads = omp_get_max_threads();
|
|---|
| 2389 | else if (num_threads > 4*procs) num_threads = 4*procs; // limit the number of threads
|
|---|
| 2390 | omp_threads = num_threads;
|
|---|
| 2391 | #endif
|
|---|
| 2392 | #ifdef OMP_FFTW
|
|---|
| 2393 | fftw_init_threads(); // enable threads for FFTW.
|
|---|
| 2394 | #endif
|
|---|
| 2395 | return omp_threads;
|
|---|
| 2396 | }
|
|---|
| 2397 |
|
|---|
| 2398 | /// fill the given array with Gauss weights. returns the number of weights written, which
|
|---|
| 2399 | /// may be zero if the grid is not a Gauss grid.
|
|---|
| 2400 | int shtns_gauss_wts(shtns_cfg shtns, double *wts)
|
|---|
| 2401 | {
|
|---|
| 2402 | int i = 0;
|
|---|
| 2403 | if (shtns->wg) {
|
|---|
| 2404 | double rescale = 2*NPHI; // weights are stored with a rescaling that depends on SHT_NORM.
|
|---|
| 2405 | if ((SHT_NORM != sht_fourpi)&&(SHT_NORM != sht_schmidt)) rescale *= 0.25/M_PI;
|
|---|
| 2406 |
|
|---|
| 2407 | do {
|
|---|
| 2408 | wts[i] = shtns->wg[i] * rescale;
|
|---|
| 2409 | } while(++i < shtns->nlat_2);
|
|---|
| 2410 | }
|
|---|
| 2411 | return i;
|
|---|
| 2412 | }
|
|---|
| 2413 |
|
|---|
| 2414 | //@}
|
|---|
| 2415 |
|
|---|
| 2416 |
|
|---|
| 2417 | #ifdef SHT_F77_API
|
|---|
| 2418 |
|
|---|
| 2419 | /* FORTRAN API */
|
|---|
| 2420 |
|
|---|
| 2421 | /** \addtogroup fortapi Fortran API.
|
|---|
| 2422 | * Call from fortran without the trailing '_'.
|
|---|
| 2423 | * see the \link SHT_example.f Fortran example \endlink for a simple usage of SHTns from Fortran language.
|
|---|
| 2424 | */
|
|---|
| 2425 | //@{
|
|---|
| 2426 |
|
|---|
| 2427 | /// Set verbosity level
|
|---|
| 2428 | void shtns_verbose_(int *v)
|
|---|
| 2429 | {
|
|---|
| 2430 | shtns_verbose(*v);
|
|---|
| 2431 | }
|
|---|
| 2432 |
|
|---|
| 2433 | /// Enable threads
|
|---|
| 2434 | void shtns_use_threads_(int *num_threads)
|
|---|
| 2435 | {
|
|---|
| 2436 | shtns_use_threads(*num_threads);
|
|---|
| 2437 | }
|
|---|
| 2438 |
|
|---|
| 2439 | /// Print info
|
|---|
| 2440 | void shtns_print_cfg_()
|
|---|
| 2441 | {
|
|---|
| 2442 | shtns_print_version();
|
|---|
| 2443 | if (sht_data) shtns_print_cfg(sht_data);
|
|---|
| 2444 | }
|
|---|
| 2445 |
|
|---|
| 2446 | /// Initializes spherical harmonic transforms of given size using Gauss algorithm with default polar optimization.
|
|---|
| 2447 | void shtns_init_sh_gauss_(int *layout, int *lmax, int *mmax, int *mres, int *nlat, int *nphi)
|
|---|
| 2448 | {
|
|---|
| 2449 | shtns_reset();
|
|---|
| 2450 | shtns_cfg shtns = shtns_create(*lmax, *mmax, *mres, SHT_DEFAULT_NORM);
|
|---|
| 2451 | shtns_set_grid(shtns, sht_gauss | *layout, SHT_DEFAULT_POLAR_OPT, *nlat, *nphi);
|
|---|
| 2452 | }
|
|---|
| 2453 |
|
|---|
| 2454 | /// Initializes spherical harmonic transforms of given size using Fastest available algorithm and polar optimization.
|
|---|
| 2455 | void shtns_init_sh_auto_(int *layout, int *lmax, int *mmax, int *mres, int *nlat, int *nphi)
|
|---|
| 2456 | {
|
|---|
| 2457 | shtns_reset();
|
|---|
| 2458 | shtns_cfg shtns = shtns_create(*lmax, *mmax, *mres, SHT_DEFAULT_NORM);
|
|---|
| 2459 | shtns_set_grid(shtns, sht_auto | *layout, SHT_DEFAULT_POLAR_OPT, *nlat, *nphi);
|
|---|
| 2460 | }
|
|---|
| 2461 |
|
|---|
| 2462 | /// Initializes spherical harmonic transforms of given size using a regular grid and agressive optimizations.
|
|---|
| 2463 | void shtns_init_sh_reg_fast_(int *layout, int *lmax, int *mmax, int *mres, int *nlat, int *nphi)
|
|---|
| 2464 | {
|
|---|
| 2465 | shtns_reset();
|
|---|
| 2466 | shtns_cfg shtns = shtns_create(*lmax, *mmax, *mres, SHT_DEFAULT_NORM);
|
|---|
| 2467 | shtns_set_grid(shtns, sht_reg_fast | *layout, 1.e-6, *nlat, *nphi);
|
|---|
| 2468 | }
|
|---|
| 2469 |
|
|---|
| 2470 | /// Initializes spherical harmonic transform SYNTHESIS ONLY of given size using a regular grid including poles.
|
|---|
| 2471 | void shtns_init_sh_poles_(int *layout, int *lmax, int *mmax, int *mres, int *nlat, int *nphi)
|
|---|
| 2472 | {
|
|---|
| 2473 | shtns_reset();
|
|---|
| 2474 | shtns_cfg shtns = shtns_create(*lmax, *mmax, *mres, SHT_DEFAULT_NORM);
|
|---|
| 2475 | shtns_set_grid(shtns, sht_reg_poles | *layout, 0, *nlat, *nphi);
|
|---|
| 2476 | }
|
|---|
| 2477 |
|
|---|
| 2478 | /// Defines the size and convention of the transform.
|
|---|
| 2479 | /// Allow to choose the normalization and whether or not to include the Condon-Shortley phase.
|
|---|
| 2480 | /// \see shtns_create
|
|---|
| 2481 | void shtns_set_size_(int *lmax, int *mmax, int *mres, int *norm)
|
|---|
| 2482 | {
|
|---|
| 2483 | shtns_reset();
|
|---|
| 2484 | shtns_create(*lmax, *mmax, *mres, *norm);
|
|---|
| 2485 | }
|
|---|
| 2486 |
|
|---|
| 2487 | /// Precompute matrices for synthesis and analysis.
|
|---|
| 2488 | /// Allow to choose polar optimization threshold and algorithm type.
|
|---|
| 2489 | /// \see shtns_set_grid
|
|---|
| 2490 | void shtns_precompute_(int *type, int *layout, double *eps, int *nlat, int *nphi)
|
|---|
| 2491 | {
|
|---|
| 2492 | shtns_set_grid(sht_data, *type | *layout, *eps, *nlat, *nphi);
|
|---|
| 2493 | }
|
|---|
| 2494 |
|
|---|
| 2495 | /// Same as shtns_precompute_ but choose optimal nlat and/or nphi.
|
|---|
| 2496 | /// \see shtns_set_grid_auto
|
|---|
| 2497 | void shtns_precompute_auto_(int *type, int *layout, double *eps, int *nl_order, int *nlat, int *nphi)
|
|---|
| 2498 | {
|
|---|
| 2499 | shtns_set_grid_auto(sht_data, *type | *layout, *eps, *nl_order, nlat, nphi);
|
|---|
| 2500 | }
|
|---|
| 2501 |
|
|---|
| 2502 | /// Clear everything
|
|---|
| 2503 | void shtns_reset_() {
|
|---|
| 2504 | shtns_reset();
|
|---|
| 2505 | }
|
|---|
| 2506 |
|
|---|
| 2507 | /// returns nlm, the number of complex*16 elements in an SH array.
|
|---|
| 2508 | /// call from fortran using \code call shtns_calc_nlm(nlm, lmax, mmax, mres) \endcode
|
|---|
| 2509 | void shtns_calc_nlm_(int *nlm, const int *const lmax, const int *const mmax, const int *const mres)
|
|---|
| 2510 | {
|
|---|
| 2511 | *nlm = nlm_calc(*lmax, *mmax, *mres);
|
|---|
| 2512 | }
|
|---|
| 2513 |
|
|---|
| 2514 | /// returns lm, the index in an SH array of mode (l,m).
|
|---|
| 2515 | /// call from fortran using \code call shtns_lmidx(lm, l, m) \endcode
|
|---|
| 2516 | void shtns_lmidx_(int *lm, const int *const l, const int *const m)
|
|---|
| 2517 | {
|
|---|
| 2518 | unsigned im = *m;
|
|---|
| 2519 | unsigned mres = sht_data->mres;
|
|---|
| 2520 | if (mres > 1) {
|
|---|
| 2521 | unsigned k = im % mres;
|
|---|
| 2522 | im = im / mres;
|
|---|
| 2523 | if (k) printf("wrong m");
|
|---|
| 2524 | }
|
|---|
| 2525 | *lm = LiM(sht_data, *l, im) + 1; // convert to fortran convention index.
|
|---|
| 2526 | }
|
|---|
| 2527 |
|
|---|
| 2528 | /// returns l and m, degree and order of an index in SH array lm.
|
|---|
| 2529 | /// call from fortran using \code call shtns_l_m(l, m, lm) \endcode
|
|---|
| 2530 | void shtns_l_m_(int *l, int *m, const int *const lm)
|
|---|
| 2531 | {
|
|---|
| 2532 | *l = sht_data->li[*lm -1]; // convert from fortran convention index.
|
|---|
| 2533 | *m = sht_data->mi[*lm -1];
|
|---|
| 2534 | }
|
|---|
| 2535 |
|
|---|
| 2536 | /// fills the given array with the cosine of the co-latitude angle (NLAT real*8)
|
|---|
| 2537 | /// if no grid has been set, the first element will be set to zero.
|
|---|
| 2538 | void shtns_cos_array_(double *costh)
|
|---|
| 2539 | {
|
|---|
| 2540 | if (sht_data->ct) {
|
|---|
| 2541 | for (int i=0; i<sht_data->nlat; i++)
|
|---|
| 2542 | costh[i] = sht_data->ct[i];
|
|---|
| 2543 | } else costh[0] = 0.0; // mark as invalid.
|
|---|
| 2544 | }
|
|---|
| 2545 |
|
|---|
| 2546 | /// fills the given array with the gaussian quadrature weights ((NLAT+1)/2 real*8).
|
|---|
| 2547 | /// when there is no gaussian grid, the first element is set to zero.
|
|---|
| 2548 | void shtns_gauss_wts_(double *wts)
|
|---|
| 2549 | {
|
|---|
| 2550 | int i = shtns_gauss_wts(sht_data, wts);
|
|---|
| 2551 | if (i==0) wts[0] = 0; // mark as invalid.
|
|---|
| 2552 | }
|
|---|
| 2553 |
|
|---|
| 2554 |
|
|---|
| 2555 | /** \name Point evaluation of Spherical Harmonics
|
|---|
| 2556 | Evaluate at a given point (\f$cos(\theta)\f$ and \f$\phi\f$) a spherical harmonic representation.
|
|---|
| 2557 | */
|
|---|
| 2558 | //@{
|
|---|
| 2559 | /// \see SH_to_point for argument description
|
|---|
| 2560 | void shtns_sh_to_point_(double *spat, cplx *Qlm, double *cost, double *phi)
|
|---|
| 2561 | {
|
|---|
| 2562 | *spat = SH_to_point(sht_data, Qlm, *cost, *phi);
|
|---|
| 2563 | }
|
|---|
| 2564 |
|
|---|
| 2565 | /// \see SHqst_to_point for argument description
|
|---|
| 2566 | void shtns_qst_to_point_(double *vr, double *vt, double *vp,
|
|---|
| 2567 | cplx *Qlm, cplx *Slm, cplx *Tlm, double *cost, double *phi)
|
|---|
| 2568 | {
|
|---|
| 2569 | SHqst_to_point(sht_data, Qlm, Slm, Tlm, *cost, *phi, vr, vt, vp);
|
|---|
| 2570 | }
|
|---|
| 2571 | //@}
|
|---|
| 2572 |
|
|---|
| 2573 |
|
|---|
| 2574 | //@}
|
|---|
| 2575 |
|
|---|
| 2576 | #endif
|
|---|