/* * Copyright (c) 2010-2015 Centre National de la Recherche Scientifique. * written by Nathanael Schaeffer (CNRS, ISTerre, Grenoble, France). * * nathanael.schaeffer@ujf-grenoble.fr * * This software is governed by the CeCILL license under French law and * abiding by the rules of distribution of free software. You can use, * modify and/or redistribute the software under the terms of the CeCILL * license as circulated by CEA, CNRS and INRIA at the following URL * "http://www.cecill.info". * * The fact that you are presently reading this means that you have had * knowledge of the CeCILL license and that you accept its terms. * */ # This file is meta-code for SHT.c (spherical harmonic transform). # it is intended for "make" to generate C code for similar SHT functions, # from one generic function + tags. # > See Makefile and SHT.c # Basically, there are tags at the beginning of lines that are information # to keep or remove the line depending on the function to build. # tags : # Q : line for scalar transform # V : line for vector transform (both spheroidal and toroidal) # S : line for vector transfrom, spheroidal component # T : line for vector transform, toroidal component. static 3 void GEN3(_sy3,NWAY,SUFFIX)(shtns_cfg shtns, cplx *Qlm, cplx *Slm, cplx *Tlm, v2d *BrF, v2d *BtF, v2d *BpF, const long int llim, const int imlim) { QX void GEN3(_sy1,NWAY,SUFFIX)(shtns_cfg shtns, cplx *Qlm, v2d *BrF, const long int llim, const int imlim) { #ifndef SHT_GRAD VX void GEN3(_sy2,NWAY,SUFFIX)(shtns_cfg shtns, cplx *Slm, cplx *Tlm, v2d *BtF, v2d *BpF, const long int llim, const int imlim) { #else S void GEN3(_sy1s,NWAY,SUFFIX)(shtns_cfg shtns, cplx *Slm, v2d *BtF, v2d *BpF, const long int llim, const int imlim) { T void GEN3(_sy1t,NWAY,SUFFIX)(shtns_cfg shtns, cplx *Tlm, v2d *BtF, v2d *BpF, const long int llim, const int imlim) { #endif #ifndef SHT_AXISYM Q #define BR0(i) ((double *)BrF)[2*(i)] V #define BT0(i) ((double *)BtF)[2*(i)] V #define BP0(i) ((double *)BpF)[2*(i)] Q #define qr(l) vall(creal(Ql[l])) Q #define qi(l) vall(cimag(Ql[l])) S #define sr(l) vall(creal(Sl[l])) S #define si(l) vall(cimag(Sl[l])) T #define tr(l) vall(creal(Tl[l])) T #define ti(l) vall(cimag(Tl[l])) V double m_1; unsigned im; #else Q #define BR0(i) ((double *)BrF)[i] S #define BT0(i) ((double *)BtF)[i] T #define BP0(i) ((double *)BpF)[i] #endif unsigned m0, mstep; long int nk,k,l,m; double *alm, *al; double *ct, *st; Q double Ql0[llim+1]; S double Sl0[llim]; T double Tl0[llim]; ct = shtns->ct; st = shtns->st; nk = NLAT_2; #if _GCC_VEC_ nk = ((unsigned)(nk+VSIZE2-1)) / VSIZE2; #endif #ifndef _OPENMP m0 = 0; mstep = 1; #else m0 = omp_get_thread_num(); mstep = omp_get_num_threads(); if (m0 == 0) #endif { // im=0; #ifdef SHT_GRAD #ifndef SHT_AXISYM #ifdef _GCC_VEC_ S k=0; do { BpF[k]=vdup(0.0); } while(++kalm; Q Ql0[0] = (double) Qlm[0]; // l=0 do { // for m=0, compress the complex Q,S,T to double Q Ql0[l] = (double) Qlm[l]; // Ql[l+1] = (double) Qlm[l+1]; S Sl0[l-1] = (double) Slm[l]; // Sl[l] = (double) Slm[l+1]; T Tl0[l-1] = (double) Tlm[l]; // Tl[l] = (double) Tlm[l+1]; ++l; } while(l<=llim); k=0; do { l=0; al = alm; rnd cost[NWAY], y0[NWAY], y1[NWAY]; V rnd sint[NWAY], dy0[NWAY], dy1[NWAY]; Q rnd re[NWAY], ro[NWAY]; S rnd te[NWAY], to[NWAY]; T rnd pe[NWAY], po[NWAY]; for (int j=0; j>1; V m_1 = 1.0/m; //alm = shtns->alm[im]; alm = shtns->alm + im*(2*LMAX -m+MRES); Q cplx* Ql = &Qlm[l]; // virtual pointer for l=0 and im S cplx* Sl = &Slm[l]; // virtual pointer for l=0 and im T cplx* Tl = &Tlm[l]; k=0; l=shtns->tm[im]; #if _GCC_VEC_ l>>=1; // stay on a 16 byte boundary while (k>= 1); } else { long int nsint = 0; do { // sin(theta)^m (use rescaling to avoid underflow) if (l&1) { for (int j=0; j>= 1); } for (int j=0; j SHT_ACCURACY*SHT_SCALE_FACTOR + 1.0) { // rescale when value is significant ++ny; for (int j=0; j (unsigned) llim) imlim = ((unsigned) llim)/MRES; // 32bit mul and div should be faster #endif #ifdef _GCC_VEC_ if (shtns->fftc_mode > 0) { // alloc memory for the FFT unsigned long nv = shtns->nspat; QX BrF = (v2d*) VMALLOC( nv * sizeof(double) ); VX BtF = (v2d*) VMALLOC( 2*nv * sizeof(double) ); VX BpF = BtF + nv/2; 3 BrF = (v2d*) VMALLOC( 3*nv * sizeof(double) ); 3 BtF = BrF + nv/2; BpF = BrF + nv; } #else if (shtns->ncplx_fft > 0) { // alloc memory for the FFT QX BrF = VMALLOC( shtns->ncplx_fft * sizeof(cplx) ); VX BtF = VMALLOC( 2* shtns->ncplx_fft * sizeof(cplx) ); VX BpF = BtF + shtns->ncplx_fft; 3 BrF = VMALLOC( 3* shtns->ncplx_fft * sizeof(cplx) ); 3 BtF = BrF + shtns->ncplx_fft; BpF = BtF + shtns->ncplx_fft; } #endif #endif imlim += 1; #pragma omp parallel num_threads(shtns->nthreads) { 3 GEN3(_sy3,NWAY,SUFFIX)(shtns, Qlm, Slm, Tlm, BrF, BtF, BpF, llim, imlim); QX GEN3(_sy1,NWAY,SUFFIX)(shtns, Qlm, BrF, llim, imlim); #ifndef SHT_GRAD VX GEN3(_sy2,NWAY,SUFFIX)(shtns, Slm, Tlm, BtF, BpF, llim, imlim); #else S GEN3(_sy1s,NWAY,SUFFIX)(shtns, Slm, BtF, BpF, llim, imlim); T GEN3(_sy1t,NWAY,SUFFIX)(shtns, Tlm, BtF, BpF, llim, imlim); #endif #ifndef SHT_AXISYM V #ifndef HAVE_LIBFFTW3_OMP V #pragma omp barrier V #if _GCC_VEC_ V if (shtns->fftc_mode == 0) { 3 #pragma omp single nowait 3 fftw_execute_dft(shtns->ifftc, (cplx *) BrF, (cplx *) Vr); V #pragma omp single nowait V fftw_execute_dft(shtns->ifftc, (cplx *) BtF, (cplx *) Vt); V #pragma omp single nowait V fftw_execute_dft(shtns->ifftc, (cplx *) BpF, (cplx *) Vp); V } else if (shtns->fftc_mode > 0) { // split dft 3 #pragma omp single nowait 3 fftw_execute_split_dft(shtns->ifftc,((double*)BrF)+1, ((double*)BrF), Vr+NPHI, Vr); V #pragma omp single nowait V fftw_execute_split_dft(shtns->ifftc,((double*)BtF)+1, ((double*)BtF), Vt+NPHI, Vt); V #pragma omp single nowait V fftw_execute_split_dft(shtns->ifftc,((double*)BpF)+1, ((double*)BpF), Vp+NPHI, Vp); V } V #else 3 #pragma omp single nowait 3 fftw_execute_dft_c2r(shtns->ifft, (cplx *) BrF, Vr); V #pragma omp single nowait V fftw_execute_dft_c2r(shtns->ifft, (cplx *) BtF, Vt); V #pragma omp single nowait V fftw_execute_dft_c2r(shtns->ifft, (cplx *) BpF, Vp); V #endif V #endif #endif } #ifndef SHT_AXISYM // NPHI > 1 as SHT_AXISYM is not defined. #if _GCC_VEC_ if (shtns->fftc_mode >= 0) { if (shtns->fftc_mode == 0) { V #ifdef HAVE_LIBFFTW3_OMP Q fftw_execute_dft(shtns->ifftc, (cplx *) BrF, (cplx *) Vr); V fftw_execute_dft(shtns->ifftc, (cplx *) BtF, (cplx *) Vt); V fftw_execute_dft(shtns->ifftc, (cplx *) BpF, (cplx *) Vp); V #endif } else { // split dft V #ifdef HAVE_LIBFFTW3_OMP Q fftw_execute_split_dft(shtns->ifftc,((double*)BrF)+1, ((double*)BrF), Vr+NPHI, Vr); V fftw_execute_split_dft(shtns->ifftc,((double*)BtF)+1, ((double*)BtF), Vt+NPHI, Vt); V fftw_execute_split_dft(shtns->ifftc,((double*)BpF)+1, ((double*)BpF), Vp+NPHI, Vp); V #endif Q VFREE(BrF); VX VFREE(BtF); // this frees also BpF. } } #else if (shtns->ncplx_fft >= 0) { V #ifdef HAVE_LIBFFTW3_OMP Q fftw_execute_dft_c2r(shtns->ifft, (cplx *) BrF, Vr); V fftw_execute_dft_c2r(shtns->ifft, (cplx *) BtF, Vt); V fftw_execute_dft_c2r(shtns->ifft, (cplx *) BpF, Vp); V #endif if (shtns->ncplx_fft > 0) { // free memory Q VFREE(BrF); VX VFREE(BtF); // this frees also BpF. } } #endif #endif }