/* * Copyright (c) 2010-2015 Centre National de la Recherche Scientifique. * written by Nathanael Schaeffer (CNRS, ISTerre, Grenoble, France). * * nathanael.schaeffer@ujf-grenoble.fr * * This software is governed by the CeCILL license under French law and * abiding by the rules of distribution of free software. You can use, * modify and/or redistribute the software under the terms of the CeCILL * license as circulated by CEA, CNRS and INRIA at the following URL * "http://www.cecill.info". * * The fact that you are presently reading this means that you have had * knowledge of the CeCILL license and that you accept its terms. * */ # This file is meta-code for SHT.c (spherical harmonic transform). # it is intended for "make" to generate C code for similar SHT functions, # from one generic function + tags. # > See Makefile and SHT.c # Basically, there are tags at the beginning of lines that are information # to keep or remove the line depending on the function to build. # tags : # Q : line for scalar transform # V : line for vector transform (both spheroidal and toroidal) # S : line for vector transfrom, spheroidal component # T : line for vector transform, toroidal component. ///////////////////////////////////////////////////// #ifdef SHT_AXISYM /// Only the \b axisymmetric (m=0) component is transformed, and the resulting spatial fields have size NLAT. #endif /// Truncation and spatial discretization are defined by \ref shtns_create and \ref shtns_set_grid_* /// \param[in] shtns = a configuration created by \ref shtns_create with a grid set by shtns_set_grid_* Q/// \param[in] Qlm = spherical harmonics coefficients : Q/// complex double arrays of size shtns->nlm [unmodified]. S/// \param[in] Slm = spherical harmonics coefficients of \b Spheroidal scalar : S/// complex double array of size shtns->nlm [unmodified]. T/// \param[in] Tlm = spherical harmonics coefficients of \b Toroidal scalar : T/// complex double array of size shtns->nlm [unmodified]. Q/// \param[out] Vr = spatial scalar field : double array. #ifndef SHT_AXISYM V/// \param[out] Vt, Vp = (theta,phi)-components of spatial vector : double arrays. #else S/// \param[out] Vt = theta-component of spatial vector : double array. T/// \param[out] Vp = phi-component of spatial vector : double array. #endif #ifdef SHT_VAR_LTR /// \param[in] llim = specify maximum degree of spherical harmonic. llim must be at most LMAX, and all spherical harmonic degree higher than llim are ignored. #else /// \param[in] llim MUST be shtns->lmax. #endif // MTR_DCT : <0 => SHT_NO_DCT must be defined !!! mem only transform // 0 => dct for m=0 only // m => dct up to m, (!!! MTR_DCT <= MTR !!!) 3 static void GEN3(SHqst_to_spat_,ID_NME,SUFFIX)(shtns_cfg shtns, cplx *Qlm, cplx *Slm, cplx *Tlm, double *Vr, double *Vt, double *Vp, long int llim) { QX static void GEN3(SH_to_spat_,ID_NME,SUFFIX)(shtns_cfg shtns, cplx *Qlm, double *Vr, long int llim) { #ifndef SHT_GRAD VX static void GEN3(SHsphtor_to_spat_,ID_NME,SUFFIX)(shtns_cfg shtns, cplx *Slm, cplx *Tlm, double *Vt, double *Vp, long int llim) { #else S static void GEN3(SHsph_to_spat_,ID_NME,SUFFIX)(shtns_cfg shtns, cplx *Slm, double *Vt, double *Vp, long int llim) { T static void GEN3(SHtor_to_spat_,ID_NME,SUFFIX)(shtns_cfg shtns, cplx *Tlm, double *Vt, double *Vp, long int llim) { #endif Q v2d *BrF; #ifndef SHT_AXISYM // with m>0, 3 components not possible with DCT acceleration 3 #define SHT_NO_DCT V v2d *BtF, *BpF; V struct DtDp *dyl; Q cplx re,ro; V cplx te,to, pe,po; V cplx dte,dto, dpe,dpo; Q #define BR0(i) ((double *)BrF)[2*(i)] V #define BT0(i) ((double *)BtF)[2*(i)] V #define BP0(i) ((double *)BpF)[2*(i)] unsigned im, imlim; int imlim_dct; #else S v2d *BtF; T v2d *BpF; Q double re,ro; S double te,to; T double pe,po; Q #define BR0(i) ((double *)BrF)[i] S #define BT0(i) ((double *)BtF)[i] T #define BP0(i) ((double *)BpF)[i] #endif long int k,m,l; #ifdef _GCC_VEC_ QX s2d Ql0[(llim+5)>>1]; // we need some zero-padding (icc 14 bug: needs 1 more for SHT_AXISYM only!) 3 s2d Ql0[(llim+4)>>1]; // we need some zero-padding (icc 14 bug: needs 1 more for SHT_AXISYM only!). S s2d Sl0[(llim+3)>>1]; // we need some zero-padding (icc 14 bug: needs 1 more for SHT_AXISYM only!). T s2d Tl0[(llim+3)>>1]; // we need some zero-padding (icc 14 bug: needs 1 more for SHT_AXISYM only!). #endif #ifndef SHT_AXISYM Q BrF = (v2d *) Vr; V BtF = (v2d *) Vt; BpF = (v2d *) Vp; if (shtns->ncplx_fft > 0) { // alloc memory for the FFT QX BrF = VMALLOC( shtns->ncplx_fft * sizeof(cplx) ); VX BtF = VMALLOC( 2* shtns->ncplx_fft * sizeof(cplx) ); VX BpF = BtF + shtns->ncplx_fft; 3 BrF = VMALLOC( 3* shtns->ncplx_fft * sizeof(cplx) ); 3 BtF = BrF + shtns->ncplx_fft; BpF = BtF + shtns->ncplx_fft; } imlim = MTR; #ifdef SHT_VAR_LTR if (imlim*MRES > (unsigned) llim) imlim = ((unsigned) llim)/MRES; // 32bit mul and div should be faster #endif #ifndef SHT_NO_DCT imlim_dct = MTR_DCT; #ifdef SHT_VAR_LTR if (imlim_dct > imlim) imlim_dct = imlim; #endif #endif #else #ifdef SHT_GRAD S if (Vp != NULL) { k=0; do { ((s2d*)Vp)[k]=vdup(0.0); } while(++kykm_dct[0]; V s2d* dyl0 = (s2d*) shtns->dykm_dct[0]; // only theta derivative (d/dphi = 0 for m=0) #ifndef _GCC_VEC_ Q double* Ql = (double*) Qlm; S double* Sl = (double*) Slm; T double* Tl = (double*) Tlm; Q re = 0.0; ro = 0.0; QE re = yl[0] * Ql[0]; QO ro = yl[1] * Ql[2]; Q yl+=2; k=0; l = 2; do { S te = 0.0; to = 0.0; T pe = 0.0; po = 0.0; SO te = dyl0[0] * Sl[2*l-2]; // l-1 TE pe = -dyl0[0] * Tl[2*l-2]; // l-1 V ++dyl0; while(l>1) - (llim>>1))*2; V dyl0 += (((LMAX+1)>>1) - ((llim+1)>>1))*2; #endif k+=2; l = k; } while (k<=llim+1); #else Q s2d r = yl[0] * Ql0[0]; // first Q [l=0 and 1] l=0; k=0; do { l >>= 1; // l = l/2; S s2d t = vdup(0.0); T s2d p = vdup(0.0); QX s2d r1 = vdup(0.0); do { Q r += yl[l+1] * Ql0[l+1]; // { re, ro } S t += dyl0[l] * Sl0[l]; // { te, to } T p -= dyl0[l] * Tl0[l]; // { pe, po } V ++l; QX r1 += yl[l+2] * Ql0[l+2]; // { re, ro } QX l+=2; QX } while(2*l < llim-1); // 2*(l+1) <= llim V } while(2*l < llim); QX r += r1; Q yl += (LMAX -k)>>1; V dyl0 += (LMAX+1 -k)>>1; #ifndef SHT_AXISYM Q BR0(k) = vlo_to_dbl(r); BR0(k+1) = vhi_to_dbl(r); VX BT0(k) = 0.0; BT0(k+1) = 0.0; S BT0(k) = vlo_to_dbl(t); BT0(k+1) = vhi_to_dbl(t); VX BP0(k) = 0.0; BP0(k+1) = 0.0; T BP0(k) = vlo_to_dbl(p); BP0(k+1) = vhi_to_dbl(p); #else Q *((s2d*)(((double*)BrF)+k)) = r; S *((s2d*)(((double*)BtF)+k)) = t; T *((s2d*)(((double*)BpF)+k)) = p; #endif l = k+1; k+=2; Q r = vdup(0.0); QX } while (lidct,Vr, Vr); // iDCT m=0 S fftw_execute_r2r(shtns->idct,Vt, Vt); // iDCT m=0 T fftw_execute_r2r(shtns->idct,Vp, Vp); // iDCT m=0 V double* st_1 = shtns->st_1; V k=0; do { V #ifdef _GCC_VEC_ V v2d sin_1 = ((v2d *)st_1)[k]; S ((v2d *)Vt)[k] *= sin_1; T ((v2d *)Vp)[k] *= sin_1; V #else V double sin_1 = st_1[2*k]; double sin_2 = st_1[2*k+1]; S Vt[2*k] *= sin_1; Vt[2*k+1] *= sin_2; T Vp[2*k] *= sin_1; Vp[2*k+1] *= sin_2; V #endif V } while (++kylm[0]; V s2d* dyl0 = (s2d*) shtns->dylm[0]; // only theta derivative (d/dphi = 0 for m=0) do { // ops : NLAT_2 * [ (lmax-m+1) + 4] : almost twice as fast. #ifndef _GCC_VEC_ l=1; Q re = yl[0] * Ql[0]; // re += ylm[im][k*(LMAX-m+1) + (l-m)] * Qlm[LiM(l,im)]; Q ++yl; Q ro = 0.0; S te = 0.0; to = 0.0; T pe = 0.0; po = 0.0; while (l>1) - (llim>>1))*2; V dyl0 += (((LMAX+1)>>1) - ((llim+1)>>1))*2; #endif Q BR0(k) = re + ro; V #ifndef SHT_AXISYM V BT0(k) = 0.0; BP0(k) = 0.0; // required for partial tor or sph transform V #endif S BT0(k) = te + to; // Bt = dS/dt T BP0(k) = pe + po; // Bp = - dT/dt QB BR0(NLAT-1-k) = re - ro; VB #ifndef SHT_AXISYM VB BT0(NLAT-1-k) = 0.0; BP0(NLAT-1-k) = 0.0; // required for partial tor or sph transform VB #endif SB BT0(NLAT-1-k) = te - to; TB BP0(NLAT-1-k) = pe - po; #else l=0; S s2d t = vdup(0.0); T s2d p = vdup(0.0); Q s2d r = vdup(0.0); QX s2d r1 = vdup(0.0); do { S t += dyl0[l] * Sl0[l]; // { te, to } T p -= dyl0[l] * Tl0[l]; // { pe, po } Q r += yl[l] * Ql0[l]; // { re, ro } ++l; QX r1 += yl[l] * Ql0[l]; // reduce dependency QX ++l; Q } while (2*l <= llim); VX } while (2*l < llim); QX r += r1; Q yl += (LMAX+2)>>1; V dyl0 += (LMAX+1)>>1; #if __SSE3__ /* alternate code, which may be faster (slightly) on SSE3. */ Q r = addi(r,r); // { re-ro , re+ro } S t = addi(t,t); // { te-to , te+to } T p = addi(p,p); // { pe-po , pe+po } Q BR0(k) = vhi_to_dbl(r); V #ifndef SHT_AXISYM V BT0(k) = 0.0; BP0(k) = 0.0; // required for partial tor or sph transform V #endif S BT0(k) = vhi_to_dbl(t); // Bt = dS/dt T BP0(k) = vhi_to_dbl(p); // Bp = - dT/dt QB BR0(NLAT-1-k) = vlo_to_dbl(r); VB #ifndef SHT_AXISYM VB BT0(NLAT-1-k) = 0.0; BP0(NLAT-1-k) = 0.0; // required for partial tor or sph transform VB #endif SB BT0(NLAT-1-k) = vlo_to_dbl(t); TB BP0(NLAT-1-k) = vlo_to_dbl(p); #else Q BR0(k) = vhi_to_dbl(r) + vlo_to_dbl(r); V #ifndef SHT_AXISYM V BT0(k) = 0.0; BP0(k) = 0.0; // required for partial tor or sph transform V #endif S BT0(k) = vhi_to_dbl(t) + vlo_to_dbl(t); // Bt = dS/dt T BP0(k) = vhi_to_dbl(p) + vlo_to_dbl(p); // Bp = - dT/dt QB BR0(NLAT-1-k) = vlo_to_dbl(r) - vhi_to_dbl(r); VB #ifndef SHT_AXISYM VB BT0(NLAT-1-k) = 0.0; BP0(NLAT-1-k) = 0.0; // required for partial tor or sph transform VB #endif SB BT0(NLAT-1-k) = vlo_to_dbl(t) - vhi_to_dbl(t); TB BP0(NLAT-1-k) = vlo_to_dbl(p) - vhi_to_dbl(p); #endif #endif } while (++k < NLAT_2); } #endif // ifndef SHT_NO_DCT #ifndef SHT_AXISYM im=1; Q BrF += NLAT; V BtF += NLAT; BpF += NLAT; #ifndef SHT_NO_DCT while(im<=imlim_dct) { // dct for im <= MTR_DCT m=im*MRES; l = LiM(shtns, 0,im); Q v2d* Ql = (v2d*) &Qlm[l]; // virtual pointer for l=0 and im S v2d* Sl = (v2d*) &Slm[l]; T v2d* Tl = (v2d*) &Tlm[l]; Q double* yl = shtns->ykm_dct[im]; V dyl = shtns->dykm_dct[im]; k=0; l=m; do { Q v2d re = vdup(0.0); v2d ro = vdup(0.0); V v2d te = vdup(0.0); v2d to = vdup(0.0); v2d pe = vdup(0.0); v2d po = vdup(0.0); V v2d dte = vdup(0.0); v2d dto = vdup(0.0); v2d dpe = vdup(0.0); v2d dpo = vdup(0.0); while(ltm[im]; while (k BrF[im*NLAT + NLAT-(k+1)] = 0.0; V BtF[k] = vdup(0.0); BpF[k] = vdup(0.0); VB BtF[NLAT-l + k] = vdup(0.0); BpF[NLAT-l + k] = vdup(0.0); // south pole zeroes ++k; } Q double* yl = shtns->ylm[im]; V dyl = shtns->dylm[im]; do { // ops : NLAT_2 * [ (lmax-m+1)*2 + 4] : almost twice as fast. l=m; Q v2d re = vdup(0.0); v2d ro = vdup(0.0); V v2d dte = vdup(0.0); v2d dto = vdup(0.0); v2d pe = vdup(0.0); v2d po = vdup(0.0); V v2d dpe = vdup(0.0); v2d dpo = vdup(0.0); v2d te = vdup(0.0); v2d to = vdup(0.0); while (lst[k]*m_1); V BtF[k] = addi(dte+dto, te+to); // Bt = dS/dt + I.m/sint *T VB BtF[NLAT-1-k] = addi(dte-dto, te-to); 3 re *= qv; ro *= qv; V BpF[k] = addi(dpe+dpo, pe+po); // Bp = I.m/sint * S - dT/dt VB BpF[NLAT-1-k] = addi(dpe-dpo, pe-po); Q BrF[k] = re + ro; QB BrF[NLAT-1-k] = re - ro; #ifdef SHT_VAR_LTR Q yl += (LMAX-llim); V dyl += (LMAX-llim); #endif } while (++k < NLAT_2); ++im; Q BrF += NLAT; V BtF += NLAT; BpF += NLAT; } for (k=0; k < NLAT*((NPHI>>1) -imlim); ++k) { // padding for high m's Q BrF[k] = vdup(0.0); V BtF[k] = vdup(0.0); BpF[k] = vdup(0.0); } Q BrF -= NLAT*(imlim+1); // restore original pointer V BtF -= NLAT*(imlim+1); BpF -= NLAT*(imlim+1); // restore original pointer // NPHI > 1 as SHT_AXISYM is not defined. #ifndef SHT_NO_DCT Q fftw_execute_r2r(shtns->idct,(double *) BrF, (double *) BrF); // iDCT V fftw_execute_r2r(shtns->idct,(double *) BtF, (double *) BtF); // iDCT V fftw_execute_r2r(shtns->idct,(double *) BpF, (double *) BpF); // iDCT V double* st_1 = shtns->st_1; V k=0; do { // m=0 V double sin_1 = st_1[k]; double sin_2 = st_1[k+1]; V ((double *)BtF)[2*k] *= sin_1; ((double *)BpF)[2*k] *= sin_1; V ((double *)BtF)[2*k+2] *= sin_2; ((double *)BpF)[2*k+2] *= sin_2; V k+=2; V } while(kst_1; Q for (im=1; im<=MTR_DCT; im+=2) { // odd m's Q k=0; do { Q ((v2d *)BrF)[im*NLAT + k] *= vdup(st_1[k]); ((v2d *)BrF)[im*NLAT + k+1] *= vdup(st_1[k+1]); Q k+=2; Q } while(kncplx_fft >= 0) { Q fftw_execute_dft_c2r(shtns->ifft, (cplx *) BrF, Vr); V fftw_execute_dft_c2r(shtns->ifft, (cplx *) BtF, Vt); V fftw_execute_dft_c2r(shtns->ifft, (cplx *) BpF, Vp); if (shtns->ncplx_fft > 0) { // free memory Q VFREE(BrF); VX VFREE(BtF); // this frees also BpF. } } #endif Q #undef BR0 V #undef BT0 V #undef BP0 }