/* * Copyright (c) 2010-2015 Centre National de la Recherche Scientifique. * written by Nathanael Schaeffer (CNRS, ISTerre, Grenoble, France). * * nathanael.schaeffer@ujf-grenoble.fr * * This software is governed by the CeCILL license under French law and * abiding by the rules of distribution of free software. You can use, * modify and/or redistribute the software under the terms of the CeCILL * license as circulated by CEA, CNRS and INRIA at the following URL * "http://www.cecill.info". * * The fact that you are presently reading this means that you have had * knowledge of the CeCILL license and that you accept its terms. * */ # This file is meta-code for SHT.c (spherical harmonic transform). # it is intended for "make" to generate C code for 3 similar SHT functions, # (namely spat_to_SH [Q tag]), spat_to_SHsphtor [V tag], spat_to_SH3 [both Q&V tags]) # from one generic function + tags. # Basically, there are tags at the beginning of lines (Q,V) that are information # to keep or remove the line depending on the function to build. (Q for scalar, V for vector, # for comment) # ////////////////////////////////////////////////// static QX void GEN3(_an1,NWAY,SUFFIX)(shtns_cfg shtns, double *BrF, cplx *Qlm, const long int llim, const int imlim) { VX void GEN3(_an2,NWAY,SUFFIX)(shtns_cfg shtns, double *BtF, double *BpF, cplx *Slm, cplx *Tlm, const long int llim, const int imlim) { 3 void GEN3(_an3,NWAY,SUFFIX)(shtns_cfg shtns, double *BrF, double *BtF, double *BpF, cplx *Qlm, cplx *Slm, cplx *Tlm, const long int llim, const int imlim) { double *alm, *al; double *wg, *ct, *st; V double *l_2; long int nk, k, l,m; int k_inc, m_inc; unsigned m0, mstep; #ifndef SHT_AXISYM unsigned im; V double m_1; #endif #if _GCC_VEC_ Q rnd qq[2*llim]; V rnd ss[2*llim]; V rnd tt[2*llim]; #else Q double qq[llim]; V double ss[llim]; V double tt[llim]; #endif Q double rer[NLAT_2 + NWAY*VSIZE2] SSE; Q double ror[NLAT_2 + NWAY*VSIZE2] SSE; V double ter[NLAT_2 + NWAY*VSIZE2] SSE; V double tor[NLAT_2 + NWAY*VSIZE2] SSE; V double per[NLAT_2 + NWAY*VSIZE2] SSE; V double por[NLAT_2 + NWAY*VSIZE2] SSE; #ifndef SHT_AXISYM Q double rei[NLAT_2 + NWAY*VSIZE2] SSE; Q double roi[NLAT_2 + NWAY*VSIZE2] SSE; V double tei[NLAT_2 + NWAY*VSIZE2] SSE; V double toi[NLAT_2 + NWAY*VSIZE2] SSE; V double pei[NLAT_2 + NWAY*VSIZE2] SSE; V double poi[NLAT_2 + NWAY*VSIZE2] SSE; #endif // ACCESS PATTERN k_inc = shtns->k_stride_a; m_inc = shtns->m_stride_a; nk = NLAT_2; // copy NLAT_2 to a local variable for faster access (inner loop limit) #if _GCC_VEC_ nk = ((unsigned) nk+(VSIZE2-1))/VSIZE2; #endif wg = shtns->wg; ct = shtns->ct; st = shtns->st; V l_2 = shtns->l_2; for (k=nk*VSIZE2; k<(nk-1+NWAY)*VSIZE2; ++k) { // never written, so this is now done for all m's Q rer[k] = 0.0; ror[k] = 0.0; V ter[k] = 0.0; tor[k] = 0.0; V per[k] = 0.0; por[k] = 0.0; #ifndef SHT_AXISYM Q rei[k] = 0.0; roi[k] = 0.0; V tei[k] = 0.0; toi[k] = 0.0; V pei[k] = 0.0; poi[k] = 0.0; #endif } #ifndef _OPENMP m0 = 0; mstep = 1; #else m0 = omp_get_thread_num(); mstep = omp_get_num_threads(); if (m0 == 0) #endif { // im=0 : dzl.p = 0.0 and evrything is REAL alm = shtns->blm; V k=0; do { // compute symmetric and antisymmetric parts. (do not weight here, it is cheaper to weight y0) V double an = BtF[k*k_inc]; double bn = BtF[k*k_inc +1]; V double bs = BtF[(NLAT-2-k)*k_inc]; double as = BtF[(NLAT-2-k)*k_inc +1]; V ter[k] = an+as; tor[k] = an-as; V ter[k+1] = bn+bs; tor[k+1] = bn-bs; V k+=2; V } while(k < nk*VSIZE2); V k=0; do { // compute symmetric and antisymmetric parts. (do not weight here, it is cheaper to weight y0) V double an = BpF[k*k_inc]; double bn = BpF[k*k_inc +1]; V double bs = BpF[(NLAT-2-k)*k_inc]; double as = BpF[(NLAT-2-k)*k_inc +1]; V per[k] = an+as; por[k] = an-as; V per[k+1] = bn+bs; por[k+1] = bn-bs; V k+=2; V } while(k < nk*VSIZE2); Q double r0a = 0.0; double r0b = 0.0; Q k=0; do { // compute symmetric and antisymmetric parts. (do not weight here, it is cheaper to weight y0) Q double an = BrF[k*k_inc]; double bn = BrF[k*k_inc +1]; Q double bs = BrF[(NLAT-2-k)*k_inc]; double as = BrF[(NLAT-2-k)*k_inc +1]; Q rer[k] = an+as; ror[k] = an-as; Q rer[k+1] = bn+bs; ror[k+1] = bn-bs; Q r0a += (an+as)*wg[k]; r0b += (bn+bs)*wg[k+1]; Q k+=2; Q } while(k < nk*VSIZE2); Q Qlm[0] = (r0a+r0b) * alm[0]; // l=0 is done. V Slm[0] = 0.0; Tlm[0] = 0.0; // l=0 is zero for the vector transform. k = 0; for (l=0;l= imlim l = LiM(shtns, imlim*MRES, imlim); do { Q ((v2d*)Qlm)[l] = vdup(0.0); V ((v2d*)Slm)[l] = vdup(0.0); ((v2d*)Tlm)[l] = vdup(0.0); } while(++l < shtns->nlm); } #endif #endif m0=mstep; } #ifndef SHT_AXISYM for (im=m0; imtm[im] / VSIZE2; //alm = shtns->blm[im]; alm = shtns->blm + im*(2*LMAX -m+MRES); Q k = ((l*VSIZE2)>>1)*2; // k must be even here. Q do { // compute symmetric and antisymmetric parts, and reorganize data. Q double an, bn, ani, bni, bs, as, bsi, asi, t; 3 double sina = st[k]; double sinb = st[k+1]; Q ani = BrF[im*m_inc + k*k_inc]; bni = BrF[im*m_inc + k*k_inc +1]; // north Q an = BrF[(NPHI-im)*m_inc + k*k_inc]; bn = BrF[(NPHI-im)*m_inc + k*k_inc +1]; Q t = ani-an; an += ani; ani = bn-bni; bn += bni; bni = t; 3 an *= sina; ani*= sina; bn *= sinb; bni *= sinb; Q bsi = BrF[im*m_inc + (NLAT-2 -k)*k_inc]; asi = BrF[im*m_inc + (NLAT-2-k)*k_inc + 1]; // south Q bs = BrF[(NPHI-im)*m_inc +(NLAT-2-k)*k_inc]; as = BrF[(NPHI-im)*m_inc +(NLAT-2-k)*k_inc +1]; Q t = bsi-bs; bs += bsi; bsi = as-asi; as += asi; asi = t; 3 as *= sina; asi*= sina; bs *= sinb; bsi *= sinb; Q rer[k] = an+as; rei[k] = ani+asi; rer[k+1] = bn+bs; rei[k+1] = bni+bsi; Q ror[k] = an-as; roi[k] = ani-asi; ror[k+1] = bn-bs; roi[k+1] = bni-bsi; Q k+=2; Q } while (k>1)*2; // k must be even here. V do { // compute symmetric and antisymmetric parts, and reorganize data. V double an, bn, ani, bni, bs, as, bsi, asi, t; V ani = BtF[im*m_inc + k*k_inc]; bni = BtF[im*m_inc + k*k_inc +1]; // north V an = BtF[(NPHI-im)*m_inc + k*k_inc]; bn = BtF[(NPHI-im)*m_inc + k*k_inc +1]; V t = ani-an; an += ani; ani = bn-bni; bn += bni; bni = t; V bsi = BtF[im*m_inc + (NLAT-2 -k)*k_inc]; asi = BtF[im*m_inc + (NLAT-2-k)*k_inc + 1]; // south V bs = BtF[(NPHI-im)*m_inc +(NLAT-2-k)*k_inc]; as = BtF[(NPHI-im)*m_inc +(NLAT-2-k)*k_inc +1]; V t = bsi-bs; bs += bsi; bsi = as-asi; as += asi; asi = t; V ter[k] = an+as; tei[k] = ani+asi; ter[k+1] = bn+bs; tei[k+1] = bni+bsi; V tor[k] = an-as; toi[k] = ani-asi; tor[k+1] = bn-bs; toi[k+1] = bni-bsi; V k+=2; V } while (k>1)*2; // k must be even here. V do { // compute symmetric and antisymmetric parts, and reorganize data. V double an, bn, ani, bni, bs, as, bsi, asi, t; V ani = BpF[im*m_inc + k*k_inc]; bni = BpF[im*m_inc + k*k_inc +1]; // north V an = BpF[(NPHI-im)*m_inc + k*k_inc]; bn = BpF[(NPHI-im)*m_inc + k*k_inc +1]; V t = ani-an; an += ani; ani = bn-bni; bn += bni; bni = t; V bsi = BpF[im*m_inc + (NLAT-2 -k)*k_inc]; asi = BpF[im*m_inc + (NLAT-2-k)*k_inc + 1]; // south V bs = BpF[(NPHI-im)*m_inc +(NLAT-2-k)*k_inc]; as = BpF[(NPHI-im)*m_inc +(NLAT-2-k)*k_inc +1]; V t = bsi-bs; bs += bsi; bsi = as-asi; as += asi; asi = t; V per[k] = an+as; pei[k] = ani+asi; per[k+1] = bn+bs; pei[k+1] = bni+bsi; V por[k] = an-as; poi[k] = ani-asi; por[k+1] = bn-bs; poi[k+1] = bni-bsi; V k+=2; V } while (k=0; l--) { Q q[0] = vall(0.0); q[1] = vall(0.0); q+=2; V s[0] = vall(0.0); s[1] = vall(0.0); s+=2; V t[0] = vall(0.0); t[1] = vall(0.0); t+=2; } do { #if _GCC_VEC_ Q rnd* q = qq; V rnd* s = ss; rnd* t = tt; #else l = LiM(shtns, m, im); Q double* q = (double *) &Qlm[l]; V double* s = (double *) &Slm[l]; V double* t = (double *) &Tlm[l]; #endif al = alm; rnd cost[NWAY], y0[NWAY], y1[NWAY]; V rnd st2[NWAY], dy0[NWAY], dy1[NWAY]; Q rnd rerk[NWAY], reik[NWAY], rork[NWAY], roik[NWAY]; // help the compiler to cache into registers. V rnd terk[NWAY], teik[NWAY], tork[NWAY], toik[NWAY]; V rnd perk[NWAY], peik[NWAY], pork[NWAY], poik[NWAY]; for (int j=0; j>= 1); } else { long int nsint = 0; do { // sin(theta)^m (use rescaling to avoid underflow) if (l&1) { for (int j=0; j>= 1); } for (int j=0; j SHT_ACCURACY*SHT_SCALE_FACTOR + 1.0) { // rescale when value is significant ++ny; for (int j=0; j (unsigned) llim) imlim = ((unsigned) llim)/MRES; // 32bit mul and div should be faster #endif if (shtns->fftc_mode >= 0) { if (shtns->fftc_mode == 0) { // in-place V #ifdef HAVE_LIBFFTW3_OMP Q fftw_execute_dft(shtns->fftc,(cplx*)BrF, (cplx*)BrF); V fftw_execute_dft(shtns->fftc,(cplx*)BtF, (cplx*)BtF); V fftw_execute_dft(shtns->fftc,(cplx*)BpF, (cplx*)BpF); V #endif } else { // alloc memory for the transpose FFT unsigned long nv = shtns->nspat; QX BrF = (double*) VMALLOC( nv * sizeof(double) ); VX BtF = (double*) VMALLOC( 2*nv * sizeof(double) ); VX BpF = BtF + nv; 3 BrF = (double*) VMALLOC( 3*nv * sizeof(double) ); 3 BtF = BrF + nv; BpF = BtF + nv; V #ifdef HAVE_LIBFFTW3_OMP Q fftw_execute_split_dft(shtns->fftc, Vr+NPHI, Vr, BrF+1, BrF); V fftw_execute_split_dft(shtns->fftc, Vt+NPHI, Vt, BtF+1, BtF); V fftw_execute_split_dft(shtns->fftc, Vp+NPHI, Vp, BpF+1, BpF); V #endif } } #endif imlim += 1; #pragma omp parallel num_threads(shtns->nthreads) { #ifndef SHT_AXISYM V #ifndef HAVE_LIBFFTW3_OMP V if (shtns->fftc_mode == 0) { // in-place 3 #pragma omp single nowait 3 fftw_execute_dft(shtns->fftc,(cplx*)BrF, (cplx*)BrF); V #pragma omp single nowait V fftw_execute_dft(shtns->fftc,(cplx*)BtF, (cplx*)BtF); V #pragma omp single nowait V fftw_execute_dft(shtns->fftc,(cplx*)BpF, (cplx*)BpF); V } else if (shtns->fftc_mode > 0) { // split out-of-place 3 #pragma omp single nowait 3 fftw_execute_split_dft(shtns->fftc, Vr+NPHI, Vr, ((double*)BrF)+1, ((double*)BrF)); V #pragma omp single nowait V fftw_execute_split_dft(shtns->fftc, Vt+NPHI, Vt, ((double*)BtF)+1, ((double*)BtF)); V #pragma omp single nowait V fftw_execute_split_dft(shtns->fftc, Vp+NPHI, Vp, ((double*)BpF)+1, ((double*)BpF)); V } V #pragma omp barrier V #endif #endif QX GEN3(_an1,NWAY,SUFFIX)(shtns, BrF, Qlm, llim, imlim); VX GEN3(_an2,NWAY,SUFFIX)(shtns, BtF, BpF, Slm, Tlm, llim, imlim); 3 GEN3(_an3,NWAY,SUFFIX)(shtns, BrF, BtF, BpF, Qlm, Slm, Tlm, llim, imlim); } #ifndef SHT_AXISYM if (shtns->fftc_mode > 0) { // free memory Q VFREE(BrF); VX VFREE(BtF); // this frees also BpF. } #endif }