Context Navigation

sht_init.c

main

Last change on this file was ea777aa, checked in by Alex Wilton <awilton@…>, 3 years ago

Moved examples, include, build_default.properties, common.xml, and README out from dev.civl.com into the root of the repo.

git-svn-id: svn://vsl.cis.udel.edu/civl/trunk@5704 fb995dde-84ed-4084-dfe6-e5aef3e2452c

Property mode set to 100644

File size: 88.5 KB

Rev	Line
[2131108]	1	/*
	2	* Copyright (c) 2010-2015 Centre National de la Recherche Scientifique.
	3	* written by Nathanael Schaeffer (CNRS, ISTerre, Grenoble, France).
	4	*
	5	* nathanael.schaeffer@ujf-grenoble.fr
	6	*
	7	* This software is governed by the CeCILL license under French law and
	8	* abiding by the rules of distribution of free software. You can use,
	9	* modify and/or redistribute the software under the terms of the CeCILL
	10	* license as circulated by CEA, CNRS and INRIA at the following URL
	11	* "http://www.cecill.info".
	12	*
	13	* The fact that you are presently reading this means that you have had
	14	* knowledge of the CeCILL license and that you accept its terms.
	15	*
	16	*/
	17
	18	/********************************************************************
	19	* SHTns : Spherical Harmonic Transform for numerical simulations. *
	20	* written by Nathanael Schaeffer / CNRS *
	21	********************************************************************/
	22
	23	/** \internal \file SHT.c
	24	* \brief main source file for SHTns.
	25	* This files contains initialization code and also some partial transforms (point or latitudinal evaluations)
	26	*/
	27
	28	#include <stdio.h>
	29	#include <string.h>
	30	// global variables definitions
	31	#include "sht_private.h"
	32
	33	// cycle counter from FFTW
	34	#include "fftw3/cycle.h"
	35
	36	// chained list of sht_setup : start with NULL
	37	shtns_cfg sht_data = NULL;
	38	#ifdef _OPENMP
	39	int omp_threads = 1; // multi-thread disabled by default.
	40	#if HAVE_LIBFFTW3_OMP
	41	#define OMP_FFTW
	42	#endif
	43	#else
	44	#define omp_threads 1
	45	#endif
	46
	47	static int verbose = 0; // runtime verbosity control: 0 no output, 1 output, 2 debug (if compiled in)
	48	void shtns_verbose(int v) {
	49	verbose = v;
	50	}
	51
	52
	53	/// \internal Abort program with error message.
	54	static void shtns_runerr(const char * error_text)
	55	{
	56	printf("*** [" PACKAGE_NAME "] Run-time error : %s\n",error_text);
	57	exit(1);
	58	}
	59
	60	/* PUBLIC useful functions */
	61
	62	/// returns the l=0, m=0 SH coefficient corresponding to a uniform value of 1.
	63	double sh00_1(shtns_cfg shtns) {
	64	return shtns->Y00_1;
	65	}
	66	/// returns the l=1, m=0 SH coefficient corresponding to cos(theta).
	67	double sh10_ct(shtns_cfg shtns) {
	68	return shtns->Y10_ct;
	69	}
	70	/// returns the l=1, m=1 SH coefficient corresponding to sin(theta).cos(phi).
	71	double sh11_st(shtns_cfg shtns) {
	72	return shtns->Y11_st;
	73	}
	74	/// returns the l,m SH coefficient corresponding to unit energy.
	75	double shlm_e1(shtns_cfg shtns, int l, int m) {
	76	double x = shtns->Y00_1/sqrt(4.*M_PI);
	77	if (SHT_NORM == sht_schmidt) x = sqrt(2l+1);
	78	if ((m!=0)&&((shtns->norm & SHT_REAL_NORM)==0)) x *= sqrt(0.5);
	79	return(x);
	80	}
	81
	82	/// \code return (mmax+1)(lmax+1) - mres(mmax(mmax+1))/2; \endcode /
	83	/// \ingroup init
	84	long nlm_calc(long lmax, long mmax, long mres)
	85	{
	86	if (mmax*mres > lmax) mmax = lmax/mres;
	87	return (mmax+1)(lmax+1) - ((mmaxmres)(mmax+1))/2; // this is wrong if lmax < mmaxmres
	88	}
	89
	90
	91	/* LEGENDRE FUNCTIONS */
	92	#include "sht_legendre.c"
	93
	94
	95	/* SHT FUNCTIONS */
	96	#include "sht_func.c"
	97
	98	#include "sht_com.c"
	99
	100	/*
	101	INTERNAL INITIALIZATION FUNCTIONS
	102	*/
	103
	104	// sht algorithms (hyb, fly1, ...)
	105	enum sht_algos { SHT_DCT, SHT_MEM, SHT_SV,
	106	SHT_FLY1, SHT_FLY2, SHT_FLY3, SHT_FLY4, SHT_FLY6, SHT_FLY8,
	107	SHT_OMP1, SHT_OMP2, SHT_OMP3, SHT_OMP4, SHT_OMP6, SHT_OMP8,
	108	SHT_NALG };
	109
	110	char* sht_name[SHT_NALG] = {"dct", "mem", "s+v", "fly1", "fly2", "fly3", "fly4", "fly6", "fly8", "omp1", "omp2", "omp3", "omp4", "omp6", "omp8" };
	111	char* sht_type[SHT_NTYP] = {"syn", "ana", "vsy", "van", "gsp", "gto", "v3s", "v3a" };
	112	char* sht_var[SHT_NVAR] = {"std", "ltr", "m" };
	113	int sht_npar[SHT_NTYP] = {2, 2, 4, 4, 3, 3, 6, 6};
	114
	115	#ifdef SHTNS_MEM
	116	extern void* fmem[SHT_NTYP];
	117	extern void* fmem_l[SHT_NTYP];
	118	extern void* fmem_m0[SHT_NTYP];
	119	extern void* fmem_m0l[SHT_NTYP];
	120	#endif
	121	#ifdef SHTNS_DCT
	122	extern void* fdct[SHT_NTYP];
	123	extern void* fdct_l[SHT_NTYP];
	124	extern void* fdct_m0[SHT_NTYP];
	125	extern void* fdct_m0l[SHT_NTYP];
	126	#endif
	127	extern void* ffly[6][SHT_NTYP];
	128	extern void* ffly_m[6][SHT_NTYP];
	129	extern void* ffly_m0[6][SHT_NTYP];
	130	#ifdef _OPENMP
	131	extern void* fomp[6][SHT_NTYP];
	132	#endif
	133
	134	// big array holding all sht functions, variants and algorithms
	135	void* sht_func[SHT_NVAR][SHT_NALG][SHT_NTYP];
	136
	137	/// \internal use on-the-fly alogorithm (guess without measuring)
	138	static void set_sht_fly(shtns_cfg shtns, int typ_start)
	139	{
	140	int algo = SHT_FLY2;
	141	if ((shtns->nthreads > 1) && (sht_func[0][SHT_OMP2][typ_start])) algo = SHT_OMP2;
	142	for (int it=typ_start; it<SHT_NTYP; it++) {
	143	for (int v=0; v<SHT_NVAR; v++)
	144	shtns->ftable[v][it] = sht_func[v][algo][it];
	145	}
	146	}
	147
	148	#ifdef SHTNS_MEM
	149	/// \internal choose memory algorithm everywhere.
	150	static void set_sht_mem(shtns_cfg shtns) {
	151	for (int it=0; it<SHT_NTYP; it++) {
	152	for (int v=0; v<SHT_NVAR; v++)
	153	shtns->ftable[v][it] = sht_func[v][SHT_MEM][it];
	154	shtns->ftable[SHT_M][it] = sht_func[SHT_M][SHT_FLY2][it]; // there is no "mem" algo for SHT_M
	155	}
	156	}
	157	#endif
	158
	159	/// \internal copy all algos to sht_func array (should be called by set_grid before choosing variants).
	160	/// if nphi is 1, axisymmetric algorithms are used.
	161	static void init_sht_array_func(shtns_cfg shtns)
	162	{
	163	int it, j;
	164	int alg_lim = SHT_FLY8;
	165
	166	if (shtns->nlat_2 < 8*VSIZE2) { // limit available on-the-fly algorithm to avoid overflow (and segfaults).
	167	it = shtns->nlat_2 / VSIZE2;
	168	switch(it) {
	169	case 0 : alg_lim = SHT_FLY1-1; break;
	170	case 1 : alg_lim = SHT_FLY1; break;
	171	case 2 : alg_lim = SHT_FLY2; break;
	172	case 3 : alg_lim = SHT_FLY3; break;
	173	case 4 : ;
	174	case 5 : alg_lim = SHT_FLY4; break;
	175	default : alg_lim = SHT_FLY6;
	176	}
	177	}
	178	alg_lim -= SHT_FLY1;
	179
	180	memset(sht_func, 0, SHT_NVARSHT_NTYPSHT_NALGsizeof(void) ); // zero out.
	181	sht_func[SHT_STD][SHT_SV][SHT_TYP_3SY] = SHqst_to_spat_2;
	182	sht_func[SHT_LTR][SHT_SV][SHT_TYP_3SY] = SHqst_to_spat_2l;
	183	sht_func[SHT_STD][SHT_SV][SHT_TYP_3AN] = spat_to_SHqst_2;
	184	sht_func[SHT_LTR][SHT_SV][SHT_TYP_3AN] = spat_to_SHqst_2l;
	185	sht_func[SHT_M][SHT_SV][SHT_TYP_3SY] = SHqst_to_spat_2ml;
	186	sht_func[SHT_M][SHT_SV][SHT_TYP_3AN] = spat_to_SHqst_2ml;
	187
	188	if (shtns->nphi==1) { // axisymmetric transform requested.
	189	for (int j=0; j<=alg_lim; j++) {
	190	memcpy(sht_func[SHT_STD][SHT_FLY1 + j], &ffly_m0[j], sizeof(void)SHT_NTYP);
	191	memcpy(sht_func[SHT_LTR][SHT_FLY1 + j], &ffly_m0[j], sizeof(void)SHT_NTYP);
	192	}
	193	#ifdef SHTNS_DCT
	194	memcpy(sht_func[SHT_STD][SHT_DCT], &fdct_m0, sizeof(void)SHT_NTYP);
	195	memcpy(sht_func[SHT_LTR][SHT_DCT], &fdct_m0l, sizeof(void)SHT_NTYP);
	196	#endif
	197	#ifdef SHTNS_MEM
	198	memcpy(sht_func[SHT_STD][SHT_MEM], &fmem_m0, sizeof(void)SHT_NTYP);
	199	memcpy(sht_func[SHT_LTR][SHT_MEM], &fmem_m0l, sizeof(void)SHT_NTYP);
	200	#endif
	201	} else {
	202	for (int j=0; j<=alg_lim; j++) {
	203	memcpy(sht_func[SHT_STD][SHT_FLY1 + j], &ffly[j], sizeof(void)SHT_NTYP);
	204	memcpy(sht_func[SHT_LTR][SHT_FLY1 + j], &ffly[j], sizeof(void)SHT_NTYP);
	205	memcpy(sht_func[SHT_M][SHT_FLY1 + j], &ffly_m[j], sizeof(void)SHT_NTYP);
	206	#ifdef _OPENMP
	207	memcpy(sht_func[SHT_STD][SHT_OMP1 + j], &fomp[j], sizeof(void)SHT_NTYP);
	208	memcpy(sht_func[SHT_LTR][SHT_OMP1 + j], &fomp[j], sizeof(void)SHT_NTYP);
	209	memcpy(sht_func[SHT_M][SHT_OMP1 + j], &ffly_m[j], sizeof(void)SHT_NTYP); // no omp algo for SHT_M, use fly instead
	210	#endif
	211	}
	212	#ifdef SHTNS_DCT
	213	memcpy(sht_func[SHT_STD][SHT_DCT], &fdct, sizeof(void)SHT_NTYP);
	214	memcpy(sht_func[SHT_LTR][SHT_DCT], &fdct_l, sizeof(void)SHT_NTYP);
	215	#endif
	216	#ifdef SHTNS_MEM
	217	memcpy(sht_func[SHT_STD][SHT_MEM], &fmem, sizeof(void)SHT_NTYP);
	218	memcpy(sht_func[SHT_LTR][SHT_MEM], &fmem_l, sizeof(void)SHT_NTYP);
	219	#endif
	220	}
	221
	222	#ifdef SHTNS_MEM
	223	set_sht_mem(shtns); // default transform is MEM
	224	#else
	225	set_sht_fly(shtns, 0);
	226	#endif
	227	}
	228
	229
	230	/// \internal return the smallest power of 2 larger than n.
	231	static int next_power_of_2(int n)
	232	{
	233	int f = 1;
	234	if ( (n<=0) \|\| (n>(1<<(sizeof(int)*8-2))) ) return 0;
	235	while (f<n) f*=2;
	236	return f;
	237	}
	238
	239	/// \internal find the closest integer that is larger than n and that contains only factors up to fmax.
	240	/// fmax is 7 for optimal FFTW fourier transforms.
	241	/// return only even integers for n>fmax.
	242	static int fft_int(int n, int fmax)
	243	{
	244	int k,f;
	245
	246	if (n<=fmax) return n;
	247	if (fmax<2) return 0;
	248	if (fmax==2) return next_power_of_2(n);
	249
	250	n -= 2-(n&1); // only even n
	251	do {
	252	n+=2; f=2;
	253	while ((2f <= n) && ((n&f)==0)) f = 2; // no divisions for factor 2.
	254	k=3;
	255	while ((k<=fmax) && (k*f <= n)) {
	256	while ((kf <= n) && (n%(kf)==0)) f *= k;
	257	k+=2;
	258	}
	259	} while (f != n);
	260
	261	k = next_power_of_2(n); // what is the closest power of 2 ?
	262	if ((k-n)*33 < n) return k; // rather choose power of 2 if not too far (3%)
	263
	264	return n;
	265	}
	266
	267
	268	/// \internal returns an aproximation of the memory usage in mega bytes for the scalar matrices.
	269	/// \ingroup init
	270	static double sht_mem_size(int lmax, int mmax, int mres, int nlat)
	271	{
	272	double s = 1./(1024*1024);
	273	s = ((nlat+1)/2) sizeof(double) * nlm_calc(lmax, mmax, mres);
	274	return s;
	275	}
	276
	277	/// \internal return the number of shtns config that contain a reference to the memory location *pp.
	278	static int ref_count(shtns_cfg shtns, void* pp)
	279	{
	280	shtns_cfg s2 = sht_data;
	281	void* p;
	282	long int nref, offset;
	283
	284	if ((pp==NULL) \|\| (shtns==NULL)) return -1; // error.
	285
	286	p = (void*)pp; // the pointer to memory location that we want to free
	287	if (p == NULL) return 0; // nothing to do.
	288
	289	offset = (char)pp - (char)shtns; // relative location in shtns_info structure.
	290	nref = 0; // reference count.
	291	while (s2 != NULL) { // scan all configs.
	292	if ( (void)(((char)s2) + offset) == p ) nref++;
	293	s2 = s2->next;
	294	}
	295	return nref; // will be >= 1 (as shtns is included in search)
	296	}
	297
	298	/// \internal check if the memory location *pp is referenced by another sht config before freeing it.
	299	/// returns the number of other references, or -1 on error. If >=1, the memory location could not be freed.
	300	/// If the return value is 0, the ressource has been freed.
	301	static int free_unused(shtns_cfg shtns, void* pp)
	302	{
	303	int n = ref_count(shtns, pp); // howmany shtns config do reference this memory location ?
	304	if (n <= 0) return n; // nothing to free.
	305	if (n == 1) { // no other reference found...
	306	void ap = (void) pp;
	307	free(*ap); // ...so we can free it...
	308	*ap = NULL; // ...and mark as unaloccated.
	309	}
	310	return (n-1);
	311	}
	312
	313	#ifdef SHTNS_MEM
	314	/// \internal Free matrices if on-the-fly has been selected.
	315	static void free_unused_matrices(shtns_cfg shtns)
	316	{
	317	long int marray_size = (MMAX+1)*sizeof(double) + (MIN_ALIGNMENT-1);
	318	int count[SHT_NTYP];
	319	int it, iv, ia, iv2;
	320
	321	for (it=0; it<SHT_NTYP; it++) {
	322	count[it] = 0;
	323	for (iv=0; iv<SHT_NVAR; iv++) {
	324	void* fptr = shtns->ftable[iv][it];
	325	if (fptr != NULL) {
	326	for (ia=0; ia<=SHT_MEM; ia++) // count occurences to mem algo
	327	for (iv2=0; iv2<SHT_NVAR; iv2++)
	328	if (sht_func[iv2][ia][it] == fptr) count[it]++;
	329	}
	330	}
	331	#if SHT_VERBOSE > 1
	332	if (verbose>1) printf(" %d ",count[it]);
	333	#endif
	334	}
	335
	336	if (shtns->dylm == NULL) { // no vector transform : do not try to free
	337	count[SHT_TYP_VAN] = -1; count[SHT_TYP_VSY] = -1;
	338	}
	339
	340	if (count[SHT_TYP_3AN] == 0) { // analysis may be freed.
	341	if (count[SHT_TYP_VAN] == 0) {
	342	PRINT_VERB("freeing vector analysis matrix\n");
	343	free_unused(shtns, &shtns->dzlm);
	344	}
	345	if (count[SHT_TYP_SAN] == 0) {
	346	PRINT_VERB("freeing scalar analysis matrix\n");
	347	free_unused(shtns, &shtns->zlm);
	348	}
	349	} else if (shtns->mmax > 0) { // scalar may be reduced to m=0
	350	if ((count[SHT_TYP_SAN] == 0) && (ref_count(shtns, &shtns->zlm) == 1)) {
	351	PRINT_VERB("keeping scalar analysis matrix up to m=0\n");
	352	shtns->zlm = realloc(shtns->zlm, ((LMAX+1)NLAT_2 +3)sizeof(double) + marray_size );
	353	}
	354	}
	355	if (count[SHT_TYP_3SY] == 0) { // synthesis may be freed.
	356	if (count[SHT_TYP_VSY] + count[SHT_TYP_GSP] + count[SHT_TYP_GTO] == 0) {
	357	PRINT_VERB("freeing vector synthesis matrix\n");
	358	free_unused(shtns, &shtns->dylm);
	359	}
	360	if (count[SHT_TYP_SSY] == 0) {
	361	PRINT_VERB("freeing scalar synthesis matrix\n");
	362	free_unused(shtns, &shtns->ylm);
	363	}
	364	} else if (shtns->mmax > 0) { // scalar may be reduced to m=0
	365	if ((count[SHT_TYP_SSY] == 0) && (ref_count(shtns, &shtns->ylm) == 1)) {
	366	PRINT_VERB("keeping scalar synthesis matrix up to m=0\n");
	367	shtns->ylm = realloc(shtns->ylm, (LMAX+2)NLAT_2sizeof(double) + marray_size );
	368	}
	369	}
	370	}
	371	#endif
	372
	373	/// \internal allocate arrays for SHT related to a given grid.
	374	static void alloc_SHTarrays(shtns_cfg shtns, int on_the_fly, int vect, int analys)
	375	{
	376	long int im, l0;
	377	long int size, marray_size, lstride;
	378
	379	im = (VSIZE2 > 2) ? VSIZE2 : 2;
	380	l0 = ((NLAT+im-1)/im)*im; // align on vector
	381	shtns->ct = (double ) VMALLOC( sizeof(double) l0*3 ); /// ct[] (including st and st_1)
	382	shtns->st = shtns->ct + l0; shtns->st_1 = shtns->ct + 2*l0;
	383
	384	shtns->ylm = NULL; shtns->dylm = NULL; // synthesis
	385	shtns->zlm = NULL; shtns->dzlm = NULL; // analysis
	386
	387	if (on_the_fly == 0) { // Allocate legendre functions lookup tables.
	388	marray_size = (MMAX+1)sizeof(double) + (MIN_ALIGNMENT-1); // for sse2 alignement
	389
	390	/* ylm */
	391	lstride = (LMAX+1); lstride += (lstride&1); // even stride.
	392	size = sizeof(double) * ((NLM-(LMAX+1)+lstride)*NLAT_2);
	393	if (MMAX == 0) size += 3*sizeof(double); // some overflow needed.
	394	shtns->ylm = (double **) malloc( marray_size + size );
	395	shtns->ylm[0] = (double *) PTR_ALIGN( shtns->ylm + (MMAX+1) );
	396	if (MMAX>0) shtns->ylm[1] = shtns->ylm[0] + NLAT_2*lstride;
	397	for (im=1; im<MMAX; im++) shtns->ylm[im+1] = shtns->ylm[im] + NLAT_2(LMAX+1-imMRES);
	398	/* dylm */
	399	if (vect) {
	400	lstride = LMAX; lstride += (lstride&1); // even stride.
	401	size = sizeof(struct DtDp) * ((NLM-(LMAX+1) +lstride/2)*NLAT_2);
	402	if (MMAX == 0) size += 2*sizeof(double); // some overflow needed.
	403	shtns->dylm = (struct DtDp **) malloc( marray_size + size );
	404	shtns->dylm[0] = (struct DtDp *) PTR_ALIGN( shtns->dylm + (MMAX+1) );
	405	if (MMAX>0) shtns->dylm[1] = shtns->dylm[0] + (lstride/2)*NLAT_2; // phi-derivative is zero for m=0
	406	for (im=1; im<MMAX; im++) shtns->dylm[im+1] = shtns->dylm[im] + NLAT_2(LMAX+1-imMRES);
	407	}
	408	if (analys) {
	409	/* zlm */
	410	size = sizeof(double) * (NLM*NLAT_2 + (NLAT_2 & 1));
	411	if (MMAX == 0) size += 2(NLAT_2 & 1)((LMAX+1) & 1) * sizeof(double);
	412	shtns->zlm = (double **) malloc( marray_size + size );
	413	shtns->zlm[0] = (double *) PTR_ALIGN( shtns->zlm + (MMAX+1) );
	414	if (MMAX>0) shtns->zlm[1] = shtns->zlm[0] + NLAT_2*(LMAX+1) + (NLAT_2&1);
	415	for (im=1; im<MMAX; im++) shtns->zlm[im+1] = shtns->zlm[im] + NLAT_2(LMAX+1-imMRES);
	416	/* dzlm */
	417	if (vect) {
	418	size = sizeof(struct DtDp)* (NLM-1)*NLAT_2; // remove l=0
	419	shtns->dzlm = (struct DtDp **) malloc( marray_size + size );
	420	shtns->dzlm[0] = (struct DtDp *) PTR_ALIGN( shtns->dzlm + (MMAX+1) );
	421	if (MMAX>0) shtns->dzlm[1] = shtns->dzlm[0] + NLAT_2*(LMAX);
	422	for (im=1; im<MMAX; im++) shtns->dzlm[im+1] = shtns->dzlm[im] + NLAT_2(LMAX+1-imMRES);
	423	}
	424	}
	425	}
	426	#if SHT_VERBOSE > 1
	427	if (verbose>1) printf(" Memory used for Ylm and Zlm matrices = %.3f Mb x2\n",3.0sizeof(double)NLMNLAT_2/(1024.1024.));
	428	#endif
	429	}
	430
	431
	432	/// \internal free arrays allocated by init_SH_dct
	433	static void free_SH_dct(shtns_cfg shtns)
	434	{
	435	if (shtns->zlm_dct0 == NULL) return;
	436
	437	if (ref_count(shtns, &shtns->dzlm_dct0) == 1) VFREE(shtns->dzlm_dct0);
	438	if (ref_count(shtns, &shtns->zlm_dct0) == 1) VFREE(shtns->zlm_dct0);
	439	shtns->dzlm_dct0 = NULL; shtns->zlm_dct0 = NULL;
	440
	441	free_unused(shtns, &shtns->dykm_dct);
	442	free_unused(shtns, &shtns->ykm_dct);
	443
	444	if (ref_count(shtns, &shtns->idct) == 1) fftw_destroy_plan(shtns->idct); // free unused dct plans
	445	if (ref_count(shtns, &shtns->dct_m0) == 1) fftw_destroy_plan(shtns->dct_m0);
	446	shtns->idct = NULL; shtns->dct_m0 = NULL;
	447	}
	448
	449	/// \internal free arrays allocated by alloc_SHTarrays.
	450	static void free_SHTarrays(shtns_cfg shtns)
	451	{
	452	free_SH_dct(shtns);
	453	free_unused(shtns, &shtns->ylm);
	454	free_unused(shtns, &shtns->dylm);
	455	free_unused(shtns, &shtns->zlm);
	456	free_unused(shtns, &shtns->dzlm);
	457
	458	if (ref_count(shtns, &shtns->ct) == 1) VFREE(shtns->ct);
	459	shtns->ct = NULL; shtns->st = NULL;
	460
	461	if (ref_count(shtns, &shtns->fft) == 1) fftw_destroy_plan(shtns->fft);
	462	if (ref_count(shtns, &shtns->ifft) == 1) fftw_destroy_plan(shtns->ifft);
	463	shtns->fft = NULL; shtns->ifft = NULL; shtns->ncplx_fft = -1; // no fft
	464	}
	465
	466	#ifndef HAVE_FFTW_COST
	467	// substitute undefined symbol in mkl and fftw older than 3.3
	468	#define fftw_cost(a) 0.0
	469	#endif
	470
	471	/// \internal initialize FFTs using FFTW.
	472	/// \param[in] layout defines the spatial layout (see \ref spat).
	473	/// \param[in] on_the_fly is one, if only on-the-fly transform are considered.
	474	/// returns the number of double to be allocated for a spatial field.
	475	static void planFFT(shtns_cfg shtns, int layout, int on_the_fly)
	476	{
	477	double cost_fft_ip, cost_fft_oop, cost_ifft_ip, cost_ifft_oop;
	478	cplx *ShF;
	479	double *Sh;
	480	fftw_plan fft2, ifft2, fft, ifft;
	481	int nfft, ncplx, nreal;
	482	int theta_inc, phi_inc, phi_embed;
	483	#ifdef HAVE_FFTW_COST
	484	int in_place = 1; // try to use in-place real fft.
	485	#else
	486	int in_place = 0; // do not try to use in-place real fft if no timing data available.
	487	#endif
	488
	489	if (NPHI <= 2MMAX) shtns_runerr("the sampling condition Nphi > 2Mmax is not met.");
	490
	491	#ifdef OMP_FFTW
	492	if ((shtns->fftw_plan_mode & (FFTW_EXHAUSTIVE \| FFTW_PATIENT)) && (omp_threads > 1)) {
	493	shtns->fftw_plan_mode = FFTW_PATIENT;
	494	fftw_plan_with_nthreads(omp_threads);
	495	} else fftw_plan_with_nthreads(shtns->nthreads);
	496	#endif
	497
	498	shtns->k_stride_a = 1; shtns->m_stride_a = NLAT; // default strides
	499
	500	shtns->fft = NULL; shtns->ifft = NULL;
	501	shtns->dct_m0 = NULL; shtns->idct = NULL; // set dct plans to uninitialized.
	502
	503	shtns->nspat = NPHI * NLAT; // default spatial size
	504
	505	if (NPHI==1) // no FFT needed.
	506	{
	507	shtns->fftc_mode = -1; // no FFT
	508	#if SHT_VERBOSE > 0
	509	if (verbose) printf(" => no fft : Mmax=0, Nphi=1, Nlat=%d\n",NLAT);
	510	#endif
	511	shtns->ncplx_fft = -1; // no fft.
	512	return;
	513	}
	514
	515	/* NPHI > 1 */
	516	theta_inc=1; phi_inc=NLAT; phi_embed=2*(NPHI/2+1); // SHT_NATIVE_LAYOUT is the default.
	517	if (layout & SHT_THETA_CONTIGUOUS) { theta_inc=1; phi_inc=NLAT; phi_embed=NPHI; }
	518	if (layout & SHT_PHI_CONTIGUOUS) { phi_inc=1; theta_inc=NPHI; phi_embed=NPHI; }
	519	nfft = NPHI;
	520	ncplx = NPHI/2 +1;
	521	nreal = phi_embed;
	522	if ((theta_inc != 1)\|\|(phi_inc != NLAT)\|\|(nreal < 2*ncplx)) in_place = 0; // we need to do the fft out-of-place.
	523
	524	#if SHT_VERBOSE > 0
	525	if (verbose) {
	526	printf(" => using FFTW : Mmax=%d, Nphi=%d, Nlat=%d (data layout : phi_inc=%d, theta_inc=%d, phi_embed=%d)\n",MMAX,NPHI,NLAT,phi_inc,theta_inc,phi_embed);
	527	if (NPHI <= (SHT_NL_ORDER+1)MMAX) printf(" !! Warning : anti-aliasing condition Nphi > %dMmax is not met !\n", SHT_NL_ORDER+1);
	528	if (NPHI != fft_int(NPHI,7)) printf(" !! Warning : Nphi is not optimal for FFTW !\n");
	529	}
	530	#endif
	531
	532	// Allocate dummy Spatial Fields.
	533	ShF = (cplx ) VMALLOC(ncplx NLAT * sizeof(cplx));
	534	Sh = (double ) VMALLOC(ncplx NLAT * sizeof(cplx));
	535	fft = NULL; ifft = NULL; fft2 = NULL; ifft2 = NULL;
	536
	537	// complex fft for fly transform is a bit different.
	538	if (layout & SHT_PHI_CONTIGUOUS) { // out-of-place split dft
	539	fftw_iodim dim, many;
	540	shtns->fftc_mode = 1;
	541	//default internal
	542	dim.n = NPHI; dim.os = 1; dim.is = NLAT; // complex transpose
	543	many.n = NLAT/2; many.os = 2*NPHI; many.is = 2;
	544	shtns->ifftc = fftw_plan_guru_split_dft(1, &dim, 1, &many, ((double)ShF)+1, (double)ShF, Sh+NPHI, Sh, shtns->fftw_plan_mode);
	545
	546	// legacy analysis fft
	547	//dim.n = NPHI; dim.is = 1; dim.os = NLAT;
	548	//many.n = NLAT/2; many.is = 2*NPHI; many.os = 2;
	549	// new internal
	550	dim.n = NPHI; dim.is = 1; dim.os = 2; // split complex, but without global transpose (faster).
	551	many.n = NLAT/2; many.is = 2NPHI; many.os = 2NPHI;
	552	shtns->fftc = fftw_plan_guru_split_dft(1, &dim, 1, &many, Sh+NPHI, Sh, ((double)ShF)+1, (double)ShF, shtns->fftw_plan_mode);
	553	shtns->k_stride_a = NPHI; shtns->m_stride_a = 2;
	554
	555	#if SHT_VERBOSE > 1
	556	if (verbose>1) {
	557	printf(" fftw cost ifftc=%lg, fftc=%lg ",fftw_cost(shtns->ifftc), fftw_cost(shtns->fftc)); fflush(stdout);
	558	}
	559	#endif
	560	} else { //if (layout & SHT_THETA_CONTIGUOUS) { // use only in-place here, supposed to be faster.
	561	shtns->fftc_mode = 0;
	562	shtns->ifftc = fftw_plan_many_dft(1, &nfft, NLAT/2, ShF, &nfft, NLAT/2, 1, ShF, &nfft, NLAT/2, 1, FFTW_BACKWARD, shtns->fftw_plan_mode);
	563	shtns->fftc = shtns->ifftc; // same thing, with m>0 and m<0 exchanged.
	564	#if SHT_VERBOSE > 1
	565	if (verbose>1) {
	566	printf(" fftw cost ifftc=%lg ",fftw_cost(shtns->ifftc)); fflush(stdout);
	567	}
	568	#endif
	569	}
	570
	571	#if _GCC_VEC_
	572	if (on_the_fly == 0)
	573	#endif
	574	{ // the real ffts are required if _GCC_VEC_ == 0 or if on_the_fly is zero.
	575	// IFFT : unnormalized. FFT : must be normalized.
	576	cost_fft_ip = 0.0; cost_ifft_ip = 0.0; cost_fft_oop = 0.0; cost_ifft_oop = 0.0;
	577	if (in_place) { // in-place FFT (if allowed)
	578	ifft2 = fftw_plan_many_dft_c2r(1, &nfft, NLAT, ShF, &ncplx, NLAT, 1, (double*) ShF, &nreal, phi_inc, theta_inc, shtns->fftw_plan_mode);
	579	#if SHT_VERBOSE > 1
	580	if (verbose>1) {
	581	printf(" in-place cost : ifft=%lg ",fftw_cost(ifft2)); fflush(stdout);
	582	}
	583	#endif
	584	if (ifft2 != NULL) {
	585	fft2 = fftw_plan_many_dft_r2c(1, &nfft, NLAT, (double*) ShF, &nreal, phi_inc, theta_inc, ShF, &ncplx, NLAT, 1, shtns->fftw_plan_mode);
	586	#if SHT_VERBOSE > 1
	587	if (verbose>1) {
	588	printf("fft=%lg\n",fftw_cost(fft2)); fflush(stdout);
	589	}
	590	#endif
	591	if (fft2 != NULL) {
	592	cost_fft_ip = fftw_cost(fft2); cost_ifft_ip = fftw_cost(ifft2);
	593	}
	594	}
	595	}
	596	if ( (in_place == 0) \|\| (cost_fft_ip * cost_ifft_ip > 0.0) )
	597	{ // out-of-place FFT
	598	ifft = fftw_plan_many_dft_c2r(1, &nfft, NLAT, ShF, &ncplx, NLAT, 1, Sh, &nreal, phi_inc, theta_inc, shtns->fftw_plan_mode);
	599	#if SHT_VERBOSE > 1
	600	if (verbose>1) { printf(" oop cost : ifft=%lg ",fftw_cost(ifft)); fflush(stdout); }
	601	#endif
	602	if (ifft == NULL) shtns_runerr("[FFTW] ifft planning failed !");
	603	fft = fftw_plan_many_dft_r2c(1, &nfft, NLAT, Sh, &nreal, phi_inc, theta_inc, ShF, &ncplx, NLAT, 1, shtns->fftw_plan_mode);
	604	#if SHT_VERBOSE > 1
	605	if (verbose>1) { printf("fft=%lg\n",fftw_cost(fft)); fflush(stdout); }
	606	#endif
	607	if (fft == NULL) shtns_runerr("[FFTW] fft planning failed !");
	608	cost_fft_oop = fftw_cost(fft); cost_ifft_oop = fftw_cost(ifft);
	609	}
	610	if ( (cost_fft_ip * cost_ifft_ip > 0.0) && (cost_fft_oop * cost_ifft_oop > 0.0) ) { // both have been succesfully timed.
	611	if ( cost_fft_oop + SHT_NL_ORDERcost_ifft_oop < cost_fft_ip + SHT_NL_ORDERcost_ifft_ip )
	612	in_place = 0; // disable in-place, because out-of-place is faster.
	613	}
	614
	615	if (in_place) {
	616	/* IN-PLACE FFT */
	617	if (fft != NULL) fftw_destroy_plan(fft);
	618	if (ifft != NULL) fftw_destroy_plan(ifft);
	619	fft = fft2; ifft = ifft2;
	620	shtns->ncplx_fft = 0; // fft is done in-place, no allocation needed.
	621	shtns->nspat = phi_embed * NLAT; // more space must be reserved for inplace ffts.
	622	} else {
	623	/* OUT-OF-PLACE FFT */
	624	if (fft2 != NULL) fftw_destroy_plan(fft2); // use OUT-OF-PLACE FFT
	625	if (ifft2 != NULL) fftw_destroy_plan(ifft2);
	626	shtns->ncplx_fft = ncplx * NLAT; // fft is done out-of-place, store allocation size.
	627	#if SHT_VERBOSE > 1
	628	if (verbose>1) printf(" out-of-place fft \n");
	629	#endif
	630	}
	631	shtns->fft = fft; shtns->ifft = ifft;
	632	}
	633	VFREE(Sh); VFREE(ShF);
	634
	635	#if SHT_VERBOSE > 2
	636	if (verbose>2) {
	637	printf(" *** fft plan :\n");
	638	fftw_print_plan(fft);
	639	printf("\n *** ifft plan :\n");
	640	fftw_print_plan(ifft);
	641	printf("\n");
	642	}
	643	#endif
	644	}
	645
	646	#ifdef SHTNS_DCT
	647	/// \internal initialize DCTs using FFTW. Must be called if MTR_DCT is changed.
	648	static int planDCT(shtns_cfg shtns)
	649	{
	650	double *Sh;
	651	int ndct = NLAT;
	652	fftw_r2r_kind r2r_kind;
	653	fftw_iodim dims, hdims[2];
	654	double Sh0[NLAT] SSE; // temp storage on the stack, aligned.
	655
	656	#ifdef OMP_FFTW
	657	if (shtns->fftw_plan_mode & (FFTW_EXHAUSTIVE \| FFTW_PATIENT)) {
	658	fftw_plan_with_nthreads(omp_threads);
	659	} else fftw_plan_with_nthreads(1);
	660	#endif
	661	// Allocate dummy Spatial Fields.
	662	Sh = (double ) VMALLOC((NPHI/2 +1) NLAT2 sizeof(double));
	663
	664	if (shtns->dct_m0 == NULL) { // allocate only once since it does not change.
	665	int stride = (NPHI>1) ? 2 : 1;
	666	r2r_kind = FFTW_REDFT10; // out-of-place.
	667	shtns->dct_m0 = fftw_plan_many_r2r(1, &ndct, 1, Sh, &ndct, stride, stride*NLAT, Sh0, &ndct, 1, NLAT, &r2r_kind, shtns->fftw_plan_mode); // out-of-place.
	668	if (shtns->dct_m0 == NULL) return 0; // dct_m0 planning failed.
	669	#if SHT_VERBOSE > 2
	670	if (verbose>2) {
	671	printf(" *** dct_m0 plan :\n"); fftw_print_plan(shtns->dct_m0); printf("\n");
	672	}
	673	#endif
	674	}
	675
	676	if (NPHI > 1) { // complex data for NPHI>1, recompute as it does depend on MTR_DCT
	677	if (shtns->idct != NULL) fftw_destroy_plan(shtns->idct);
	678
	679	dims.n = NLAT; dims.is = 2; dims.os = 2; // real and imaginary part.
	680	hdims[0].n = MTR_DCT+1; hdims[0].is = 2NLAT; hdims[0].os = 2NLAT;
	681	hdims[1].n = 2; hdims[1].is = 1; hdims[1].os = 1;
	682	r2r_kind = FFTW_REDFT01;
	683	shtns->idct = fftw_plan_guru_r2r(1, &dims, 2, hdims, Sh, Sh, &r2r_kind, shtns->fftw_plan_mode);
	684	} else { // NPHI == 1
	685	if (shtns->idct == NULL) {
	686	r2r_kind = FFTW_REDFT01;
	687	shtns->idct = fftw_plan_many_r2r(1, &ndct, 1, Sh, &ndct, 1, NLAT, Sh, &ndct, 1, NLAT, &r2r_kind, shtns->fftw_plan_mode);
	688	}
	689	}
	690
	691	VFREE(Sh);
	692	if (shtns->idct == NULL) return 0; // idct planning failed.
	693	#if SHT_VERBOSE > 2
	694	if (verbose>2) {
	695	printf(" *** idct plan :\n"); fftw_print_plan(shtns->idct); printf("\n");
	696	}
	697	#endif
	698	return 1; // success.
	699	}
	700
	701	/// \internal SET MTR_DCT and updates fftw_plan for DCT's
	702	static int Set_MTR_DCT(shtns_cfg shtns, int m)
	703	{
	704	if ((shtns->zlm_dct0 == NULL)\|\|(m == MTR_DCT)) return MTR_DCT;
	705	if ( m < 0 ) { // don't use dct
	706	shtns->mtr_dct = -1;
	707	} else {
	708	if (m>MMAX) m=MMAX;
	709	shtns->mtr_dct = m;
	710	if (planDCT(shtns) == 0)
	711	shtns->mtr_dct = -1; // failure
	712	}
	713	return MTR_DCT;
	714	}
	715
	716	/// \internal returns the m-truncation of DCT part of synthesis
	717	static int Get_MTR_DCT(shtns_cfg shtns) {
	718	return MTR_DCT;
	719	}
	720	#endif /* SHTNS_DCT */
	721
	722
	723	/// \internal Sets the value tm[im] used for polar optimiation on-the-fly.
	724	static void PolarOptimize(shtns_cfg shtns, double eps)
	725	{
	726	int im, m, l, it;
	727	double v;
	728	double y[LMAX+1];
	729
	730	for (im=0;im<=MMAX;im++) shtns->tm[im] = 0;
	731
	732	if (eps > 0.0) {
	733	for (im=1;im<=MMAX;im++) {
	734	m = im*MRES;
	735	it = shtns->tm[im-1] -1; // tm[im] is monotonic.
	736	do {
	737	it++;
	738	legendre_sphPlm_array(shtns, LMAX, im, shtns->ct[it], y+m);
	739	v = 0.0;
	740	for (l=m; l<=LMAX; l++) {
	741	double ya = fabs(y[l]);
	742	if ( v < ya ) v = ya;
	743	}
	744	} while (v < eps);
	745	shtns->tm[im] = it;
	746	}
	747	#if SHT_VERBOSE > 0
	748	if (verbose) printf(" + polar optimization threshold = %.1e\n",eps);
	749	#endif
	750	#if SHT_VERBOSE > 1
	751	if (verbose>1) {
	752	printf(" tm[im]=");
	753	for (im=0;im<=MMAX;im++)
	754	printf(" %d",shtns->tm[im]);
	755	printf("\n");
	756	}
	757	#endif
	758	}
	759	}
	760
	761	#ifdef SHTNS_MEM
	762
	763	/// \internal Perform some optimization on the SHT matrices.
	764	static void OptimizeMatrices(shtns_cfg shtns, double eps)
	765	{
	766	unsigned short *tm;
	767	double ylm, zlm;
	768	struct DtDp** dylm;
	769	struct DtDp** dzlm;
	770	int im,m,l,it;
	771	int vector = (shtns->dylm != NULL);
	772
	773	tm = shtns->tm;
	774	ylm = shtns->ylm; dylm = shtns->dylm;
	775	zlm = shtns->zlm; dzlm = shtns->dzlm;
	776
	777	for (im=0;im<=MMAX;im++) tm[im] = 0;
	778	/// POLAR OPTIMIZATION : analyzing coefficients, some can be safely neglected.
	779	if (eps > 0.0) {
	780	for (im=1;im<=MMAX;im++) {
	781	m = im*MRES;
	782	tm[im] = NLAT_2;
	783	for (l=m;l<=LMAX;l++) {
	784	it = tm[im-1]; // tm[im] is monotonic.
	785	while( fabs(ylm[im][it*(LMAX-m+1) + (l-m)]) < eps ) { it++; }
	786	if (tm[im] > it) tm[im] = it;
	787	}
	788	}
	789	#if SHT_VERBOSE > 0
	790	if (verbose) printf(" + polar optimization threshold = %.1e\n",eps);
	791	#endif
	792	#if SHT_VERBOSE > 1
	793	if (verbose>1) {
	794	printf(" tm[im]=");
	795	for (im=0;im<=MMAX;im++)
	796	printf(" %d",tm[im]);
	797	printf("\n");
	798	}
	799	#endif
	800
	801	for (im=1; im<=MMAX; im++) { // im >= 1
	802	if (tm[im] > 0) { // we can remove the data corresponding to polar values.
	803	m = im*MRES;
	804	ylm[im] += tm[im]*(LMAX-m+1); // shift pointers (still one block for each m)
	805	if (vector) dylm[im] += tm[im]*(LMAX-m+1);
	806	if (zlm[0] != NULL) {
	807	for (l=m; l<LMAX; l+=2) {
	808	for (it=0; it<NLAT_2-tm[im]; it++) { // copy data to avoid cache misses.
	809	zlm[im][(l-m)(NLAT_2-tm[im]) + it2] = zlm[im][(l-m)NLAT_2 + (it+tm[im])2];
	810	zlm[im][(l-m)(NLAT_2-tm[im]) + it2+1] = zlm[im][(l-m)NLAT_2 + (it+tm[im])2+1];
	811	if (vector) {
	812	dzlm[im][(l-m)(NLAT_2-tm[im]) + it2].t = dzlm[im][(l-m)NLAT_2 + (it+tm[im])2].t;
	813	dzlm[im][(l-m)(NLAT_2-tm[im]) + it2].p = dzlm[im][(l-m)NLAT_2 + (it+tm[im])2].p;
	814	dzlm[im][(l-m)(NLAT_2-tm[im]) + it2+1].t = dzlm[im][(l-m)NLAT_2 + (it+tm[im])2+1].t;
	815	dzlm[im][(l-m)(NLAT_2-tm[im]) + it2+1].p = dzlm[im][(l-m)NLAT_2 + (it+tm[im])2+1].p;
	816	}
	817	}
	818	}
	819	if (l==LMAX) {
	820	for (it=0; it<NLAT_2-tm[im]; it++) {
	821	zlm[im][(l-m)(NLAT_2-tm[im]) + it] = zlm[im][(l-m)NLAT_2 + (it+tm[im])];
	822	if (vector) {
	823	dzlm[im][(l-m)(NLAT_2-tm[im]) + it].t = dzlm[im][(l-m)NLAT_2 + (it+tm[im])].t;
	824	dzlm[im][(l-m)(NLAT_2-tm[im]) + it].p = dzlm[im][(l-m)NLAT_2 + (it+tm[im])].p;
	825	}
	826	}
	827	}
	828	}
	829	}
	830	}
	831	}
	832
	833	/// Compression of dzlm for m=0, as .p is 0
	834	if ((vector) && (dzlm[0] != NULL)) { // for sht_reg_poles there is no dzlm defined.
	835	im=0; m=0;
	836	double* yg = (double *) dzlm[im];
	837	for (l=1; l<LMAX; l+=2) { // l=0 is zero, so we start at l=1.
	838	for (it=0; it<NLAT_2; it++) {
	839	yg[(l-1)NLAT_2 + it2] = dzlm[im][(l-1)NLAT_2 + it2].t; // l
	840	yg[(l-1)NLAT_2 + it2+1] = dzlm[im][(l-1)NLAT_2 + it2+1].t; // l+1
	841	}
	842	}
	843	if (l==LMAX) { // last l is stored right away, without interleaving.
	844	for (it=0; it<NLAT_2; it++) {
	845	yg[(l-1)NLAT_2 + it] = dzlm[im][(l-1)NLAT_2 + it].t; // l (odd)
	846	}
	847	}
	848	}
	849	if ((zlm[0] != NULL) && (NLAT_2 & 1)) {
	850	zlm[0][NLAT_2] = 0.0; // take care to write 0.0 for this sse2 padding value.
	851	if ( (MMAX==0) && (!(LMAX & 1)) ) { // NLAT_2 odd + LMAX even
	852	zlm[0][(LMAX+1)*NLAT_2 +1] = 0.0; // overflow for im=0
	853	zlm[0][(LMAX+1)*NLAT_2 +2] = 0.0;
	854	}
	855	}
	856	}
	857
	858	// how to access the ylm matrix for im=0
	859	#define YL0(it, l) ( shtns->ylm[0][ (it)(((LMAX+2)>>1)2) + (l) ] )
	860	#define DYL0(it, l) ( ((double)(shtns->dylm[0]))[ (it)(((LMAX+1)>>1)*2) + (l-1) ] )
	861
	862	#define YLM(it, l, im) ( (im==0) ? YL0(it,l) : shtns->ylm[im][(it)(LMAX-(im)MRES+1) + ((l)-(im)*MRES)] )
	863	#define DTYLM(it, l, im) ( (im==0) ? ( (l==0) ? 0.0 : DYL0(it,l) ) : shtns->dylm[im][(it)(LMAX-(im)MRES+1) + ((l)-(im)*MRES)].t )
	864	#define DPYLM(it, l, im) ( (im==0) ? 0.0 : shtns->dylm[im][(it)(LMAX-(im)MRES+1) + ((l)-(im)*MRES)].p )
	865
	866	/// \internal Precompute the matrix for SH synthesis.
	867	static void init_SH_synth(shtns_cfg shtns)
	868	{
	869	double yl, dyl; // temp storage for derivative for legendre function values.
	870	long int it,im,m,l;
	871	double *ct = shtns->ct;
	872	double *st = shtns->st;
	873	int vector = (shtns->dylm != NULL);
	874
	875	yl = (double) malloc( 2sizeof(double) *(LMAX+1) );
	876	dyl = yl + (LMAX+1);
	877
	878	{ im=0; m=0;
	879	for (it=0; it<NLAT_2; it++) {
	880	legendre_sphPlm_deriv_array_hp(shtns, LMAX, im, ct[it], st[it], yl, dyl); // fixed im legendre functions lookup table.
	881	for (l=0; l<=LMAX; l++) YL0(it, l) = yl[l];
	882	for (l=LMAX+1; l<=LMAX+3; l++) YL0(it, l) = 0.0; // allow overflow.
	883	if (vector) {
	884	for (l=1; l<=LMAX; l++) DYL0(it, l) = dyl[l];
	885	for (l=LMAX+1; l<=LMAX+2; l++) DYL0(it, l) = 0.0; // allow overflow.
	886	}
	887	}
	888	}
	889	for (im=1; im<=MMAX; im++) {
	890	double *ylm = shtns->ylm[im];
	891	struct DtDp* dylm = vector ? shtns->dylm[im] : NULL;
	892	m = im*MRES;
	893	for (it=0; it<NLAT_2; it++) {
	894	legendre_sphPlm_deriv_array_hp(shtns, LMAX, im, ct[it], st[it], yl, dyl); // fixed im legendre functions lookup table.
	895	for (l=m; l<=LMAX; l++) {
	896	ylm[it(LMAX-m+1) + (l-m)] = yl[l-m] st[it];
	897	if (vector) {
	898	dylm[it*(LMAX-m+1) + (l-m)].t = dyl[l-m];
	899	dylm[it(LMAX-m+1) + (l-m)].p = yl[l-m] m; // 1/sint(t) dYlm/dphi
	900	}
	901	}
	902	}
	903	}
	904	free(yl);
	905	}
	906
	907
	908	/// \internal Precompute matrices for SH synthesis and analysis, on a Gauss-Legendre grid.
	909	static void init_SH_gauss(shtns_cfg shtns)
	910	{
	911	long int it,im,m,l;
	912	int vector = (shtns->dylm != NULL);
	913
	914	init_SH_synth(shtns);
	915
	916	// for analysis (decomposition, direct transform) : transpose and multiply by gauss weight and other normalizations.
	917	// interleave l and l+1 : this stores data in the way it will be read.
	918	double **zlm = shtns->zlm;
	919	struct DtDp** dzlm = shtns->dzlm;
	920	for (im=0; im<=MMAX; im++) {
	921	m = im*MRES;
	922	long int talign = 0;
	923
	924	for (it=0;it<NLAT_2;it++) {
	925	double norm = shtns->wg[it];
	926	if ( (m>0) && (shtns->norm & SHT_REAL_NORM) ) norm *= 2; // "Real" norm : zlm must be doubled for m>0
	927	long int l0 = m;
	928	if (m==0) {
	929	zlm[im][it] = YL0(it, 0) * norm;
	930	// les derivees sont nulles pour l=0
	931	l0++;
	932	talign = (NLAT_2&1);
	933	}
	934	for (l=l0; l<LMAX; l+=2) {
	935	double nz0 = norm; double nz1 = norm;
	936	if (SHT_NORM == sht_schmidt) {
	937	nz0 = (2l+1); nz1 = (2l+3);
	938	}
	939	zlm[im][(l-m)NLAT_2 + it2 +talign] = YLM(it, l, im) * nz0;
	940	zlm[im][(l-m)NLAT_2 + it2 +1 +talign] = YLM(it, l+1, im) * nz1;
	941	if (vector) {
	942	nz0 = shtns->l_2[l]; nz1 = shtns->l_2[l+1];
	943	dzlm[im][(l-l0)NLAT_2 + it2].t = DTYLM(it, l, im) * nz0;
	944	dzlm[im][(l-l0)NLAT_2 + it2].p = DPYLM(it, l, im) * nz0;
	945	dzlm[im][(l-l0)NLAT_2 + it2+1].t = DTYLM(it, l+1, im) * nz1;
	946	dzlm[im][(l-l0)NLAT_2 + it2+1].p = DPYLM(it, l+1, im) * nz1;
	947	}
	948	}
	949	if (l==LMAX) { // last l is stored right away, without interleaving.
	950	double nz0 = norm;
	951	if (SHT_NORM == sht_schmidt) nz0 = (2l+1);
	952	zlm[im][(l-m)NLAT_2 + it +talign] = YLM(it, l, im) nz0;
	953	if (vector) {
	954	nz0 *= shtns->l_2[l];
	955	dzlm[im][(l-l0)NLAT_2 + it].t = DTYLM(it, l, im) nz0;
	956	dzlm[im][(l-l0)NLAT_2 + it].p = DPYLM(it, l, im) nz0;
	957	}
	958	}
	959	}
	960	}
	961	}
	962
	963	#endif
	964
	965	#ifdef SHTNS_DCT
	966
	967	static void init_SH_dct_m(shtns_cfg shtns, double* is1, fftw_plan dct, fftw_plan idct, int analysis, const int m0, const int mstep)
	968	{
	969	double yk, yk0, dyk0, yg; // temp storage
	970	struct DtDp dyg, dyk;
	971	int it,im,m,l;
	972	double Z[2*NLAT_2] SSE;
	973	double dZt[2*NLAT_2] SSE;
	974	double dZp[2*NLAT_2] SSE; // equally spaced theta points.
	975
	976	double *st = shtns->st;
	977	double *st_1 = shtns->st_1;
	978	const int vector = (shtns->dylm != NULL);
	979	const int KMAX = LMAX+1;
	980
	981	real iylm_fft_norm = 1.0; // FFT/SHT normalization for zlm (4pi normalized)
	982	if ((SHT_NORM != sht_fourpi)&&(SHT_NORM != sht_schmidt)) iylm_fft_norm = 4*M_PIl; // FFT/SHT normalization for zlm (orthonormalized)
	983	iylm_fft_norm /= (2NPHINLAT_2);
	984
	985	// Even/Odd symmetry : ylm is even or odd across equator, as l-m is even or odd => only NLAT_2 points required.
	986	// temp memory for ykm_dct.
	987	yk = (double ) malloc( sizeof(double) (KMAX+1)*(LMAX+1) );
	988	dyk = (struct DtDp ) malloc( sizeof(struct DtDp) (KMAX+1)*(LMAX+1) );
	989	if ((m0==0)&&(analysis)) {
	990	yk0 = (double ) malloc( sizeof(double) (LMAX/2+1)(2NLAT_2) * 2 ); // temp for zlm_dct0
	991	dyk0 = yk0 + (LMAX/2+1)(2NLAT_2);
	992	}
	993
	994	for (im=m0; im<=MMAX; im+=mstep) {
	995	m = im*MRES;
	996	// go to DCT space
	997	for (it=0;it<=KMAX;it+=2) {
	998	for(l=m; l<=LMAX; l++) {
	999	yk[(it/2)*(LMAX+1-m) + (l-m)] = 0.0;
	1000	dyk[(it/2)*(LMAX+1-m) + (l-m)].t = 0.0;
	1001	dyk[(it/2)*(LMAX+1-m) + (l-m)].p = 0.0;
	1002	}
	1003	}
	1004	for (l=m; l<=LMAX; l++) {
	1005	if (m & 1) { // m odd
	1006	for (it=0; it<NLAT_2; it++) {
	1007	Z[it] = YLM(it, l, im) * st[it]; // P[l+1](x) *st
	1008	if (vector) {
	1009	dZt[it] = DTYLM(it, l, im); // P[l](x) *1
	1010	dZp[it] = DPYLM(it, l, im); // P[l-1](x) *1
	1011	}
	1012	}
	1013	} else { // m even
	1014	for (it=0; it<NLAT_2; it++) {
	1015	Z[it] = YLM(it, l, im); // P[l](x) *1
	1016	if (vector) {
	1017	dZt[it] = DTYLM(it, l, im) st[it]; // P[l+1](x) st
	1018	dZp[it] = YLM(it, l, im) * m; // P[l](x) *st
	1019	}
	1020	}
	1021	}
	1022	if ((l-m)&1) { // odd
	1023	for (it=NLAT_2; it<2*NLAT_2; it++) {
	1024	Z[it] = - Z[2*NLAT_2-it-1]; // reconstruct even/odd part
	1025	dZt[it] = dZt[2*NLAT_2-it-1];
	1026	dZp[it] = - dZp[2*NLAT_2-it-1];
	1027	}
	1028	} else { // even
	1029	for (it=NLAT_2; it<2*NLAT_2; it++) {
	1030	Z[it] = Z[2*NLAT_2-it-1]; // reconstruct even/odd part
	1031	dZt[it] = - dZt[2*NLAT_2-it-1];
	1032	dZp[it] = dZp[2*NLAT_2-it-1];
	1033	}
	1034	}
	1035	fftw_execute_r2r(dct, Z, Z);
	1036	fftw_execute_r2r(dct, dZt, dZt);
	1037	fftw_execute_r2r(dct, dZp, dZp);
	1038	#if SHT_VERBOSE > 1
	1039	if ((LMAX <= 12) && (verbose>1))
	1040	#pragma omp critical
	1041	{
	1042	printf("\nl=%d, m=%d ::\t", l,m);
	1043	for(it=0;it<2NLAT_2;it++) printf("%e ",Z[it]/(2NLAT));
	1044	printf("\n dYt ::\t");
	1045	for(it=0;it<2NLAT_2;it++) printf("%e ",dZt[it]/(2NLAT));
	1046	printf("\n dYp ::\t");
	1047	for(it=0;it<2NLAT_2;it++) printf("%e ",dZp[it]/(2NLAT));
	1048	}
	1049	#endif
	1050	for (it=(l-m)&1; it<=l+1; it+=2) {
	1051	yk[(it/2)(LMAX+1-m) + (l-m)] = Z[it]/(2NLAT); // and transpose
	1052	dyk[(it/2)(LMAX+1-m) + (l-m)].p = dZp[it]/(2NLAT);
	1053	}
	1054	for (it=(l+1-m)&1; it<=l+1; it+=2) {
	1055	dyk[(it/2)(LMAX+1-m) + (l-m)].t = dZt[it]/(2NLAT);
	1056	}
	1057	}
	1058
	1059	/* compute analysis coefficients (fast way)
	1060	* Wklm = int(TkYlm) = int(Tk.sum(i,a_ilmTi)) = sum(i, a_ilm* int(TkTi)) = sum(i, a_ilmJik)
	1061	* with Jik = int(Tk*Ti) = 1/(1-(k-i)^2) + 1/(1-(k+i)^2)
	1062	*/
	1063	if (analysis) {
	1064	#if SHT_VERBOSE > 0
	1065	if ((LMAX>126)&&(m0==0)&&(verbose)) { // only one thread prints this message.
	1066	printf("computing weights m=%d\r",m); fflush(stdout);
	1067	}
	1068	#endif
	1069	for (l=m; l<=LMAX; l++) {
	1070	unsigned k0,k1, k,i,d;
	1071	double Jik0, Jik1, yy, dyp, dyt;
	1072	double lnorm = iylm_fft_norm;
	1073
	1074	if (SHT_NORM == sht_schmidt) lnorm = (2l+1); // Schmidt semi-normalization
	1075	if ( (m>0) && (shtns->norm & SHT_REAL_NORM) ) lnorm *= 2; // "real" norm : zlm must be doubled for m>0
	1076
	1077	k0 = (l-m)&1; k1 = 1-k0;
	1078	for(k=0; k<NLAT; k++) { Z[k] = 0.0; dZt[k] = 0.0; dZp[k] = 0.0; }
	1079	for (i=0; i<=(l+1)/2; i++) {
	1080	yy = yk[i(LMAX+1-m) + (l-m)] lnorm;
	1081	dyp = dyk[i(LMAX+1-m) + (l-m)].p lnorm/(l*(l+1));
	1082	dyt = dyk[i(LMAX+1-m) + (l-m)].t lnorm/(l*(l+1));
	1083	if (i+k0==0) { yy=0.5; dyp=0.5; }
	1084	if (i+k1==0) dyt*=0.5;
	1085	for (k=0; k<NLAT_2; k++) {
	1086	d = (k<i) ? i-k : k-i;
	1087	Jik0 = is1[(i+k0)+k] + is1[d];
	1088	Jik1 = is1[(i+k1)+k] + is1[d];
	1089	Z[2k+k0] += yy Jik0;
	1090	dZt[2k+k1] += dyt Jik1;
	1091	if (m&1) dZp[2k+k0] += dyp Jik0;
	1092	}
	1093	}
	1094	#if SHT_VERBOSE > 1
	1095	if ((LMAX <= 12)&&(verbose>1))
	1096	#pragma omp critical
	1097	{
	1098	printf("\nl=%d, m=%d ::\t",l,m);
	1099	for (k=0; k<(2*NLAT_2); k++) printf("%f ",Z[k]);
	1100	printf("\n dZt ::\t");
	1101	for (k=0; k<(2*NLAT_2); k++) printf("%f ",dZt[k]);
	1102	if (m&1) {
	1103	printf("\n dZp ::\t");
	1104	for (k=0; k<(2*NLAT_2); k++) printf("%f ",dZp[k]);
	1105	}
	1106	}
	1107	#endif
	1108
	1109	if (m == 0) { // we store zlm in dct space for m=0
	1110	if (k0==0) {
	1111	yk0[((l-m)>>1)(2NLAT_2)] = Z[0]*0.5; // store zlm_dct (k=0)
	1112	for (k=1; k<(2NLAT_2); k++) yk0[((l-m)>>1)(2*NLAT_2) +k] = 0.0; // zero out.
	1113	k0=2;
	1114	}
	1115	for (k=k0; k<(2*NLAT_2); k+=2)
	1116	yk0[((l-m)>>1)(2NLAT_2) +k] = Z[k]; // store zlm_dct
	1117
	1118	if (l>0) {
	1119	if (k1==0) {
	1120	dyk0[((l-1-m)>>1)(2NLAT_2)] = dZt[0]*0.5; // store dzlm_dct (k=0)
	1121	for (k=1; k<(2NLAT_2); k++) dyk0[((l-1-m)>>1)(2*NLAT_2) +k] = 0.0; // zero out.
	1122	k1=2;
	1123	}
	1124	for (k=k1; k<(2*NLAT_2); k+=2)
	1125	dyk0[((l-1-m)>>1)(2NLAT_2) +k] = dZt[k]; // store dzlm_dct
	1126	}
	1127	}
	1128
	1129	fftw_execute_r2r(idct, Z, Z); fftw_execute_r2r(idct, dZt, dZt);
	1130	if (m == 0) {
	1131	for (it=0; it<NLAT; it++) { dZp[it] = 0.0; dZt[it] *= st_1[it]; }
	1132	} else if (m & 1) { // m odd
	1133	fftw_execute_r2r(idct, dZp, dZp);
	1134	for (it=0; it<NLAT; it++) { Z[it] *= st_1[it]; }
	1135	} else { // m even
	1136	for (it=0; it<NLAT; it++) { dZp[it] = Z[it]m/(l(l+1)st[it]); dZt[it] = st_1[it]; }
	1137	}
	1138
	1139	long int l0 = (m==0) ? 1 : m;
	1140	long int talign = (m==0)*(NLAT_2 & 1);
	1141	long int sk = (l-l0)&1;
	1142	if (l==0) {
	1143	for (it=0; it<NLAT_2; it++) {
	1144	shtns->zlm[im][it] = Z[it];
	1145	}
	1146	} else if ((sk == 0)&&(l == LMAX)) {
	1147	for (it=0; it<NLAT_2; it++) {
	1148	shtns->zlm[im][(l-m)*NLAT_2 + it + talign] = Z[it];
	1149	if (vector) {
	1150	shtns->dzlm[im][(l-l0)*NLAT_2 + it].p = dZp[it];
	1151	shtns->dzlm[im][(l-l0)*NLAT_2 + it].t = dZt[it];
	1152	}
	1153	}
	1154	} else {
	1155	for (it=0; it<NLAT_2; it++) {
	1156	shtns->zlm[im][(l-m-sk)NLAT_2 + it2 +sk + talign] = Z[it];
	1157	if (vector) {
	1158	shtns->dzlm[im][(l-l0-sk)NLAT_2 + it2 +sk].p = dZp[it];
	1159	shtns->dzlm[im][(l-l0-sk)NLAT_2 + it2 +sk].t = dZt[it];
	1160	}
	1161	}
	1162	}
	1163	}
	1164	}
	1165
	1166	// Compact the coefficients for improved cache efficiency.
	1167	yg = shtns->ykm_dct[im];
	1168	for (it=0; it<= KMAX; it+=2) {
	1169	l = (it < m) ? m : it-(m&1);
	1170	while (l<=LMAX) {
	1171	yg[0] = yk[(it/2)*(LMAX+1-m) + (l-m)];
	1172	l++; yg++;
	1173	}
	1174	if ((m==0) && ((LMAX & 1) == 0)) { yg[0] = 0; yg++; } // SSE2 padding.
	1175	}
	1176	if (MMAX==0) for (int j=0; j<3; j++) yg[j] = 0; // allow some overflow.
	1177	if (vector) {
	1178	dyg = shtns->dykm_dct[im];
	1179	for (it=0; it<= KMAX; it+=2) {
	1180	l = (it-2 < m) ? m : it-2+(m&1);
	1181	while (l<=LMAX) {
	1182	dyg[0].t = dyk[(it/2)*(LMAX+1-m) + (l-m)].t;
	1183	dyg[0].p = dyk[(it/2)*(LMAX+1-m) + (l-m)].p;
	1184	l++; dyg++;
	1185	}
	1186	}
	1187	if (im == 0) { // compact and reorder m=0 dylm because .p = 0 :
	1188	dyg = shtns->dykm_dct[im];
	1189	yg = (double *) shtns->dykm_dct[im];
	1190	for (it=0; it<= KMAX; it+=2) {
	1191	dyg++;
	1192	for (l=it-1; l<=LMAX; l++) {
	1193	if (l>0) {
	1194	yg[0] = dyg[0].t;
	1195	yg++; dyg++;
	1196	}
	1197	}
	1198	if (LMAX&1) { // padding for SSE2 alignement.
	1199	yg[0] = 0.0; yg++;
	1200	}
	1201	}
	1202	}
	1203	}
	1204	}
	1205
	1206	// compact yk to zlm_dct0
	1207	if ((m0==0)&&(analysis)) {
	1208	long int klim = (LMAX * SHT_NL_ORDER) + 2; // max k needed for nl-terms...
	1209	klim = (klim/2)*2; // must be even...
	1210	if (klim > 2NLAT_2) klim = 2NLAT_2; // but no more than 2*NLAT_2.
	1211	shtns->klim = klim; // store for use in codelets.
	1212	yg = shtns->zlm_dct0;
	1213	for (l=0; l<=LMAX; l+=2) {
	1214	for (it=l; it<klim; it++) { // for m=0, zl coeff with i<l are zeros.
	1215	*yg = yk0[it];
	1216	yg++;
	1217	}
	1218	yk0 += 2*NLAT_2;
	1219	}
	1220	if (vector) {
	1221	yg = shtns->dzlm_dct0;
	1222	for (l=1; l<=LMAX; l+=2) {
	1223	for (it=l-1; it<klim; it++) { // for m=0, dzl coeff with i<l-1 are zeros.
	1224	*yg = dyk0[it];
	1225	yg++;
	1226	}
	1227	dyk0 += 2*NLAT_2;
	1228	}
	1229	}
	1230	free(yk0 - (2NLAT_2)(LMAX/2+1));
	1231	}
	1232
	1233	free(dyk); free(yk);
	1234	}
	1235
	1236	/// \internal Computes the matrices required for SH transform on a regular grid (with or without DCT).
	1237	/// \param analysis : 0 => synthesis only.
	1238	static void init_SH_dct(shtns_cfg shtns, int analysis)
	1239	{
	1240	fftw_plan dct, idct;
	1241	long int it,im,m,l;
	1242	long int sk, dsk;
	1243	double Z[2*NLAT_2] SSE;
	1244	double is1[NLAT]; // tabulate values for integrals.
	1245
	1246	const long int marray_size = sizeof(void)(MMAX+1) + (MIN_ALIGNMENT-1);
	1247	const int vector = (shtns->dylm != NULL);
	1248	const int KMAX = LMAX+1;
	1249
	1250	for(im=0, sk=0, dsk=0; im<=MMAX; im++) { // how much memory to allocate for ykm_dct ?
	1251	m = im*MRES;
	1252	for (it=0; it<= KMAX; it+=2) {
	1253	l = (it < m) ? m : it-(m&1);
	1254	sk += LMAX+1 - l;
	1255	if ((m==0) && ((LMAX & 1) ==0)) sk++; // SSE padding for m=0
	1256	}
	1257	for (it=0; it<= KMAX; it+=2) {
	1258	l = (it-2 < m) ? m : it-2+(m&1);
	1259	dsk += LMAX+1 - l;
	1260	}
	1261	}
	1262	if (MMAX == 0) sk+=3; // allow some overflow.
	1263	for (l=0, it=0; l<=LMAX; l+=2) // how much memory for zlm_dct0 ?
	1264	it += (2*NLAT_2 -l);
	1265	for (l=1, im=0; l<=LMAX; l+=2) // how much memory for dzlm_dct0 ?
	1266	im += (2*NLAT_2 -l+1);
	1267
	1268	#if SHT_VERBOSE > 1
	1269	if (verbose>1) printf(" Memory used for Ykm_dct matrices = %.3f Mb\n",sizeof(double)(sk + 2.dsk + it)/(1024.*1024.));
	1270	#endif
	1271	shtns->ykm_dct = (double *) malloc( marray_size + sizeof(double)sk );
	1272	shtns->ykm_dct[0] = (double *) PTR_ALIGN( shtns->ykm_dct + (MMAX+1) );
	1273	if (vector) {
	1274	shtns->dykm_dct = (struct DtDp *) malloc( marray_size + sizeof(struct DtDp)dsk );
	1275	shtns->dykm_dct[0] = (struct DtDp *) PTR_ALIGN( shtns->dykm_dct + (MMAX+1) );
	1276	}
	1277	shtns->zlm_dct0 = (double ) VMALLOC( sizeof(double) it );
	1278	if (vector) {
	1279	shtns->dzlm_dct0 = (double ) VMALLOC( sizeof(double) im );
	1280	}
	1281	for (im=0; im<MMAX; im++) {
	1282	m = im*MRES;
	1283	for (it=0, sk=0; it<= KMAX; it+=2) {
	1284	l = (it < m) ? m : it-(m&1);
	1285	sk += LMAX+1 - l;
	1286	if ((m==0) && ((LMAX & 1) ==0)) sk++; // SSE padding for m=0
	1287	}
	1288	for (it=0, dsk=0; it<= KMAX; it+=2) {
	1289	l = (it-2 < m) ? m : it-2+(m&1);
	1290	dsk += LMAX+1 - l;
	1291	}
	1292	shtns->ykm_dct[im+1] = shtns->ykm_dct[im] + sk;
	1293	if (vector) shtns->dykm_dct[im+1] = shtns->dykm_dct[im] + dsk;
	1294	}
	1295
	1296	#if SHT_VERBOSE > 1
	1297	ticks tik0, tik1;
	1298	tik0 = getticks();
	1299	#endif
	1300
	1301	#ifdef OMP_FFTW
	1302	fftw_plan_with_nthreads(1);
	1303	#endif
	1304	dct = fftw_plan_r2r_1d( 2*NLAT_2, Z, Z, FFTW_REDFT10, FFTW_MEASURE ); // quick and dirty dct.
	1305	idct = fftw_plan_r2r_1d( 2*NLAT_2, Z, Z, FFTW_REDFT01, FFTW_MEASURE ); // quick and dirty idct.
	1306
	1307	init_SH_synth(shtns);
	1308
	1309	// precomputation for scalar product of Chebychev polynomials.
	1310	for(it=0; it<NLAT; it++)
	1311	is1[it] = 1./(1. - 4.itit);
	1312
	1313	#ifdef _OPENMP
	1314	#pragma omp parallel
	1315	{
	1316	int n=omp_get_num_threads();
	1317	int k=omp_get_thread_num();
	1318	init_SH_dct_m(shtns, is1, dct, idct, analysis, k, n);
	1319	}
	1320	#else
	1321	init_SH_dct_m(shtns, is1, dct, idct, analysis, 0, 1);
	1322	#endif
	1323
	1324	#if SHT_VERBOSE > 1
	1325	tik1 = getticks();
	1326	if (verbose>1) printf("\n ticks : %.3f\n", elapsed(tik1,tik0)/(NLMNLAT(MMAX+1)));
	1327	#endif
	1328	fftw_destroy_plan(idct); fftw_destroy_plan(dct);
	1329	}
	1330
	1331	#endif /* SHTNS_DCT */
	1332
	1333	/// \internal Generate a gauss grid (including weights)
	1334	static void grid_gauss(shtns_cfg shtns, double latdir)
	1335	{
	1336	long int it;
	1337	real iylm_fft_norm;
	1338	real xg[NLAT], wgl[NLAT]; // gauss points and weights.
	1339	const int overflow = 8*VSIZE2-1;
	1340
	1341	shtns->grid = GRID_GAUSS;
	1342	shtns->wg = VMALLOC((NLAT_2 +overflow) * sizeof(double)); // gauss weights, double precision.
	1343
	1344	iylm_fft_norm = 1.0; // FFT/SHT normalization for zlm (4pi normalized)
	1345	if ((SHT_NORM != sht_fourpi)&&(SHT_NORM != sht_schmidt)) iylm_fft_norm = 4*M_PIl; // FFT/SHT normalization for zlm (orthonormalized)
	1346	iylm_fft_norm /= (2*NPHI);
	1347	#if SHT_VERBOSE > 0
	1348	if (verbose) {
	1349	printf(" => using Gauss nodes\n");
	1350	if (2NLAT <= (SHT_NL_ORDER +1)LMAX) printf(" !! Warning : Gauss-Legendre anti-aliasing condition 2Nlat > %dLmax is not met.\n",SHT_NL_ORDER+1);
	1351	}
	1352	#endif
	1353	gauss_nodes(xg,wgl,NLAT); // generate gauss nodes and weights : ct = ]1,-1[ = cos(theta)
	1354	for (it=0; it<NLAT; it++) {
	1355	shtns->ct[it] = latdir * xg[it];
	1356	shtns->st[it] = SQRT((1.-xg[it])*(1.+xg[it]));
	1357	shtns->st_1[it] = 1.0/SQRT((1.-xg[it])*(1.+xg[it]));
	1358	}
	1359	for (it=0; it<NLAT_2; it++)
	1360	shtns->wg[it] = wgl[it]*iylm_fft_norm; // faster double-precision computations.
	1361	if (NLAT & 1) { // odd NLAT : adjust weigth of middle point.
	1362	shtns->wg[NLAT_2-1] *= 0.5;
	1363	}
	1364	for (it=NLAT_2; it < NLAT_2 +overflow; it++) shtns->wg[it] = 0.0; // padding for multi-way algorithm.
	1365
	1366	#if SHT_VERBOSE > 1
	1367	if (verbose>1) {
	1368	printf(" NLAT=%d, NLAT_2=%d\n",NLAT,NLAT_2);
	1369	// TEST if gauss points are ok.
	1370	double tmax = 0.0;
	1371	for (it = 0; it<NLAT_2; it++) {
	1372	double t = legendre_Pl(NLAT, shtns->ct[it]);
	1373	if (t>tmax) tmax = t;
	1374	// printf("i=%d, x=%12.12g, p=%12.12g\n",it,ct[it],t);
	1375	}
	1376	printf(" max zero at Gauss nodes for Pl[l=NLAT] : %g\n",tmax);
	1377	if (NLAT_2 < 100) {
	1378	printf(" Gauss nodes :");
	1379	for (it=0;it<NLAT_2; it++)
	1380	printf(" %g",shtns->ct[it]);
	1381	printf("\n");
	1382	}
	1383	}
	1384	#endif
	1385	}
	1386
	1387	#ifdef SHTNS_DCT
	1388	/// \internal Generate an equi-spaced theta grid (Chebychev points, excluding poles) for Féjer-DCT SHT.
	1389	static void grid_dct(shtns_cfg shtns, double latdir)
	1390	{
	1391	long int it;
	1392
	1393	shtns->grid = GRID_REGULAR;
	1394	#if SHT_VERBOSE > 0
	1395	if (verbose) {
	1396	printf(" => using equaly spaced nodes with DCT acceleration\n");
	1397	if (NLAT <= SHT_NL_ORDER LMAX) printf(" !! Warning : DCT anti-aliasing condition Nlat > %dLmax is not met.\n",SHT_NL_ORDER);
	1398	if (NLAT != fft_int(NLAT,7)) printf(" !! Warning : Nlat is not optimal for FFTW !\n");
	1399	}
	1400	#endif
	1401	if (NLAT & 1) shtns_runerr("NLAT must be even (DCT)");
	1402	if (NLAT <= LMAX+1) shtns_runerr("NLAT should be at least LMAX+2 (DCT)");
	1403
	1404	for (it=0; it<NLAT; it++) { // Chebychev points : equaly spaced but skipping poles.
	1405	real th = M_PIl; th = (th(2it+1))/(2*NLAT);
	1406	shtns->ct[it] = latdir * COS(th);
	1407	shtns->st[it] = SIN(th);
	1408	shtns->st_1[it] = 1.0/SIN(th);
	1409	}
	1410
	1411	#if SHT_VERBOSE > 1
	1412	if (verbose>1) {
	1413	printf(" NLAT=%d, NLAT_2=%d\n",NLAT,NLAT_2);
	1414	double tmax = 0.0;
	1415	for (it=0;it<NLAT_2; it++) {
	1416	double ct = shtns->ct[it]; double st = shtns->st[it];
	1417	double t = fabs((ctct + stst) -1.0);
	1418	if (t > tmax) tmax=t;
	1419	}
	1420	printf(" max st^2 + ct^2 -1 = %g\n",tmax);
	1421	if (NLAT_2 < 100) {
	1422	printf(" DCT nodes :");
	1423	for (it=0; it<NLAT_2; it++)
	1424	printf(" %g",shtns->ct[it]);
	1425	printf("\n");
	1426	}
	1427	}
	1428	#endif
	1429	}
	1430	#endif /* SHTNS_DCT */
	1431
	1432	/// \internal Generate an equi-spaced theta grid including the poles, for synthesis only.
	1433	static void grid_equal_polar(shtns_cfg shtns, double latdir)
	1434	{
	1435	long int j;
	1436
	1437	shtns->grid = GRID_POLES;
	1438	#if SHT_VERBOSE > 0
	1439	if (verbose) printf(" => using Equaly Spaced Nodes including poles\n");
	1440	#endif
	1441	// cos theta of latidunal points (equaly spaced in theta)
	1442	double f = M_PIl/(NLAT-1);
	1443	for (j=0; j<NLAT; j++) {
	1444	shtns->ct[j] = latdir * cos(f*j);
	1445	shtns->st[j] = sin(f*j);
	1446	shtns->st_1[j] = 1.0/sin(f*j);
	1447	}
	1448	#if SHT_VERBOSE > 0
	1449	if (verbose) printf(" !! Warning : only synthesis (inverse transform) supported for this grid !\n");
	1450	#endif
	1451	}
	1452
	1453
	1454	/* TEST AND TIMING FUNCTIONS */
	1455
	1456	/// \internal return the max error for a back-and-forth SHT transform.
	1457	/// this function is used to internally measure the accuracy.
	1458	double SHT_error(shtns_cfg shtns, int vector)
	1459	{
	1460	cplx Tlm0=0, Slm0=0, Tlm=0, Slm=0;
	1461	double Sh=0, Th=0;
	1462	double t, tmax, n2, err;
	1463	long int i, jj, nlm_cplx;
	1464
	1465	srand( time(NULL) ); // init random numbers.
	1466
	1467	Slm0 = (cplx ) VMALLOC(sizeof(cplx) NLM);
	1468	Slm = (cplx ) VMALLOC(sizeof(cplx) NLM);
	1469	Sh = (double ) VMALLOC( NSPAT_ALLOC(shtns) sizeof(double) );
	1470	if ((Sh==0) \|\| (Slm==0) \|\| (Slm0==0)) shtns_runerr("not enough memory.");
	1471	if (vector) {
	1472	Tlm0 = (cplx ) VMALLOC(sizeof(cplx) NLM);
	1473	Tlm = (cplx ) VMALLOC(sizeof(cplx) NLM);
	1474	Th = (double ) VMALLOC( NSPAT_ALLOC(shtns) sizeof(double) );
	1475	if ((Th==0) \|\| (Tlm==0) \|\| (Tlm0==0)) vector=0;
	1476	}
	1477
	1478	// m = nphi/2 is also real if nphi is even.
	1479	nlm_cplx = ( MMAX2 == NPHI ) ? LiM(shtns, MRESMMAX,MMAX) : NLM;
	1480	t = 1.0 / (RAND_MAX/2);
	1481	for (i=0; i<NLM; i++) {
	1482	if ((i<=LMAX)\|\|(i>=nlm_cplx)) { // m=0 or m*2=nphi : real random data
	1483	Slm0[i] = t*((double) (rand() - RAND_MAX/2));
	1484	if (vector) Tlm0[i] = t*((double) (rand() - RAND_MAX/2));
	1485	} else { // m>0 : complex random data
	1486	Slm0[i] = t((double) (rand() - RAND_MAX/2)) + It*((double) (rand() - RAND_MAX/2));
	1487	if (vector) Tlm0[i] = t((double) (rand() - RAND_MAX/2)) + It*((double) (rand() - RAND_MAX/2));
	1488	}
	1489	}
	1490
	1491	SH_to_spat(shtns, Slm0,Sh); // scalar SHT
	1492	spat_to_SH(shtns, Sh, Slm);
	1493	for (i=0, tmax=0., n2=0., jj=0; i<NLM; i++) { // compute error
	1494	t = cabs(Slm[i] - Slm0[i]);
	1495	n2 += t*t;
	1496	if (t>tmax) { tmax = t; jj = i; }
	1497	}
	1498	err = tmax;
	1499	#if SHT_VERBOSE > 1
	1500	if (verbose>1) printf(" scalar SH - poloidal rms error = %.3g max error = %.3g for l=%hu,lm=%ld\n",sqrt(n2/NLM),tmax,shtns->li[jj],jj);
	1501	#endif
	1502
	1503	if (vector) {
	1504	Slm0[0] = 0.0; Tlm0[0] = 0.0; // l=0, m=0 n'a pas de signification sph/tor
	1505	SHsphtor_to_spat(shtns, Slm0, Tlm0, Sh, Th); // vector SHT
	1506	spat_to_SHsphtor(shtns, Sh, Th, Slm, Tlm);
	1507	for (i=0, tmax=0., n2=0., jj=0; i<NLM; i++) { // compute error
	1508	t = cabs(Slm[i] - Slm0[i]);
	1509	n2 += t*t;
	1510	if (t>tmax) { tmax = t; jj = i; }
	1511	}
	1512	if (tmax > err) err = tmax;
	1513	#if SHT_VERBOSE > 1
	1514	if (verbose>1) printf(" vector SH - spheroidal rms error = %.3g max error = %.3g for l=%hu,lm=%ld\n",sqrt(n2/NLM),tmax,shtns->li[jj],jj);
	1515	#endif
	1516	for (i=0, tmax=0., n2=0., jj=0; i<NLM; i++) { // compute error
	1517	t = cabs(Tlm[i] - Tlm0[i]);
	1518	n2 += t*t;
	1519	if (t>tmax) { tmax = t; jj = i; }
	1520	}
	1521	if (tmax > err) err = tmax;
	1522	#if SHT_VERBOSE > 1
	1523	if (verbose>1) printf(" - toroidal rms error = %.3g max error = %.3g for l=%hu,lm=%ld\n",sqrt(n2/NLM),tmax,shtns->li[jj],jj);
	1524	#endif
	1525	}
	1526
	1527	if (Th) VFREE(Th); if (Tlm) VFREE(Tlm); if (Tlm0) VFREE(Tlm0);
	1528	VFREE(Sh); VFREE(Slm); VFREE(Slm0);
	1529	return(err); // return max error.
	1530	}
	1531
	1532
	1533	#if SHT_VERBOSE == 1
	1534	#define PRINT_DOT if (verbose>=1) { printf("."); fflush(stdout); }
	1535	#else
	1536	#define PRINT_DOT (0);
	1537	#endif
	1538
	1539	/// \internal measure time used for a transform function
	1540	static double get_time(shtns_cfg shtns, int nloop, int npar, char* name, void fptr, void i1, void i2, void i3, void o1, void o2, void *o3, int l)
	1541	{
	1542	double t;
	1543	int i;
	1544	ticks tik0, tik1;
	1545
	1546	if (fptr == NULL) return(0.0);
	1547
	1548	tik1 = getticks();
	1549	for (i=0; i<nloop; i++) {
	1550	switch(npar) {
	1551	case 2: (*(pf2l)fptr)(shtns, i1,o1, l); break; // l may be discarded.
	1552	case 3: (*(pf3l)fptr)(shtns, i1,o1,o2, l); break;
	1553	case 4: (*(pf4l)fptr)(shtns, i1,i2,o1,o2, l); break;
	1554	default: (*(pf6l)fptr)(shtns, i1,i2,i3, o1,o2,o3, l); break;
	1555	}
	1556	if (i==0) tik0 = getticks();
	1557	}
	1558	if (nloop == 1) {
	1559	t = elapsed(tik0, tik1);
	1560	} else {
	1561	tik1 = getticks();
	1562	t = elapsed(tik1, tik0)/(nloop-1); // discard first iteration.
	1563	}
	1564	#if SHT_VERBOSE > 1
	1565	if (verbose>1) { printf(" t(%s) = %.3g",name,t); fflush(stdout); }
	1566	#endif
	1567	return t;
	1568	}
	1569
	1570
	1571	/// \internal choose fastest between on-the-fly and gauss algorithms.
	1572	/// *nlp is the number of loops. If zero, it is set to a good value.
	1573	/// on_the_fly : 1 = skip all memory algorithm. 0 = include memory and on-the-fly. -1 = test only DCT.
	1574	/// returns time without dct / best time with dct (or 0 if no dct available).
	1575	static double choose_best_sht(shtns_cfg shtns, int* nlp, int vector, int dct_mtr)
	1576	{
	1577	cplx Qlm=0, Slm=0, *Tlm=0;
	1578	double Qh=0, Sh=0, *Th=0;
	1579	int m, i, i0, minc, nloop, alg_end;
	1580	int typ_lim = SHT_NTYP; // time every type.
	1581	double t0, t, tt, r;
	1582	double tdct, tnodct;
	1583	clock_t tcpu;
	1584	int on_the_fly_only = (shtns->ylm == NULL); // only on-the-fly.
	1585	int otf_analys = (shtns->wg != NULL); // on-the-fly analysis supported.
	1586
	1587	if (NLAT < VSIZE24) return(0.0); // on-the-fly not possible for NLAT_2 < 2NWAY (overflow) and DCT not efficient for low NLAT.
	1588	if ((dct_mtr != 0) && (shtns->ykm_dct == NULL)) return(0.0); // no dct available : do nothing.
	1589
	1590	size_t nspat = sizeof(double) * NSPAT_ALLOC(shtns);
	1591	size_t nspec = sizeof(cplx)* NLM;
	1592	if (nspec>nspat) nspat=nspec;
	1593	Sh = (double ) VMALLOC(nspat); Slm = (cplx ) VMALLOC(nspec);
	1594	if ((Sh==0) \|\| (Slm==0)) shtns_runerr("not enough memory.");
	1595	if (vector) {
	1596	Th = (double ) VMALLOC(nspat); Qh = (double ) VMALLOC(nspat);
	1597	Tlm = (cplx ) VMALLOC(nspec); Qlm = (cplx ) VMALLOC(nspec);
	1598	if ( (Th==0) \|\| (Qh==0) \|\| (Tlm==0) \|\| (Qlm==0) ) vector = 0;
	1599	}
	1600
	1601	for (i=0;i<NLM;i++) {
	1602	int l = shtns->li[i];
	1603	Slm[i] = shtns->l_2[l] + 0.5Ishtns->l_2[l];
	1604	if (vector) {
	1605	Tlm[i] = 0.5shtns->l_2[l] + Ishtns->l_2[l];
	1606	Qlm[i] = 3shtns->l_2[l] + 2I*shtns->l_2[l];
	1607	}
	1608	}
	1609
	1610	#if SHT_VERBOSE > 0
	1611	if (verbose) {
	1612	if (dct_mtr != 0) printf(" finding optimal m-truncation for DCT synthesis");
	1613	else printf(" finding optimal algorithm");
	1614	fflush(stdout);
	1615	}
	1616	#endif
	1617
	1618	if (*nlp <= 0) {
	1619	// find good nloop by requiring less than 3% difference between 2 consecutive timings.
	1620	m=0; nloop = 1; // number of loops to get timings.
	1621	r = 0.0; tt = 1.0;
	1622	do {
	1623	if ((r > 0.03)\|\|(tt<0.1)) {
	1624	m = 0; nloop *= 3;
	1625	} else m++;
	1626	tcpu = clock();
	1627	t0 = get_time(shtns, nloop, 2, "", sht_func[SHT_STD][SHT_FLY2][SHT_TYP_SSY], Slm, Tlm, Qlm, Sh, Th, Qh, LMAX);
	1628	tcpu = clock() - tcpu; tt = 1.e-6 * tcpu;
	1629	if (tt >= SHT_TIME_LIMIT) break; // we should not exceed 1 second
	1630	t = get_time(shtns, nloop, 2, "", sht_func[SHT_STD][SHT_FLY2][SHT_TYP_SSY], Slm, Tlm, Qlm, Sh, Th, Qh, LMAX);
	1631	r = fabs(2.0*(t-t0)/(t+t0));
	1632	#if SHT_VERBOSE > 1
	1633	if (verbose>1) printf(", nloop=%d, r=%g, m=%d (real time = %g s)\n",nloop,r,m,tt);
	1634	if (tt >= 0.01) break; // faster timing in debug mode.
	1635	#endif
	1636	PRINT_DOT
	1637	} while((nloop<10000)&&(m < 3));
	1638	*nlp = nloop;
	1639	} else {
	1640	nloop = *nlp;
	1641	}
	1642	#if SHT_VERBOSE > 1
	1643	if (verbose>1) printf(" => nloop=%d (takes %g s)\n",nloop, tt);
	1644	#endif
	1645	if (vector == 0) typ_lim = SHT_TYP_VSY; // time only scalar transforms.
	1646	// if (tt > 3.0) typ_lim = SHT_TYP_VSY; // time only scalar transforms.
	1647	// if (tt > 10.0) goto done; // timing this will be too slow...
	1648
	1649	int ityp = 0; do {
	1650	if ((dct_mtr != 0) && (ityp >= 4)) break; // dct !=0 : only scalar and vector.
	1651	if (ityp == 2) nloop = (nloop+1)/2; // scalar ar done.
	1652	t0 = 1e100;
	1653	i0 = 0;
	1654	if (MTR_DCT < 0) i0 = SHT_MEM; // skip dct.
	1655	if (on_the_fly_only) i0 = SHT_SV; // only on-the-fly (SV is then also on-the-fly)
	1656	alg_end = SHT_NALG;
	1657	if (shtns->nthreads <= 1) alg_end = SHT_OMP1; // no OpenMP with 1 thread.
	1658	if ((ityp&1) && (otf_analys == 0)) alg_end = SHT_FLY1; // no on-the-fly analysis for regular grid.
	1659	for (i=i0, m=0; i<alg_end; i++) {
	1660	if (sht_func[0][i][ityp] != NULL) m++; // count number of algos
	1661	}
	1662	if (m >= 2) { // don't time if there is only 1 algo !
	1663	#if SHT_VERBOSE > 1
	1664	if (verbose>1) { printf("finding best %s ...",sht_type[ityp]); fflush(stdout); }
	1665	#endif
	1666	i = i0-1; i0 = -1;
	1667	while (++i < alg_end) {
	1668	void *pf = sht_func[0][i][ityp];
	1669	if (pf != NULL) {
	1670	if (ityp&1) { // analysis
	1671	t = get_time(shtns, nloop, sht_npar[ityp], sht_name[i], pf, Sh, Th, Qh, Slm, Tlm, Qlm, LMAX);
	1672	} else {
	1673	t = get_time(shtns, nloop, sht_npar[ityp], sht_name[i], pf, Slm, Tlm, Qlm, Sh, Th, Qh, LMAX);
	1674	}
	1675	if (i < SHT_FLY1) t *= 1.03; // 3% penality for memory based transforms.
	1676	#ifdef _OPENMP
	1677	if ((i >= SHT_OMP1)\|\|(i == SHT_SV)) t *= 1.3; // 30% penality for openmp transforms.
	1678	#endif
	1679	if (t < t0) { i0 = i; t0 = t; PRINT_VERB("*"); }
	1680	}
	1681	}
	1682	if (i0 >= 0) {
	1683	for (int iv=0; iv<SHT_NVAR; iv++) {
	1684	if (sht_func[iv][i0][ityp]) shtns->ftable[iv][ityp] = sht_func[iv][i0][ityp];
	1685	if (ityp == 4) { // only one timing for both gradients variants.
	1686	if (sht_func[iv][i0][ityp+1]) shtns->ftable[iv][ityp+1] = sht_func[iv][i0][ityp+1];
	1687	}
	1688	}
	1689	PRINT_DOT
	1690	#if SHT_VERBOSE > 1
	1691	if (verbose>1) printf(" => %s\n",sht_name[i0]);
	1692	#endif
	1693	}
	1694	}
	1695	if (ityp == 4) ityp++; // skip second gradient
	1696	} while(++ityp < typ_lim);
	1697
	1698	#ifdef SHTNS_DCT
	1699	if (dct_mtr != 0) { // find the best DCT timings...
	1700	#if SHT_VERBOSE > 1
	1701	if (verbose>1) { printf("finding best mtr_dct ..."); fflush(stdout); }
	1702	#endif
	1703	minc = MMAX/20 + 1; // don't test every single m.
	1704	m = -1; i = -1; t0 = 0.0; // reference = no dct.
	1705	if (sht_func[SHT_STD][SHT_DCT][SHT_TYP_SSY] != NULL)
	1706	t0 += get_time(shtns, *nlp, 2, "s", shtns->ftable[SHT_STD][SHT_TYP_SSY], Slm, Tlm, Qlm, Sh, Th, Qh, LMAX);
	1707	if ( (sht_func[SHT_STD][SHT_DCT][SHT_TYP_VSY] != NULL) && (vector) )
	1708	t0 += get_time(shtns, nloop, 4, "v", shtns->ftable[SHT_STD][SHT_TYP_VSY], Slm, Tlm, Qlm, Sh, Th, Qh, LMAX);
	1709	tnodct = t0;
	1710	for (m=0; m<=MMAX; m+=minc) {
	1711	#if SHT_VERBOSE > 1
	1712	if (verbose>1) printf("\n\tm=%d :",m);
	1713	#endif
	1714	if (Set_MTR_DCT(shtns, m) >= 0) {
	1715	t = get_time(shtns, *nlp, 2, "sdct", sht_func[SHT_STD][SHT_DCT][SHT_TYP_SSY], Slm, Tlm, Qlm, Sh, Th, Qh, LMAX);
	1716	if (vector)
	1717	t += get_time(shtns, nloop, 4, "vdct", sht_func[SHT_STD][SHT_DCT][SHT_TYP_VSY], Slm, Tlm, Qlm, Sh, Th, Qh, LMAX);
	1718	if (t < t0) { t0 = t; i = m; PRINT_VERB("*"); }
	1719	PRINT_DOT
	1720	}
	1721	}
	1722	tdct = t0;
	1723	Set_MTR_DCT(shtns, i); // the best DCT is chosen.
	1724	#if SHT_VERBOSE > 0
	1725	if (verbose) printf(" mtr_dct=%d (%.1f%% performance gain)", MTR_DCTMRES, 100.(tnodct/tdct-1.));
	1726	#endif
	1727	}
	1728	#endif
	1729
	1730	done:
	1731	#if SHT_VERBOSE > 0
	1732	if (verbose) printf("\n");
	1733	#endif
	1734	if (Qlm) VFREE(Qlm); if (Tlm) VFREE(Tlm);
	1735	if (Qh) VFREE(Qh); if (Th) VFREE(Th);
	1736	if (Slm) VFREE(Slm); if (Sh) VFREE(Sh);
	1737	if (dct_mtr > 0) {
	1738	return(tnodct/tdct);
	1739	} else return(0.0);
	1740	}
	1741
	1742
	1743	void shtns_print_version() {
	1744	printf("[" PACKAGE_STRING "] built " __DATE__ ", " __TIME__ ", id: " _SIMD_NAME_ "\n");
	1745	}
	1746
	1747	void fprint_ftable(FILE* fp, void* ftable[SHT_NVAR][SHT_NTYP])
	1748	{
	1749	for (int iv=0; iv<SHT_NVAR; iv++) {
	1750	fprintf(fp, "\n %4s:",sht_var[iv]);
	1751	void** f = ftable[iv];
	1752	for (int it=0; it<SHT_NTYP; it++) {
	1753	if (f[it] != NULL) {
	1754	for (int ia=0; ia<SHT_NALG; ia++)
	1755	if (sht_func[iv][ia][it] == f[it]) {
	1756	fprintf(fp, "%5s ",sht_name[ia]); break;
	1757	}
	1758	} else fprintf(fp, " none ");
	1759	}
	1760	}
	1761	}
	1762
	1763	void shtns_print_cfg(shtns_cfg shtns)
	1764	{
	1765	printf("Lmax=%d, MmaxMres=%d, Mres=%d, Nlm=%d [%d threads, ",LMAX, MMAXMRES, MRES, NLM, shtns->nthreads);
	1766	if (shtns->norm & SHT_REAL_NORM) printf("'real' norm, ");
	1767	if (shtns->norm & SHT_NO_CS_PHASE) printf("no Condon-Shortley phase, ");
	1768	if (SHT_NORM == sht_fourpi) printf("4.pi normalized]\n");
	1769	else if (SHT_NORM == sht_schmidt) printf("Schmidt semi-normalized]\n");
	1770	else printf("orthonormalized]\n");
	1771	if (shtns->ct == NULL) return; // no grid is set
	1772
	1773	switch(shtns->grid) {
	1774	case GRID_GAUSS : printf("Gauss grid"); break;
	1775	case GRID_REGULAR : printf("Regular grid (mtr_dct=%d)",shtns->mtr_dct); break;
	1776	case GRID_POLES : printf("Regular grid including poles"); break;
	1777	default : printf("Unknown grid");
	1778	}
	1779	printf(" : Nlat=%d, Nphi=%d\n", NLAT, NPHI);
	1780	printf(" ");
	1781	for (int it=0; it<SHT_NTYP; it++)
	1782	printf("%5s ",sht_type[it]);
	1783	fprint_ftable(stdout, shtns->ftable);
	1784	printf("\n");
	1785	}
	1786
	1787
	1788	/// \internal saves config to a file for later restart.
	1789	int config_save(shtns_cfg shtns, int req_flags)
	1790	{
	1791	int err = 0;
	1792
	1793	if (shtns->ct == NULL) return -1; // no grid set
	1794
	1795	if ((shtns->nphi > 1)\|\|(shtns->mtr_dct >= 0)) {
	1796	FILE* f = fopen("shtns_cfg_fftw","w");
	1797	if (f != NULL) {
	1798	fftw_export_wisdom_to_file(f);
	1799	fclose(f);
	1800	} else err -= 2;
	1801	}
	1802
	1803	FILE *fcfg = fopen("shtns_cfg","a");
	1804	if (fcfg != NULL) {
	1805	fprintf(fcfg, "%s %s %d %d %d %d %d %d %d %d %d %d",PACKAGE_VERSION, _SIMD_NAME_, shtns->lmax, shtns->mmax, shtns->mres, shtns->nphi, shtns->nlat, shtns->grid, shtns->nthreads, req_flags, shtns->nlorder, shtns->mtr_dct);
	1806	fprint_ftable(fcfg, shtns->ftable);
	1807	fprintf(fcfg,"\n");
	1808	fclose(fcfg);
	1809	} else err -= 4;
	1810
	1811	#if SHT_VERBOSE > 0
	1812	if (err < 0) fprintf(stderr,"! Warning ! SHTns could not save config\n");
	1813	#endif
	1814	return err;
	1815	}
	1816
	1817	/// \internal try to load config from a file
	1818	int config_load(shtns_cfg shtns, int req_flags)
	1819	{
	1820	void* ft2[SHT_NVAR][SHT_NTYP]; // pointers to transform functions.
	1821	int lmax2, mmax2, mres2, nphi2, nlat2, grid2, nthreads2, req_flags2, nlorder2, mtr_dct2;
	1822	int found = 0;
	1823	char version[32], simd[8], alg[8];
	1824
	1825	if (shtns->ct == NULL) return -1; // no grid set
	1826
	1827	if ((req_flags & 255) == sht_quick_init) req_flags += sht_gauss - sht_quick_init; // quick_init uses gauss.
	1828
	1829	FILE *fcfg = fopen("shtns_cfg","r");
	1830	if (fcfg != NULL) {
	1831	int i=0;
	1832	while(1) {
	1833	fscanf(fcfg, "%30s %8s %d %d %d %d %d %d %d %d %d %d",version, simd, &lmax2, &mmax2, &mres2, &nphi2, &nlat2, &grid2, &nthreads2, &req_flags2, &nlorder2, &mtr_dct2);
	1834	for (int iv=0; iv<SHT_NVAR; iv++) {
	1835	fscanf(fcfg, "%7s", alg);
	1836	for (int it=0; it<SHT_NTYP; it++) {
	1837	fscanf(fcfg, "%7s", alg),
	1838	ft2[iv][it] = 0;
	1839	for (int ia=0; ia<SHT_NALG; ia++) {
	1840	if (strcmp(alg, sht_name[ia]) == 0) {
	1841	ft2[iv][it] = sht_func[iv][ia][it];
	1842	break;
	1843	}
	1844	}
	1845	}
	1846	}
	1847	if (feof(fcfg)) break;
	1848	#ifndef SHTNS_DCT
	1849	if (mtr_dct2 <= 0)
	1850	#endif
	1851	if ((shtns->lmax == lmax2) && (shtns->mmax == mmax2) && (shtns->mres == mres2) && (shtns->nthreads == nthreads2) &&
	1852	(shtns->nphi == nphi2) && (shtns->nlat == nlat2) && (shtns->grid == grid2) && (req_flags == req_flags2) &&
	1853	(shtns->nlorder == nlorder2) && (strcmp(simd, _SIMD_NAME_)==0)) {
	1854	#if SHT_VERBOSE > 0
	1855	if (verbose > 0) printf(" + using saved config\n");
	1856	#endif
	1857	#if SHT_VERBOSE > 1
	1858	if (verbose > 1) {
	1859	fprint_ftable(stdout, ft2);
	1860	printf("\n");
	1861	}
	1862	#endif
	1863	#ifdef SHTNS_DCT
	1864	Set_MTR_DCT(shtns, mtr_dct2); // use loaded mtr_dct
	1865	#endif
	1866	for (int iv=0; iv<SHT_NVAR; iv++)
	1867	for (int it=0; it<SHT_NTYP; it++)
	1868	if (ft2[iv][it]) shtns->ftable[iv][it] = ft2[iv][it]; // accept only non-null pointer
	1869	found = 1;
	1870	break;
	1871	}
	1872	}
	1873	fclose(fcfg);
	1874	return found;
	1875	} else {
	1876	#if SHT_VERBOSE > 0
	1877	if (verbose) fprintf(stderr,"! Warning ! SHTns could not load config\n");
	1878	#endif
	1879	return -2; // file not found
	1880	}
	1881	}
	1882
	1883	/// \internal returns 1 if val cannot fit in dest (unsigned)
	1884	#define IS_TOO_LARGE(val, dest) (sizeof(dest) >= sizeof(val)) ? 0 : ( ( val >= (1<<(8*sizeof(dest))) ) ? 1 : 0 )
	1885
	1886	/// \internal returns the size that must be allocated for an shtns_info.
	1887	#define SIZEOF_SHTNS_INFO(mmax) ( sizeof(struct shtns_info) + (mmax+1)*( sizeof(int)+sizeof(unsigned short) ) )
	1888
	1889	/* PUBLIC INITIALIZATION & DESTRUCTION */
	1890
	1891	/** \addtogroup init Initialization functions.
	1892	*/
	1893	//@{
	1894
	1895	/*! This sets the description of spherical harmonic coefficients.
	1896	* It tells SHTns how to interpret spherical harmonic coefficient arrays, and it sets usefull arrays.
	1897	* Returns the configuration to be passed to subsequent transform functions, which is basicaly a pointer to a \ref shtns_info struct.
	1898	* \param lmax : maximum SH degree that we want to describe.
	1899	* \param mmax : number of azimutal wave numbers.
	1900	* \param mres : \c 2.pi/mres is the azimutal periodicity. \c mmax*mres is the maximum SH order.
	1901	* \param norm : define the normalization of the spherical harmonics (\ref shtns_norm)
	1902	* + optionaly disable Condon-Shortley phase (ex: \ref sht_schmidt \| \ref SHT_NO_CS_PHASE)
	1903	* + optionaly use a 'real' normalization (ex: \ref sht_fourpi \| \ref SHT_REAL_NORM)
	1904	*/
	1905	shtns_cfg shtns_create(int lmax, int mmax, int mres, enum shtns_norm norm)
	1906	{
	1907	shtns_cfg shtns, s2;
	1908	int im, m, l, lm;
	1909	int with_cs_phase = 1; /// Condon-Shortley phase (-1)^m is used by default.
	1910	double mpos_renorm = 1.0; /// renormalization of m>0.
	1911	int larrays_ok = 0;
	1912	int legendre_ok = 0;
	1913	int l_2_ok = 0;
	1914
	1915	// if (lmax < 1) shtns_runerr("lmax must be larger than 1");
	1916	if (lmax < 2) shtns_runerr("lmax must be at least 2");
	1917	if (IS_TOO_LARGE(lmax, shtns->lmax)) shtns_runerr("lmax too large");
	1918	if (mmaxmres > lmax) shtns_runerr("MMAXMRES should not exceed LMAX");
	1919	if (mres <= 0) shtns_runerr("MRES must be > 0");
	1920
	1921	// allocate new setup and initialize some variables (used as flags) :
	1922	shtns = malloc( SIZEOF_SHTNS_INFO(mmax) );
	1923	if (shtns == NULL) return shtns; // FAIL
	1924	{
	1925	void p0 = (void) &shtns->tm; // first pointer in struct.
	1926	void p1 = (void) &shtns->Y00_1; // first non-pointer.
	1927	while(p0 < p1) *p0++ = NULL; // write NULL to every pointer.
	1928	shtns->lmidx = (int*) (shtns + 1); // lmidx is stored at the end of the struct...
	1929	shtns->tm = (unsigned short*) (shtns->lmidx + (mmax+1)); // and tm just after.
	1930	shtns->ct = NULL; shtns->st = NULL;
	1931	shtns->nphi = 0; shtns->nlat = 0; shtns->nlat_2 = 0; shtns->nspat = 0; // public data
	1932	}
	1933
	1934	// copy sizes.
	1935	shtns->norm = norm;
	1936	if (norm & SHT_NO_CS_PHASE)
	1937	with_cs_phase = 0;
	1938	if (norm & SHT_REAL_NORM)
	1939	mpos_renorm = 0.5; // normalization for 'real' spherical harmonics.
	1940
	1941	shtns->mmax = mmax; shtns->mres = mres; shtns->lmax = lmax;
	1942	shtns->nlm = nlm_calc(lmax, mmax, mres);
	1943	shtns->nthreads = omp_threads;
	1944	if (omp_threads > mmax+1) shtns->nthreads = mmax+1; // limit the number of threads to mmax+1
	1945	#if SHT_VERBOSE > 0
	1946	if (verbose) {
	1947	shtns_print_version();
	1948	printf(" "); shtns_print_cfg(shtns);
	1949	}
	1950	#endif
	1951
	1952	s2 = sht_data; // check if some data can be shared ...
	1953	while(s2 != NULL) {
	1954	if ((s2->mmax >= mmax) && (s2->mres == mres)) {
	1955	if (s2->lmax == lmax) { // we can reuse the l-related arrays (li + copy lmidx)
	1956	shtns->li = s2->li; shtns->mi = s2->mi;
	1957	for (im=0; im<=mmax; im++) shtns->lmidx[im] = s2->lmidx[im];
	1958	larrays_ok = 1;
	1959	}
	1960	if ( (s2->lmax >= lmax) && (s2->norm == norm) ) { // we can reuse the legendre tables.
	1961	shtns->alm = s2->alm; shtns->blm = s2->blm;
	1962	legendre_ok = 1;
	1963	}
	1964	}
	1965	if (s2->lmax >= lmax) { // we can reuse l_2
	1966	shtns->l_2 = s2->l_2;
	1967	l_2_ok = 1;
	1968	}
	1969	s2 = s2->next;
	1970	}
	1971	if (larrays_ok == 0) {
	1972	// alloc spectral arrays
	1973	shtns->li = (unsigned short ) malloc( 2NLM*sizeof(unsigned short) ); // NLM defined at runtime.
	1974	shtns->mi = shtns->li + NLM;
	1975	for (im=0, lm=0; im<=MMAX; im++) { // init l-related arrays.
	1976	m = im*MRES;
	1977	shtns->lmidx[im] = lm -m; // virtual pointer for l=0
	1978	for (l=im*MRES;l<=LMAX;l++) {
	1979	shtns->li[lm] = l; shtns->mi[lm] = m;
	1980	lm++;
	1981	}
	1982	}
	1983	if (lm != NLM) shtns_runerr("unexpected error");
	1984	}
	1985	if (legendre_ok == 0) { // this quickly precomputes some values for the legendre recursion.
	1986	legendre_precomp(shtns, SHT_NORM, with_cs_phase, mpos_renorm);
	1987	}
	1988	if (l_2_ok == 0) {
	1989	shtns->l_2 = (double ) malloc( (LMAX+1)sizeof(double) );
	1990	shtns->l_2[0] = 0.0; // undefined for l=0 => replace with 0.
	1991	real one = 1.0;
	1992	for (l=1; l<=LMAX; l++) shtns->l_2[l] = one/(l*(l+1));
	1993	}
	1994
	1995	switch(SHT_NORM) {
	1996	case sht_schmidt:
	1997	shtns->Y00_1 = 1.0; shtns->Y10_ct = 1.0;
	1998	break;
	1999	case sht_fourpi:
	2000	shtns->Y00_1 = 1.0; shtns->Y10_ct = sqrt(1./3.);
	2001	break;
	2002	case sht_orthonormal:
	2003	default:
	2004	shtns->Y00_1 = sqrt(4.M_PI); shtns->Y10_ct = sqrt(4.M_PI/3.);
	2005	// Y11_st = sqrt(2.*M_PI/3.); // orthonormal : \f$ \sin\theta\cos\phi/(Y_1^1 + Y_1^{-1}) = -\sqrt{2 \pi /3} \f$
	2006	}
	2007	shtns->Y11_st = shtns->Y10_ct * sqrt(0.5/mpos_renorm);
	2008	if (with_cs_phase) shtns->Y11_st *= -1.0; // correct Condon-Shortley phase
	2009
	2010	// save a pointer to this setup and return.
	2011	shtns->next = sht_data; // reference of previous setup (may be NULL).
	2012	sht_data = shtns; // keep track of new setup.
	2013	return(shtns);
	2014	}
	2015
	2016	/// Copy a given config but allow a different (smaller) mmax and the possibility to enable/disable fft (beta).
	2017	shtns_cfg shtns_create_with_grid(shtns_cfg base, int mmax, int nofft)
	2018	{
	2019	shtns_cfg shtns;
	2020
	2021	if (mmax > base->mmax) return (NULL); // fail if mmax larger than source config.
	2022
	2023	shtns = malloc( SIZEOF_SHTNS_INFO(mmax) );
	2024	memcpy(shtns, base, SIZEOF_SHTNS_INFO(mmax) ); // copy all
	2025	shtns->lmidx = (int*) shtns+1; // lmidx is stored at the end of the struct...
	2026	shtns->tm = (unsigned short*) (shtns->lmidx + (mmax+1)); // ...and tm just after.
	2027
	2028	if (mmax != shtns->mmax) {
	2029	shtns->mmax = mmax;
	2030	for (int im=0; im<=mmax; im++) {
	2031	shtns->lmidx[im] = base->lmidx[im];
	2032	shtns->tm[im] = base->tm[im];
	2033	}
	2034	#ifdef SHTNS_DCT
	2035	if (mmax < shtns->mtr_dct) {
	2036	shtns->idct = NULL; // do not destroy the plan of the source.
	2037	Set_MTR_DCT(shtns, mmax); // adjut mtr_dct if required.
	2038	}
	2039	#endif
	2040	if (mmax == 0) {
	2041	// TODO we may disable fft and replace with a phi-averaging function ...
	2042	// ... then switch to axisymmetric functions :
	2043	// init_sht_array_func(shtns);
	2044	// choose_best_sht(shtns, &nloop, 0);
	2045	}
	2046	}
	2047	if (nofft != 0) {
	2048	shtns->ncplx_fft = -1; // fft disabled.
	2049	}
	2050
	2051	// save a pointer to this setup and return.
	2052	shtns->next = sht_data; // reference of previous setup (may be NULL).
	2053	sht_data = shtns; // keep track of new setup.
	2054	return(shtns);
	2055	}
	2056
	2057	/// release all resources allocated by a grid.
	2058	void shtns_unset_grid(shtns_cfg shtns)
	2059	{
	2060	if (ref_count(shtns, &shtns->wg) == 1) VFREE(shtns->wg);
	2061	shtns->wg = NULL;
	2062	free_SH_dct(shtns);
	2063	free_SHTarrays(shtns);
	2064	shtns->nlat = 0; shtns->nlat_2 = 0;
	2065	shtns->nphi = 0; shtns->nspat = 0;
	2066	}
	2067
	2068	/// release all resources allocated by a given shtns_cfg.
	2069	void shtns_destroy(shtns_cfg shtns)
	2070	{
	2071	free_unused(shtns, &shtns->l_2);
	2072	if (shtns->blm != shtns->alm)
	2073	free_unused(shtns, &shtns->blm);
	2074	free_unused(shtns, &shtns->alm);
	2075	free_unused(shtns, &shtns->li);
	2076
	2077	shtns_unset_grid(shtns);
	2078
	2079	if (sht_data == shtns) {
	2080	sht_data = shtns->next; // forget shtns
	2081	} else {
	2082	shtns_cfg s2 = sht_data;
	2083	while (s2 != NULL) {
	2084	if (s2->next == shtns) {
	2085	s2->next = shtns->next; // forget shtns
	2086	break;
	2087	}
	2088	s2 = s2->next;
	2089	}
	2090	}
	2091	free(shtns);
	2092	}
	2093
	2094	/// clear all allocated memory (hopefully) and go back to 0 state.
	2095	void shtns_reset()
	2096	{
	2097	while (sht_data != NULL) {
	2098	shtns_destroy(sht_data);
	2099	}
	2100	}
	2101
	2102	/*! Initialization of Spherical Harmonic transforms (backward and forward, vector and scalar, ...) of given size.
	2103	* <b>This function must be called after \ref shtns_create and before any SH transform.</b> and sets all global variables and internal data.
	2104	* returns the required number of doubles to be allocated for a spatial field.
	2105	* \param shtns is the config created by \ref shtns_create for which the grid will be set.
	2106	* \param nlat,nphi pointers to the number of latitudinal and longitudinal grid points respectively. If 0, they are set to optimal values.
	2107	* \param nl_order defines the maximum SH degree to be resolved by analysis : lmax_analysis = lmax*nl_order. It is used to set an optimal and anti-aliasing nlat. If 0, the default SHT_DEFAULT_NL_ORDER is used.
	2108	* \param flags allows to choose the type of transform (see \ref shtns_type) and the spatial data layout (see \ref spat)
	2109	* \param eps polar optimization threshold : polar values of Legendre Polynomials below that threshold are neglected (for high m), leading to increased performance (a few percents)
	2110	* 0 = no polar optimization; 1.e-14 = VERY safe; 1.e-10 = safe; 1.e-6 = aggresive, but still good accuracy.
	2111	*/
	2112	int shtns_set_grid_auto(shtns_cfg shtns, enum shtns_type flags, double eps, int nl_order, int nlat, int nphi)
	2113	{
	2114	double t, mem;
	2115	int im,m;
	2116	int layout;
	2117	int nloop = 0;
	2118	int n_gauss = 0;
	2119	int on_the_fly = 0;
	2120	int quick_init = 0;
	2121	int vector = !(flags & SHT_SCALAR_ONLY);
	2122	int latdir = (flags & SHT_SOUTH_POLE_FIRST) ? -1 : 1; // choose latitudinal direction (change sign of ct)
	2123	int cfg_loaded = 0;
	2124	int analys = 1;
	2125	const int req_flags = flags; // requested flags.
	2126
	2127	#if _GCC_VEC_
	2128	if (*nlat & 1) shtns_runerr("Nlat must be even\n");
	2129	#ifdef __MIC__
	2130	if (*nlat % VSIZE2) shtns_runerr("Nlat must be a multiple of 8 for the MIC\n");
	2131	#endif
	2132	#endif
	2133	shtns_unset_grid(shtns); // release grid if previously allocated.
	2134	if (nl_order <= 0) nl_order = SHT_DEFAULT_NL_ORDER;
	2135	/* shtns.lshift = 0;
	2136	if (nl_order == 0) nl_order = SHT_DEFAULT_NL_ORDER;
	2137	if (nl_order < 0) { shtns.lshift = -nl_order; nl_order = 1; } // linear with a shift in l.
	2138	*/
	2139	shtns->nspat = 0;
	2140	shtns->nlorder = nl_order;
	2141	shtns->mtr_dct = -1; // dct switched off completely.
	2142	layout = flags & 0xFFFF00;
	2143	flags = flags & 255; // clear higher bits.
	2144
	2145	switch (flags) {
	2146	#ifndef SHTNS_DCT
	2147	case sht_auto : flags = sht_gauss; break; // only gauss available.
	2148	case sht_reg_fast:
	2149	case sht_reg_dct: shtns_runerr("regular grid not available (DCT required)."); break;
	2150	#endif
	2151	case sht_gauss_fly : flags = sht_gauss; on_the_fly = 1; break;
	2152	case sht_quick_init : flags = sht_gauss; quick_init = 1; break;
	2153	case sht_reg_poles : analys = 0; quick_init = 1; break;
	2154	default : break;
	2155	}
	2156	#ifndef SHTNS_MEM
	2157	on_the_fly = 1;
	2158	#endif
	2159
	2160	if (*nphi == 0) {
	2161	nphi = fft_int((nl_order+1)MMAX+1, 7); // required fft nodes
	2162	}
	2163	if (*nlat == 0) {
	2164	n_gauss = ((nl_order+1)*LMAX)/2 +1; // required gauss nodes
	2165	n_gauss += (n_gauss&1); // even is better.
	2166	n_gauss = ((n_gauss+(VSIZE2-1))/VSIZE2) * VSIZE2; // multiple of vector size
	2167	if (flags != sht_gauss) {
	2168	m = fft_int(nl_order*LMAX+2, 7); // required dct nodes
	2169	*nlat = m + (m&1); // even is better.
	2170	} else *nlat = n_gauss;
	2171	}
	2172
	2173	mem = sht_mem_size(shtns->lmax, shtns->mmax, shtns->mres, *nlat);
	2174	t=mem; if (analys) t=2; if (vector) t=3;
	2175	#if SHT_VERBOSE > 1
	2176	if (verbose>1) printf("Memory required for precomputed matrices (estimate) : %.3f Mb\n",t);
	2177	#endif
	2178	if ( t > SHTNS_MAX_MEMORY ) { // huge transform has been requested
	2179	on_the_fly = 1;
	2180	if ( (flags == sht_reg_dct) \|\| (flags == sht_reg_fast) ) shtns_runerr("Memory limit exceeded, try using sht_gauss or increase SHTNS_MAX_MEMORY in sht_config.h");
	2181	if (flags != sht_reg_poles) {
	2182	flags = sht_gauss;
	2183	if (n_gauss > 0) *nlat = n_gauss;
	2184	}
	2185	// if (t > 10*SHTNS_MAX_MEMORY) quick_init =1; // do not time such large transforms.
	2186	}
	2187
	2188	if (quick_init == 0) { // do not waste too much time finding optimal fftw.
	2189	shtns->fftw_plan_mode = FFTW_EXHAUSTIVE; // defines the default FFTW planner mode.
	2190	// fftw_set_timelimit(60.0); // do not search plans for more than 1 minute (does it work well ???)
	2191	if (*nphi > 512) shtns->fftw_plan_mode = FFTW_PATIENT;
	2192	if (*nphi > 1024) shtns->fftw_plan_mode = FFTW_MEASURE;
	2193	} else {
	2194	shtns->fftw_plan_mode = FFTW_ESTIMATE;
	2195	if ((mem < 1.0) && (SHT_VERBOSE < 2)) shtns->nthreads = 1; // disable threads for small transforms (in quickinit mode).
	2196	if ((VSIZE2 >= 4) && (nlat >= VSIZE24)) on_the_fly = 1; // with AVX, on-the-fly should be the default (faster).
	2197	if ((shtns->nthreads > 1) && (nlat >= VSIZE216)) on_the_fly = 1; // force multi-thread transforms
	2198	}
	2199
	2200	if (flags == sht_auto) {
	2201	if ( ((nl_order>=2)&&(MMAXMRES > LMAX/2)) \|\| (nlat < SHT_MIN_NLAT_DCT) \|\| (nlat & 1) \|\| (nlat <= LMAX+1) ) {
	2202	flags = sht_gauss; // avoid computing DCT stuff when it is not expected to be faster.
	2203	if (n_gauss > 0) *nlat = n_gauss;
	2204	}
	2205	}
	2206
	2207	if (*nlat <= shtns->lmax) shtns_runerr("Nlat must be larger than Lmax");
	2208	if (IS_TOO_LARGE(*nlat, shtns->nlat)) shtns_runerr("Nlat too large");
	2209	if (IS_TOO_LARGE(*nphi, shtns->nphi)) shtns_runerr("Nphi too large");
	2210
	2211	// copy to global variables.
	2212	shtns->nphi = *nphi;
	2213	shtns->nlat_2 = (nlat+1)/2; shtns->nlat = nlat;
	2214
	2215	if (layout & SHT_LOAD_SAVE_CFG) {
	2216	FILE* f = fopen("shtns_cfg_fftw","r");
	2217	if (f) {
	2218	fftw_import_wisdom_from_file(f); // load fftw wisdom.
	2219	fclose(f);
	2220	}
	2221	}
	2222	planFFT(shtns, layout, on_the_fly); // initialize fftw
	2223	shtns->zlm_dct0 = NULL; // used as a flag.
	2224	init_sht_array_func(shtns); // array of SHT functions is now set.
	2225
	2226	#ifdef SHTNS_DCT
	2227	if (flags == sht_reg_dct) { // pure dct.
	2228	alloc_SHTarrays(shtns, on_the_fly, vector, analys); // allocate dynamic arrays
	2229	grid_dct(shtns, latdir); init_SH_dct(shtns, 1);
	2230	OptimizeMatrices(shtns, eps);
	2231	Set_MTR_DCT(shtns, MMAX);
	2232	if (MTR_DCT != MMAX) shtns_runerr("DCT planning failed.");
	2233	}
	2234	if ((flags == sht_auto)\|\|(flags == sht_reg_fast))
	2235	{
	2236	alloc_SHTarrays(shtns, on_the_fly, vector, analys); // allocate dynamic arrays
	2237	grid_dct(shtns, latdir); init_SH_dct(shtns, 1);
	2238	OptimizeMatrices(shtns, eps);
	2239	if (NLAT >= SHT_MIN_NLAT_DCT) { // dct requires large NLAT to perform well.
	2240	if ((layout & SHT_LOAD_SAVE_CFG) && (flags == sht_reg_fast))
	2241	cfg_loaded = (config_load(shtns, req_flags) > 0);
	2242	if (!cfg_loaded) {
	2243	t = choose_best_sht(shtns, &nloop, vector, 1); // find optimal MTR_DCT.
	2244	if ((n_gauss > 0)&&(flags == sht_auto)) t *= ((double) n_gauss)/NLAT; // we can revert to gauss with a smaller nlat.
	2245	if (t < MIN_PERF_IMPROVE_DCT) {
	2246	Set_MTR_DCT(shtns, -1); // turn off DCT.
	2247	} else {
	2248	t = SHT_error(shtns, vector);
	2249	if (t > MIN_ACCURACY_DCT) {
	2250	#if SHT_VERBOSE > 0
	2251	if (verbose) printf(" !! Not enough accuracy (%.3g) => DCT disabled.\n",t);
	2252	#endif
	2253	#if SHT_VERBOSE < 2
	2254	Set_MTR_DCT(shtns, -1); // turn off DCT.
	2255	#endif
	2256	}
	2257	}
	2258	}
	2259	}
	2260	if (MTR_DCT < 0) { // free memory used by DCT and disables DCT.
	2261	free_SH_dct(shtns); // free now useless arrays.
	2262	if (flags == sht_auto) {
	2263	flags = sht_gauss; // switch to gauss grid, even better accuracy.
	2264	#if SHT_VERBOSE > 0
	2265	if (verbose) printf(" => switching back to Gauss Grid\n");
	2266	#endif
	2267	for (im=1; im<=MMAX; im++) { // im >= 1
	2268	m = im*MRES;
	2269	shtns->ylm[im] -= shtns->tm[im]*(LMAX-m+1); // restore pointers altered by OptimizeMatrices().
	2270	if (vector) shtns->dylm[im] -= shtns->tm[im]*(LMAX-m+1);
	2271	}
	2272	if (n_gauss > 0) { // we should use the optimal size for gauss-legendre
	2273	free_SHTarrays(shtns);
	2274	*nlat = n_gauss;
	2275	shtns->nlat_2 = (nlat+1)/2; shtns->nlat = nlat;
	2276	planFFT(shtns, layout, on_the_fly); // fft must be replanned because NLAT has changed.
	2277	}
	2278	}
	2279	}
	2280	}
	2281	#endif /* SHTNS_DCT */
	2282	if (flags == sht_gauss)
	2283	{
	2284	alloc_SHTarrays(shtns, on_the_fly, vector, analys); // allocate dynamic arrays
	2285	grid_gauss(shtns, latdir);
	2286	#ifdef SHTNS_MEM
	2287	if (on_the_fly == 0) {
	2288	init_SH_gauss(shtns); // precompute matrices
	2289	OptimizeMatrices(shtns, eps);
	2290	}
	2291	#endif
	2292	}
	2293	if (flags == sht_reg_poles)
	2294	{
	2295	alloc_SHTarrays(shtns, on_the_fly, vector, 0); // allocate dynamic arrays (no analysis)
	2296	grid_equal_polar(shtns, latdir);
	2297	#ifdef SHTNS_MEM
	2298	if (on_the_fly == 0) {
	2299	init_SH_synth(shtns);
	2300	for (im=0; im<=MMAX; im++) shtns->tm[im] = 0; // avoid problems with tm[im] modified ????
	2301	}
	2302	#endif
	2303	}
	2304
	2305	if (on_the_fly == 1) {
	2306	#if SHT_VERBOSE > 0
	2307	if (verbose) printf(" + using on-the-fly transforms.\n");
	2308	#endif
	2309	if (NLAT < VSIZE24) shtns_runerr("on-the-fly only available for nlat>=32"); // avoid overflow with NLAT_2 < VSIZE22
	2310	PolarOptimize(shtns, eps);
	2311	set_sht_fly(shtns, 0); // switch function pointers to "on-the-fly" functions.
	2312	}
	2313
	2314	if ((layout & SHT_LOAD_SAVE_CFG) && (!cfg_loaded)) cfg_loaded = (config_load(shtns, req_flags) > 0);
	2315	if (quick_init == 0) {
	2316	if (!cfg_loaded) {
	2317	choose_best_sht(shtns, &nloop, vector, 0);
	2318	if (layout & SHT_LOAD_SAVE_CFG) config_save(shtns, req_flags);
	2319	}
	2320	#ifdef SHTNS_MEM
	2321	if (on_the_fly == 0) free_unused_matrices(shtns);
	2322	#endif
	2323	t = SHT_error(shtns, vector); // compute SHT accuracy.
	2324	#if SHT_VERBOSE > 0
	2325	if (verbose) printf(" + SHT accuracy = %.3g\n",t);
	2326	#endif
	2327	#if SHT_VERBOSE < 2
	2328	if (t > 1.e-3) {
	2329	shtns_print_cfg(shtns);
	2330	shtns_runerr("bad SHT accuracy"); // stop if something went wrong (but not in debug mode)
	2331	}
	2332	#endif
	2333	}
	2334
	2335	// set_sht_fly(shtns, SHT_TYP_VAN);
	2336	#if SHT_VERBOSE > 1
	2337	if ((omp_threads > 1)&&(verbose>1)) printf(" nthreads = %d\n",shtns->nthreads);
	2338	#endif
	2339	#if SHT_VERBOSE > 0
	2340	if (verbose) printf(" => " PACKAGE_NAME " is ready.\n");
	2341	#endif
	2342	return(shtns->nspat); // returns the number of doubles to be allocated for a spatial field.
	2343	}
	2344
	2345
	2346	/*! Initialization of Spherical Harmonic transforms (backward and forward, vector and scalar, ...) of given size.
	2347	* <b>This function must be called after \ref shtns_create and before any SH transform.</b> and sets all global variables.
	2348	* returns the required number of doubles to be allocated for a spatial field.
	2349	* \param shtns is the config created by shtns_create for which the grid will be set.
	2350	* \param flags allows to choose the type of transform (see \ref shtns_type) and the spatial data layout (see \ref spat)
	2351	* \param eps polar optimization threshold : polar values of Legendre Polynomials below that threshold are neglected (for high m), leading to increased performance (a few percents)
	2352	* \param nlat,nphi respectively the number of latitudinal and longitudinal grid points.
	2353	* 0 = no polar optimization; 1.e-14 = VERY safe; 1.e-10 = safe; 1.e-6 = aggresive, but still good accuracy.
	2354	*/
	2355	int shtns_set_grid(shtns_cfg shtns, enum shtns_type flags, double eps, int nlat, int nphi)
	2356	{
	2357	if ((nlat == 0)\|\|(nphi == 0)) shtns_runerr("nlat or nphi is zero !");
	2358	return( shtns_set_grid_auto(shtns, flags, eps, 0, &nlat, &nphi) );
	2359	}
	2360
	2361	/*! Simple initialization of Spherical Harmonic transforms (backward and forward, vector and scalar, ...) of given size.
	2362	* This function sets all global variables by calling \ref shtns_create followed by \ref shtns_set_grid, with the
	2363	* default normalization and the default polar optimization (see \ref sht_config.h).
	2364	* Returns the configuration to be passed to subsequent transform functions, which is basicaly a pointer to a \ref shtns_info struct.
	2365	* \param lmax : maximum SH degree that we want to describe.
	2366	* \param mmax : number of azimutal wave numbers.
	2367	* \param mres : \c 2.pi/mres is the azimutal periodicity. \c mmax*mres is the maximum SH order.
	2368	* \param nlat,nphi : respectively the number of latitudinal and longitudinal grid points.
	2369	* \param flags allows to choose the type of transform (see \ref shtns_type) and the spatial data layout (see \ref spat)
	2370	*/
	2371	shtns_cfg shtns_init(enum shtns_type flags, int lmax, int mmax, int mres, int nlat, int nphi)
	2372	{
	2373	shtns_cfg shtns = shtns_create(lmax, mmax, mres, SHT_DEFAULT_NORM);
	2374	if (shtns != NULL)
	2375	shtns_set_grid(shtns, flags, SHT_DEFAULT_POLAR_OPT, nlat, nphi);
	2376	return shtns;
	2377	}
	2378
	2379	/** Enables OpenMP parallel transforms, if available (see \ref compil).
	2380	Call before any initialization of shtns to use mutliple threads. Returns the actual number of threads.
	2381	\li If num_threads > 0, specifies the maximum number of threads that should be used.
	2382	\li If num_threads <= 0, maximum number of threads is automatically set to the number of processors.
	2383	\li If num_threads == 1, openmp will be disabled. */
	2384	int shtns_use_threads(int num_threads)
	2385	{
	2386	#ifdef _OPENMP
	2387	int procs = omp_get_num_procs();
	2388	if (num_threads <= 0) num_threads = omp_get_max_threads();
	2389	else if (num_threads > 4procs) num_threads = 4procs; // limit the number of threads
	2390	omp_threads = num_threads;
	2391	#endif
	2392	#ifdef OMP_FFTW
	2393	fftw_init_threads(); // enable threads for FFTW.
	2394	#endif
	2395	return omp_threads;
	2396	}
	2397
	2398	/// fill the given array with Gauss weights. returns the number of weights written, which
	2399	/// may be zero if the grid is not a Gauss grid.
	2400	int shtns_gauss_wts(shtns_cfg shtns, double *wts)
	2401	{
	2402	int i = 0;
	2403	if (shtns->wg) {
	2404	double rescale = 2*NPHI; // weights are stored with a rescaling that depends on SHT_NORM.
	2405	if ((SHT_NORM != sht_fourpi)&&(SHT_NORM != sht_schmidt)) rescale *= 0.25/M_PI;
	2406
	2407	do {
	2408	wts[i] = shtns->wg[i] * rescale;
	2409	} while(++i < shtns->nlat_2);
	2410	}
	2411	return i;
	2412	}
	2413
	2414	//@}
	2415
	2416
	2417	#ifdef SHT_F77_API
	2418
	2419	/* FORTRAN API */
	2420
	2421	/** \addtogroup fortapi Fortran API.
	2422	* Call from fortran without the trailing '_'.
	2423	* see the \link SHT_example.f Fortran example \endlink for a simple usage of SHTns from Fortran language.
	2424	*/
	2425	//@{
	2426
	2427	/// Set verbosity level
	2428	void shtns_verbose_(int *v)
	2429	{
	2430	shtns_verbose(*v);
	2431	}
	2432
	2433	/// Enable threads
	2434	void shtns_use_threads_(int *num_threads)
	2435	{
	2436	shtns_use_threads(*num_threads);
	2437	}
	2438
	2439	/// Print info
	2440	void shtns_print_cfg_()
	2441	{
	2442	shtns_print_version();
	2443	if (sht_data) shtns_print_cfg(sht_data);
	2444	}
	2445
	2446	/// Initializes spherical harmonic transforms of given size using Gauss algorithm with default polar optimization.
	2447	void shtns_init_sh_gauss_(int layout, int lmax, int mmax, int mres, int nlat, int nphi)
	2448	{
	2449	shtns_reset();
	2450	shtns_cfg shtns = shtns_create(lmax, mmax, *mres, SHT_DEFAULT_NORM);
	2451	shtns_set_grid(shtns, sht_gauss \| layout, SHT_DEFAULT_POLAR_OPT, nlat, *nphi);
	2452	}
	2453
	2454	/// Initializes spherical harmonic transforms of given size using Fastest available algorithm and polar optimization.
	2455	void shtns_init_sh_auto_(int layout, int lmax, int mmax, int mres, int nlat, int nphi)
	2456	{
	2457	shtns_reset();
	2458	shtns_cfg shtns = shtns_create(lmax, mmax, *mres, SHT_DEFAULT_NORM);
	2459	shtns_set_grid(shtns, sht_auto \| layout, SHT_DEFAULT_POLAR_OPT, nlat, *nphi);
	2460	}
	2461
	2462	/// Initializes spherical harmonic transforms of given size using a regular grid and agressive optimizations.
	2463	void shtns_init_sh_reg_fast_(int layout, int lmax, int mmax, int mres, int nlat, int nphi)
	2464	{
	2465	shtns_reset();
	2466	shtns_cfg shtns = shtns_create(lmax, mmax, *mres, SHT_DEFAULT_NORM);
	2467	shtns_set_grid(shtns, sht_reg_fast \| layout, 1.e-6, nlat, *nphi);
	2468	}
	2469
	2470	/// Initializes spherical harmonic transform SYNTHESIS ONLY of given size using a regular grid including poles.
	2471	void shtns_init_sh_poles_(int layout, int lmax, int mmax, int mres, int nlat, int nphi)
	2472	{
	2473	shtns_reset();
	2474	shtns_cfg shtns = shtns_create(lmax, mmax, *mres, SHT_DEFAULT_NORM);
	2475	shtns_set_grid(shtns, sht_reg_poles \| layout, 0, nlat, *nphi);
	2476	}
	2477
	2478	/// Defines the size and convention of the transform.
	2479	/// Allow to choose the normalization and whether or not to include the Condon-Shortley phase.
	2480	/// \see shtns_create
	2481	void shtns_set_size_(int lmax, int mmax, int mres, int norm)
	2482	{
	2483	shtns_reset();
	2484	shtns_create(lmax, mmax, mres, norm);
	2485	}
	2486
	2487	/// Precompute matrices for synthesis and analysis.
	2488	/// Allow to choose polar optimization threshold and algorithm type.
	2489	/// \see shtns_set_grid
	2490	void shtns_precompute_(int type, int layout, double eps, int nlat, int *nphi)
	2491	{
	2492	shtns_set_grid(sht_data, type \| layout, eps, nlat, *nphi);
	2493	}
	2494
	2495	/// Same as shtns_precompute_ but choose optimal nlat and/or nphi.
	2496	/// \see shtns_set_grid_auto
	2497	void shtns_precompute_auto_(int type, int layout, double eps, int nl_order, int nlat, int nphi)
	2498	{
	2499	shtns_set_grid_auto(sht_data, type \| layout, eps, nl_order, nlat, nphi);
	2500	}
	2501
	2502	/// Clear everything
	2503	void shtns_reset_() {
	2504	shtns_reset();
	2505	}
	2506
	2507	/// returns nlm, the number of complex*16 elements in an SH array.
	2508	/// call from fortran using \code call shtns_calc_nlm(nlm, lmax, mmax, mres) \endcode
	2509	void shtns_calc_nlm_(int nlm, const int const lmax, const int const mmax, const int const mres)
	2510	{
	2511	nlm = nlm_calc(lmax, mmax, mres);
	2512	}
	2513
	2514	/// returns lm, the index in an SH array of mode (l,m).
	2515	/// call from fortran using \code call shtns_lmidx(lm, l, m) \endcode
	2516	void shtns_lmidx_(int lm, const int const l, const int *const m)
	2517	{
	2518	unsigned im = *m;
	2519	unsigned mres = sht_data->mres;
	2520	if (mres > 1) {
	2521	unsigned k = im % mres;
	2522	im = im / mres;
	2523	if (k) printf("wrong m");
	2524	}
	2525	lm = LiM(sht_data, l, im) + 1; // convert to fortran convention index.
	2526	}
	2527
	2528	/// returns l and m, degree and order of an index in SH array lm.
	2529	/// call from fortran using \code call shtns_l_m(l, m, lm) \endcode
	2530	void shtns_l_m_(int l, int m, const int *const lm)
	2531	{
	2532	l = sht_data->li[lm -1]; // convert from fortran convention index.
	2533	m = sht_data->mi[lm -1];
	2534	}
	2535
	2536	/// fills the given array with the cosine of the co-latitude angle (NLAT real*8)
	2537	/// if no grid has been set, the first element will be set to zero.
	2538	void shtns_cos_array_(double *costh)
	2539	{
	2540	if (sht_data->ct) {
	2541	for (int i=0; i<sht_data->nlat; i++)
	2542	costh[i] = sht_data->ct[i];
	2543	} else costh[0] = 0.0; // mark as invalid.
	2544	}
	2545
	2546	/// fills the given array with the gaussian quadrature weights ((NLAT+1)/2 real*8).
	2547	/// when there is no gaussian grid, the first element is set to zero.
	2548	void shtns_gauss_wts_(double *wts)
	2549	{
	2550	int i = shtns_gauss_wts(sht_data, wts);
	2551	if (i==0) wts[0] = 0; // mark as invalid.
	2552	}
	2553
	2554
	2555	/** \name Point evaluation of Spherical Harmonics
	2556	Evaluate at a given point (\f$cos(\theta)\f$ and \f$\phi\f$) a spherical harmonic representation.
	2557	*/
	2558	//@{
	2559	/// \see SH_to_point for argument description
	2560	void shtns_sh_to_point_(double spat, cplx Qlm, double cost, double phi)
	2561	{
	2562	spat = SH_to_point(sht_data, Qlm, cost, *phi);
	2563	}
	2564
	2565	/// \see SHqst_to_point for argument description
	2566	void shtns_qst_to_point_(double vr, double vt, double *vp,
	2567	cplx Qlm, cplx Slm, cplx Tlm, double cost, double *phi)
	2568	{
	2569	SHqst_to_point(sht_data, Qlm, Slm, Tlm, cost, phi, vr, vt, vp);
	2570	}
	2571	//@}
	2572
	2573
	2574	//@}
	2575
	2576	#endif

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: CIVL/examples/omp/shtns/sht_init.c

Download in other formats: