Context Navigation

mic_spat_to_SH.gen.c

main

Last change on this file was ea777aa, checked in by Alex Wilton <awilton@…>, 3 years ago

Moved examples, include, build_default.properties, common.xml, and README out from dev.civl.com into the root of the repo.

git-svn-id: svn://vsl.cis.udel.edu/civl/trunk@5704 fb995dde-84ed-4084-dfe6-e5aef3e2452c

Property mode set to 100644

File size: 18.1 KB

Rev	Line
[2131108]	1	/*
	2	* Copyright (c) 2010-2015 Centre National de la Recherche Scientifique.
	3	* written by Nathanael Schaeffer (CNRS, ISTerre, Grenoble, France).
	4	*
	5	* nathanael.schaeffer@ujf-grenoble.fr
	6	*
	7	* This software is governed by the CeCILL license under French law and
	8	* abiding by the rules of distribution of free software. You can use,
	9	* modify and/or redistribute the software under the terms of the CeCILL
	10	* license as circulated by CEA, CNRS and INRIA at the following URL
	11	* "http://www.cecill.info".
	12	*
	13	* The fact that you are presently reading this means that you have had
	14	* knowledge of the CeCILL license and that you accept its terms.
	15	*
	16	*/
	17
	18	# This file is meta-code for SHT.c (spherical harmonic transform).
	19	# it is intended for "make" to generate C code for 3 similar SHT functions,
	20	# (namely spat_to_SH [Q tag]), spat_to_SHsphtor [V tag], spat_to_SH3 [both Q&V tags])
	21	# from one generic function + tags.
	22	# Basically, there are tags at the beginning of lines (Q,V) that are information
	23	# to keep or remove the line depending on the function to build. (Q for scalar, V for vector, # for comment)
	24	#
	25	//////////////////////////////////////////////////
	26
	27	static
	28	QX void GEN3(_an1,NWAY,SUFFIX)(shtns_cfg shtns, double BrF, cplx Qlm, const long int llim, const int imlim)
	29	VX void GEN3(_an2,NWAY,SUFFIX)(shtns_cfg shtns, double BtF, double BpF, cplx Slm, cplx Tlm, const long int llim, const int imlim)
	30	3 void GEN3(_an3,NWAY,SUFFIX)(shtns_cfg shtns, double BrF, double BtF, double BpF, cplx Qlm, cplx Slm, cplx Tlm, const long int llim, const int imlim)
	31	{
	32	#define NW (NWAY*2)
	33
	34	double alm, al;
	35	double wg, ct, *st;
	36	V double *l_2;
	37	long int nk, k, l,m;
	38	unsigned m0, mstep;
	39	int k_inc, m_inc;
	40	#ifndef SHT_AXISYM
	41	unsigned im;
	42	V double m_1;
	43	#endif
	44	Q v2d qq[llim];
	45	V v2d ss[llim];
	46	V v2d tt[llim];
	47
	48	Q double rer[NLAT_2 + NW*VSIZE2] SSE;
	49	Q double ror[NLAT_2 + NW*VSIZE2] SSE;
	50	V double ter[NLAT_2 + NW*VSIZE2] SSE;
	51	V double tor[NLAT_2 + NW*VSIZE2] SSE;
	52	V double per[NLAT_2 + NW*VSIZE2] SSE;
	53	V double por[NLAT_2 + NW*VSIZE2] SSE;
	54	#ifndef SHT_AXISYM
	55	Q double rei[NLAT_2 + NW*VSIZE2] SSE;
	56	Q double roi[NLAT_2 + NW*VSIZE2] SSE;
	57	V double tei[NLAT_2 + NW*VSIZE2] SSE;
	58	V double toi[NLAT_2 + NW*VSIZE2] SSE;
	59	V double pei[NLAT_2 + NW*VSIZE2] SSE;
	60	V double poi[NLAT_2 + NW*VSIZE2] SSE;
	61	#endif
	62
	63	nk = NLAT_2; // copy NLAT_2 to a local variable for faster access (inner loop limit)
	64	#if _GCC_VEC_
	65	nk = ((unsigned) nk+(VSIZE2-1))/VSIZE2;
	66	#endif
	67	wg = shtns->wg; ct = shtns->ct; st = shtns->st;
	68	V l_2 = shtns->l_2;
	69	for (k=nkVSIZE2; k<(nk-1+NW)VSIZE2; ++k) { // never written, so this is now done for all m's
	70	Q rer[k] = 0.0; ror[k] = 0.0;
	71	V ter[k] = 0.0; tor[k] = 0.0;
	72	V per[k] = 0.0; por[k] = 0.0;
	73	#ifndef SHT_AXISYM
	74	Q rei[k] = 0.0; roi[k] = 0.0;
	75	V tei[k] = 0.0; toi[k] = 0.0;
	76	V pei[k] = 0.0; poi[k] = 0.0;
	77	#endif
	78	}
	79
	80	// ACCESS PATTERN
	81	k_inc = shtns->k_stride_a; m_inc = shtns->m_stride_a;
	82
	83	#ifndef _OPENMP
	84	m0 = 0; mstep = 1;
	85	#else
	86	m0 = omp_get_thread_num();
	87	mstep = omp_get_num_threads();
	88	if (m0 == 0)
	89	#endif
	90	{ // im=0 : dzl.p = 0.0 and evrything is REAL
	91	alm = shtns->blm;
	92	Q double r0 = 0.0;
	93	Q k=0; do { // compute symmetric and antisymmetric parts. (do not weight here, it is cheaper to weight y0)
	94	Q double an = BrF[kk_inc]; double bn = BrF[kk_inc +1];
	95	Q double bs = BrF[(NLAT-2-k)k_inc]; double as = BrF[(NLAT-2-k)k_inc +1];
	96	Q rer[k] = an+as; ror[k] = an-as;
	97	Q rer[k+1] = bn+bs; ror[k+1] = bn-bs;
	98	Q r0 += (an+as)wg[k] + (bn+bs)wg[k+1];
	99	Q k+=2;
	100	Q } while(k < nk*VSIZE2);
	101	V k=0; do { // compute symmetric and antisymmetric parts. (do not weight here, it is cheaper to weight y0)
	102	V double an = BtF[kk_inc]; double bn = BtF[kk_inc +1];
	103	V double bs = BtF[(NLAT-2-k)k_inc]; double as = BtF[(NLAT-2-k)k_inc +1];
	104	V ter[k] = an+as; tor[k] = an-as;
	105	V ter[k+1] = bn+bs; tor[k+1] = bn-bs;
	106	V k+=2;
	107	V } while(k < nk*VSIZE2);
	108	V k=0; do { // compute symmetric and antisymmetric parts. (do not weight here, it is cheaper to weight y0)
	109	V double an = BpF[kk_inc]; double bn = BpF[kk_inc +1];
	110	V double bs = BpF[(NLAT-2-k)k_inc]; double as = BpF[(NLAT-2-k)k_inc +1];
	111	V per[k] = an+as; por[k] = an-as;
	112	V per[k+1] = bn+bs; por[k+1] = bn-bs;
	113	V k+=2;
	114	V } while(k < nk*VSIZE2);
	115	Q Qlm[0] = r0 * alm[0]; // l=0 is done.
	116	V Slm[0] = 0.0; Tlm[0] = 0.0; // l=0 is zero for the vector transform.
	117	k = 0;
	118	Q double* q_ = (double*) qq;
	119	V double* s_ = (double) ss; double t_ = (double*) tt;
	120	for (l=0;l<llim;++l) {
	121	Q q_[l] = 0.0;
	122	V s_[l] = 0.0; t_[l] = 0.0;
	123	}
	124	do {
	125	al = alm;
	126	rnd cost[NW], y0[NW], y1[NW];
	127	V rnd sint[NW], dy0[NW], dy1[NW];
	128	Q rnd rerk[NW], rork[NW]; // help the compiler to cache into registers.
	129	V rnd terk[NW], tork[NW], perk[NW], pork[NW];
	130	for (int j=0; j<NW; ++j) {
	131	cost[j] = vread(ct, k+j);
	132	y0[j] = vall(al[0]) * vread(wg, k+j); // weight of Gauss quadrature appears here
	133	V dy0[j] = vall(0.0);
	134	V sint[j] = -vread(st, k+j);
	135	y1[j] = (vall(al[1])y0[j]) cost[j];
	136	V dy1[j] = (vall(al[1])y0[j]) sint[j];
	137	Q rerk[j] = vread(rer, k+j); rork[j] = vread(ror, k+j); // cache into registers.
	138	V terk[j] = vread(ter, k+j); tork[j] = vread(tor, k+j);
	139	V perk[j] = vread(per, k+j); pork[j] = vread(por, k+j);
	140	}
	141	al+=2; l=1;
	142	while(l<llim) {
	143	for (int j=0; j<NW; ++j) {
	144	V dy0[j] = vall(al[1])(cost[j]dy1[j] + y1[j]sint[j]) + vall(al[0])dy0[j];
	145	y0[j] = vall(al[1])(cost[j]y1[j]) + vall(al[0])*y0[j];
	146	}
	147	Q rnd q = y1[0] * rork[0];
	148	V rnd s = dy1[0] * terk[0];
	149	V rnd t = dy1[0] * perk[0];
	150	for (int j=1; j<NW; ++j) {
	151	Q q += y1[j] * rork[j];
	152	V s += dy1[j] * terk[j];
	153	V t += dy1[j] * perk[j];
	154	}
	155	Q q_[l-1] += reduce_add(q);
	156	V s_[l-1] += reduce_add(s);
	157	V t_[l-1] -= reduce_add(t);
	158	for (int j=0; j<NW; ++j) {
	159	V dy1[j] = vall(al[3])(cost[j]dy0[j] + y0[j]sint[j]) + vall(al[2])dy1[j];
	160	y1[j] = vall(al[3])(cost[j]y0[j]) + vall(al[2])*y1[j];
	161	}
	162	Q q = y0[0] * rerk[0];
	163	V s = dy0[0] * tork[0];
	164	V t = dy0[0] * pork[0];
	165	for (int j=1; j<NW; ++j) {
	166	Q q += y0[j] * rerk[j];
	167	V s += dy0[j] * tork[j];
	168	V t += dy0[j] * pork[j];
	169	}
	170	Q q_[l] += reduce_add(q);
	171	V s_[l] += reduce_add(s);
	172	V t_[l] -= reduce_add(t);
	173	al+=4; l+=2;
	174	}
	175	if (l==llim) {
	176	Q rnd q = y1[0] * rork[0];
	177	V rnd s = dy1[0] * terk[0];
	178	V rnd t = dy1[0] * perk[0];
	179	for (int j=1; j<NW; ++j) {
	180	Q q += y1[j] * rork[j];
	181	V s += dy1[j] * terk[j];
	182	V t += dy1[j] * perk[j];
	183	}
	184	Q q_[l-1] += reduce_add(q);
	185	V s_[l-1] += reduce_add(s);
	186	V t_[l-1] -= reduce_add(t);
	187	}
	188	k+=NW;
	189	} while (k < nk);
	190	for (l=1; l<=llim; ++l) {
	191	Q Qlm[l] = q_[l-1];
	192	V Slm[l] = s_[l-1]l_2[l]; Tlm[l] = t_[l-1]l_2[l];
	193	}
	194	#ifdef SHT_VAR_LTR
	195	for (l=llim+1; l<= LMAX; ++l) {
	196	Q ((v2d*)Qlm)[l] = vdup(0.0);
	197	V ((v2d)Slm)[l] = vdup(0.0); ((v2d)Tlm)[l] = vdup(0.0);
	198	}
	199	#ifndef SHT_AXISYM
	200	if (imlim <= MMAX) { // zero out m >= imlim
	201	l = LiM(shtns, imlim*MRES, imlim);
	202	do {
	203	Q ((v2d*)Qlm)[l] = vdup(0.0);
	204	V ((v2d)Slm)[l] = vdup(0.0); ((v2d)Tlm)[l] = vdup(0.0);
	205	} while(++l < shtns->nlm);
	206	}
	207	#endif
	208	#endif
	209	m0=mstep;
	210	}
	211
	212	#ifndef SHT_AXISYM
	213	for (im=m0; im<imlim; im+=mstep) {
	214	m = im*MRES;
	215	l = shtns->tm[im] / VSIZE2;
	216	alm = shtns->blm + im(2LMAX -m+MRES);
	217	Q k = ((lVSIZE2)>>1)2; // k must be even here.
	218	Q do { // compute symmetric and antisymmetric parts, and reorganize data.
	219	Q double an, bn, ani, bni, bs, as, bsi, asi, t;
	220	3 double sina = st[k]; double sinb = st[k+1];
	221	Q ani = BrF[imm_inc + kk_inc]; bni = BrF[imm_inc + kk_inc +1]; // north
	222	Q an = BrF[(NPHI-im)m_inc + kk_inc]; bn = BrF[(NPHI-im)m_inc + kk_inc +1];
	223	Q t = ani-an; an += ani; ani = bn-bni; bn += bni; bni = t;
	224	3 an = sina; ani= sina; bn = sinb; bni = sinb;
	225	Q bsi = BrF[imm_inc + (NLAT-2 -k)k_inc]; asi = BrF[imm_inc + (NLAT-2-k)k_inc + 1]; // south
	226	Q bs = BrF[(NPHI-im)m_inc +(NLAT-2-k)k_inc]; as = BrF[(NPHI-im)m_inc +(NLAT-2-k)k_inc +1];
	227	Q t = bsi-bs; bs += bsi; bsi = as-asi; as += asi; asi = t;
	228	3 as = sina; asi= sina; bs = sinb; bsi = sinb;
	229	Q rer[k] = an+as; rei[k] = ani+asi; rer[k+1] = bn+bs; rei[k+1] = bni+bsi;
	230	Q ror[k] = an-as; roi[k] = ani-asi; ror[k+1] = bn-bs; roi[k+1] = bni-bsi;
	231	Q k+=2;
	232	Q } while (k<nk*VSIZE2);
	233	V k = ((lVSIZE2)>>1)2; // k must be even here.
	234	V do { // compute symmetric and antisymmetric parts, and reorganize data.
	235	V double an, bn, ani, bni, bs, as, bsi, asi, t;
	236	V ani = BtF[imm_inc + kk_inc]; bni = BtF[imm_inc + kk_inc +1]; // north
	237	V an = BtF[(NPHI-im)m_inc + kk_inc]; bn = BtF[(NPHI-im)m_inc + kk_inc +1];
	238	V t = ani-an; an += ani; ani = bn-bni; bn += bni; bni = t;
	239	V bsi = BtF[imm_inc + (NLAT-2 -k)k_inc]; asi = BtF[imm_inc + (NLAT-2-k)k_inc + 1]; // south
	240	V bs = BtF[(NPHI-im)m_inc +(NLAT-2-k)k_inc]; as = BtF[(NPHI-im)m_inc +(NLAT-2-k)k_inc +1];
	241	V t = bsi-bs; bs += bsi; bsi = as-asi; as += asi; asi = t;
	242	V ter[k] = an+as; tei[k] = ani+asi; ter[k+1] = bn+bs; tei[k+1] = bni+bsi;
	243	V tor[k] = an-as; toi[k] = ani-asi; tor[k+1] = bn-bs; toi[k+1] = bni-bsi;
	244	V k+=2;
	245	V } while (k<nk*VSIZE2);
	246	V k = ((lVSIZE2)>>1)2; // k must be even here.
	247	V do { // compute symmetric and antisymmetric parts, and reorganize data.
	248	V double an, bn, ani, bni, bs, as, bsi, asi, t;
	249	V ani = BpF[imm_inc + kk_inc]; bni = BpF[imm_inc + kk_inc +1]; // north
	250	V an = BpF[(NPHI-im)m_inc + kk_inc]; bn = BpF[(NPHI-im)m_inc + kk_inc +1];
	251	V t = ani-an; an += ani; ani = bn-bni; bn += bni; bni = t;
	252	V bsi = BpF[imm_inc + (NLAT-2 -k)k_inc]; asi = BpF[imm_inc + (NLAT-2-k)k_inc + 1]; // south
	253	V bs = BpF[(NPHI-im)m_inc +(NLAT-2-k)k_inc]; as = BpF[(NPHI-im)m_inc +(NLAT-2-k)k_inc +1];
	254	V t = bsi-bs; bs += bsi; bsi = as-asi; as += asi; asi = t;
	255	V per[k] = an+as; pei[k] = ani+asi; per[k+1] = bn+bs; pei[k+1] = bni+bsi;
	256	V por[k] = an-as; poi[k] = ani-asi; por[k+1] = bn-bs; poi[k+1] = bni-bsi;
	257	V k+=2;
	258	V } while (k<nk*VSIZE2);
	259	V m_1 = 1.0/m;
	260	k=l;
	261	for (l=0; l<=llim-m; l++) {
	262	Q qq[l] = vdup(0.0);
	263	V ss[l] = vdup(0.0); tt[l] = vdup(0.0);
	264	}
	265	do {
	266	Q v2d* q = qq;
	267	V v2d* s = ss; v2d* t = tt;
	268	al = alm;
	269	rnd cost[NW], y0[NW], y1[NW];
	270	V rnd st2[NW], dy0[NW], dy1[NW];
	271	Q rnd rerk[NW], reik[NW], rork[NW], roik[NW]; // help the compiler to cache into registers.
	272	V rnd terk[NW], teik[NW], tork[NW], toik[NW];
	273	V rnd perk[NW], peik[NW], pork[NW], poik[NW];
	274	for (int j=0; j<NW; ++j) {
	275	cost[j] = vread(st, k+j);
	276	y0[j] = vall(0.5);
	277	V st2[j] = cost[j]cost[j]vall(-m_1);
	278	V y0[j] = vall(m); // for the vector transform, compute ylmm/sint
	279	}
	280	Q l=m;
	281	V l=m-1;
	282	long int ny = 0; // exponent to extend double precision range.
	283	if ((int)llim <= SHT_L_RESCALE_FLY) {
	284	do { // sin(theta)^m
	285	if (l&1) for (int j=0; j<NW; ++j) y0[j] *= cost[j];
	286	for (int j=0; j<NW; ++j) cost[j] *= cost[j];
	287	} while(l >>= 1);
	288	} else {
	289	long int nsint = 0;
	290	do { // sin(theta)^m (use rescaling to avoid underflow)
	291	if (l&1) {
	292	for (int j=0; j<NW; ++j) y0[j] *= cost[j];
	293	ny += nsint;
	294	if (vlo(y0[0]) < (SHT_ACCURACY+1.0/SHT_SCALE_FACTOR)) {
	295	ny--;
	296	for (int j=0; j<NW; ++j) y0[j] *= vall(SHT_SCALE_FACTOR);
	297	}
	298	}
	299	for (int j=0; j<NW; ++j) cost[j] *= cost[j];
	300	nsint += nsint;
	301	if (vlo(cost[0]) < 1.0/SHT_SCALE_FACTOR) {
	302	nsint--;
	303	for (int j=0; j<NW; ++j) cost[j] *= vall(SHT_SCALE_FACTOR);
	304	}
	305	} while(l >>= 1);
	306	}
	307	for (int j=0; j<NW; ++j) {
	308	y0[j] *= vall(al[0]);
	309	cost[j] = vread(ct, k+j);
	310	V dy0[j] = cost[j]*y0[j];
	311	y1[j] = (vall(al[1])y0[j]) cost[j];
	312	V dy1[j] = (vall(al[1])y0[j]) (cost[j]*cost[j] + st2[j]);
	313	}
	314	l=m; al+=2;
	315	while ((ny<0) && (l<llim)) { // ylm treated as zero and ignored if ny < 0
	316	for (int j=0; j<NW; ++j) {
	317	V dy0[j] = vall(al[1])(cost[j]dy1[j] + y1[j]st2[j]) + vall(al[0])dy0[j];
	318	y0[j] = vall(al[1])(cost[j]y1[j]) + vall(al[0])*y0[j];
	319	}
	320	for (int j=0; j<NW; ++j) {
	321	V dy1[j] = vall(al[3])(cost[j]dy0[j] + y0[j]st2[j]) + vall(al[2])dy1[j];
	322	y1[j] = vall(al[3])(cost[j]y0[j]) + vall(al[2])*y1[j];
	323	}
	324	l+=2; al+=4;
	325	if (fabs(vlo(y0[NW-1])) > SHT_ACCURACY*SHT_SCALE_FACTOR + 1.0) { // rescale when value is significant
	326	++ny;
	327	for (int j=0; j<NW; ++j) {
	328	y0[j] = vall(1.0/SHT_SCALE_FACTOR); y1[j] = vall(1.0/SHT_SCALE_FACTOR);
	329	V dy0[j] = vall(1.0/SHT_SCALE_FACTOR); dy1[j] = vall(1.0/SHT_SCALE_FACTOR);
	330	}
	331	}
	332	}
	333	if (ny == 0) {
	334	Q q+=(l-m);
	335	V s+=(l-m); t+=(l-m);
	336	for (int j=0; j<NW; ++j) { // prefetch
	337	y0[j] = vread(wg, k+j); y1[j] = vread(wg, k+j); // weight appears here (must be after the previous accuracy loop).
	338	V dy0[j] = vread(wg, k+j); dy1[j] = vread(wg, k+j);
	339	Q rerk[j] = vread( rer, k+j); reik[j] = vread( rei, k+j); rork[j] = vread( ror, k+j); roik[j] = vread( roi, k+j);
	340	V terk[j] = vread( ter, k+j); teik[j] = vread( tei, k+j); tork[j] = vread( tor, k+j); toik[j] = vread( toi, k+j);
	341	V perk[j] = vread( per, k+j); peik[j] = vread( pei, k+j); pork[j] = vread( por, k+j); poik[j] = vread( poi, k+j);
	342	}
	343	while (l<llim) { // compute even and odd parts
	344	Q rnd qq0 = y0[0] * rerk[0];
	345	Q rnd qq1 = y0[0] * reik[0];
	346	V rnd ss0 = dy0[0] * tork[0] + y0[0] * peik[0];
	347	V rnd ss1 = dy0[0] * toik[0] - y0[0] * perk[0];
	348	V rnd tt0 = dy0[0] * pork[0] - y0[0] * teik[0];
	349	V rnd tt1 = dy0[0] * poik[0] + y0[0] * terk[0];
	350	Q for (int j=1; j<NW; ++j) qq0 += y0[j] * rerk[j]; // real even
	351	Q for (int j=1; j<NW; ++j) qq1 += y0[j] * reik[j]; // imag even
	352	V for (int j=1; j<NW; ++j) ss0 += dy0[j] * tork[j] + y0[j] * peik[j];
	353	V for (int j=1; j<NW; ++j) ss1 += dy0[j] * toik[j] - y0[j] * perk[j];
	354	V for (int j=1; j<NW; ++j) tt0 += dy0[j] * pork[j] - y0[j] * teik[j];
	355	V for (int j=1; j<NW; ++j) tt1 += dy0[j] * poik[j] + y0[j] * terk[j];
	356	Q q[0] += v2d_reduce(qq0, qq1);
	357	V s[0] += v2d_reduce(ss0, ss1);
	358	V t[0] -= v2d_reduce(tt0, tt1);
	359	for (int j=0; j<NW; ++j) {
	360	V dy0[j] = vall(al[1])(cost[j]dy1[j] + y1[j]st2[j]) + vall(al[0])dy0[j];
	361	y0[j] = vall(al[1])(cost[j]y1[j]) + vall(al[0])*y0[j];
	362	}
	363	Q qq0 = y1[0] * rork[0];
	364	Q qq1 = y1[0] * roik[0];
	365	V ss0 = dy1[0] * terk[0] + y1[0] * poik[0];
	366	V ss1 = dy1[0] * teik[0] - y1[0] * pork[0];
	367	V tt0 = dy1[0] * perk[0] - y1[0] * toik[0];
	368	V tt1 = dy1[0] * peik[0] + y1[0] * tork[0];
	369	Q for (int j=1; j<NW; ++j) qq0 += y1[j] * rork[j]; // real odd
	370	Q for (int j=1; j<NW; ++j) qq1 += y1[j] * roik[j]; // imag odd
	371	V for (int j=1; j<NW; ++j) ss0 += dy1[j] * terk[j] + y1[j] * poik[j];
	372	V for (int j=1; j<NW; ++j) ss1 += dy1[j] * teik[j] - y1[j] * pork[j];
	373	V for (int j=1; j<NW; ++j) tt0 += dy1[j] * perk[j] - y1[j] * toik[j];
	374	V for (int j=1; j<NW; ++j) tt1 += dy1[j] * peik[j] + y1[j] * tork[j];
	375	Q q[1] += v2d_reduce(qq0, qq1);
	376	V s[1] += v2d_reduce(ss0, ss1);
	377	V t[1] -= v2d_reduce(tt0, tt1);
	378	Q q+=2;
	379	V s+=2; t+=2;
	380	for (int j=0; j<NW; ++j) {
	381	V dy1[j] = vall(al[3])(cost[j]dy0[j] + y0[j]st2[j]) + vall(al[2])dy1[j];
	382	y1[j] = vall(al[3])(cost[j]y0[j]) + vall(al[2])*y1[j];
	383	}
	384	l+=2; al+=4;
	385	}
	386	if (l==llim) {
	387	Q rnd qq0 = y0[0] * rerk[0];
	388	Q rnd qq1 = y0[0] * reik[0];
	389	V rnd ss0 = dy0[0] * tork[0] + y0[0] * peik[0];
	390	V rnd ss1 = dy0[0] * toik[0] - y0[0] * perk[0];
	391	V rnd tt0 = dy0[0] * pork[0] - y0[0] * teik[0];
	392	V rnd tt1 = dy0[0] * poik[0] + y0[0] * terk[0];
	393	Q for (int j=1; j<NW; ++j) qq0 += y0[j] * rerk[j]; // real even
	394	Q for (int j=1; j<NW; ++j) qq1 += y0[j] * reik[j]; // imag even
	395	V for (int j=1; j<NW; ++j) ss0 += dy0[j] * tork[j] + y0[j] * peik[j];
	396	V for (int j=1; j<NW; ++j) ss1 += dy0[j] * toik[j] - y0[j] * perk[j];
	397	V for (int j=1; j<NW; ++j) tt0 += dy0[j] * pork[j] - y0[j] * teik[j];
	398	V for (int j=1; j<NW; ++j) tt1 += dy0[j] * poik[j] + y0[j] * terk[j];
	399	Q q[0] += v2d_reduce(qq0, qq1);
	400	V s[0] += v2d_reduce(ss0, ss1);
	401	V t[0] -= v2d_reduce(tt0, tt1);
	402	}
	403	}
	404	k+=NW;
	405	} while (k < nk);
	406	l = LiM(shtns, m, im);
	407	Q v2d Ql = (v2d) &Qlm[l];
	408	V v2d Sl = (v2d) &Slm[l];
	409	V v2d Tl = (v2d) &Tlm[l];
	410	for (l=0; l<=llim-m; ++l) {
	411	QX Ql[l] = qq[l];
	412	3 Ql[l] = qq[l] * vdup(m_1);
	413	V Sl[l] = ss[l] * vdup(l_2[l+m]);
	414	V Tl[l] = tt[l] * vdup(l_2[l+m]);
	415	}
	416	#ifdef SHT_VAR_LTR
	417	for (l=llim+1-m; l<=LMAX-m; ++l) {
	418	Q Ql[l] = vdup(0.0);
	419	V Sl[l] = vdup(0.0); Tl[l] = vdup(0.0);
	420	}
	421	#endif
	422	}
	423	#endif
	424	}
	425
	426	static
	427	QX void GEN3(spat_to_SH_mic,NWAY,SUFFIX)(shtns_cfg shtns, double Vr, cplx Qlm, long int llim) {
	428	VX void GEN3(spat_to_SHsphtor_mic,NWAY,SUFFIX)(shtns_cfg shtns, double Vt, double Vp, cplx Slm, cplx Tlm, long int llim) {
	429	3 void GEN3(spat_to_SHqst_mic,NWAY,SUFFIX)(shtns_cfg shtns, double Vr, double Vt, double Vp, cplx Qlm, cplx Slm, cplx Tlm, long int llim) {
	430
	431	Q double *BrF; // contains the Fourier transformed data
	432	V double BtF, BpF; // contains the Fourier transformed data
	433	unsigned imlim=0;
	434
	435	Q BrF = Vr;
	436	V BtF = Vt; BpF = Vp;
	437	#ifndef SHT_AXISYM
	438	imlim = MTR;
	439	#ifdef SHT_VAR_LTR
	440	if (imlim*MRES > (unsigned) llim) imlim = ((unsigned) llim)/MRES; // 32bit mul and div should be faster
	441	#endif
	442	if (shtns->fftc_mode >= 0) {
	443	if (shtns->fftc_mode == 0) { // in-place
	444	Q fftw_execute_dft(shtns->fftc,(cplx)BrF, (cplx)BrF);
	445	V fftw_execute_dft(shtns->fftc,(cplx)BtF, (cplx)BtF);
	446	V fftw_execute_dft(shtns->fftc,(cplx)BpF, (cplx)BpF);
	447	} else { // alloc memory for the transpose FFT
	448	unsigned long nv = shtns->nspat;
	449	QX BrF = (double) VMALLOC( nv sizeof(double) );
	450	VX BtF = (double) VMALLOC( 2nv * sizeof(double) );
	451	VX BpF = BtF + nv;
	452	3 BrF = (double) VMALLOC( 3nv * sizeof(double) );
	453	3 BtF = BrF + nv; BpF = BtF + nv;
	454	Q fftw_execute_split_dft(shtns->fftc, Vr+NPHI, Vr, BrF+1, BrF);
	455	V fftw_execute_split_dft(shtns->fftc, Vt+NPHI, Vt, BtF+1, BtF);
	456	V fftw_execute_split_dft(shtns->fftc, Vp+NPHI, Vp, BpF+1, BpF);
	457	}
	458	}
	459	#endif
	460	imlim += 1;
	461
	462	#pragma omp parallel num_threads(shtns->nthreads)
	463	{
	464	QX GEN3(_an1,NWAY,SUFFIX)(shtns, BrF, Qlm, llim, imlim);
	465	VX GEN3(_an2,NWAY,SUFFIX)(shtns, BtF, BpF, Slm, Tlm, llim, imlim);
	466	3 GEN3(_an3,NWAY,SUFFIX)(shtns, BrF, BtF, BpF, Qlm, Slm, Tlm, llim, imlim);
	467	}
	468
	469	#ifndef SHT_AXISYM
	470	if (shtns->fftc_mode > 0) { // free memory
	471	Q VFREE(BrF);
	472	VX VFREE(BtF); // this frees also BpF.
	473	}
	474	#endif
	475
	476	}

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: CIVL/examples/omp/shtns/SHT/mic_spat_to_SH.gen.c

Download in other formats: