Context Navigation

mic_spat_to_SH.gen.c

main

Last change on this file was ea777aa, checked in by Alex Wilton <awilton@…>, 3 years ago

Moved examples, include, build_default.properties, common.xml, and README out from dev.civl.com into the root of the repo.

git-svn-id: svn://vsl.cis.udel.edu/civl/trunk@5704 fb995dde-84ed-4084-dfe6-e5aef3e2452c

Property mode set to 100644

File size: 18.1 KB

Line
1	/*
2	* Copyright (c) 2010-2015 Centre National de la Recherche Scientifique.
3	* written by Nathanael Schaeffer (CNRS, ISTerre, Grenoble, France).
4	*
5	* nathanael.schaeffer@ujf-grenoble.fr
6	*
7	* This software is governed by the CeCILL license under French law and
8	* abiding by the rules of distribution of free software. You can use,
9	* modify and/or redistribute the software under the terms of the CeCILL
10	* license as circulated by CEA, CNRS and INRIA at the following URL
11	* "http://www.cecill.info".
12	*
13	* The fact that you are presently reading this means that you have had
14	* knowledge of the CeCILL license and that you accept its terms.
15	*
16	*/
17
18	# This file is meta-code for SHT.c (spherical harmonic transform).
19	# it is intended for "make" to generate C code for 3 similar SHT functions,
20	# (namely spat_to_SH [Q tag]), spat_to_SHsphtor [V tag], spat_to_SH3 [both Q&V tags])
21	# from one generic function + tags.
22	# Basically, there are tags at the beginning of lines (Q,V) that are information
23	# to keep or remove the line depending on the function to build. (Q for scalar, V for vector, # for comment)
24	#
25	//////////////////////////////////////////////////
26
27	static
28	QX void GEN3(_an1,NWAY,SUFFIX)(shtns_cfg shtns, double BrF, cplx Qlm, const long int llim, const int imlim)
29	VX void GEN3(_an2,NWAY,SUFFIX)(shtns_cfg shtns, double BtF, double BpF, cplx Slm, cplx Tlm, const long int llim, const int imlim)
30	3 void GEN3(_an3,NWAY,SUFFIX)(shtns_cfg shtns, double BrF, double BtF, double BpF, cplx Qlm, cplx Slm, cplx Tlm, const long int llim, const int imlim)
31	{
32	#define NW (NWAY*2)
33
34	double alm, al;
35	double wg, ct, *st;
36	V double *l_2;
37	long int nk, k, l,m;
38	unsigned m0, mstep;
39	int k_inc, m_inc;
40	#ifndef SHT_AXISYM
41	unsigned im;
42	V double m_1;
43	#endif
44	Q v2d qq[llim];
45	V v2d ss[llim];
46	V v2d tt[llim];
47
48	Q double rer[NLAT_2 + NW*VSIZE2] SSE;
49	Q double ror[NLAT_2 + NW*VSIZE2] SSE;
50	V double ter[NLAT_2 + NW*VSIZE2] SSE;
51	V double tor[NLAT_2 + NW*VSIZE2] SSE;
52	V double per[NLAT_2 + NW*VSIZE2] SSE;
53	V double por[NLAT_2 + NW*VSIZE2] SSE;
54	#ifndef SHT_AXISYM
55	Q double rei[NLAT_2 + NW*VSIZE2] SSE;
56	Q double roi[NLAT_2 + NW*VSIZE2] SSE;
57	V double tei[NLAT_2 + NW*VSIZE2] SSE;
58	V double toi[NLAT_2 + NW*VSIZE2] SSE;
59	V double pei[NLAT_2 + NW*VSIZE2] SSE;
60	V double poi[NLAT_2 + NW*VSIZE2] SSE;
61	#endif
62
63	nk = NLAT_2; // copy NLAT_2 to a local variable for faster access (inner loop limit)
64	#if _GCC_VEC_
65	nk = ((unsigned) nk+(VSIZE2-1))/VSIZE2;
66	#endif
67	wg = shtns->wg; ct = shtns->ct; st = shtns->st;
68	V l_2 = shtns->l_2;
69	for (k=nkVSIZE2; k<(nk-1+NW)VSIZE2; ++k) { // never written, so this is now done for all m's
70	Q rer[k] = 0.0; ror[k] = 0.0;
71	V ter[k] = 0.0; tor[k] = 0.0;
72	V per[k] = 0.0; por[k] = 0.0;
73	#ifndef SHT_AXISYM
74	Q rei[k] = 0.0; roi[k] = 0.0;
75	V tei[k] = 0.0; toi[k] = 0.0;
76	V pei[k] = 0.0; poi[k] = 0.0;
77	#endif
78	}
79
80	// ACCESS PATTERN
81	k_inc = shtns->k_stride_a; m_inc = shtns->m_stride_a;
82
83	#ifndef _OPENMP
84	m0 = 0; mstep = 1;
85	#else
86	m0 = omp_get_thread_num();
87	mstep = omp_get_num_threads();
88	if (m0 == 0)
89	#endif
90	{ // im=0 : dzl.p = 0.0 and evrything is REAL
91	alm = shtns->blm;
92	Q double r0 = 0.0;
93	Q k=0; do { // compute symmetric and antisymmetric parts. (do not weight here, it is cheaper to weight y0)
94	Q double an = BrF[kk_inc]; double bn = BrF[kk_inc +1];
95	Q double bs = BrF[(NLAT-2-k)k_inc]; double as = BrF[(NLAT-2-k)k_inc +1];
96	Q rer[k] = an+as; ror[k] = an-as;
97	Q rer[k+1] = bn+bs; ror[k+1] = bn-bs;
98	Q r0 += (an+as)wg[k] + (bn+bs)wg[k+1];
99	Q k+=2;
100	Q } while(k < nk*VSIZE2);
101	V k=0; do { // compute symmetric and antisymmetric parts. (do not weight here, it is cheaper to weight y0)
102	V double an = BtF[kk_inc]; double bn = BtF[kk_inc +1];
103	V double bs = BtF[(NLAT-2-k)k_inc]; double as = BtF[(NLAT-2-k)k_inc +1];
104	V ter[k] = an+as; tor[k] = an-as;
105	V ter[k+1] = bn+bs; tor[k+1] = bn-bs;
106	V k+=2;
107	V } while(k < nk*VSIZE2);
108	V k=0; do { // compute symmetric and antisymmetric parts. (do not weight here, it is cheaper to weight y0)
109	V double an = BpF[kk_inc]; double bn = BpF[kk_inc +1];
110	V double bs = BpF[(NLAT-2-k)k_inc]; double as = BpF[(NLAT-2-k)k_inc +1];
111	V per[k] = an+as; por[k] = an-as;
112	V per[k+1] = bn+bs; por[k+1] = bn-bs;
113	V k+=2;
114	V } while(k < nk*VSIZE2);
115	Q Qlm[0] = r0 * alm[0]; // l=0 is done.
116	V Slm[0] = 0.0; Tlm[0] = 0.0; // l=0 is zero for the vector transform.
117	k = 0;
118	Q double* q_ = (double*) qq;
119	V double* s_ = (double) ss; double t_ = (double*) tt;
120	for (l=0;l<llim;++l) {
121	Q q_[l] = 0.0;
122	V s_[l] = 0.0; t_[l] = 0.0;
123	}
124	do {
125	al = alm;
126	rnd cost[NW], y0[NW], y1[NW];
127	V rnd sint[NW], dy0[NW], dy1[NW];
128	Q rnd rerk[NW], rork[NW]; // help the compiler to cache into registers.
129	V rnd terk[NW], tork[NW], perk[NW], pork[NW];
130	for (int j=0; j<NW; ++j) {
131	cost[j] = vread(ct, k+j);
132	y0[j] = vall(al[0]) * vread(wg, k+j); // weight of Gauss quadrature appears here
133	V dy0[j] = vall(0.0);
134	V sint[j] = -vread(st, k+j);
135	y1[j] = (vall(al[1])y0[j]) cost[j];
136	V dy1[j] = (vall(al[1])y0[j]) sint[j];
137	Q rerk[j] = vread(rer, k+j); rork[j] = vread(ror, k+j); // cache into registers.
138	V terk[j] = vread(ter, k+j); tork[j] = vread(tor, k+j);
139	V perk[j] = vread(per, k+j); pork[j] = vread(por, k+j);
140	}
141	al+=2; l=1;
142	while(l<llim) {
143	for (int j=0; j<NW; ++j) {
144	V dy0[j] = vall(al[1])(cost[j]dy1[j] + y1[j]sint[j]) + vall(al[0])dy0[j];
145	y0[j] = vall(al[1])(cost[j]y1[j]) + vall(al[0])*y0[j];
146	}
147	Q rnd q = y1[0] * rork[0];
148	V rnd s = dy1[0] * terk[0];
149	V rnd t = dy1[0] * perk[0];
150	for (int j=1; j<NW; ++j) {
151	Q q += y1[j] * rork[j];
152	V s += dy1[j] * terk[j];
153	V t += dy1[j] * perk[j];
154	}
155	Q q_[l-1] += reduce_add(q);
156	V s_[l-1] += reduce_add(s);
157	V t_[l-1] -= reduce_add(t);
158	for (int j=0; j<NW; ++j) {
159	V dy1[j] = vall(al[3])(cost[j]dy0[j] + y0[j]sint[j]) + vall(al[2])dy1[j];
160	y1[j] = vall(al[3])(cost[j]y0[j]) + vall(al[2])*y1[j];
161	}
162	Q q = y0[0] * rerk[0];
163	V s = dy0[0] * tork[0];
164	V t = dy0[0] * pork[0];
165	for (int j=1; j<NW; ++j) {
166	Q q += y0[j] * rerk[j];
167	V s += dy0[j] * tork[j];
168	V t += dy0[j] * pork[j];
169	}
170	Q q_[l] += reduce_add(q);
171	V s_[l] += reduce_add(s);
172	V t_[l] -= reduce_add(t);
173	al+=4; l+=2;
174	}
175	if (l==llim) {
176	Q rnd q = y1[0] * rork[0];
177	V rnd s = dy1[0] * terk[0];
178	V rnd t = dy1[0] * perk[0];
179	for (int j=1; j<NW; ++j) {
180	Q q += y1[j] * rork[j];
181	V s += dy1[j] * terk[j];
182	V t += dy1[j] * perk[j];
183	}
184	Q q_[l-1] += reduce_add(q);
185	V s_[l-1] += reduce_add(s);
186	V t_[l-1] -= reduce_add(t);
187	}
188	k+=NW;
189	} while (k < nk);
190	for (l=1; l<=llim; ++l) {
191	Q Qlm[l] = q_[l-1];
192	V Slm[l] = s_[l-1]l_2[l]; Tlm[l] = t_[l-1]l_2[l];
193	}
194	#ifdef SHT_VAR_LTR
195	for (l=llim+1; l<= LMAX; ++l) {
196	Q ((v2d*)Qlm)[l] = vdup(0.0);
197	V ((v2d)Slm)[l] = vdup(0.0); ((v2d)Tlm)[l] = vdup(0.0);
198	}
199	#ifndef SHT_AXISYM
200	if (imlim <= MMAX) { // zero out m >= imlim
201	l = LiM(shtns, imlim*MRES, imlim);
202	do {
203	Q ((v2d*)Qlm)[l] = vdup(0.0);
204	V ((v2d)Slm)[l] = vdup(0.0); ((v2d)Tlm)[l] = vdup(0.0);
205	} while(++l < shtns->nlm);
206	}
207	#endif
208	#endif
209	m0=mstep;
210	}
211
212	#ifndef SHT_AXISYM
213	for (im=m0; im<imlim; im+=mstep) {
214	m = im*MRES;
215	l = shtns->tm[im] / VSIZE2;
216	alm = shtns->blm + im(2LMAX -m+MRES);
217	Q k = ((lVSIZE2)>>1)2; // k must be even here.
218	Q do { // compute symmetric and antisymmetric parts, and reorganize data.
219	Q double an, bn, ani, bni, bs, as, bsi, asi, t;
220	3 double sina = st[k]; double sinb = st[k+1];
221	Q ani = BrF[imm_inc + kk_inc]; bni = BrF[imm_inc + kk_inc +1]; // north
222	Q an = BrF[(NPHI-im)m_inc + kk_inc]; bn = BrF[(NPHI-im)m_inc + kk_inc +1];
223	Q t = ani-an; an += ani; ani = bn-bni; bn += bni; bni = t;
224	3 an = sina; ani= sina; bn = sinb; bni = sinb;
225	Q bsi = BrF[imm_inc + (NLAT-2 -k)k_inc]; asi = BrF[imm_inc + (NLAT-2-k)k_inc + 1]; // south
226	Q bs = BrF[(NPHI-im)m_inc +(NLAT-2-k)k_inc]; as = BrF[(NPHI-im)m_inc +(NLAT-2-k)k_inc +1];
227	Q t = bsi-bs; bs += bsi; bsi = as-asi; as += asi; asi = t;
228	3 as = sina; asi= sina; bs = sinb; bsi = sinb;
229	Q rer[k] = an+as; rei[k] = ani+asi; rer[k+1] = bn+bs; rei[k+1] = bni+bsi;
230	Q ror[k] = an-as; roi[k] = ani-asi; ror[k+1] = bn-bs; roi[k+1] = bni-bsi;
231	Q k+=2;
232	Q } while (k<nk*VSIZE2);
233	V k = ((lVSIZE2)>>1)2; // k must be even here.
234	V do { // compute symmetric and antisymmetric parts, and reorganize data.
235	V double an, bn, ani, bni, bs, as, bsi, asi, t;
236	V ani = BtF[imm_inc + kk_inc]; bni = BtF[imm_inc + kk_inc +1]; // north
237	V an = BtF[(NPHI-im)m_inc + kk_inc]; bn = BtF[(NPHI-im)m_inc + kk_inc +1];
238	V t = ani-an; an += ani; ani = bn-bni; bn += bni; bni = t;
239	V bsi = BtF[imm_inc + (NLAT-2 -k)k_inc]; asi = BtF[imm_inc + (NLAT-2-k)k_inc + 1]; // south
240	V bs = BtF[(NPHI-im)m_inc +(NLAT-2-k)k_inc]; as = BtF[(NPHI-im)m_inc +(NLAT-2-k)k_inc +1];
241	V t = bsi-bs; bs += bsi; bsi = as-asi; as += asi; asi = t;
242	V ter[k] = an+as; tei[k] = ani+asi; ter[k+1] = bn+bs; tei[k+1] = bni+bsi;
243	V tor[k] = an-as; toi[k] = ani-asi; tor[k+1] = bn-bs; toi[k+1] = bni-bsi;
244	V k+=2;
245	V } while (k<nk*VSIZE2);
246	V k = ((lVSIZE2)>>1)2; // k must be even here.
247	V do { // compute symmetric and antisymmetric parts, and reorganize data.
248	V double an, bn, ani, bni, bs, as, bsi, asi, t;
249	V ani = BpF[imm_inc + kk_inc]; bni = BpF[imm_inc + kk_inc +1]; // north
250	V an = BpF[(NPHI-im)m_inc + kk_inc]; bn = BpF[(NPHI-im)m_inc + kk_inc +1];
251	V t = ani-an; an += ani; ani = bn-bni; bn += bni; bni = t;
252	V bsi = BpF[imm_inc + (NLAT-2 -k)k_inc]; asi = BpF[imm_inc + (NLAT-2-k)k_inc + 1]; // south
253	V bs = BpF[(NPHI-im)m_inc +(NLAT-2-k)k_inc]; as = BpF[(NPHI-im)m_inc +(NLAT-2-k)k_inc +1];
254	V t = bsi-bs; bs += bsi; bsi = as-asi; as += asi; asi = t;
255	V per[k] = an+as; pei[k] = ani+asi; per[k+1] = bn+bs; pei[k+1] = bni+bsi;
256	V por[k] = an-as; poi[k] = ani-asi; por[k+1] = bn-bs; poi[k+1] = bni-bsi;
257	V k+=2;
258	V } while (k<nk*VSIZE2);
259	V m_1 = 1.0/m;
260	k=l;
261	for (l=0; l<=llim-m; l++) {
262	Q qq[l] = vdup(0.0);
263	V ss[l] = vdup(0.0); tt[l] = vdup(0.0);
264	}
265	do {
266	Q v2d* q = qq;
267	V v2d* s = ss; v2d* t = tt;
268	al = alm;
269	rnd cost[NW], y0[NW], y1[NW];
270	V rnd st2[NW], dy0[NW], dy1[NW];
271	Q rnd rerk[NW], reik[NW], rork[NW], roik[NW]; // help the compiler to cache into registers.
272	V rnd terk[NW], teik[NW], tork[NW], toik[NW];
273	V rnd perk[NW], peik[NW], pork[NW], poik[NW];
274	for (int j=0; j<NW; ++j) {
275	cost[j] = vread(st, k+j);
276	y0[j] = vall(0.5);
277	V st2[j] = cost[j]cost[j]vall(-m_1);
278	V y0[j] = vall(m); // for the vector transform, compute ylmm/sint
279	}
280	Q l=m;
281	V l=m-1;
282	long int ny = 0; // exponent to extend double precision range.
283	if ((int)llim <= SHT_L_RESCALE_FLY) {
284	do { // sin(theta)^m
285	if (l&1) for (int j=0; j<NW; ++j) y0[j] *= cost[j];
286	for (int j=0; j<NW; ++j) cost[j] *= cost[j];
287	} while(l >>= 1);
288	} else {
289	long int nsint = 0;
290	do { // sin(theta)^m (use rescaling to avoid underflow)
291	if (l&1) {
292	for (int j=0; j<NW; ++j) y0[j] *= cost[j];
293	ny += nsint;
294	if (vlo(y0[0]) < (SHT_ACCURACY+1.0/SHT_SCALE_FACTOR)) {
295	ny--;
296	for (int j=0; j<NW; ++j) y0[j] *= vall(SHT_SCALE_FACTOR);
297	}
298	}
299	for (int j=0; j<NW; ++j) cost[j] *= cost[j];
300	nsint += nsint;
301	if (vlo(cost[0]) < 1.0/SHT_SCALE_FACTOR) {
302	nsint--;
303	for (int j=0; j<NW; ++j) cost[j] *= vall(SHT_SCALE_FACTOR);
304	}
305	} while(l >>= 1);
306	}
307	for (int j=0; j<NW; ++j) {
308	y0[j] *= vall(al[0]);
309	cost[j] = vread(ct, k+j);
310	V dy0[j] = cost[j]*y0[j];
311	y1[j] = (vall(al[1])y0[j]) cost[j];
312	V dy1[j] = (vall(al[1])y0[j]) (cost[j]*cost[j] + st2[j]);
313	}
314	l=m; al+=2;
315	while ((ny<0) && (l<llim)) { // ylm treated as zero and ignored if ny < 0
316	for (int j=0; j<NW; ++j) {
317	V dy0[j] = vall(al[1])(cost[j]dy1[j] + y1[j]st2[j]) + vall(al[0])dy0[j];
318	y0[j] = vall(al[1])(cost[j]y1[j]) + vall(al[0])*y0[j];
319	}
320	for (int j=0; j<NW; ++j) {
321	V dy1[j] = vall(al[3])(cost[j]dy0[j] + y0[j]st2[j]) + vall(al[2])dy1[j];
322	y1[j] = vall(al[3])(cost[j]y0[j]) + vall(al[2])*y1[j];
323	}
324	l+=2; al+=4;
325	if (fabs(vlo(y0[NW-1])) > SHT_ACCURACY*SHT_SCALE_FACTOR + 1.0) { // rescale when value is significant
326	++ny;
327	for (int j=0; j<NW; ++j) {
328	y0[j] = vall(1.0/SHT_SCALE_FACTOR); y1[j] = vall(1.0/SHT_SCALE_FACTOR);
329	V dy0[j] = vall(1.0/SHT_SCALE_FACTOR); dy1[j] = vall(1.0/SHT_SCALE_FACTOR);
330	}
331	}
332	}
333	if (ny == 0) {
334	Q q+=(l-m);
335	V s+=(l-m); t+=(l-m);
336	for (int j=0; j<NW; ++j) { // prefetch
337	y0[j] = vread(wg, k+j); y1[j] = vread(wg, k+j); // weight appears here (must be after the previous accuracy loop).
338	V dy0[j] = vread(wg, k+j); dy1[j] = vread(wg, k+j);
339	Q rerk[j] = vread( rer, k+j); reik[j] = vread( rei, k+j); rork[j] = vread( ror, k+j); roik[j] = vread( roi, k+j);
340	V terk[j] = vread( ter, k+j); teik[j] = vread( tei, k+j); tork[j] = vread( tor, k+j); toik[j] = vread( toi, k+j);
341	V perk[j] = vread( per, k+j); peik[j] = vread( pei, k+j); pork[j] = vread( por, k+j); poik[j] = vread( poi, k+j);
342	}
343	while (l<llim) { // compute even and odd parts
344	Q rnd qq0 = y0[0] * rerk[0];
345	Q rnd qq1 = y0[0] * reik[0];
346	V rnd ss0 = dy0[0] * tork[0] + y0[0] * peik[0];
347	V rnd ss1 = dy0[0] * toik[0] - y0[0] * perk[0];
348	V rnd tt0 = dy0[0] * pork[0] - y0[0] * teik[0];
349	V rnd tt1 = dy0[0] * poik[0] + y0[0] * terk[0];
350	Q for (int j=1; j<NW; ++j) qq0 += y0[j] * rerk[j]; // real even
351	Q for (int j=1; j<NW; ++j) qq1 += y0[j] * reik[j]; // imag even
352	V for (int j=1; j<NW; ++j) ss0 += dy0[j] * tork[j] + y0[j] * peik[j];
353	V for (int j=1; j<NW; ++j) ss1 += dy0[j] * toik[j] - y0[j] * perk[j];
354	V for (int j=1; j<NW; ++j) tt0 += dy0[j] * pork[j] - y0[j] * teik[j];
355	V for (int j=1; j<NW; ++j) tt1 += dy0[j] * poik[j] + y0[j] * terk[j];
356	Q q[0] += v2d_reduce(qq0, qq1);
357	V s[0] += v2d_reduce(ss0, ss1);
358	V t[0] -= v2d_reduce(tt0, tt1);
359	for (int j=0; j<NW; ++j) {
360	V dy0[j] = vall(al[1])(cost[j]dy1[j] + y1[j]st2[j]) + vall(al[0])dy0[j];
361	y0[j] = vall(al[1])(cost[j]y1[j]) + vall(al[0])*y0[j];
362	}
363	Q qq0 = y1[0] * rork[0];
364	Q qq1 = y1[0] * roik[0];
365	V ss0 = dy1[0] * terk[0] + y1[0] * poik[0];
366	V ss1 = dy1[0] * teik[0] - y1[0] * pork[0];
367	V tt0 = dy1[0] * perk[0] - y1[0] * toik[0];
368	V tt1 = dy1[0] * peik[0] + y1[0] * tork[0];
369	Q for (int j=1; j<NW; ++j) qq0 += y1[j] * rork[j]; // real odd
370	Q for (int j=1; j<NW; ++j) qq1 += y1[j] * roik[j]; // imag odd
371	V for (int j=1; j<NW; ++j) ss0 += dy1[j] * terk[j] + y1[j] * poik[j];
372	V for (int j=1; j<NW; ++j) ss1 += dy1[j] * teik[j] - y1[j] * pork[j];
373	V for (int j=1; j<NW; ++j) tt0 += dy1[j] * perk[j] - y1[j] * toik[j];
374	V for (int j=1; j<NW; ++j) tt1 += dy1[j] * peik[j] + y1[j] * tork[j];
375	Q q[1] += v2d_reduce(qq0, qq1);
376	V s[1] += v2d_reduce(ss0, ss1);
377	V t[1] -= v2d_reduce(tt0, tt1);
378	Q q+=2;
379	V s+=2; t+=2;
380	for (int j=0; j<NW; ++j) {
381	V dy1[j] = vall(al[3])(cost[j]dy0[j] + y0[j]st2[j]) + vall(al[2])dy1[j];
382	y1[j] = vall(al[3])(cost[j]y0[j]) + vall(al[2])*y1[j];
383	}
384	l+=2; al+=4;
385	}
386	if (l==llim) {
387	Q rnd qq0 = y0[0] * rerk[0];
388	Q rnd qq1 = y0[0] * reik[0];
389	V rnd ss0 = dy0[0] * tork[0] + y0[0] * peik[0];
390	V rnd ss1 = dy0[0] * toik[0] - y0[0] * perk[0];
391	V rnd tt0 = dy0[0] * pork[0] - y0[0] * teik[0];
392	V rnd tt1 = dy0[0] * poik[0] + y0[0] * terk[0];
393	Q for (int j=1; j<NW; ++j) qq0 += y0[j] * rerk[j]; // real even
394	Q for (int j=1; j<NW; ++j) qq1 += y0[j] * reik[j]; // imag even
395	V for (int j=1; j<NW; ++j) ss0 += dy0[j] * tork[j] + y0[j] * peik[j];
396	V for (int j=1; j<NW; ++j) ss1 += dy0[j] * toik[j] - y0[j] * perk[j];
397	V for (int j=1; j<NW; ++j) tt0 += dy0[j] * pork[j] - y0[j] * teik[j];
398	V for (int j=1; j<NW; ++j) tt1 += dy0[j] * poik[j] + y0[j] * terk[j];
399	Q q[0] += v2d_reduce(qq0, qq1);
400	V s[0] += v2d_reduce(ss0, ss1);
401	V t[0] -= v2d_reduce(tt0, tt1);
402	}
403	}
404	k+=NW;
405	} while (k < nk);
406	l = LiM(shtns, m, im);
407	Q v2d Ql = (v2d) &Qlm[l];
408	V v2d Sl = (v2d) &Slm[l];
409	V v2d Tl = (v2d) &Tlm[l];
410	for (l=0; l<=llim-m; ++l) {
411	QX Ql[l] = qq[l];
412	3 Ql[l] = qq[l] * vdup(m_1);
413	V Sl[l] = ss[l] * vdup(l_2[l+m]);
414	V Tl[l] = tt[l] * vdup(l_2[l+m]);
415	}
416	#ifdef SHT_VAR_LTR
417	for (l=llim+1-m; l<=LMAX-m; ++l) {
418	Q Ql[l] = vdup(0.0);
419	V Sl[l] = vdup(0.0); Tl[l] = vdup(0.0);
420	}
421	#endif
422	}
423	#endif
424	}
425
426	static
427	QX void GEN3(spat_to_SH_mic,NWAY,SUFFIX)(shtns_cfg shtns, double Vr, cplx Qlm, long int llim) {
428	VX void GEN3(spat_to_SHsphtor_mic,NWAY,SUFFIX)(shtns_cfg shtns, double Vt, double Vp, cplx Slm, cplx Tlm, long int llim) {
429	3 void GEN3(spat_to_SHqst_mic,NWAY,SUFFIX)(shtns_cfg shtns, double Vr, double Vt, double Vp, cplx Qlm, cplx Slm, cplx Tlm, long int llim) {
430
431	Q double *BrF; // contains the Fourier transformed data
432	V double BtF, BpF; // contains the Fourier transformed data
433	unsigned imlim=0;
434
435	Q BrF = Vr;
436	V BtF = Vt; BpF = Vp;
437	#ifndef SHT_AXISYM
438	imlim = MTR;
439	#ifdef SHT_VAR_LTR
440	if (imlim*MRES > (unsigned) llim) imlim = ((unsigned) llim)/MRES; // 32bit mul and div should be faster
441	#endif
442	if (shtns->fftc_mode >= 0) {
443	if (shtns->fftc_mode == 0) { // in-place
444	Q fftw_execute_dft(shtns->fftc,(cplx)BrF, (cplx)BrF);
445	V fftw_execute_dft(shtns->fftc,(cplx)BtF, (cplx)BtF);
446	V fftw_execute_dft(shtns->fftc,(cplx)BpF, (cplx)BpF);
447	} else { // alloc memory for the transpose FFT
448	unsigned long nv = shtns->nspat;
449	QX BrF = (double) VMALLOC( nv sizeof(double) );
450	VX BtF = (double) VMALLOC( 2nv * sizeof(double) );
451	VX BpF = BtF + nv;
452	3 BrF = (double) VMALLOC( 3nv * sizeof(double) );
453	3 BtF = BrF + nv; BpF = BtF + nv;
454	Q fftw_execute_split_dft(shtns->fftc, Vr+NPHI, Vr, BrF+1, BrF);
455	V fftw_execute_split_dft(shtns->fftc, Vt+NPHI, Vt, BtF+1, BtF);
456	V fftw_execute_split_dft(shtns->fftc, Vp+NPHI, Vp, BpF+1, BpF);
457	}
458	}
459	#endif
460	imlim += 1;
461
462	#pragma omp parallel num_threads(shtns->nthreads)
463	{
464	QX GEN3(_an1,NWAY,SUFFIX)(shtns, BrF, Qlm, llim, imlim);
465	VX GEN3(_an2,NWAY,SUFFIX)(shtns, BtF, BpF, Slm, Tlm, llim, imlim);
466	3 GEN3(_an3,NWAY,SUFFIX)(shtns, BrF, BtF, BpF, Qlm, Slm, Tlm, llim, imlim);
467	}
468
469	#ifndef SHT_AXISYM
470	if (shtns->fftc_mode > 0) { // free memory
471	Q VFREE(BrF);
472	VX VFREE(BtF); // this frees also BpF.
473	}
474	#endif
475
476	}

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: CIVL/examples/omp/shtns/SHT/mic_spat_to_SH.gen.c

Download in other formats: