source: CIVL/examples/fortran/nek5000/core/crs_xxt.c

main
Last change on this file was ea777aa, checked in by Alex Wilton <awilton@…>, 3 years ago

Moved examples, include, build_default.properties, common.xml, and README out from dev.civl.com into the root of the repo.

git-svn-id: svn://vsl.cis.udel.edu/civl/trunk@5704 fb995dde-84ed-4084-dfe6-e5aef3e2452c

  • Property mode set to 100755
File size: 27.3 KB
Line 
1#include <stddef.h>
2#include <stdlib.h>
3#include <stdio.h>
4#include <limits.h>
5#include <float.h>
6#include <string.h>
7#include <math.h>
8#include "gslib.h"
9
10#define crs_setup PREFIXED_NAME(crs_xxt_setup)
11#define crs_solve PREFIXED_NAME(crs_xxt_solve)
12#define crs_stats PREFIXED_NAME(crs_xxt_stats)
13#define crs_free PREFIXED_NAME(crs_xxt_free )
14
15/*
16 portable log base 2
17
18 does a binary search to find leading order bit
19
20 UINT_BITS = number of bits in a uint
21 BITS(0) = UINT_BITS
22 BITS(i) = half of BITS(i-1), rounded up
23 MASK(i) = bitmask with BITS(i) 1's followed by BITS(i) 0's
24*/
25
26static unsigned lg(uint v)
27{
28 unsigned r = 0;
29#define UINT_BITS (sizeof(uint)*CHAR_BIT)
30#define BITS(i) ((UINT_BITS+(1<<(i))-1)>>(i))
31#define MASK(i) ((((uint)1<<BITS(i)) - 1) << BITS(i))
32#define CHECK(i) if((BITS(i)!=1) && (v&MASK(i))) v>>=BITS(i), r+=BITS(i)
33 CHECK(1); CHECK(2); CHECK(3); CHECK(4); CHECK(5); CHECK(6); CHECK(7);
34 CHECK(8); CHECK(9); /* this covers up to 1024-bit uints */
35 if(v&2) ++r;
36 return r;
37#undef UINT_BITS
38#undef BITS
39#undef MASK
40#undef CHECK
41}
42
43/* factors: L is in CSR format
44 D is a diagonal matrix stored as a vector
45 actual factorization is:
46
47 -1 T
48 A = (I-L) D (I-L)
49
50 -1 -T -1
51 A = (I-L) D (I-L)
52
53 (triangular factor is unit diagonal; the diagonal is not stored)
54*/
55struct sparse_cholesky {
56 uint n, *Lrp, *Lj;
57 double *L, *D;
58};
59
60/*
61 symbolic factorization: finds the sparsity structure of L
62
63 uses the concept of elimination tree:
64 the parent of node j is node i when L(i,j) is the first
65 non-zero in column j below the diagonal (i>j)
66 L's structure is discovered row-by-row; the first time
67 an entry in column j is set, it must be the parent
68
69 the nonzeros in L are the nonzeros in A + paths up the elimination tree
70
71 linear in the number of nonzeros of L
72*/
73static void factor_symbolic(uint n, const uint *Arp, const uint *Aj,
74 struct sparse_cholesky *out, buffer *buf)
75{
76 uint *visit = tmalloc(uint,2*n), *parent = visit+n;
77 uint *Lrp, *Lj;
78 uint i,nz=0;
79
80 out->n=n;
81
82 for(i=0;i<n;++i) {
83 uint p=Arp[i], pe=Arp[i+1];
84 visit[i]=i, parent[i]=n;
85 for(;p!=pe;++p) {
86 uint j=Aj[p]; if(j>=i) break;
87 for(;visit[j]!=i;j=parent[j]) {
88 ++nz, visit[j]=i;
89 if(parent[j]==n) { parent[j]=i; break; }
90 }
91 }
92 }
93
94 Lrp=out->Lrp=tmalloc(uint,n+1+nz);
95 Lj =out->Lj =Lrp+n+1;
96
97 Lrp[0]=0;
98 for(i=0;i<n;++i) {
99 uint p=Arp[i], pe=Arp[i+1], count=0, *Ljr=&Lj[Lrp[i]];
100 visit[i]=i;
101 for(;p!=pe;++p) {
102 uint j=Aj[p]; if(j>=i) break;
103 for(;visit[j]!=i;j=parent[j]) Ljr[count++]=j, visit[j]=i;
104 }
105 sortv(Ljr, Ljr,count,sizeof(uint), buf);
106 Lrp[i+1]=Lrp[i]+count;
107 }
108 free(visit);
109}
110
111/*
112 numeric factorization:
113
114 L is built row-by-row, using: ( ' indicates transpose )
115
116
117 [ A r ] = [ (I-L) ] [ D^(-1) ] [ (I-L)' -s ]
118 [ r' a ] [ -s' 1 ] [ 1/d ] [ 1 ]
119
120 = [ A (I-L) D^(-1) (-s) ]
121 [ r' s' D^(-1) s + 1/d ]
122
123 so, if r' is the next row of A, up to but excluding the diagonal,
124 then the next row of L, s', obeys
125
126 r = - (I-L) D^(-1) s
127
128 let y = (I-L)^(-1) (-r)
129 then s = D y, and d = 1/(s' y)
130
131*/
132static void factor_numeric(uint n, const uint *Arp, const uint *Aj,
133 const double *A,
134 struct sparse_cholesky *out,
135 uint *visit, double *y)
136{
137 const uint *Lrp=out->Lrp, *Lj=out->Lj;
138 double *D, *L;
139 uint i;
140
141 D=out->D=tmalloc(double,n+Lrp[n]);
142 L=out->L=D+n;
143
144 for(i=0;i<n;++i) {
145 uint p,pe; double a=0;
146 visit[i]=n;
147 for(p=Lrp[i],pe=Lrp[i+1];p!=pe;++p) {
148 uint j=Lj[p]; y[j]=0, visit[j]=i;
149 }
150 for(p=Arp[i],pe=Arp[i+1];p!=pe;++p) {
151 uint j=Aj[p];
152 if(j>=i) { if(j==i) a=A[p]; break; }
153 y[j]=-A[p];
154 }
155 for(p=Lrp[i],pe=Lrp[i+1];p!=pe;++p) {
156 uint q,qe,j=Lj[p]; double lij,yj=y[j];
157 for(q=Lrp[j],qe=Lrp[j+1];q!=qe;++q) {
158 uint k=Lj[q]; if(visit[k]==i) yj+=L[q]*y[k];
159 }
160 y[j]=yj;
161 L[p]=lij=D[j]*yj;
162 a-=yj*lij;
163 }
164 D[i]=1/a;
165 }
166}
167
168/* x = A^(-1) b; works when x and b alias */
169void sparse_cholesky_solve(
170 double *x, const struct sparse_cholesky *fac, double *b)
171{
172 const uint n=fac->n, *Lrp=fac->Lrp, *Lj=fac->Lj;
173 const double *L=fac->L, *D=fac->D;
174 uint i, p,pe;
175 for(i=0;i<n;++i) {
176 double xi = b[i];
177 for(p=Lrp[i],pe=Lrp[i+1];p!=pe;++p) xi+=L[p]*x[Lj[p]];
178 x[i]=xi;
179 }
180 for(i=0;i<n;++i) x[i]*=D[i];
181 for(i=n;i;) {
182 double xi = x[--i];
183 for(p=Lrp[i],pe=Lrp[i+1];p!=pe;++p) x[Lj[p]]+=L[p]*xi;
184 }
185}
186
187void sparse_cholesky_factor(uint n, const uint *Arp, const uint *Aj,
188 const double *A,
189 struct sparse_cholesky *out, buffer *buf)
190{
191 const uint n_uints_as_dbls = (n*sizeof(uint)+sizeof(double)-1)/sizeof(double);
192 buffer_reserve(buf,(n_uints_as_dbls+n)*sizeof(double));
193 factor_symbolic(n,Arp,Aj,out,buf);
194 factor_numeric(n,Arp,Aj,A,out,buf->ptr,n_uints_as_dbls+(double*)buf->ptr);
195}
196
197void sparse_cholesky_free(struct sparse_cholesky *fac)
198{
199 free(fac->Lrp); fac->Lj=fac->Lrp=0;
200 free(fac->D); fac->L =fac->D =0;
201}
202
203struct csr_mat {
204 uint n, *Arp, *Aj; double *A;
205};
206
207struct xxt {
208
209 /* communication */
210
211 struct comm comm;
212 uint pcoord; /* coordinate in communication tree */
213 unsigned plevels; /* # of stages of communication */
214 sint *pother; /* let p = pother[i], then during stage i of fan-in,
215 if p>=0, receive from p
216 if p< 0, send to (-p-1)
217 fan-out is just the reverse ...
218 on proc 0, pother is never negative
219 on others, pother is negative for the last stage only */
220 comm_req *req;
221
222 /* separators */
223
224 unsigned nsep; /* number of separators */
225 uint *sep_size; /* # of dofs on each separator,
226 ordered from the bottom to the top of the tree:
227 separator 0 is the bottom-most one (dofs not shared)
228 separator nsep-1 is the root of the tree */
229
230 unsigned null_space;
231 double *share_weight;
232
233 /* vector sizes */
234
235 uint un; /* user's vector size */
236
237 /* xxt_solve works with "condensed" vectors;
238 same dofs as in user's vectors, but no duplicates and no Dirichlet nodes,
239 and also ordered topologically (children before parents) according to the
240 separator tree */
241
242 uint cn; /* size of condensed vectors */
243 sint *perm_u2c; /* permutation from user vector to condensed vector,
244 p=perm_u2c[i]; xu[i] = p=-1 ? 0 : xc[p]; */
245 uint ln, sn; /* xc[0 ... ln-1] are not shared (ln=sep_size[0])
246 xc[ln ... ln+sn-1] are shared
247 ln+sn = cn */
248
249 uint xn; /* # of columns of x = sum_i(sep_size[i]) - sep_size[0] */
250
251 /* data */
252 struct sparse_cholesky fac_A_ll;
253 struct csr_mat A_sl;
254 uint *Xp; double *X; /* column i of X starts at X[Xp[i]] */
255
256 /* execution buffers */
257 double *vl, *vc, *vx, *combuf;
258};
259
260
261/*
262 for the binary communication tree, the procs are divided in half
263 at each level, with the second half always the larger
264
265 e.g., for np = 13:
266
267 +------13-------+
268 | |
269 +---6---+ +---7---+
270 | | | |
271 +-3-+ +-3-+ +-3-+ +-4-+
272 1 2 1 2 1 2 2 2
273 1^1 1^1 1^1 1^1 1^1
274
275 plevels is the number of levels in the tree
276 = np==1 ? 1 : ( lg(np-1)+2 )
277
278 labelling the nodes with proc id's gives this communication tree:
279
280 +-------0-------+
281 | |
282 +---0---+ +---6---+
283 | | | |
284 +-0-+ +-3-+ +-6-+ +-9-+
285 0 1 3 4 6 7 9 b
286 1^2 4^5 7^8 9^a b^c
287
288 consider proc 7 (pid = 7);
289 pcoord gives the position of the leaf labelled 7:
290 Root Right Left Right Left -> RRLRL -> 11010
291 so pcoord = 11010 binary
292 note the parent coordinate can be found by bit shifting right
293 (i.e. dividing by 2)
294*/
295
296/* sets: pcoord, nsep, plevels, pother, req */
297static void locate_proc(struct xxt *data)
298{
299 const uint id = data->comm.id;
300 uint n = data->comm.np, c=1, odd=0, base=0;
301 unsigned level=0;
302 while(n>1) {
303 ++level;
304 odd=(odd<<1)|(n&1);
305 c<<=1, n>>=1;
306 if(id>=base+n) c|=1, base+=n, n+=(odd&1);
307 }
308 data->pcoord=c;
309 data->nsep = level+1;
310 data->plevels = data->nsep-1;
311 data->pother = tmalloc(sint,data->plevels);
312 data->req = tmalloc(comm_req,data->plevels);
313 for(level=0;level<data->plevels;++level) {
314 if((c&1)==1) {
315 uint targ = id - (n-(odd&1));
316 data->pother[level]=-(sint)(targ+1);
317 data->plevels = level+1;
318 break;
319 } else {
320 data->pother[level]=id+n;
321 c>>=1, n=(n<<1)+(odd&1), odd>>=1;
322 }
323 }
324}
325
326/* the tuple list describing the condensed dofs:
327 [(separator level, share count, global id)] */
328struct dof { ulong id; uint level, count; };
329
330/* determine the size of each separator;
331 sums the separator sizes following the fan-in, fan-out comm. pattern
332 uses the share-counts to avoid counting dofs more than once */
333/* sets: xn, sep_size, ln, sn */
334static void discover_sep_sizes(struct xxt *data,
335 struct array *dofa, buffer *buf)
336{
337 const unsigned ns=data->nsep, nl=data->plevels;
338 const uint n = dofa->n;
339 float *v, *recv;
340 unsigned i,lvl; uint j;
341 const struct dof *dof = dofa->ptr;
342
343 buffer_reserve(buf,2*ns*sizeof(float));
344 v=buf->ptr, recv=v+ns;
345
346 for(i=0;i<ns;++i) v[i]=0;
347 for(j=0;j<n;++j) v[dof[j].level]+=1/(float)dof[j].count;
348
349 /* fan-in */
350 for(lvl=0;lvl<nl;++lvl) {
351 sint other = data->pother[lvl];
352 unsigned s = ns-(lvl+1);
353 if(other<0) {
354 comm_send(&data->comm,v +lvl+1,s*sizeof(float),-other-1,s);
355 } else {
356 comm_recv(&data->comm,recv+lvl+1,s*sizeof(float),other,s);
357 for(i=lvl+1;i<ns;++i) v[i]+=recv[i];
358 }
359 }
360 /* fan-out */
361 for(;lvl;) {
362 sint other = data->pother[--lvl];
363 unsigned s = ns-(lvl+1);
364 if(other<0)
365 comm_recv(&data->comm,v+lvl+1,s*sizeof(float),-other-1,s);
366 else
367 comm_send(&data->comm,v+lvl+1,s*sizeof(float),other,s);
368 }
369
370 data->xn=0;
371 data->sep_size = tmalloc(uint,ns);
372 for(i=0;i<ns;++i) {
373 uint s=v[i]+.1f;
374 data->sep_size[i]=s;
375 data->xn+=s;
376 }
377 data->ln=data->sep_size[0];
378 data->sn=data->cn-data->ln;
379 data->xn-=data->ln;
380}
381
382/* assuming [A,Aend) is sorted,
383 removes 0's and any duplicate entries,
384 returns new end */
385static ulong *unique_nonzero(ulong *A, ulong *Aend)
386{
387 if(Aend==A) return A;
388 else {
389 ulong *end = Aend-1, last=*end, *p=A,*q=A,v=0;
390 *end = 1;
391 while(*q==0) ++q; /* *q==0 => q!=end since *end==0 */
392 *end = 0;
393 while(q!=end) {
394 v=*q++, *p++=v;
395 while(*q==v) ++q; /* *q==v => q!=end since *end==0 */
396 }
397 if(last!=v) *p++=last;
398 return p;
399 }
400}
401
402static void merge_sep_ids(struct xxt *data, ulong *sep_id, ulong *other,
403 ulong *work, unsigned s0, buffer *buf)
404{
405 const unsigned ns = data->nsep;
406 unsigned s;
407 ulong *p=sep_id, *q=other;
408 for(s=s0;s<ns;++s) {
409 ulong *end;
410 uint size = data->sep_size[s];
411 memcpy(work ,p,size*sizeof(ulong));
412 memcpy(work+size,q,size*sizeof(ulong));
413 sortv_long(work, work,2*size,sizeof(ulong), buf);
414 end = unique_nonzero(work,work+2*size);
415 memcpy(p,work,(end-work)*sizeof(ulong));
416 p+=size, q+=size;
417 }
418}
419
420static void init_sep_ids(struct xxt *data, struct array *dofa, ulong *xid)
421{
422 const unsigned ns=data->nsep;
423 const uint n=data->cn, *sep_size=data->sep_size;
424 unsigned s=1;
425 uint i, size;
426 const struct dof *dof = dofa->ptr;
427 if(ns==1) return;
428 size=sep_size[s];
429 for(i=data->ln;i<n;++i) {
430 unsigned si = dof[i].level;
431 while(s!=si) {
432 memset(xid,0,size*sizeof(ulong));
433 xid+=size;
434 if(++s != ns) size=data->sep_size[s];
435 }
436 *xid++ = dof[i].id, --size;
437 }
438 while(s!=ns) {
439 memset(xid,0,size*sizeof(ulong));
440 xid+=size;
441 if(++s != ns) size=data->sep_size[s];
442 }
443}
444
445static void find_perm_x2c(uint ln, uint cn, const struct array *dofc,
446 uint xn, const ulong *xid, sint *perm)
447{
448 const struct dof *dof = dofc->ptr, *dof_end = dof+cn;
449 const ulong *xid_end = xid+xn; uint i=ln;
450 dof+=ln;
451 while(dof!=dof_end) {
452 ulong v=dof->id;
453 while(*xid!=v) ++xid, *perm++ = -1;
454 *perm++ = i++, ++dof, ++xid;
455 }
456 while(xid!=xid_end) ++xid, *perm++ = -1;
457}
458
459/* sets: perm_x2c */
460static sint *discover_sep_ids(struct xxt *data, struct array *dofa, buffer *buf)
461{
462 const unsigned ns=data->nsep, nl=data->plevels;
463 const uint xn=data->xn, *sep_size=data->sep_size;
464 ulong *xid, *recv, *work, *p;
465 unsigned lvl;
466 uint size,ss;
467 sint *perm_x2c;
468
469 size=0; for(lvl=1;lvl<ns;++lvl) if(sep_size[lvl]>size) size=sep_size[lvl];
470 xid=tmalloc(ulong,2*xn+2*size), recv=xid+xn, work=recv+xn;
471
472 init_sep_ids(data,dofa,xid);
473
474 if(nl) {
475 /* fan-in */
476 p=xid, size=xn;
477 for(lvl=0;lvl<nl;++lvl) {
478 sint other = data->pother[lvl];
479 if(other<0) {
480 comm_send(&data->comm,p ,size*sizeof(ulong),-other-1,size);
481 } else {
482 comm_recv(&data->comm,recv,size*sizeof(ulong),other,size);
483 merge_sep_ids(data,p,recv,work,lvl+1,buf);
484 }
485 ss=data->sep_size[lvl+1];
486 if(ss>=size || lvl==nl-1) break;
487 p+=ss, size-=ss;
488 }
489 /* fan-out */
490 for(;;) {
491 sint other = data->pother[lvl];
492 if(other<0)
493 comm_recv(&data->comm,p,size*sizeof(ulong),-other-1,size);
494 else
495 comm_send(&data->comm,p,size*sizeof(ulong),other,size);
496 if(lvl==0) break;
497 ss=data->sep_size[lvl];
498 p-=ss, size+=ss, --lvl;
499 }
500 }
501
502 perm_x2c=tmalloc(sint,xn);
503 find_perm_x2c(data->ln,data->cn,dofa, xn,xid, perm_x2c);
504 free(xid);
505
506 return perm_x2c;
507}
508
509static void apply_QQt(struct xxt *data, double *v, uint n, uint tag)
510{
511 const unsigned nl=data->plevels;
512 double *p=v, *recv=data->combuf;
513 unsigned lvl, nsend=0;
514 uint size=n, ss;
515
516 if(n==0 || nl==0) return;
517
518 tag=tag*2+0;
519 /* fan-in */
520 for(lvl=0;lvl<nl;++lvl) {
521 sint other = data->pother[lvl];
522 if(other<0) {
523 comm_send(&data->comm,p ,size*sizeof(double),-other-1,tag);
524 } else {
525 uint i;
526 comm_recv(&data->comm,recv,size*sizeof(double),other ,tag);
527 for(i=0;i<size;++i) p[i]+=recv[i];
528 }
529 ss=data->sep_size[lvl+1];
530 if(ss>=size || lvl==nl-1) break;
531 p+=ss, size-=ss;
532 }
533 /* fan-out */
534 for(;;) {
535 sint other = data->pother[lvl];
536 if(other<0) {
537 comm_recv (&data->comm,p,size*sizeof(double),-other-1,tag);
538 } else {
539 comm_isend(&data->req[nsend++],&data->comm,
540 p,size*sizeof(double),other ,tag);
541 }
542 if(lvl==0) break;
543 ss=data->sep_size[lvl];
544 p-=ss, size+=ss, --lvl;
545 }
546 if(nsend) comm_wait(data->req,nsend);
547}
548
549static double sum(struct xxt *data, double v, uint n, uint tag)
550{
551 const unsigned nl=data->plevels;
552 double r;
553 unsigned lvl,nsend=0;
554 uint size=n, ss;
555
556 tag=tag*2+1;
557 if(n==0 || nl==0) return v;
558 /* fan-in */
559 for(lvl=0;lvl<nl;++lvl) {
560 sint other = data->pother[lvl];
561 if(other<0) {
562 comm_send(&data->comm,&v,sizeof(double),-other-1,tag);
563 } else {
564 comm_recv(&data->comm,&r,sizeof(double),other ,tag);
565 v+=r;
566 }
567 ss=data->sep_size[lvl+1];
568 if(ss>=size || lvl==nl-1) break;
569 size-=ss;
570 }
571 /* fan-out */
572 for(;;) {
573 sint other = data->pother[lvl];
574 if(other<0) {
575 comm_recv (&data->comm,&v,sizeof(double),-other-1,tag);
576 } else {
577 comm_isend(&data->req[nsend++],&data->comm,
578 &v,sizeof(double),other ,tag);
579 }
580 if(lvl==0) break;
581 ss=data->sep_size[lvl];
582 size+=ss, --lvl;
583 }
584 if(nsend) comm_wait(data->req,nsend);
585 return v;
586}
587
588/* sorts an array of ids, removes 0's and duplicates;
589 just returns the permutation */
590static uint unique_ids(uint n, const ulong *id, sint *perm, buffer *buf)
591{
592 uint *p, i, un=0; ulong last=0;
593 p = sortp_long(buf,0, id,n,sizeof(ulong));
594 for(i=0;i<n;++i) {
595 uint j = p[i]; ulong v = id[j];
596 if(v==0) perm[j]=-1;
597 else {
598 if(v!=last) last=v, ++un;
599 perm[j]=un-1;
600 }
601 }
602 buf->n=0;
603 return un;
604}
605
606/* given user's list of dofs (as id's)
607 uses gather-scatter to find share-count and separator # for each
608 outputs as a list, sorted topologically (children before parents)
609 according to the sep. tree (and without duplicates),
610 as well as the permutation to get there from the user's list */
611/* sets: un, cn, perm_u2c */
612static void discover_dofs(
613 struct xxt *data, uint n, const ulong *id,
614 struct array *dofa, buffer *buf, const struct comm *comm)
615{
616 const uint pcoord = data->pcoord, ns=data->nsep;
617 sint *perm;
618 uint i, cn, *p, *pi;
619 ulong *bid;
620 struct gs_data *gsh; sint *v;
621 struct dof *dof;
622
623 data->un = n;
624 data->perm_u2c = perm = tmalloc(sint,n);
625 data->cn = cn = unique_ids(n,id,perm,buf);
626 array_init(struct dof,dofa,cn), dofa->n=cn, dof=dofa->ptr;
627 buffer_reserve(buf,cn*sizeof(ulong)), bid=buf->ptr;
628 for(i=0;i<n;++i) if(perm[i]>=0) bid[perm[i]]=dof[perm[i]].id=id[i];
629
630 gsh = gs_setup((const slong*)bid,cn,comm,0,gs_crystal_router,0);
631 v = tmalloc(sint,cn);
632
633 for(i=0;i<cn;++i) v[i]=pcoord;
634 gs(v,gs_sint,gs_bpr,0,gsh,buf);
635 for(i=0;i<cn;++i) dof[i].level=ns-1-lg((uint)v[i]);
636
637 for(i=0;i<cn;++i) v[i]=1;
638 gs(v,gs_sint,gs_add,0,gsh,buf);
639 for(i=0;i<cn;++i) dof[i].count=v[i];
640
641 free(v);
642 gs_free(gsh);
643
644 if(!cn) return;
645 buffer_reserve(buf,2*cn*sizeof(uint));
646 p = sortp(buf,0, &dof[0].level,cn,sizeof(struct dof));
647 pi = p+cn; for(i=0;i<cn;++i) pi[p[i]]=i;
648 for(i=0;i<n;++i) if(perm[i]>=0) perm[i]=pi[perm[i]];
649 sarray_permute_buf(struct dof,dof,cn, buf);
650}
651
652/* vl += A_ls * vs */
653static void apply_p_Als(double *vl, struct xxt *data, const double *vs, uint ns)
654{
655 const uint *Arp = data->A_sl.Arp,
656 *Aj = data->A_sl.Aj;
657 const double *A = data->A_sl.A;
658 uint i,p,pe;
659 for(i=0;i<ns;++i)
660 for(p=Arp[i],pe=Arp[i+1];p!=pe;++p)
661 vl[Aj[p]]+=A[p]*vs[i];
662}
663
664/* vs -= A_sl * vl */
665static void apply_m_Asl(double *vs, uint ns, struct xxt *data, const double *vl)
666{
667 const uint *Arp = data->A_sl.Arp,
668 *Aj = data->A_sl.Aj;
669 const double *A = data->A_sl.A;
670 uint i,p,pe;
671 for(i=0;i<ns;++i)
672 for(p=Arp[i],pe=Arp[i+1];p!=pe;++p)
673 vs[i]-=A[p]*vl[Aj[p]];
674}
675
676/* returns a column of S : vs = -S(0:ei-1,ei) */
677static void apply_S_col(double *vs, struct xxt *data,
678 struct csr_mat *A_ss, uint ei, double *vl)
679{
680 const uint ln=data->ln;
681 const uint *Asl_rp = data->A_sl.Arp, *Ass_rp = A_ss->Arp,
682 *Asl_j = data->A_sl.Aj, *Ass_j = A_ss->Aj;
683 const double *Asl = data->A_sl.A, *Ass = A_ss->A;
684 uint i,p,pe;
685 for(i=0;i<ei;++i) vs[i]=0;
686 for(p=Ass_rp[ei],pe=Ass_rp[ei+1];p!=pe;++p) {
687 uint j=Ass_j[p];
688 if(j>=ei) break;
689 vs[j]=-Ass[p];
690 }
691 for(i=0;i<ln;++i) vl[i]=0;
692 for(p=Asl_rp[ei],pe=Asl_rp[ei+1];p!=pe;++p) vl[Asl_j[p]]=-Asl[p];
693 sparse_cholesky_solve(vl,&data->fac_A_ll,vl);
694 apply_m_Asl(vs,ei,data,vl);
695}
696
697static void apply_S(double *Svs, uint ns, struct xxt *data,
698 struct csr_mat *A_ss, const double *vs, double *vl)
699{
700 const uint ln=data->ln;
701 const uint *Ass_rp = A_ss->Arp,
702 *Ass_j = A_ss->Aj;
703 const double *Ass = A_ss->A;
704 uint i, p,pe;
705 for(i=0;i<ns;++i) {
706 double sum=0;
707 for(p=Ass_rp[i],pe=Ass_rp[i+1];p!=pe;++p) {
708 uint j=Ass_j[p];
709 if(j>=ns) break;
710 sum+=Ass[p]*vs[j];
711 }
712 Svs[i]=sum;
713 }
714 for(i=0;i<ln;++i) vl[i]=0;
715 apply_p_Als(vl,data,vs,ns);
716 sparse_cholesky_solve(vl,&data->fac_A_ll,vl);
717 apply_m_Asl(Svs,ns,data,vl);
718}
719
720/* vx = X' * vs */
721static void apply_Xt(double *vx, uint nx, const struct xxt *data,
722 const double *vs)
723{
724 const double *X = data->X; const uint *Xp = data->Xp;
725 uint i; for(i=0;i<nx;++i) vx[i]=tensor_dot(vs,X+Xp[i],Xp[i+1]-Xp[i]);
726}
727
728/* vs = X * vx */
729static void apply_X(double *vs, uint ns, const struct xxt *data,
730 const double *vx, uint nx)
731{
732 const double *X = data->X; const uint *Xp = data->Xp;
733 uint i,j;
734 for(i=0;i<ns;++i) vs[i]=0;
735 for(i=0;i<nx;++i) {
736 const double v = vx[i];
737 const double *x = X+Xp[i]; uint n=Xp[i+1]-Xp[i];
738 for(j=0;j<n;++j) vs[j]+=x[j]*v;
739 }
740}
741
742static void allocate_X(struct xxt *data, sint *perm_x2c)
743{
744 uint xn=data->xn;
745 uint i,h=0;
746 if(data->null_space && xn) --xn;
747 data->Xp = tmalloc(uint,xn+1);
748 data->Xp[0]=0;
749 for(i=0;i<xn;++i) {
750 if(perm_x2c[i]!=-1) ++h;
751 data->Xp[i+1]=data->Xp[i]+h;
752 }
753 data->X = tmalloc(double,data->Xp[xn]);
754}
755
756static void orthogonalize(struct xxt *data, struct csr_mat *A_ss,
757 sint *perm_x2c, buffer *buf)
758{
759 uint ln=data->ln, sn=data->sn, xn=data->xn;
760 double *vl, *vs, *vx, *Svs;
761 uint i,j;
762
763 allocate_X(data,perm_x2c);
764
765 buffer_reserve(buf,(ln+2*sn+xn)*sizeof(double));
766 vl=buf->ptr, vs=vl+ln, Svs=vs+sn, vx=Svs+sn;
767
768 if(data->null_space && xn) --xn;
769 for(i=0;i<xn;++i) {
770 uint ns=data->Xp[i+1]-data->Xp[i];
771 sint ui = perm_x2c[i];
772 double ytsy, *x;
773
774 if(ui == -1) {
775 for(j=0;j<i;++j) vx[j]=0;
776 } else {
777 ui-=ln;
778 apply_S_col(vs, data,A_ss, ui, vl);
779 apply_Xt(vx,i, data, vs);
780 }
781 apply_QQt(data,vx,i,xn-i);
782 apply_X(vs,ns, data, vx,i);
783 if(ui!=-1) vs[ui]=1;
784 apply_S(Svs,ns, data,A_ss, vs, vl);
785 ytsy = tensor_dot(vs,Svs,ns);
786 ytsy = sum(data,ytsy,i+1,xn-(i+1));
787 if(ytsy<DBL_EPSILON/128) ytsy=0; else ytsy = 1/sqrt(ytsy);
788 x=&data->X[data->Xp[i]];
789 for(j=0;j<ns;++j) x[j]=ytsy*vs[j];
790 }
791}
792
793struct yale_mat { uint i,j; double v; };
794
795/* produces CSR matrix from Yale-like format, summing duplicates */
796static void condense_matrix(struct array *mat, uint nr,
797 struct csr_mat *out, buffer *buf)
798{
799 uint k, nz=mat->n;
800 struct yale_mat *p, *q;
801 sarray_sort_2(struct yale_mat,mat->ptr,mat->n, i,0, j,0, buf);
802
803 p = mat->ptr;
804 for(k=0;k+1<nz;++k,++p) if(p[0].i==p[1].i && p[0].j==p[1].j) break;
805 if(++k<nz) {
806 uint i=p->i,j=p->j;
807 q = p+1;
808 for(;k<nz;++k,++q) {
809 if(i==q->i&&j==q->j) p->v += q->v, --mat->n;
810 else ++p, p->i=i=q->i,p->j=j=q->j, p->v=q->v;
811 }
812 }
813
814 nz=mat->n;
815 out->n=nr;
816 out->Arp = tmalloc(uint,nr+1+mat->n);
817 out->Aj = out->Arp+nr+1;
818 out->A = tmalloc(double,mat->n);
819 for(k=0;k<nr;++k) out->Arp[k]=0;
820 for(p=mat->ptr,k=0;k<nz;++k,++p)
821 out->Arp[p->i]++, out->Aj[k]=p->j, out->A[k]=p->v;
822 nz=0; for(k=0;k<=nr;++k) { uint t=out->Arp[k]; out->Arp[k]=nz, nz+=t; }
823}
824
825static void separate_matrix(
826 uint nz, const uint *Ai, const uint *Aj, const double *A,
827 const sint *perm, uint ln, uint sn,
828 struct csr_mat *out_ll, struct csr_mat *out_sl, struct csr_mat *out_ss,
829 buffer *buf
830)
831{
832 uint k,n;
833 struct array mat_ll, mat_sl, mat_ss;
834 struct yale_mat *mll, *msl, *mss;
835 array_init(struct yale_mat,&mat_ll,2*nz), mll=mat_ll.ptr;
836 array_init(struct yale_mat,&mat_sl,2*nz), msl=mat_sl.ptr;
837 array_init(struct yale_mat,&mat_ss,2*nz), mss=mat_ss.ptr;
838 for(k=0;k<nz;++k) {
839 sint i=perm[Ai[k]], j=perm[Aj[k]];
840 if(i<0 || j<0 || A[k]==0) continue;
841 if((uint)i<ln) {
842 if((uint)j<ln)
843 n=mat_ll.n++,mll[n].i=i,mll[n].j=j,mll[n].v=A[k];
844 } else {
845 if((uint)j<ln)
846 n=mat_sl.n++,msl[n].i=i-ln,msl[n].j=j,msl[n].v=A[k];
847 else
848 n=mat_ss.n++,mss[n].i=i-ln,mss[n].j=j-ln,mss[n].v=A[k];
849 }
850 }
851 condense_matrix(&mat_ll,ln,out_ll,buf);
852 condense_matrix(&mat_sl,sn,out_sl,buf);
853 condense_matrix(&mat_ss,sn,out_ss,buf);
854 array_free(&mat_ll);
855 array_free(&mat_sl);
856 array_free(&mat_ss);
857}
858
859struct xxt *crs_setup(
860 uint n, const ulong *id,
861 uint nz, const uint *Ai, const uint *Aj, const double *A,
862 uint null_space, const struct comm *comm)
863{
864 struct xxt *data = tmalloc(struct xxt,1);
865 sint *perm_x2c;
866 struct array dofa;
867 struct csr_mat A_ll, A_ss;
868 buffer buf;
869
870 comm_dup(&data->comm,comm);
871
872 locate_proc(data);
873
874 data->null_space=null_space;
875
876 buffer_init(&buf,1024);
877
878 discover_dofs(data,n,id,&dofa,&buf,&data->comm);
879 discover_sep_sizes(data,&dofa,&buf);
880
881 perm_x2c = discover_sep_ids(data,&dofa,&buf);
882 if(data->null_space) {
883 uint i; double count = 0; struct dof *dof = dofa.ptr;
884 for(i=0;i<data->cn;++i) count+=1/(double)dof[i].count;
885 count=1/sum(data,count,data->xn,0);
886 data->share_weight=tmalloc(double,data->cn);
887 for(i=0;i<data->cn;++i)
888 data->share_weight[i]=count/dof[i].count;
889 }
890 array_free(&dofa);
891
892 if(!data->null_space || data->xn!=0) {
893 separate_matrix(nz,Ai,Aj,A,data->perm_u2c,
894 data->ln,data->sn,
895 &A_ll,&data->A_sl,&A_ss,
896 &buf);
897 } else {
898 separate_matrix(nz,Ai,Aj,A,data->perm_u2c,
899 data->ln-1,1,
900 &A_ll,&data->A_sl,&A_ss,
901 &buf);
902 }
903
904 sparse_cholesky_factor(A_ll.n,A_ll.Arp,A_ll.Aj,A_ll.A,
905 &data->fac_A_ll, &buf);
906 free(A_ll.Arp); free(A_ll.A);
907
908 data->vl = tmalloc(double,data->ln+data->cn+2*data->xn);
909 data->vc = data->vl+data->ln;
910 data->vx = data->vc+data->cn;
911 data->combuf = data->vx+data->xn;
912
913 orthogonalize(data,&A_ss,perm_x2c,&buf);
914 free(A_ss.Arp); free(A_ss.A);
915 free(perm_x2c);
916 buffer_free(&buf);
917
918 return data;
919}
920
921void crs_solve(double *x, struct xxt *data, const double *b)
922{
923 uint cn=data->cn, un=data->un, ln=data->ln, sn=data->sn, xn=data->xn;
924 double *vl=data->vl, *vc=data->vc, *vx=data->vx;
925 uint i;
926 for(i=0;i<cn;++i) vc[i]=0;
927 for(i=0;i<un;++i) {
928 sint p=data->perm_u2c[i];
929 if(p>=0) vc[p]+=b[i];
930 }
931 if(xn>0 && (!data->null_space || xn>1)) {
932 if(data->null_space) --xn;
933 sparse_cholesky_solve(vc,&data->fac_A_ll,vc);
934 apply_m_Asl(vc+ln,sn, data, vc);
935 apply_Xt(vx,xn, data, vc+ln);
936 apply_QQt(data,vx,xn,0);
937 apply_X(vc+ln,sn, data, vx,xn);
938 for(i=0;i<ln;++i) vl[i]=0;
939 apply_p_Als(vl, data, vc+ln,sn);
940 sparse_cholesky_solve(vl,&data->fac_A_ll,vl);
941 for(i=0;i<ln;++i) vc[i]-=vl[i];
942 } else {
943 sparse_cholesky_solve(vc,&data->fac_A_ll,vc);
944 if(data->null_space) {
945 if(xn==0) vc[ln-1]=0;
946 else if(sn==1) vc[ln]=0;
947 }
948 }
949 if(data->null_space) {
950 double s=0;
951 for(i=0;i<cn;++i) s+=data->share_weight[i]*vc[i];
952 s = sum(data,s,data->xn,0);
953 for(i=0;i<cn;++i) vc[i]-=s;
954 }
955 for(i=0;i<un;++i) {
956 sint p=data->perm_u2c[i];
957 x[i] = p>=0 ? vc[p] : 0;
958 }
959}
960
961void crs_stats(struct xxt *data)
962{
963 int a,b; uint xcol;
964 if(data->comm.id==0) {
965 unsigned s;
966 printf("xxt: separator sizes on %d =",(int)data->comm.id);
967 for(s=0;s<data->nsep;++s) printf(" %d",(int)data->sep_size[s]);
968 printf("\n");
969 printf("xxt: shared dofs on %d = %d\n",(int)data->comm.id,(int)data->sn);
970 }
971 a=data->ln;
972 comm_allreduce(&data->comm,gs_int,gs_max, &a,1, &b);
973 if(data->comm.id==0) printf("xxt: max non-shared dofs = %d\n",a);
974 a=data->sn;
975 comm_allreduce(&data->comm,gs_int,gs_max, &a,1, &b);
976 if(data->comm.id==0) printf("xxt: max shared dofs = %d\n",a);
977 xcol=data->xn; if(xcol&&data->null_space) --xcol;
978 a=xcol;
979 comm_allreduce(&data->comm,gs_int,gs_max, &a,1, &b);
980 if(data->comm.id==0) printf("xxt: max X cols = %d\n",a);
981 a=data->Xp[xcol]*sizeof(double);
982 comm_allreduce(&data->comm,gs_int,gs_max, &a,1, &b);
983 if(data->comm.id==0) printf("xxt: max X size = %d bytes\n",a);
984}
985
986void crs_free(struct xxt *data)
987{
988 comm_free(&data->comm);
989 free(data->pother);
990 free(data->req);
991 free(data->sep_size);
992 free(data->perm_u2c);
993 if(data->null_space) free(data->share_weight);
994 sparse_cholesky_free(&data->fac_A_ll);
995 free(data->A_sl.Arp); free(data->A_sl.A);
996 free(data->Xp); free(data->X);
997 free(data->vl);
998 free(data);
999}
Note: See TracBrowser for help on using the repository browser.