| 1 | #include <stddef.h>
|
|---|
| 2 | #include <stdlib.h>
|
|---|
| 3 | #include <stdio.h>
|
|---|
| 4 | #include <limits.h>
|
|---|
| 5 | #include <float.h>
|
|---|
| 6 | #include <string.h>
|
|---|
| 7 | #include <math.h>
|
|---|
| 8 | #include "gslib.h"
|
|---|
| 9 |
|
|---|
| 10 | #define crs_setup PREFIXED_NAME(crs_xxt_setup)
|
|---|
| 11 | #define crs_solve PREFIXED_NAME(crs_xxt_solve)
|
|---|
| 12 | #define crs_stats PREFIXED_NAME(crs_xxt_stats)
|
|---|
| 13 | #define crs_free PREFIXED_NAME(crs_xxt_free )
|
|---|
| 14 |
|
|---|
| 15 | /*
|
|---|
| 16 | portable log base 2
|
|---|
| 17 |
|
|---|
| 18 | does a binary search to find leading order bit
|
|---|
| 19 |
|
|---|
| 20 | UINT_BITS = number of bits in a uint
|
|---|
| 21 | BITS(0) = UINT_BITS
|
|---|
| 22 | BITS(i) = half of BITS(i-1), rounded up
|
|---|
| 23 | MASK(i) = bitmask with BITS(i) 1's followed by BITS(i) 0's
|
|---|
| 24 | */
|
|---|
| 25 |
|
|---|
| 26 | static unsigned lg(uint v)
|
|---|
| 27 | {
|
|---|
| 28 | unsigned r = 0;
|
|---|
| 29 | #define UINT_BITS (sizeof(uint)*CHAR_BIT)
|
|---|
| 30 | #define BITS(i) ((UINT_BITS+(1<<(i))-1)>>(i))
|
|---|
| 31 | #define MASK(i) ((((uint)1<<BITS(i)) - 1) << BITS(i))
|
|---|
| 32 | #define CHECK(i) if((BITS(i)!=1) && (v&MASK(i))) v>>=BITS(i), r+=BITS(i)
|
|---|
| 33 | CHECK(1); CHECK(2); CHECK(3); CHECK(4); CHECK(5); CHECK(6); CHECK(7);
|
|---|
| 34 | CHECK(8); CHECK(9); /* this covers up to 1024-bit uints */
|
|---|
| 35 | if(v&2) ++r;
|
|---|
| 36 | return r;
|
|---|
| 37 | #undef UINT_BITS
|
|---|
| 38 | #undef BITS
|
|---|
| 39 | #undef MASK
|
|---|
| 40 | #undef CHECK
|
|---|
| 41 | }
|
|---|
| 42 |
|
|---|
| 43 | /* factors: L is in CSR format
|
|---|
| 44 | D is a diagonal matrix stored as a vector
|
|---|
| 45 | actual factorization is:
|
|---|
| 46 |
|
|---|
| 47 | -1 T
|
|---|
| 48 | A = (I-L) D (I-L)
|
|---|
| 49 |
|
|---|
| 50 | -1 -T -1
|
|---|
| 51 | A = (I-L) D (I-L)
|
|---|
| 52 |
|
|---|
| 53 | (triangular factor is unit diagonal; the diagonal is not stored)
|
|---|
| 54 | */
|
|---|
| 55 | struct sparse_cholesky {
|
|---|
| 56 | uint n, *Lrp, *Lj;
|
|---|
| 57 | double *L, *D;
|
|---|
| 58 | };
|
|---|
| 59 |
|
|---|
| 60 | /*
|
|---|
| 61 | symbolic factorization: finds the sparsity structure of L
|
|---|
| 62 |
|
|---|
| 63 | uses the concept of elimination tree:
|
|---|
| 64 | the parent of node j is node i when L(i,j) is the first
|
|---|
| 65 | non-zero in column j below the diagonal (i>j)
|
|---|
| 66 | L's structure is discovered row-by-row; the first time
|
|---|
| 67 | an entry in column j is set, it must be the parent
|
|---|
| 68 |
|
|---|
| 69 | the nonzeros in L are the nonzeros in A + paths up the elimination tree
|
|---|
| 70 |
|
|---|
| 71 | linear in the number of nonzeros of L
|
|---|
| 72 | */
|
|---|
| 73 | static void factor_symbolic(uint n, const uint *Arp, const uint *Aj,
|
|---|
| 74 | struct sparse_cholesky *out, buffer *buf)
|
|---|
| 75 | {
|
|---|
| 76 | uint *visit = tmalloc(uint,2*n), *parent = visit+n;
|
|---|
| 77 | uint *Lrp, *Lj;
|
|---|
| 78 | uint i,nz=0;
|
|---|
| 79 |
|
|---|
| 80 | out->n=n;
|
|---|
| 81 |
|
|---|
| 82 | for(i=0;i<n;++i) {
|
|---|
| 83 | uint p=Arp[i], pe=Arp[i+1];
|
|---|
| 84 | visit[i]=i, parent[i]=n;
|
|---|
| 85 | for(;p!=pe;++p) {
|
|---|
| 86 | uint j=Aj[p]; if(j>=i) break;
|
|---|
| 87 | for(;visit[j]!=i;j=parent[j]) {
|
|---|
| 88 | ++nz, visit[j]=i;
|
|---|
| 89 | if(parent[j]==n) { parent[j]=i; break; }
|
|---|
| 90 | }
|
|---|
| 91 | }
|
|---|
| 92 | }
|
|---|
| 93 |
|
|---|
| 94 | Lrp=out->Lrp=tmalloc(uint,n+1+nz);
|
|---|
| 95 | Lj =out->Lj =Lrp+n+1;
|
|---|
| 96 |
|
|---|
| 97 | Lrp[0]=0;
|
|---|
| 98 | for(i=0;i<n;++i) {
|
|---|
| 99 | uint p=Arp[i], pe=Arp[i+1], count=0, *Ljr=&Lj[Lrp[i]];
|
|---|
| 100 | visit[i]=i;
|
|---|
| 101 | for(;p!=pe;++p) {
|
|---|
| 102 | uint j=Aj[p]; if(j>=i) break;
|
|---|
| 103 | for(;visit[j]!=i;j=parent[j]) Ljr[count++]=j, visit[j]=i;
|
|---|
| 104 | }
|
|---|
| 105 | sortv(Ljr, Ljr,count,sizeof(uint), buf);
|
|---|
| 106 | Lrp[i+1]=Lrp[i]+count;
|
|---|
| 107 | }
|
|---|
| 108 | free(visit);
|
|---|
| 109 | }
|
|---|
| 110 |
|
|---|
| 111 | /*
|
|---|
| 112 | numeric factorization:
|
|---|
| 113 |
|
|---|
| 114 | L is built row-by-row, using: ( ' indicates transpose )
|
|---|
| 115 |
|
|---|
| 116 |
|
|---|
| 117 | [ A r ] = [ (I-L) ] [ D^(-1) ] [ (I-L)' -s ]
|
|---|
| 118 | [ r' a ] [ -s' 1 ] [ 1/d ] [ 1 ]
|
|---|
| 119 |
|
|---|
| 120 | = [ A (I-L) D^(-1) (-s) ]
|
|---|
| 121 | [ r' s' D^(-1) s + 1/d ]
|
|---|
| 122 |
|
|---|
| 123 | so, if r' is the next row of A, up to but excluding the diagonal,
|
|---|
| 124 | then the next row of L, s', obeys
|
|---|
| 125 |
|
|---|
| 126 | r = - (I-L) D^(-1) s
|
|---|
| 127 |
|
|---|
| 128 | let y = (I-L)^(-1) (-r)
|
|---|
| 129 | then s = D y, and d = 1/(s' y)
|
|---|
| 130 |
|
|---|
| 131 | */
|
|---|
| 132 | static void factor_numeric(uint n, const uint *Arp, const uint *Aj,
|
|---|
| 133 | const double *A,
|
|---|
| 134 | struct sparse_cholesky *out,
|
|---|
| 135 | uint *visit, double *y)
|
|---|
| 136 | {
|
|---|
| 137 | const uint *Lrp=out->Lrp, *Lj=out->Lj;
|
|---|
| 138 | double *D, *L;
|
|---|
| 139 | uint i;
|
|---|
| 140 |
|
|---|
| 141 | D=out->D=tmalloc(double,n+Lrp[n]);
|
|---|
| 142 | L=out->L=D+n;
|
|---|
| 143 |
|
|---|
| 144 | for(i=0;i<n;++i) {
|
|---|
| 145 | uint p,pe; double a=0;
|
|---|
| 146 | visit[i]=n;
|
|---|
| 147 | for(p=Lrp[i],pe=Lrp[i+1];p!=pe;++p) {
|
|---|
| 148 | uint j=Lj[p]; y[j]=0, visit[j]=i;
|
|---|
| 149 | }
|
|---|
| 150 | for(p=Arp[i],pe=Arp[i+1];p!=pe;++p) {
|
|---|
| 151 | uint j=Aj[p];
|
|---|
| 152 | if(j>=i) { if(j==i) a=A[p]; break; }
|
|---|
| 153 | y[j]=-A[p];
|
|---|
| 154 | }
|
|---|
| 155 | for(p=Lrp[i],pe=Lrp[i+1];p!=pe;++p) {
|
|---|
| 156 | uint q,qe,j=Lj[p]; double lij,yj=y[j];
|
|---|
| 157 | for(q=Lrp[j],qe=Lrp[j+1];q!=qe;++q) {
|
|---|
| 158 | uint k=Lj[q]; if(visit[k]==i) yj+=L[q]*y[k];
|
|---|
| 159 | }
|
|---|
| 160 | y[j]=yj;
|
|---|
| 161 | L[p]=lij=D[j]*yj;
|
|---|
| 162 | a-=yj*lij;
|
|---|
| 163 | }
|
|---|
| 164 | D[i]=1/a;
|
|---|
| 165 | }
|
|---|
| 166 | }
|
|---|
| 167 |
|
|---|
| 168 | /* x = A^(-1) b; works when x and b alias */
|
|---|
| 169 | void sparse_cholesky_solve(
|
|---|
| 170 | double *x, const struct sparse_cholesky *fac, double *b)
|
|---|
| 171 | {
|
|---|
| 172 | const uint n=fac->n, *Lrp=fac->Lrp, *Lj=fac->Lj;
|
|---|
| 173 | const double *L=fac->L, *D=fac->D;
|
|---|
| 174 | uint i, p,pe;
|
|---|
| 175 | for(i=0;i<n;++i) {
|
|---|
| 176 | double xi = b[i];
|
|---|
| 177 | for(p=Lrp[i],pe=Lrp[i+1];p!=pe;++p) xi+=L[p]*x[Lj[p]];
|
|---|
| 178 | x[i]=xi;
|
|---|
| 179 | }
|
|---|
| 180 | for(i=0;i<n;++i) x[i]*=D[i];
|
|---|
| 181 | for(i=n;i;) {
|
|---|
| 182 | double xi = x[--i];
|
|---|
| 183 | for(p=Lrp[i],pe=Lrp[i+1];p!=pe;++p) x[Lj[p]]+=L[p]*xi;
|
|---|
| 184 | }
|
|---|
| 185 | }
|
|---|
| 186 |
|
|---|
| 187 | void sparse_cholesky_factor(uint n, const uint *Arp, const uint *Aj,
|
|---|
| 188 | const double *A,
|
|---|
| 189 | struct sparse_cholesky *out, buffer *buf)
|
|---|
| 190 | {
|
|---|
| 191 | const uint n_uints_as_dbls = (n*sizeof(uint)+sizeof(double)-1)/sizeof(double);
|
|---|
| 192 | buffer_reserve(buf,(n_uints_as_dbls+n)*sizeof(double));
|
|---|
| 193 | factor_symbolic(n,Arp,Aj,out,buf);
|
|---|
| 194 | factor_numeric(n,Arp,Aj,A,out,buf->ptr,n_uints_as_dbls+(double*)buf->ptr);
|
|---|
| 195 | }
|
|---|
| 196 |
|
|---|
| 197 | void sparse_cholesky_free(struct sparse_cholesky *fac)
|
|---|
| 198 | {
|
|---|
| 199 | free(fac->Lrp); fac->Lj=fac->Lrp=0;
|
|---|
| 200 | free(fac->D); fac->L =fac->D =0;
|
|---|
| 201 | }
|
|---|
| 202 |
|
|---|
| 203 | struct csr_mat {
|
|---|
| 204 | uint n, *Arp, *Aj; double *A;
|
|---|
| 205 | };
|
|---|
| 206 |
|
|---|
| 207 | struct xxt {
|
|---|
| 208 |
|
|---|
| 209 | /* communication */
|
|---|
| 210 |
|
|---|
| 211 | struct comm comm;
|
|---|
| 212 | uint pcoord; /* coordinate in communication tree */
|
|---|
| 213 | unsigned plevels; /* # of stages of communication */
|
|---|
| 214 | sint *pother; /* let p = pother[i], then during stage i of fan-in,
|
|---|
| 215 | if p>=0, receive from p
|
|---|
| 216 | if p< 0, send to (-p-1)
|
|---|
| 217 | fan-out is just the reverse ...
|
|---|
| 218 | on proc 0, pother is never negative
|
|---|
| 219 | on others, pother is negative for the last stage only */
|
|---|
| 220 | comm_req *req;
|
|---|
| 221 |
|
|---|
| 222 | /* separators */
|
|---|
| 223 |
|
|---|
| 224 | unsigned nsep; /* number of separators */
|
|---|
| 225 | uint *sep_size; /* # of dofs on each separator,
|
|---|
| 226 | ordered from the bottom to the top of the tree:
|
|---|
| 227 | separator 0 is the bottom-most one (dofs not shared)
|
|---|
| 228 | separator nsep-1 is the root of the tree */
|
|---|
| 229 |
|
|---|
| 230 | unsigned null_space;
|
|---|
| 231 | double *share_weight;
|
|---|
| 232 |
|
|---|
| 233 | /* vector sizes */
|
|---|
| 234 |
|
|---|
| 235 | uint un; /* user's vector size */
|
|---|
| 236 |
|
|---|
| 237 | /* xxt_solve works with "condensed" vectors;
|
|---|
| 238 | same dofs as in user's vectors, but no duplicates and no Dirichlet nodes,
|
|---|
| 239 | and also ordered topologically (children before parents) according to the
|
|---|
| 240 | separator tree */
|
|---|
| 241 |
|
|---|
| 242 | uint cn; /* size of condensed vectors */
|
|---|
| 243 | sint *perm_u2c; /* permutation from user vector to condensed vector,
|
|---|
| 244 | p=perm_u2c[i]; xu[i] = p=-1 ? 0 : xc[p]; */
|
|---|
| 245 | uint ln, sn; /* xc[0 ... ln-1] are not shared (ln=sep_size[0])
|
|---|
| 246 | xc[ln ... ln+sn-1] are shared
|
|---|
| 247 | ln+sn = cn */
|
|---|
| 248 |
|
|---|
| 249 | uint xn; /* # of columns of x = sum_i(sep_size[i]) - sep_size[0] */
|
|---|
| 250 |
|
|---|
| 251 | /* data */
|
|---|
| 252 | struct sparse_cholesky fac_A_ll;
|
|---|
| 253 | struct csr_mat A_sl;
|
|---|
| 254 | uint *Xp; double *X; /* column i of X starts at X[Xp[i]] */
|
|---|
| 255 |
|
|---|
| 256 | /* execution buffers */
|
|---|
| 257 | double *vl, *vc, *vx, *combuf;
|
|---|
| 258 | };
|
|---|
| 259 |
|
|---|
| 260 |
|
|---|
| 261 | /*
|
|---|
| 262 | for the binary communication tree, the procs are divided in half
|
|---|
| 263 | at each level, with the second half always the larger
|
|---|
| 264 |
|
|---|
| 265 | e.g., for np = 13:
|
|---|
| 266 |
|
|---|
| 267 | +------13-------+
|
|---|
| 268 | | |
|
|---|
| 269 | +---6---+ +---7---+
|
|---|
| 270 | | | | |
|
|---|
| 271 | +-3-+ +-3-+ +-3-+ +-4-+
|
|---|
| 272 | 1 2 1 2 1 2 2 2
|
|---|
| 273 | 1^1 1^1 1^1 1^1 1^1
|
|---|
| 274 |
|
|---|
| 275 | plevels is the number of levels in the tree
|
|---|
| 276 | = np==1 ? 1 : ( lg(np-1)+2 )
|
|---|
| 277 |
|
|---|
| 278 | labelling the nodes with proc id's gives this communication tree:
|
|---|
| 279 |
|
|---|
| 280 | +-------0-------+
|
|---|
| 281 | | |
|
|---|
| 282 | +---0---+ +---6---+
|
|---|
| 283 | | | | |
|
|---|
| 284 | +-0-+ +-3-+ +-6-+ +-9-+
|
|---|
| 285 | 0 1 3 4 6 7 9 b
|
|---|
| 286 | 1^2 4^5 7^8 9^a b^c
|
|---|
| 287 |
|
|---|
| 288 | consider proc 7 (pid = 7);
|
|---|
| 289 | pcoord gives the position of the leaf labelled 7:
|
|---|
| 290 | Root Right Left Right Left -> RRLRL -> 11010
|
|---|
| 291 | so pcoord = 11010 binary
|
|---|
| 292 | note the parent coordinate can be found by bit shifting right
|
|---|
| 293 | (i.e. dividing by 2)
|
|---|
| 294 | */
|
|---|
| 295 |
|
|---|
| 296 | /* sets: pcoord, nsep, plevels, pother, req */
|
|---|
| 297 | static void locate_proc(struct xxt *data)
|
|---|
| 298 | {
|
|---|
| 299 | const uint id = data->comm.id;
|
|---|
| 300 | uint n = data->comm.np, c=1, odd=0, base=0;
|
|---|
| 301 | unsigned level=0;
|
|---|
| 302 | while(n>1) {
|
|---|
| 303 | ++level;
|
|---|
| 304 | odd=(odd<<1)|(n&1);
|
|---|
| 305 | c<<=1, n>>=1;
|
|---|
| 306 | if(id>=base+n) c|=1, base+=n, n+=(odd&1);
|
|---|
| 307 | }
|
|---|
| 308 | data->pcoord=c;
|
|---|
| 309 | data->nsep = level+1;
|
|---|
| 310 | data->plevels = data->nsep-1;
|
|---|
| 311 | data->pother = tmalloc(sint,data->plevels);
|
|---|
| 312 | data->req = tmalloc(comm_req,data->plevels);
|
|---|
| 313 | for(level=0;level<data->plevels;++level) {
|
|---|
| 314 | if((c&1)==1) {
|
|---|
| 315 | uint targ = id - (n-(odd&1));
|
|---|
| 316 | data->pother[level]=-(sint)(targ+1);
|
|---|
| 317 | data->plevels = level+1;
|
|---|
| 318 | break;
|
|---|
| 319 | } else {
|
|---|
| 320 | data->pother[level]=id+n;
|
|---|
| 321 | c>>=1, n=(n<<1)+(odd&1), odd>>=1;
|
|---|
| 322 | }
|
|---|
| 323 | }
|
|---|
| 324 | }
|
|---|
| 325 |
|
|---|
| 326 | /* the tuple list describing the condensed dofs:
|
|---|
| 327 | [(separator level, share count, global id)] */
|
|---|
| 328 | struct dof { ulong id; uint level, count; };
|
|---|
| 329 |
|
|---|
| 330 | /* determine the size of each separator;
|
|---|
| 331 | sums the separator sizes following the fan-in, fan-out comm. pattern
|
|---|
| 332 | uses the share-counts to avoid counting dofs more than once */
|
|---|
| 333 | /* sets: xn, sep_size, ln, sn */
|
|---|
| 334 | static void discover_sep_sizes(struct xxt *data,
|
|---|
| 335 | struct array *dofa, buffer *buf)
|
|---|
| 336 | {
|
|---|
| 337 | const unsigned ns=data->nsep, nl=data->plevels;
|
|---|
| 338 | const uint n = dofa->n;
|
|---|
| 339 | float *v, *recv;
|
|---|
| 340 | unsigned i,lvl; uint j;
|
|---|
| 341 | const struct dof *dof = dofa->ptr;
|
|---|
| 342 |
|
|---|
| 343 | buffer_reserve(buf,2*ns*sizeof(float));
|
|---|
| 344 | v=buf->ptr, recv=v+ns;
|
|---|
| 345 |
|
|---|
| 346 | for(i=0;i<ns;++i) v[i]=0;
|
|---|
| 347 | for(j=0;j<n;++j) v[dof[j].level]+=1/(float)dof[j].count;
|
|---|
| 348 |
|
|---|
| 349 | /* fan-in */
|
|---|
| 350 | for(lvl=0;lvl<nl;++lvl) {
|
|---|
| 351 | sint other = data->pother[lvl];
|
|---|
| 352 | unsigned s = ns-(lvl+1);
|
|---|
| 353 | if(other<0) {
|
|---|
| 354 | comm_send(&data->comm,v +lvl+1,s*sizeof(float),-other-1,s);
|
|---|
| 355 | } else {
|
|---|
| 356 | comm_recv(&data->comm,recv+lvl+1,s*sizeof(float),other,s);
|
|---|
| 357 | for(i=lvl+1;i<ns;++i) v[i]+=recv[i];
|
|---|
| 358 | }
|
|---|
| 359 | }
|
|---|
| 360 | /* fan-out */
|
|---|
| 361 | for(;lvl;) {
|
|---|
| 362 | sint other = data->pother[--lvl];
|
|---|
| 363 | unsigned s = ns-(lvl+1);
|
|---|
| 364 | if(other<0)
|
|---|
| 365 | comm_recv(&data->comm,v+lvl+1,s*sizeof(float),-other-1,s);
|
|---|
| 366 | else
|
|---|
| 367 | comm_send(&data->comm,v+lvl+1,s*sizeof(float),other,s);
|
|---|
| 368 | }
|
|---|
| 369 |
|
|---|
| 370 | data->xn=0;
|
|---|
| 371 | data->sep_size = tmalloc(uint,ns);
|
|---|
| 372 | for(i=0;i<ns;++i) {
|
|---|
| 373 | uint s=v[i]+.1f;
|
|---|
| 374 | data->sep_size[i]=s;
|
|---|
| 375 | data->xn+=s;
|
|---|
| 376 | }
|
|---|
| 377 | data->ln=data->sep_size[0];
|
|---|
| 378 | data->sn=data->cn-data->ln;
|
|---|
| 379 | data->xn-=data->ln;
|
|---|
| 380 | }
|
|---|
| 381 |
|
|---|
| 382 | /* assuming [A,Aend) is sorted,
|
|---|
| 383 | removes 0's and any duplicate entries,
|
|---|
| 384 | returns new end */
|
|---|
| 385 | static ulong *unique_nonzero(ulong *A, ulong *Aend)
|
|---|
| 386 | {
|
|---|
| 387 | if(Aend==A) return A;
|
|---|
| 388 | else {
|
|---|
| 389 | ulong *end = Aend-1, last=*end, *p=A,*q=A,v=0;
|
|---|
| 390 | *end = 1;
|
|---|
| 391 | while(*q==0) ++q; /* *q==0 => q!=end since *end==0 */
|
|---|
| 392 | *end = 0;
|
|---|
| 393 | while(q!=end) {
|
|---|
| 394 | v=*q++, *p++=v;
|
|---|
| 395 | while(*q==v) ++q; /* *q==v => q!=end since *end==0 */
|
|---|
| 396 | }
|
|---|
| 397 | if(last!=v) *p++=last;
|
|---|
| 398 | return p;
|
|---|
| 399 | }
|
|---|
| 400 | }
|
|---|
| 401 |
|
|---|
| 402 | static void merge_sep_ids(struct xxt *data, ulong *sep_id, ulong *other,
|
|---|
| 403 | ulong *work, unsigned s0, buffer *buf)
|
|---|
| 404 | {
|
|---|
| 405 | const unsigned ns = data->nsep;
|
|---|
| 406 | unsigned s;
|
|---|
| 407 | ulong *p=sep_id, *q=other;
|
|---|
| 408 | for(s=s0;s<ns;++s) {
|
|---|
| 409 | ulong *end;
|
|---|
| 410 | uint size = data->sep_size[s];
|
|---|
| 411 | memcpy(work ,p,size*sizeof(ulong));
|
|---|
| 412 | memcpy(work+size,q,size*sizeof(ulong));
|
|---|
| 413 | sortv_long(work, work,2*size,sizeof(ulong), buf);
|
|---|
| 414 | end = unique_nonzero(work,work+2*size);
|
|---|
| 415 | memcpy(p,work,(end-work)*sizeof(ulong));
|
|---|
| 416 | p+=size, q+=size;
|
|---|
| 417 | }
|
|---|
| 418 | }
|
|---|
| 419 |
|
|---|
| 420 | static void init_sep_ids(struct xxt *data, struct array *dofa, ulong *xid)
|
|---|
| 421 | {
|
|---|
| 422 | const unsigned ns=data->nsep;
|
|---|
| 423 | const uint n=data->cn, *sep_size=data->sep_size;
|
|---|
| 424 | unsigned s=1;
|
|---|
| 425 | uint i, size;
|
|---|
| 426 | const struct dof *dof = dofa->ptr;
|
|---|
| 427 | if(ns==1) return;
|
|---|
| 428 | size=sep_size[s];
|
|---|
| 429 | for(i=data->ln;i<n;++i) {
|
|---|
| 430 | unsigned si = dof[i].level;
|
|---|
| 431 | while(s!=si) {
|
|---|
| 432 | memset(xid,0,size*sizeof(ulong));
|
|---|
| 433 | xid+=size;
|
|---|
| 434 | if(++s != ns) size=data->sep_size[s];
|
|---|
| 435 | }
|
|---|
| 436 | *xid++ = dof[i].id, --size;
|
|---|
| 437 | }
|
|---|
| 438 | while(s!=ns) {
|
|---|
| 439 | memset(xid,0,size*sizeof(ulong));
|
|---|
| 440 | xid+=size;
|
|---|
| 441 | if(++s != ns) size=data->sep_size[s];
|
|---|
| 442 | }
|
|---|
| 443 | }
|
|---|
| 444 |
|
|---|
| 445 | static void find_perm_x2c(uint ln, uint cn, const struct array *dofc,
|
|---|
| 446 | uint xn, const ulong *xid, sint *perm)
|
|---|
| 447 | {
|
|---|
| 448 | const struct dof *dof = dofc->ptr, *dof_end = dof+cn;
|
|---|
| 449 | const ulong *xid_end = xid+xn; uint i=ln;
|
|---|
| 450 | dof+=ln;
|
|---|
| 451 | while(dof!=dof_end) {
|
|---|
| 452 | ulong v=dof->id;
|
|---|
| 453 | while(*xid!=v) ++xid, *perm++ = -1;
|
|---|
| 454 | *perm++ = i++, ++dof, ++xid;
|
|---|
| 455 | }
|
|---|
| 456 | while(xid!=xid_end) ++xid, *perm++ = -1;
|
|---|
| 457 | }
|
|---|
| 458 |
|
|---|
| 459 | /* sets: perm_x2c */
|
|---|
| 460 | static sint *discover_sep_ids(struct xxt *data, struct array *dofa, buffer *buf)
|
|---|
| 461 | {
|
|---|
| 462 | const unsigned ns=data->nsep, nl=data->plevels;
|
|---|
| 463 | const uint xn=data->xn, *sep_size=data->sep_size;
|
|---|
| 464 | ulong *xid, *recv, *work, *p;
|
|---|
| 465 | unsigned lvl;
|
|---|
| 466 | uint size,ss;
|
|---|
| 467 | sint *perm_x2c;
|
|---|
| 468 |
|
|---|
| 469 | size=0; for(lvl=1;lvl<ns;++lvl) if(sep_size[lvl]>size) size=sep_size[lvl];
|
|---|
| 470 | xid=tmalloc(ulong,2*xn+2*size), recv=xid+xn, work=recv+xn;
|
|---|
| 471 |
|
|---|
| 472 | init_sep_ids(data,dofa,xid);
|
|---|
| 473 |
|
|---|
| 474 | if(nl) {
|
|---|
| 475 | /* fan-in */
|
|---|
| 476 | p=xid, size=xn;
|
|---|
| 477 | for(lvl=0;lvl<nl;++lvl) {
|
|---|
| 478 | sint other = data->pother[lvl];
|
|---|
| 479 | if(other<0) {
|
|---|
| 480 | comm_send(&data->comm,p ,size*sizeof(ulong),-other-1,size);
|
|---|
| 481 | } else {
|
|---|
| 482 | comm_recv(&data->comm,recv,size*sizeof(ulong),other,size);
|
|---|
| 483 | merge_sep_ids(data,p,recv,work,lvl+1,buf);
|
|---|
| 484 | }
|
|---|
| 485 | ss=data->sep_size[lvl+1];
|
|---|
| 486 | if(ss>=size || lvl==nl-1) break;
|
|---|
| 487 | p+=ss, size-=ss;
|
|---|
| 488 | }
|
|---|
| 489 | /* fan-out */
|
|---|
| 490 | for(;;) {
|
|---|
| 491 | sint other = data->pother[lvl];
|
|---|
| 492 | if(other<0)
|
|---|
| 493 | comm_recv(&data->comm,p,size*sizeof(ulong),-other-1,size);
|
|---|
| 494 | else
|
|---|
| 495 | comm_send(&data->comm,p,size*sizeof(ulong),other,size);
|
|---|
| 496 | if(lvl==0) break;
|
|---|
| 497 | ss=data->sep_size[lvl];
|
|---|
| 498 | p-=ss, size+=ss, --lvl;
|
|---|
| 499 | }
|
|---|
| 500 | }
|
|---|
| 501 |
|
|---|
| 502 | perm_x2c=tmalloc(sint,xn);
|
|---|
| 503 | find_perm_x2c(data->ln,data->cn,dofa, xn,xid, perm_x2c);
|
|---|
| 504 | free(xid);
|
|---|
| 505 |
|
|---|
| 506 | return perm_x2c;
|
|---|
| 507 | }
|
|---|
| 508 |
|
|---|
| 509 | static void apply_QQt(struct xxt *data, double *v, uint n, uint tag)
|
|---|
| 510 | {
|
|---|
| 511 | const unsigned nl=data->plevels;
|
|---|
| 512 | double *p=v, *recv=data->combuf;
|
|---|
| 513 | unsigned lvl, nsend=0;
|
|---|
| 514 | uint size=n, ss;
|
|---|
| 515 |
|
|---|
| 516 | if(n==0 || nl==0) return;
|
|---|
| 517 |
|
|---|
| 518 | tag=tag*2+0;
|
|---|
| 519 | /* fan-in */
|
|---|
| 520 | for(lvl=0;lvl<nl;++lvl) {
|
|---|
| 521 | sint other = data->pother[lvl];
|
|---|
| 522 | if(other<0) {
|
|---|
| 523 | comm_send(&data->comm,p ,size*sizeof(double),-other-1,tag);
|
|---|
| 524 | } else {
|
|---|
| 525 | uint i;
|
|---|
| 526 | comm_recv(&data->comm,recv,size*sizeof(double),other ,tag);
|
|---|
| 527 | for(i=0;i<size;++i) p[i]+=recv[i];
|
|---|
| 528 | }
|
|---|
| 529 | ss=data->sep_size[lvl+1];
|
|---|
| 530 | if(ss>=size || lvl==nl-1) break;
|
|---|
| 531 | p+=ss, size-=ss;
|
|---|
| 532 | }
|
|---|
| 533 | /* fan-out */
|
|---|
| 534 | for(;;) {
|
|---|
| 535 | sint other = data->pother[lvl];
|
|---|
| 536 | if(other<0) {
|
|---|
| 537 | comm_recv (&data->comm,p,size*sizeof(double),-other-1,tag);
|
|---|
| 538 | } else {
|
|---|
| 539 | comm_isend(&data->req[nsend++],&data->comm,
|
|---|
| 540 | p,size*sizeof(double),other ,tag);
|
|---|
| 541 | }
|
|---|
| 542 | if(lvl==0) break;
|
|---|
| 543 | ss=data->sep_size[lvl];
|
|---|
| 544 | p-=ss, size+=ss, --lvl;
|
|---|
| 545 | }
|
|---|
| 546 | if(nsend) comm_wait(data->req,nsend);
|
|---|
| 547 | }
|
|---|
| 548 |
|
|---|
| 549 | static double sum(struct xxt *data, double v, uint n, uint tag)
|
|---|
| 550 | {
|
|---|
| 551 | const unsigned nl=data->plevels;
|
|---|
| 552 | double r;
|
|---|
| 553 | unsigned lvl,nsend=0;
|
|---|
| 554 | uint size=n, ss;
|
|---|
| 555 |
|
|---|
| 556 | tag=tag*2+1;
|
|---|
| 557 | if(n==0 || nl==0) return v;
|
|---|
| 558 | /* fan-in */
|
|---|
| 559 | for(lvl=0;lvl<nl;++lvl) {
|
|---|
| 560 | sint other = data->pother[lvl];
|
|---|
| 561 | if(other<0) {
|
|---|
| 562 | comm_send(&data->comm,&v,sizeof(double),-other-1,tag);
|
|---|
| 563 | } else {
|
|---|
| 564 | comm_recv(&data->comm,&r,sizeof(double),other ,tag);
|
|---|
| 565 | v+=r;
|
|---|
| 566 | }
|
|---|
| 567 | ss=data->sep_size[lvl+1];
|
|---|
| 568 | if(ss>=size || lvl==nl-1) break;
|
|---|
| 569 | size-=ss;
|
|---|
| 570 | }
|
|---|
| 571 | /* fan-out */
|
|---|
| 572 | for(;;) {
|
|---|
| 573 | sint other = data->pother[lvl];
|
|---|
| 574 | if(other<0) {
|
|---|
| 575 | comm_recv (&data->comm,&v,sizeof(double),-other-1,tag);
|
|---|
| 576 | } else {
|
|---|
| 577 | comm_isend(&data->req[nsend++],&data->comm,
|
|---|
| 578 | &v,sizeof(double),other ,tag);
|
|---|
| 579 | }
|
|---|
| 580 | if(lvl==0) break;
|
|---|
| 581 | ss=data->sep_size[lvl];
|
|---|
| 582 | size+=ss, --lvl;
|
|---|
| 583 | }
|
|---|
| 584 | if(nsend) comm_wait(data->req,nsend);
|
|---|
| 585 | return v;
|
|---|
| 586 | }
|
|---|
| 587 |
|
|---|
| 588 | /* sorts an array of ids, removes 0's and duplicates;
|
|---|
| 589 | just returns the permutation */
|
|---|
| 590 | static uint unique_ids(uint n, const ulong *id, sint *perm, buffer *buf)
|
|---|
| 591 | {
|
|---|
| 592 | uint *p, i, un=0; ulong last=0;
|
|---|
| 593 | p = sortp_long(buf,0, id,n,sizeof(ulong));
|
|---|
| 594 | for(i=0;i<n;++i) {
|
|---|
| 595 | uint j = p[i]; ulong v = id[j];
|
|---|
| 596 | if(v==0) perm[j]=-1;
|
|---|
| 597 | else {
|
|---|
| 598 | if(v!=last) last=v, ++un;
|
|---|
| 599 | perm[j]=un-1;
|
|---|
| 600 | }
|
|---|
| 601 | }
|
|---|
| 602 | buf->n=0;
|
|---|
| 603 | return un;
|
|---|
| 604 | }
|
|---|
| 605 |
|
|---|
| 606 | /* given user's list of dofs (as id's)
|
|---|
| 607 | uses gather-scatter to find share-count and separator # for each
|
|---|
| 608 | outputs as a list, sorted topologically (children before parents)
|
|---|
| 609 | according to the sep. tree (and without duplicates),
|
|---|
| 610 | as well as the permutation to get there from the user's list */
|
|---|
| 611 | /* sets: un, cn, perm_u2c */
|
|---|
| 612 | static void discover_dofs(
|
|---|
| 613 | struct xxt *data, uint n, const ulong *id,
|
|---|
| 614 | struct array *dofa, buffer *buf, const struct comm *comm)
|
|---|
| 615 | {
|
|---|
| 616 | const uint pcoord = data->pcoord, ns=data->nsep;
|
|---|
| 617 | sint *perm;
|
|---|
| 618 | uint i, cn, *p, *pi;
|
|---|
| 619 | ulong *bid;
|
|---|
| 620 | struct gs_data *gsh; sint *v;
|
|---|
| 621 | struct dof *dof;
|
|---|
| 622 |
|
|---|
| 623 | data->un = n;
|
|---|
| 624 | data->perm_u2c = perm = tmalloc(sint,n);
|
|---|
| 625 | data->cn = cn = unique_ids(n,id,perm,buf);
|
|---|
| 626 | array_init(struct dof,dofa,cn), dofa->n=cn, dof=dofa->ptr;
|
|---|
| 627 | buffer_reserve(buf,cn*sizeof(ulong)), bid=buf->ptr;
|
|---|
| 628 | for(i=0;i<n;++i) if(perm[i]>=0) bid[perm[i]]=dof[perm[i]].id=id[i];
|
|---|
| 629 |
|
|---|
| 630 | gsh = gs_setup((const slong*)bid,cn,comm,0,gs_crystal_router,0);
|
|---|
| 631 | v = tmalloc(sint,cn);
|
|---|
| 632 |
|
|---|
| 633 | for(i=0;i<cn;++i) v[i]=pcoord;
|
|---|
| 634 | gs(v,gs_sint,gs_bpr,0,gsh,buf);
|
|---|
| 635 | for(i=0;i<cn;++i) dof[i].level=ns-1-lg((uint)v[i]);
|
|---|
| 636 |
|
|---|
| 637 | for(i=0;i<cn;++i) v[i]=1;
|
|---|
| 638 | gs(v,gs_sint,gs_add,0,gsh,buf);
|
|---|
| 639 | for(i=0;i<cn;++i) dof[i].count=v[i];
|
|---|
| 640 |
|
|---|
| 641 | free(v);
|
|---|
| 642 | gs_free(gsh);
|
|---|
| 643 |
|
|---|
| 644 | if(!cn) return;
|
|---|
| 645 | buffer_reserve(buf,2*cn*sizeof(uint));
|
|---|
| 646 | p = sortp(buf,0, &dof[0].level,cn,sizeof(struct dof));
|
|---|
| 647 | pi = p+cn; for(i=0;i<cn;++i) pi[p[i]]=i;
|
|---|
| 648 | for(i=0;i<n;++i) if(perm[i]>=0) perm[i]=pi[perm[i]];
|
|---|
| 649 | sarray_permute_buf(struct dof,dof,cn, buf);
|
|---|
| 650 | }
|
|---|
| 651 |
|
|---|
| 652 | /* vl += A_ls * vs */
|
|---|
| 653 | static void apply_p_Als(double *vl, struct xxt *data, const double *vs, uint ns)
|
|---|
| 654 | {
|
|---|
| 655 | const uint *Arp = data->A_sl.Arp,
|
|---|
| 656 | *Aj = data->A_sl.Aj;
|
|---|
| 657 | const double *A = data->A_sl.A;
|
|---|
| 658 | uint i,p,pe;
|
|---|
| 659 | for(i=0;i<ns;++i)
|
|---|
| 660 | for(p=Arp[i],pe=Arp[i+1];p!=pe;++p)
|
|---|
| 661 | vl[Aj[p]]+=A[p]*vs[i];
|
|---|
| 662 | }
|
|---|
| 663 |
|
|---|
| 664 | /* vs -= A_sl * vl */
|
|---|
| 665 | static void apply_m_Asl(double *vs, uint ns, struct xxt *data, const double *vl)
|
|---|
| 666 | {
|
|---|
| 667 | const uint *Arp = data->A_sl.Arp,
|
|---|
| 668 | *Aj = data->A_sl.Aj;
|
|---|
| 669 | const double *A = data->A_sl.A;
|
|---|
| 670 | uint i,p,pe;
|
|---|
| 671 | for(i=0;i<ns;++i)
|
|---|
| 672 | for(p=Arp[i],pe=Arp[i+1];p!=pe;++p)
|
|---|
| 673 | vs[i]-=A[p]*vl[Aj[p]];
|
|---|
| 674 | }
|
|---|
| 675 |
|
|---|
| 676 | /* returns a column of S : vs = -S(0:ei-1,ei) */
|
|---|
| 677 | static void apply_S_col(double *vs, struct xxt *data,
|
|---|
| 678 | struct csr_mat *A_ss, uint ei, double *vl)
|
|---|
| 679 | {
|
|---|
| 680 | const uint ln=data->ln;
|
|---|
| 681 | const uint *Asl_rp = data->A_sl.Arp, *Ass_rp = A_ss->Arp,
|
|---|
| 682 | *Asl_j = data->A_sl.Aj, *Ass_j = A_ss->Aj;
|
|---|
| 683 | const double *Asl = data->A_sl.A, *Ass = A_ss->A;
|
|---|
| 684 | uint i,p,pe;
|
|---|
| 685 | for(i=0;i<ei;++i) vs[i]=0;
|
|---|
| 686 | for(p=Ass_rp[ei],pe=Ass_rp[ei+1];p!=pe;++p) {
|
|---|
| 687 | uint j=Ass_j[p];
|
|---|
| 688 | if(j>=ei) break;
|
|---|
| 689 | vs[j]=-Ass[p];
|
|---|
| 690 | }
|
|---|
| 691 | for(i=0;i<ln;++i) vl[i]=0;
|
|---|
| 692 | for(p=Asl_rp[ei],pe=Asl_rp[ei+1];p!=pe;++p) vl[Asl_j[p]]=-Asl[p];
|
|---|
| 693 | sparse_cholesky_solve(vl,&data->fac_A_ll,vl);
|
|---|
| 694 | apply_m_Asl(vs,ei,data,vl);
|
|---|
| 695 | }
|
|---|
| 696 |
|
|---|
| 697 | static void apply_S(double *Svs, uint ns, struct xxt *data,
|
|---|
| 698 | struct csr_mat *A_ss, const double *vs, double *vl)
|
|---|
| 699 | {
|
|---|
| 700 | const uint ln=data->ln;
|
|---|
| 701 | const uint *Ass_rp = A_ss->Arp,
|
|---|
| 702 | *Ass_j = A_ss->Aj;
|
|---|
| 703 | const double *Ass = A_ss->A;
|
|---|
| 704 | uint i, p,pe;
|
|---|
| 705 | for(i=0;i<ns;++i) {
|
|---|
| 706 | double sum=0;
|
|---|
| 707 | for(p=Ass_rp[i],pe=Ass_rp[i+1];p!=pe;++p) {
|
|---|
| 708 | uint j=Ass_j[p];
|
|---|
| 709 | if(j>=ns) break;
|
|---|
| 710 | sum+=Ass[p]*vs[j];
|
|---|
| 711 | }
|
|---|
| 712 | Svs[i]=sum;
|
|---|
| 713 | }
|
|---|
| 714 | for(i=0;i<ln;++i) vl[i]=0;
|
|---|
| 715 | apply_p_Als(vl,data,vs,ns);
|
|---|
| 716 | sparse_cholesky_solve(vl,&data->fac_A_ll,vl);
|
|---|
| 717 | apply_m_Asl(Svs,ns,data,vl);
|
|---|
| 718 | }
|
|---|
| 719 |
|
|---|
| 720 | /* vx = X' * vs */
|
|---|
| 721 | static void apply_Xt(double *vx, uint nx, const struct xxt *data,
|
|---|
| 722 | const double *vs)
|
|---|
| 723 | {
|
|---|
| 724 | const double *X = data->X; const uint *Xp = data->Xp;
|
|---|
| 725 | uint i; for(i=0;i<nx;++i) vx[i]=tensor_dot(vs,X+Xp[i],Xp[i+1]-Xp[i]);
|
|---|
| 726 | }
|
|---|
| 727 |
|
|---|
| 728 | /* vs = X * vx */
|
|---|
| 729 | static void apply_X(double *vs, uint ns, const struct xxt *data,
|
|---|
| 730 | const double *vx, uint nx)
|
|---|
| 731 | {
|
|---|
| 732 | const double *X = data->X; const uint *Xp = data->Xp;
|
|---|
| 733 | uint i,j;
|
|---|
| 734 | for(i=0;i<ns;++i) vs[i]=0;
|
|---|
| 735 | for(i=0;i<nx;++i) {
|
|---|
| 736 | const double v = vx[i];
|
|---|
| 737 | const double *x = X+Xp[i]; uint n=Xp[i+1]-Xp[i];
|
|---|
| 738 | for(j=0;j<n;++j) vs[j]+=x[j]*v;
|
|---|
| 739 | }
|
|---|
| 740 | }
|
|---|
| 741 |
|
|---|
| 742 | static void allocate_X(struct xxt *data, sint *perm_x2c)
|
|---|
| 743 | {
|
|---|
| 744 | uint xn=data->xn;
|
|---|
| 745 | uint i,h=0;
|
|---|
| 746 | if(data->null_space && xn) --xn;
|
|---|
| 747 | data->Xp = tmalloc(uint,xn+1);
|
|---|
| 748 | data->Xp[0]=0;
|
|---|
| 749 | for(i=0;i<xn;++i) {
|
|---|
| 750 | if(perm_x2c[i]!=-1) ++h;
|
|---|
| 751 | data->Xp[i+1]=data->Xp[i]+h;
|
|---|
| 752 | }
|
|---|
| 753 | data->X = tmalloc(double,data->Xp[xn]);
|
|---|
| 754 | }
|
|---|
| 755 |
|
|---|
| 756 | static void orthogonalize(struct xxt *data, struct csr_mat *A_ss,
|
|---|
| 757 | sint *perm_x2c, buffer *buf)
|
|---|
| 758 | {
|
|---|
| 759 | uint ln=data->ln, sn=data->sn, xn=data->xn;
|
|---|
| 760 | double *vl, *vs, *vx, *Svs;
|
|---|
| 761 | uint i,j;
|
|---|
| 762 |
|
|---|
| 763 | allocate_X(data,perm_x2c);
|
|---|
| 764 |
|
|---|
| 765 | buffer_reserve(buf,(ln+2*sn+xn)*sizeof(double));
|
|---|
| 766 | vl=buf->ptr, vs=vl+ln, Svs=vs+sn, vx=Svs+sn;
|
|---|
| 767 |
|
|---|
| 768 | if(data->null_space && xn) --xn;
|
|---|
| 769 | for(i=0;i<xn;++i) {
|
|---|
| 770 | uint ns=data->Xp[i+1]-data->Xp[i];
|
|---|
| 771 | sint ui = perm_x2c[i];
|
|---|
| 772 | double ytsy, *x;
|
|---|
| 773 |
|
|---|
| 774 | if(ui == -1) {
|
|---|
| 775 | for(j=0;j<i;++j) vx[j]=0;
|
|---|
| 776 | } else {
|
|---|
| 777 | ui-=ln;
|
|---|
| 778 | apply_S_col(vs, data,A_ss, ui, vl);
|
|---|
| 779 | apply_Xt(vx,i, data, vs);
|
|---|
| 780 | }
|
|---|
| 781 | apply_QQt(data,vx,i,xn-i);
|
|---|
| 782 | apply_X(vs,ns, data, vx,i);
|
|---|
| 783 | if(ui!=-1) vs[ui]=1;
|
|---|
| 784 | apply_S(Svs,ns, data,A_ss, vs, vl);
|
|---|
| 785 | ytsy = tensor_dot(vs,Svs,ns);
|
|---|
| 786 | ytsy = sum(data,ytsy,i+1,xn-(i+1));
|
|---|
| 787 | if(ytsy<DBL_EPSILON/128) ytsy=0; else ytsy = 1/sqrt(ytsy);
|
|---|
| 788 | x=&data->X[data->Xp[i]];
|
|---|
| 789 | for(j=0;j<ns;++j) x[j]=ytsy*vs[j];
|
|---|
| 790 | }
|
|---|
| 791 | }
|
|---|
| 792 |
|
|---|
| 793 | struct yale_mat { uint i,j; double v; };
|
|---|
| 794 |
|
|---|
| 795 | /* produces CSR matrix from Yale-like format, summing duplicates */
|
|---|
| 796 | static void condense_matrix(struct array *mat, uint nr,
|
|---|
| 797 | struct csr_mat *out, buffer *buf)
|
|---|
| 798 | {
|
|---|
| 799 | uint k, nz=mat->n;
|
|---|
| 800 | struct yale_mat *p, *q;
|
|---|
| 801 | sarray_sort_2(struct yale_mat,mat->ptr,mat->n, i,0, j,0, buf);
|
|---|
| 802 |
|
|---|
| 803 | p = mat->ptr;
|
|---|
| 804 | for(k=0;k+1<nz;++k,++p) if(p[0].i==p[1].i && p[0].j==p[1].j) break;
|
|---|
| 805 | if(++k<nz) {
|
|---|
| 806 | uint i=p->i,j=p->j;
|
|---|
| 807 | q = p+1;
|
|---|
| 808 | for(;k<nz;++k,++q) {
|
|---|
| 809 | if(i==q->i&&j==q->j) p->v += q->v, --mat->n;
|
|---|
| 810 | else ++p, p->i=i=q->i,p->j=j=q->j, p->v=q->v;
|
|---|
| 811 | }
|
|---|
| 812 | }
|
|---|
| 813 |
|
|---|
| 814 | nz=mat->n;
|
|---|
| 815 | out->n=nr;
|
|---|
| 816 | out->Arp = tmalloc(uint,nr+1+mat->n);
|
|---|
| 817 | out->Aj = out->Arp+nr+1;
|
|---|
| 818 | out->A = tmalloc(double,mat->n);
|
|---|
| 819 | for(k=0;k<nr;++k) out->Arp[k]=0;
|
|---|
| 820 | for(p=mat->ptr,k=0;k<nz;++k,++p)
|
|---|
| 821 | out->Arp[p->i]++, out->Aj[k]=p->j, out->A[k]=p->v;
|
|---|
| 822 | nz=0; for(k=0;k<=nr;++k) { uint t=out->Arp[k]; out->Arp[k]=nz, nz+=t; }
|
|---|
| 823 | }
|
|---|
| 824 |
|
|---|
| 825 | static void separate_matrix(
|
|---|
| 826 | uint nz, const uint *Ai, const uint *Aj, const double *A,
|
|---|
| 827 | const sint *perm, uint ln, uint sn,
|
|---|
| 828 | struct csr_mat *out_ll, struct csr_mat *out_sl, struct csr_mat *out_ss,
|
|---|
| 829 | buffer *buf
|
|---|
| 830 | )
|
|---|
| 831 | {
|
|---|
| 832 | uint k,n;
|
|---|
| 833 | struct array mat_ll, mat_sl, mat_ss;
|
|---|
| 834 | struct yale_mat *mll, *msl, *mss;
|
|---|
| 835 | array_init(struct yale_mat,&mat_ll,2*nz), mll=mat_ll.ptr;
|
|---|
| 836 | array_init(struct yale_mat,&mat_sl,2*nz), msl=mat_sl.ptr;
|
|---|
| 837 | array_init(struct yale_mat,&mat_ss,2*nz), mss=mat_ss.ptr;
|
|---|
| 838 | for(k=0;k<nz;++k) {
|
|---|
| 839 | sint i=perm[Ai[k]], j=perm[Aj[k]];
|
|---|
| 840 | if(i<0 || j<0 || A[k]==0) continue;
|
|---|
| 841 | if((uint)i<ln) {
|
|---|
| 842 | if((uint)j<ln)
|
|---|
| 843 | n=mat_ll.n++,mll[n].i=i,mll[n].j=j,mll[n].v=A[k];
|
|---|
| 844 | } else {
|
|---|
| 845 | if((uint)j<ln)
|
|---|
| 846 | n=mat_sl.n++,msl[n].i=i-ln,msl[n].j=j,msl[n].v=A[k];
|
|---|
| 847 | else
|
|---|
| 848 | n=mat_ss.n++,mss[n].i=i-ln,mss[n].j=j-ln,mss[n].v=A[k];
|
|---|
| 849 | }
|
|---|
| 850 | }
|
|---|
| 851 | condense_matrix(&mat_ll,ln,out_ll,buf);
|
|---|
| 852 | condense_matrix(&mat_sl,sn,out_sl,buf);
|
|---|
| 853 | condense_matrix(&mat_ss,sn,out_ss,buf);
|
|---|
| 854 | array_free(&mat_ll);
|
|---|
| 855 | array_free(&mat_sl);
|
|---|
| 856 | array_free(&mat_ss);
|
|---|
| 857 | }
|
|---|
| 858 |
|
|---|
| 859 | struct xxt *crs_setup(
|
|---|
| 860 | uint n, const ulong *id,
|
|---|
| 861 | uint nz, const uint *Ai, const uint *Aj, const double *A,
|
|---|
| 862 | uint null_space, const struct comm *comm)
|
|---|
| 863 | {
|
|---|
| 864 | struct xxt *data = tmalloc(struct xxt,1);
|
|---|
| 865 | sint *perm_x2c;
|
|---|
| 866 | struct array dofa;
|
|---|
| 867 | struct csr_mat A_ll, A_ss;
|
|---|
| 868 | buffer buf;
|
|---|
| 869 |
|
|---|
| 870 | comm_dup(&data->comm,comm);
|
|---|
| 871 |
|
|---|
| 872 | locate_proc(data);
|
|---|
| 873 |
|
|---|
| 874 | data->null_space=null_space;
|
|---|
| 875 |
|
|---|
| 876 | buffer_init(&buf,1024);
|
|---|
| 877 |
|
|---|
| 878 | discover_dofs(data,n,id,&dofa,&buf,&data->comm);
|
|---|
| 879 | discover_sep_sizes(data,&dofa,&buf);
|
|---|
| 880 |
|
|---|
| 881 | perm_x2c = discover_sep_ids(data,&dofa,&buf);
|
|---|
| 882 | if(data->null_space) {
|
|---|
| 883 | uint i; double count = 0; struct dof *dof = dofa.ptr;
|
|---|
| 884 | for(i=0;i<data->cn;++i) count+=1/(double)dof[i].count;
|
|---|
| 885 | count=1/sum(data,count,data->xn,0);
|
|---|
| 886 | data->share_weight=tmalloc(double,data->cn);
|
|---|
| 887 | for(i=0;i<data->cn;++i)
|
|---|
| 888 | data->share_weight[i]=count/dof[i].count;
|
|---|
| 889 | }
|
|---|
| 890 | array_free(&dofa);
|
|---|
| 891 |
|
|---|
| 892 | if(!data->null_space || data->xn!=0) {
|
|---|
| 893 | separate_matrix(nz,Ai,Aj,A,data->perm_u2c,
|
|---|
| 894 | data->ln,data->sn,
|
|---|
| 895 | &A_ll,&data->A_sl,&A_ss,
|
|---|
| 896 | &buf);
|
|---|
| 897 | } else {
|
|---|
| 898 | separate_matrix(nz,Ai,Aj,A,data->perm_u2c,
|
|---|
| 899 | data->ln-1,1,
|
|---|
| 900 | &A_ll,&data->A_sl,&A_ss,
|
|---|
| 901 | &buf);
|
|---|
| 902 | }
|
|---|
| 903 |
|
|---|
| 904 | sparse_cholesky_factor(A_ll.n,A_ll.Arp,A_ll.Aj,A_ll.A,
|
|---|
| 905 | &data->fac_A_ll, &buf);
|
|---|
| 906 | free(A_ll.Arp); free(A_ll.A);
|
|---|
| 907 |
|
|---|
| 908 | data->vl = tmalloc(double,data->ln+data->cn+2*data->xn);
|
|---|
| 909 | data->vc = data->vl+data->ln;
|
|---|
| 910 | data->vx = data->vc+data->cn;
|
|---|
| 911 | data->combuf = data->vx+data->xn;
|
|---|
| 912 |
|
|---|
| 913 | orthogonalize(data,&A_ss,perm_x2c,&buf);
|
|---|
| 914 | free(A_ss.Arp); free(A_ss.A);
|
|---|
| 915 | free(perm_x2c);
|
|---|
| 916 | buffer_free(&buf);
|
|---|
| 917 |
|
|---|
| 918 | return data;
|
|---|
| 919 | }
|
|---|
| 920 |
|
|---|
| 921 | void crs_solve(double *x, struct xxt *data, const double *b)
|
|---|
| 922 | {
|
|---|
| 923 | uint cn=data->cn, un=data->un, ln=data->ln, sn=data->sn, xn=data->xn;
|
|---|
| 924 | double *vl=data->vl, *vc=data->vc, *vx=data->vx;
|
|---|
| 925 | uint i;
|
|---|
| 926 | for(i=0;i<cn;++i) vc[i]=0;
|
|---|
| 927 | for(i=0;i<un;++i) {
|
|---|
| 928 | sint p=data->perm_u2c[i];
|
|---|
| 929 | if(p>=0) vc[p]+=b[i];
|
|---|
| 930 | }
|
|---|
| 931 | if(xn>0 && (!data->null_space || xn>1)) {
|
|---|
| 932 | if(data->null_space) --xn;
|
|---|
| 933 | sparse_cholesky_solve(vc,&data->fac_A_ll,vc);
|
|---|
| 934 | apply_m_Asl(vc+ln,sn, data, vc);
|
|---|
| 935 | apply_Xt(vx,xn, data, vc+ln);
|
|---|
| 936 | apply_QQt(data,vx,xn,0);
|
|---|
| 937 | apply_X(vc+ln,sn, data, vx,xn);
|
|---|
| 938 | for(i=0;i<ln;++i) vl[i]=0;
|
|---|
| 939 | apply_p_Als(vl, data, vc+ln,sn);
|
|---|
| 940 | sparse_cholesky_solve(vl,&data->fac_A_ll,vl);
|
|---|
| 941 | for(i=0;i<ln;++i) vc[i]-=vl[i];
|
|---|
| 942 | } else {
|
|---|
| 943 | sparse_cholesky_solve(vc,&data->fac_A_ll,vc);
|
|---|
| 944 | if(data->null_space) {
|
|---|
| 945 | if(xn==0) vc[ln-1]=0;
|
|---|
| 946 | else if(sn==1) vc[ln]=0;
|
|---|
| 947 | }
|
|---|
| 948 | }
|
|---|
| 949 | if(data->null_space) {
|
|---|
| 950 | double s=0;
|
|---|
| 951 | for(i=0;i<cn;++i) s+=data->share_weight[i]*vc[i];
|
|---|
| 952 | s = sum(data,s,data->xn,0);
|
|---|
| 953 | for(i=0;i<cn;++i) vc[i]-=s;
|
|---|
| 954 | }
|
|---|
| 955 | for(i=0;i<un;++i) {
|
|---|
| 956 | sint p=data->perm_u2c[i];
|
|---|
| 957 | x[i] = p>=0 ? vc[p] : 0;
|
|---|
| 958 | }
|
|---|
| 959 | }
|
|---|
| 960 |
|
|---|
| 961 | void crs_stats(struct xxt *data)
|
|---|
| 962 | {
|
|---|
| 963 | int a,b; uint xcol;
|
|---|
| 964 | if(data->comm.id==0) {
|
|---|
| 965 | unsigned s;
|
|---|
| 966 | printf("xxt: separator sizes on %d =",(int)data->comm.id);
|
|---|
| 967 | for(s=0;s<data->nsep;++s) printf(" %d",(int)data->sep_size[s]);
|
|---|
| 968 | printf("\n");
|
|---|
| 969 | printf("xxt: shared dofs on %d = %d\n",(int)data->comm.id,(int)data->sn);
|
|---|
| 970 | }
|
|---|
| 971 | a=data->ln;
|
|---|
| 972 | comm_allreduce(&data->comm,gs_int,gs_max, &a,1, &b);
|
|---|
| 973 | if(data->comm.id==0) printf("xxt: max non-shared dofs = %d\n",a);
|
|---|
| 974 | a=data->sn;
|
|---|
| 975 | comm_allreduce(&data->comm,gs_int,gs_max, &a,1, &b);
|
|---|
| 976 | if(data->comm.id==0) printf("xxt: max shared dofs = %d\n",a);
|
|---|
| 977 | xcol=data->xn; if(xcol&&data->null_space) --xcol;
|
|---|
| 978 | a=xcol;
|
|---|
| 979 | comm_allreduce(&data->comm,gs_int,gs_max, &a,1, &b);
|
|---|
| 980 | if(data->comm.id==0) printf("xxt: max X cols = %d\n",a);
|
|---|
| 981 | a=data->Xp[xcol]*sizeof(double);
|
|---|
| 982 | comm_allreduce(&data->comm,gs_int,gs_max, &a,1, &b);
|
|---|
| 983 | if(data->comm.id==0) printf("xxt: max X size = %d bytes\n",a);
|
|---|
| 984 | }
|
|---|
| 985 |
|
|---|
| 986 | void crs_free(struct xxt *data)
|
|---|
| 987 | {
|
|---|
| 988 | comm_free(&data->comm);
|
|---|
| 989 | free(data->pother);
|
|---|
| 990 | free(data->req);
|
|---|
| 991 | free(data->sep_size);
|
|---|
| 992 | free(data->perm_u2c);
|
|---|
| 993 | if(data->null_space) free(data->share_weight);
|
|---|
| 994 | sparse_cholesky_free(&data->fac_A_ll);
|
|---|
| 995 | free(data->A_sl.Arp); free(data->A_sl.A);
|
|---|
| 996 | free(data->Xp); free(data->X);
|
|---|
| 997 | free(data->vl);
|
|---|
| 998 | free(data);
|
|---|
| 999 | }
|
|---|