/*BHEADER********************************************************************** * Copyright (c) 2008, Lawrence Livermore National Security, LLC. * Produced at the Lawrence Livermore National Laboratory. * This file is part of HYPRE. See file COPYRIGHT for details. * * HYPRE is free software; you can redistribute it and/or modify it under the * terms of the GNU Lesser General Public License (as published by the Free * Software Foundation) version 2.1 dated February 1999. * * $Revision: 2.4 $ ***********************************************************************EHEADER*/ #include "headers.h" /*--------------------------------------------------------------------------- * hypre_BoomerAMGBuildInterp *--------------------------------------------------------------------------*/ int hypre_BoomerAMGBuildInterp( hypre_ParCSRMatrix *A, int *CF_marker, hypre_ParCSRMatrix *S, HYPRE_BigInt *num_cpts_global, int num_functions, int *dof_func, int debug_flag, double trunc_factor, int max_elmts, int *col_offd_S_to_A, hypre_ParCSRMatrix **P_ptr) { MPI_Comm comm = hypre_ParCSRMatrixComm(A); hypre_ParCSRCommPkg *comm_pkg = hypre_ParCSRMatrixCommPkg(A); hypre_ParCSRCommHandle *comm_handle; hypre_CSRMatrix *A_diag = hypre_ParCSRMatrixDiag(A); double *A_diag_data = hypre_CSRMatrixData(A_diag); int *A_diag_i = hypre_CSRMatrixI(A_diag); int *A_diag_j = hypre_CSRMatrixJ(A_diag); hypre_CSRMatrix *A_offd = hypre_ParCSRMatrixOffd(A); double *A_offd_data = hypre_CSRMatrixData(A_offd); int *A_offd_i = hypre_CSRMatrixI(A_offd); int *A_offd_j = hypre_CSRMatrixJ(A_offd); int num_cols_A_offd = hypre_CSRMatrixNumCols(A_offd); hypre_CSRMatrix *S_diag = hypre_ParCSRMatrixDiag(S); int *S_diag_i = hypre_CSRMatrixI(S_diag); int *S_diag_j = hypre_CSRMatrixJ(S_diag); hypre_CSRMatrix *S_offd = hypre_ParCSRMatrixOffd(S); int *S_offd_i = hypre_CSRMatrixI(S_offd); int *S_offd_j = hypre_CSRMatrixJ(S_offd); hypre_ParCSRMatrix *P; HYPRE_BigInt *col_map_offd_P = NULL; int *tmp_map_offd = NULL; int *CF_marker_offd = NULL; int *dof_func_offd = NULL; hypre_CSRMatrix *A_ext; double *A_ext_data; int *A_ext_i; int *A_ext_j; hypre_CSRMatrix *P_diag; hypre_CSRMatrix *P_offd; double *P_diag_data = NULL; int *P_diag_i; int *P_diag_j = NULL; double *P_offd_data = NULL; int *P_offd_i; int *P_offd_j = NULL; int P_diag_size, P_offd_size; int *P_marker = NULL; int *P_marker_offd = NULL; int jj_counter,jj_counter_offd; int *jj_count, *jj_count_offd; int jj_begin_row,jj_begin_row_offd; int jj_end_row,jj_end_row_offd; int start_indexing = 0; /* start indexing for P_data at 0 */ int n_fine = hypre_CSRMatrixNumRows(A_diag); int strong_f_marker; int *fine_to_coarse; /*int *fine_to_coarse_offd;*/ int *coarse_counter; int coarse_shift; HYPRE_BigInt total_global_cpts; HYPRE_BigInt my_first_cpt; int num_cols_P_offd; int i,i1,i2; int j,jl,jj,jj1; int start; int sgn; int c_num; double diagonal; double sum; double distribute; double zero = 0.0; double one = 1.0; int my_id; int num_procs; int num_threadsID; int num_sends; int index; int ns, ne, size, rest; int *int_buf_data = NULL; double wall_time; /* for debugging instrumentation */ MPI_Comm_size(comm, &num_procs); MPI_Comm_rank(comm,&my_id); num_threadsID = hypre_NumThreads(); #ifdef HYPRE_NO_GLOBAL_PARTITION my_first_cpt = num_cpts_global[0]; if (my_id == (num_procs -1)) total_global_cpts = num_cpts_global[1]; MPI_Bcast(&total_global_cpts, 1, MPI_HYPRE_BIG_INT, num_procs-1, comm); #else my_first_cpt = num_cpts_global[my_id]; total_global_cpts = num_cpts_global[num_procs]; #endif /*------------------------------------------------------------------- * Get the CF_marker data for the off-processor columns *-------------------------------------------------------------------*/ if (debug_flag==4) wall_time = time_getWallclockSeconds(); if (num_cols_A_offd) CF_marker_offd = hypre_CTAlloc(int, num_cols_A_offd); if (num_functions > 1 && num_cols_A_offd) dof_func_offd = hypre_CTAlloc(int, num_cols_A_offd); if (!comm_pkg) { #ifdef HYPRE_NO_GLOBAL_PARTITION hypre_NewCommPkgCreate(A); #else hypre_MatvecCommPkgCreate(A); #endif comm_pkg = hypre_ParCSRMatrixCommPkg(A); } num_sends = hypre_ParCSRCommPkgNumSends(comm_pkg); if (hypre_ParCSRCommPkgSendMapStart(comm_pkg, num_sends)) int_buf_data = hypre_CTAlloc(int, hypre_ParCSRCommPkgSendMapStart(comm_pkg, num_sends)); index = 0; for (i = 0; i < num_sends; i++) { start = hypre_ParCSRCommPkgSendMapStart(comm_pkg, i); for (j = start; j < hypre_ParCSRCommPkgSendMapStart(comm_pkg, i+1); j++) int_buf_data[index++] = CF_marker[hypre_ParCSRCommPkgSendMapElmt(comm_pkg,j)]; } comm_handle = hypre_ParCSRCommHandleCreate( 11, comm_pkg, int_buf_data, CF_marker_offd); hypre_ParCSRCommHandleDestroy(comm_handle); if (num_functions > 1) { index = 0; for (i = 0; i < num_sends; i++) { start = hypre_ParCSRCommPkgSendMapStart(comm_pkg, i); for (j=start; j < hypre_ParCSRCommPkgSendMapStart(comm_pkg, i+1); j++) int_buf_data[index++] = dof_func[hypre_ParCSRCommPkgSendMapElmt(comm_pkg,j)]; } comm_handle = hypre_ParCSRCommHandleCreate( 11, comm_pkg, int_buf_data, dof_func_offd); hypre_ParCSRCommHandleDestroy(comm_handle); } if (debug_flag==4) { wall_time = time_getWallclockSeconds() - wall_time; printf("Proc = %d Interp: Comm 1 CF_marker = %f\n", my_id, wall_time); fflush(NULL); } /*---------------------------------------------------------------------- * Get the ghost rows of A *---------------------------------------------------------------------*/ if (debug_flag==4) wall_time = time_getWallclockSeconds(); if (num_procs > 1) { A_ext = hypre_ParCSRMatrixExtractConvBExt(A,A,1); A_ext_i = hypre_CSRMatrixI(A_ext); A_ext_j = hypre_CSRMatrixJ(A_ext); A_ext_data = hypre_CSRMatrixData(A_ext); } /*index = 0; for (i=0; i < num_cols_A_offd; i++) { for (j=A_ext_i[i]; j < A_ext_i[i+1]; j++) { k = A_ext_j[j]; if (k >= col_1 && k < col_n) { A_ext_j[index] = k - col_1; A_ext_data[index++] = A_ext_data[j]; } else { kc = hypre_BinarySearch(col_map_offd,k,num_cols_A_offd); if (kc > -1) { A_ext_j[index] = -kc-1; A_ext_data[index++] = A_ext_data[j]; } } } A_ext_i[i] = index; } for (i = num_cols_A_offd; i > 0; i--) A_ext_i[i] = A_ext_i[i-1]; if (num_procs > 1) A_ext_i[0] = 0;*/ if (debug_flag==4) { wall_time = time_getWallclockSeconds() - wall_time; printf("Proc = %d Interp: Comm 2 Get A_ext = %f\n", my_id, wall_time); fflush(NULL); } /*----------------------------------------------------------------------- * First Pass: Determine size of P and fill in fine_to_coarse mapping. *-----------------------------------------------------------------------*/ /*----------------------------------------------------------------------- * Intialize counters and allocate mapping vector. *-----------------------------------------------------------------------*/ coarse_counter = hypre_CTAlloc(int, num_threadsID); jj_count = hypre_CTAlloc(int, num_threadsID); jj_count_offd = hypre_CTAlloc(int, num_threadsID); fine_to_coarse = hypre_CTAlloc(int, n_fine); /*#define HYPRE_SMP_PRIVATE i #include "../utilities/hypre_smp_forloop.h"*/ for (i = 0; i < n_fine; i++) fine_to_coarse[i] = -1; jj_counter = start_indexing; jj_counter_offd = start_indexing; /*----------------------------------------------------------------------- * Loop over fine grid. *-----------------------------------------------------------------------*/ /* RDF: this looks a little tricky, but doable */ #define HYPRE_SMP_PRIVATE i,j,i1,jj,ns,ne,size,rest #include "../utilities/hypre_smp_forloop.h" for (j = 0; j < num_threadsID; j++) { size = n_fine/num_threadsID; rest = n_fine - size*num_threadsID; if (j < rest) { ns = j*size+j; ne = (j+1)*size+j+1; } else { ns = j*size+rest; ne = (j+1)*size+rest; } for (i = ns; i < ne; i++) { /*-------------------------------------------------------------------- * If i is a C-point, interpolation is the identity. Also set up * mapping vector. *--------------------------------------------------------------------*/ if (CF_marker[i] >= 0) { jj_count[j]++; fine_to_coarse[i] = coarse_counter[j]; coarse_counter[j]++; } /*-------------------------------------------------------------------- * If i is an F-point, interpolation is from the C-points that * strongly influence i. *--------------------------------------------------------------------*/ else { for (jj = S_diag_i[i]; jj < S_diag_i[i+1]; jj++) { i1 = S_diag_j[jj]; if (CF_marker[i1] >= 0) { jj_count[j]++; } } if (num_procs > 1) { if (col_offd_S_to_A) { for (jj = S_offd_i[i]; jj < S_offd_i[i+1]; jj++) { i1 = col_offd_S_to_A[S_offd_j[jj]]; if (CF_marker_offd[i1] >= 0) { jj_count_offd[j]++; } } } else { for (jj = S_offd_i[i]; jj < S_offd_i[i+1]; jj++) { i1 = S_offd_j[jj]; if (CF_marker_offd[i1] >= 0) { jj_count_offd[j]++; } } } } } } } /*----------------------------------------------------------------------- * Allocate arrays. *-----------------------------------------------------------------------*/ for (i=0; i < num_threadsID-1; i++) { coarse_counter[i+1] += coarse_counter[i]; jj_count[i+1] += jj_count[i]; jj_count_offd[i+1] += jj_count_offd[i]; } i = num_threadsID-1; jj_counter = jj_count[i]; jj_counter_offd = jj_count_offd[i]; P_diag_size = jj_counter; P_diag_i = hypre_CTAlloc(int, n_fine+1); if (P_diag_size) { P_diag_j = hypre_CTAlloc(int, P_diag_size); P_diag_data = hypre_CTAlloc(double, P_diag_size); } P_diag_i[n_fine] = jj_counter; P_offd_size = jj_counter_offd; P_offd_i = hypre_CTAlloc(int, n_fine+1); if (P_offd_size) { P_offd_j = hypre_CTAlloc(int, P_offd_size); P_offd_data = hypre_CTAlloc(double, P_offd_size); } /*----------------------------------------------------------------------- * Intialize some stuff. *-----------------------------------------------------------------------*/ jj_counter = start_indexing; jj_counter_offd = start_indexing; if (debug_flag==4) { wall_time = time_getWallclockSeconds() - wall_time; printf("Proc = %d Interp: Internal work 1 = %f\n", my_id, wall_time); fflush(NULL); } /*----------------------------------------------------------------------- * Send and receive fine_to_coarse info. *-----------------------------------------------------------------------*/ if (debug_flag==4) wall_time = time_getWallclockSeconds(); /*fine_to_coarse_offd = hypre_CTAlloc(HYPRE_BigInt, num_cols_A_offd); */ #define HYPRE_SMP_PRIVATE i,j,ns,ne,size,rest,coarse_shift #include "../utilities/hypre_smp_forloop.h" for (j = 0; j < num_threadsID; j++) { coarse_shift = 0; if (j > 0) coarse_shift = coarse_counter[j-1]; size = n_fine/num_threadsID; rest = n_fine - size*num_threadsID; if (j < rest) { ns = j*size+j; ne = (j+1)*size+j+1; } else { ns = j*size+rest; ne = (j+1)*size+rest; } for (i = ns; i < ne; i++) fine_to_coarse[i] += coarse_shift; } /* index = 0; for (i = 0; i < num_sends; i++) { start = hypre_ParCSRCommPkgSendMapStart(comm_pkg, i); for (j = start; j < hypre_ParCSRCommPkgSendMapStart(comm_pkg, i+1); j++) big_buf_data[index++] = my_first_cpt+ (HYPRE_BigInt)fine_to_coarse[hypre_ParCSRCommPkgSendMapElmt(comm_pkg,j)]; } comm_handle = hypre_ParCSRCommHandleCreate( 21, comm_pkg, big_buf_data, fine_to_coarse_offd); hypre_ParCSRCommHandleDestroy(comm_handle); if (debug_flag==4) { wall_time = time_getWallclockSeconds() - wall_time; printf("Proc = %d Interp: Comm 4 FineToCoarse = %f\n", my_id, wall_time); fflush(NULL); } */ if (debug_flag==4) wall_time = time_getWallclockSeconds(); /*----------------------------------------------------------------------- * Loop over fine grid points. *-----------------------------------------------------------------------*/ #define HYPRE_SMP_PRIVATE i,j,jl,i1,i2,jj,jj1,ns,ne,size,rest,sum,diagonal,distribute,P_marker,P_marker_offd,strong_f_marker,jj_counter,jj_counter_offd,sgn,c_num,jj_begin_row,jj_end_row,jj_begin_row_offd,jj_end_row_offd #include "../utilities/hypre_smp_forloop.h" for (jl = 0; jl < num_threadsID; jl++) { size = n_fine/num_threadsID; rest = n_fine - size*num_threadsID; if (jl < rest) { ns = jl*size+jl; ne = (jl+1)*size+jl+1; } else { ns = jl*size+rest; ne = (jl+1)*size+rest; } jj_counter = 0; if (jl > 0) jj_counter = jj_count[jl-1]; jj_counter_offd = 0; if (jl > 0) jj_counter_offd = jj_count_offd[jl-1]; if (n_fine) P_marker = hypre_CTAlloc(int, n_fine); else P_marker = NULL; if (num_cols_A_offd) P_marker_offd = hypre_CTAlloc(int, num_cols_A_offd); else P_marker_offd = NULL; for (i = 0; i < n_fine; i++) { P_marker[i] = -1; } for (i = 0; i < num_cols_A_offd; i++) { P_marker_offd[i] = -1; } strong_f_marker = -2; for (i = ns; i < ne; i++) { /*-------------------------------------------------------------------- * If i is a c-point, interpolation is the identity. *--------------------------------------------------------------------*/ if (CF_marker[i] >= 0) { P_diag_i[i] = jj_counter; P_diag_j[jj_counter] = fine_to_coarse[i]; P_diag_data[jj_counter] = one; jj_counter++; } /*-------------------------------------------------------------------- * If i is an F-point, build interpolation. *--------------------------------------------------------------------*/ else { /* Diagonal part of P */ P_diag_i[i] = jj_counter; jj_begin_row = jj_counter; for (jj = S_diag_i[i]; jj < S_diag_i[i+1]; jj++) { i1 = S_diag_j[jj]; /*-------------------------------------------------------------- * If neighbor i1 is a C-point, set column number in P_diag_j * and initialize interpolation weight to zero. *--------------------------------------------------------------*/ if (CF_marker[i1] >= 0) { P_marker[i1] = jj_counter; P_diag_j[jj_counter] = fine_to_coarse[i1]; P_diag_data[jj_counter] = zero; jj_counter++; } /*-------------------------------------------------------------- * If neighbor i1 is an F-point, mark it as a strong F-point * whose connection needs to be distributed. *--------------------------------------------------------------*/ else if (CF_marker[i1] != -3) { P_marker[i1] = strong_f_marker; } } jj_end_row = jj_counter; /* Off-Diagonal part of P */ P_offd_i[i] = jj_counter_offd; jj_begin_row_offd = jj_counter_offd; if (num_procs > 1) { if (col_offd_S_to_A) { for (jj = S_offd_i[i]; jj < S_offd_i[i+1]; jj++) { i1 = col_offd_S_to_A[S_offd_j[jj]]; /*----------------------------------------------------------- * If neighbor i1 is a C-point, set column number in P_offd_j * and initialize interpolation weight to zero. *-----------------------------------------------------------*/ if (CF_marker_offd[i1] >= 0) { P_marker_offd[i1] = jj_counter_offd; /*P_offd_j[jj_counter_offd] = fine_to_coarse_offd[i1];*/ P_offd_j[jj_counter_offd] = i1; P_offd_data[jj_counter_offd] = zero; jj_counter_offd++; } /*----------------------------------------------------------- * If neighbor i1 is an F-point, mark it as a strong F-point * whose connection needs to be distributed. *-----------------------------------------------------------*/ else if (CF_marker_offd[i1] != -3) { P_marker_offd[i1] = strong_f_marker; } } } else { for (jj = S_offd_i[i]; jj < S_offd_i[i+1]; jj++) { i1 = S_offd_j[jj]; /*----------------------------------------------------------- * If neighbor i1 is a C-point, set column number in P_offd_j * and initialize interpolation weight to zero. *-----------------------------------------------------------*/ if (CF_marker_offd[i1] >= 0) { P_marker_offd[i1] = jj_counter_offd; /*P_offd_j[jj_counter_offd] = fine_to_coarse_offd[i1];*/ P_offd_j[jj_counter_offd] = i1; P_offd_data[jj_counter_offd] = zero; jj_counter_offd++; } /*----------------------------------------------------------- * If neighbor i1 is an F-point, mark it as a strong F-point * whose connection needs to be distributed. *-----------------------------------------------------------*/ else if (CF_marker_offd[i1] != -3) { P_marker_offd[i1] = strong_f_marker; } } } } jj_end_row_offd = jj_counter_offd; diagonal = A_diag_data[A_diag_i[i]]; /* Loop over ith row of A. First, the diagonal part of A */ for (jj = A_diag_i[i]+1; jj < A_diag_i[i+1]; jj++) { i1 = A_diag_j[jj]; /*-------------------------------------------------------------- * Case 1: neighbor i1 is a C-point and strongly influences i, * accumulate a_{i,i1} into the interpolation weight. *--------------------------------------------------------------*/ if (P_marker[i1] >= jj_begin_row) { P_diag_data[P_marker[i1]] += A_diag_data[jj]; } /*-------------------------------------------------------------- * Case 2: neighbor i1 is an F-point and strongly influences i, * distribute a_{i,i1} to C-points that strongly infuence i. * Note: currently no distribution to the diagonal in this case. *--------------------------------------------------------------*/ else if (P_marker[i1] == strong_f_marker) { sum = zero; /*----------------------------------------------------------- * Loop over row of A for point i1 and calculate the sum * of the connections to c-points that strongly influence i. *-----------------------------------------------------------*/ sgn = 1; if (A_diag_data[A_diag_i[i1]] < 0) sgn = -1; /* Diagonal block part of row i1 */ for (jj1 = A_diag_i[i1]; jj1 < A_diag_i[i1+1]; jj1++) { i2 = A_diag_j[jj1]; if (P_marker[i2] >= jj_begin_row && (sgn*A_diag_data[jj1]) < 0) { sum += A_diag_data[jj1]; } } /* Off-Diagonal block part of row i1 */ if (num_procs > 1) { for (jj1 = A_offd_i[i1]; jj1 < A_offd_i[i1+1]; jj1++) { i2 = A_offd_j[jj1]; if (P_marker_offd[i2] >= jj_begin_row_offd && (sgn*A_offd_data[jj1]) < 0) { sum += A_offd_data[jj1]; } } } if (sum != 0) { distribute = A_diag_data[jj] / sum; /*----------------------------------------------------------- * Loop over row of A for point i1 and do the distribution. *-----------------------------------------------------------*/ /* Diagonal block part of row i1 */ for (jj1 = A_diag_i[i1]; jj1 < A_diag_i[i1+1]; jj1++) { i2 = A_diag_j[jj1]; if (P_marker[i2] >= jj_begin_row && (sgn*A_diag_data[jj1]) < 0) { P_diag_data[P_marker[i2]] += distribute * A_diag_data[jj1]; } } /* Off-Diagonal block part of row i1 */ if (num_procs > 1) { for (jj1 = A_offd_i[i1]; jj1 < A_offd_i[i1+1]; jj1++) { i2 = A_offd_j[jj1]; if (P_marker_offd[i2] >= jj_begin_row_offd && (sgn*A_offd_data[jj1]) < 0) { P_offd_data[P_marker_offd[i2]] += distribute * A_offd_data[jj1]; } } } } else { if (num_functions == 1 || dof_func[i] == dof_func[i1]) diagonal += A_diag_data[jj]; } } /*-------------------------------------------------------------- * Case 3: neighbor i1 weakly influences i, accumulate a_{i,i1} * into the diagonal. *--------------------------------------------------------------*/ else if (CF_marker[i1] != -3) { if (num_functions == 1 || dof_func[i] == dof_func[i1]) diagonal += A_diag_data[jj]; } } /*---------------------------------------------------------------- * Still looping over ith row of A. Next, loop over the * off-diagonal part of A *---------------------------------------------------------------*/ if (num_procs > 1) { for (jj = A_offd_i[i]; jj < A_offd_i[i+1]; jj++) { i1 = A_offd_j[jj]; /*-------------------------------------------------------------- * Case 1: neighbor i1 is a C-point and strongly influences i, * accumulate a_{i,i1} into the interpolation weight. *--------------------------------------------------------------*/ if (P_marker_offd[i1] >= jj_begin_row_offd) { P_offd_data[P_marker_offd[i1]] += A_offd_data[jj]; } /*------------------------------------------------------------ * Case 2: neighbor i1 is an F-point and strongly influences i, * distribute a_{i,i1} to C-points that strongly infuence i. * Note: currently no distribution to the diagonal in this case. *-----------------------------------------------------------*/ else if (P_marker_offd[i1] == strong_f_marker) { sum = zero; /*--------------------------------------------------------- * Loop over row of A_ext for point i1 and calculate the sum * of the connections to c-points that strongly influence i. *---------------------------------------------------------*/ /* find row number */ c_num = A_offd_j[jj]; sgn = 1; if (A_ext_data[A_ext_i[c_num]] < 0) sgn = -1; for (jj1 = A_ext_i[c_num]; jj1 < A_ext_i[c_num+1]; jj1++) { i2 = A_ext_j[jj1]; if (i2 > -1) { /* in the diagonal block */ if (P_marker[i2] >= jj_begin_row && (sgn*A_ext_data[jj1]) < 0) { sum += A_ext_data[jj1]; } } else { /* in the off_diagonal block */ if (P_marker_offd[-i2-1] >= jj_begin_row_offd && (sgn*A_ext_data[jj1]) < 0) { sum += A_ext_data[jj1]; } } } if (sum != 0) { distribute = A_offd_data[jj] / sum; /*--------------------------------------------------------- * Loop over row of A_ext for point i1 and do * the distribution. *--------------------------------------------------------*/ /* Diagonal block part of row i1 */ for (jj1 = A_ext_i[c_num]; jj1 < A_ext_i[c_num+1]; jj1++) { i2 = A_ext_j[jj1]; if (i2 > -1) /* in the diagonal block */ { if (P_marker[i2] >= jj_begin_row && (sgn*A_ext_data[jj1]) < 0) { P_diag_data[P_marker[i2]] += distribute * A_ext_data[jj1]; } } else { /* in the off_diagonal block */ if (P_marker_offd[-i2-1] >= jj_begin_row_offd && (sgn*A_ext_data[jj1]) < 0) P_offd_data[P_marker_offd[-i2-1]] += distribute * A_ext_data[jj1]; } } } else { if (num_functions == 1 || dof_func[i] == dof_func_offd[i1]) diagonal += A_offd_data[jj]; } } /*----------------------------------------------------------- * Case 3: neighbor i1 weakly influences i, accumulate a_{i,i1} * into the diagonal. *-----------------------------------------------------------*/ else if (CF_marker_offd[i1] != -3) { if (num_functions == 1 || dof_func[i] == dof_func_offd[i1]) diagonal += A_offd_data[jj]; } } } /*----------------------------------------------------------------- * Set interpolation weight by dividing by the diagonal. *-----------------------------------------------------------------*/ if (diagonal == 0.0) { printf(" Warning! zero diagonal! Proc id %d row %d\n", my_id,i); diagonal = A_diag_data[A_diag_i[i]]; } for (jj = jj_begin_row; jj < jj_end_row; jj++) { P_diag_data[jj] /= -diagonal; } for (jj = jj_begin_row_offd; jj < jj_end_row_offd; jj++) { P_offd_data[jj] /= -diagonal; } } strong_f_marker--; P_offd_i[i+1] = jj_counter_offd; } hypre_TFree(P_marker); hypre_TFree(P_marker_offd); } P = hypre_ParCSRMatrixCreate(comm, hypre_ParCSRMatrixGlobalNumRows(A), total_global_cpts, hypre_ParCSRMatrixColStarts(A), num_cpts_global, 0, P_diag_i[n_fine], P_offd_i[n_fine]); P_diag = hypre_ParCSRMatrixDiag(P); hypre_CSRMatrixData(P_diag) = P_diag_data; hypre_CSRMatrixI(P_diag) = P_diag_i; hypre_CSRMatrixJ(P_diag) = P_diag_j; P_offd = hypre_ParCSRMatrixOffd(P); hypre_CSRMatrixData(P_offd) = P_offd_data; hypre_CSRMatrixI(P_offd) = P_offd_i; hypre_CSRMatrixJ(P_offd) = P_offd_j; hypre_ParCSRMatrixOwnsRowStarts(P) = 0; /* Compress P, removing coefficients smaller than trunc_factor * Max */ if (trunc_factor != 0.0 || max_elmts > 0) { hypre_BoomerAMGInterpTruncation(P, trunc_factor, max_elmts); P_diag_data = hypre_CSRMatrixData(P_diag); P_diag_i = hypre_CSRMatrixI(P_diag); P_diag_j = hypre_CSRMatrixJ(P_diag); P_offd_data = hypre_CSRMatrixData(P_offd); P_offd_i = hypre_CSRMatrixI(P_offd); P_offd_j = hypre_CSRMatrixJ(P_offd); P_diag_size = P_diag_i[n_fine]; P_offd_size = P_offd_i[n_fine]; } num_cols_P_offd = 0; if (P_offd_size) { if (num_cols_A_offd) P_marker = hypre_CTAlloc(int, num_cols_A_offd); /*#define HYPRE_SMP_PRIVATE i #include "../utilities/hypre_smp_forloop.h"*/ for (i=0; i < num_cols_A_offd; i++) P_marker[i] = 0; num_cols_P_offd = 0; for (i=0; i < P_offd_size; i++) { index = P_offd_j[i]; if (!P_marker[index]) { num_cols_P_offd++; P_marker[index] = 1; } } if (num_cols_P_offd) { col_map_offd_P = hypre_CTAlloc(HYPRE_BigInt,num_cols_P_offd); tmp_map_offd = hypre_CTAlloc(int,num_cols_P_offd); } index = 0; for (i=0; i < num_cols_P_offd; i++) { while (P_marker[index]==0) index++; tmp_map_offd[i] = index++; } /*#define HYPRE_SMP_PRIVATE i #include "../utilities/hypre_smp_forloop.h"*/ for (i=0; i < P_offd_size; i++) P_offd_j[i] = hypre_BinarySearch(tmp_map_offd, P_offd_j[i], num_cols_P_offd); hypre_TFree(P_marker); } for (i=0; i < n_fine; i++) if (CF_marker[i] == -3) CF_marker[i] = -1; if (num_cols_P_offd) { hypre_ParCSRMatrixColMapOffd(P) = col_map_offd_P; hypre_CSRMatrixNumCols(P_offd) = num_cols_P_offd; } if (num_procs > 1) hypre_GetCommPkgRTFromCommPkgA(P,A, fine_to_coarse, tmp_map_offd); *P_ptr = P; hypre_TFree(tmp_map_offd); hypre_TFree(CF_marker_offd); hypre_TFree(dof_func_offd); hypre_TFree(int_buf_data); hypre_TFree(fine_to_coarse); /*hypre_TFree(fine_to_coarse_offd);*/ hypre_TFree(coarse_counter); hypre_TFree(jj_count); hypre_TFree(jj_count_offd); if (num_procs > 1) hypre_CSRMatrixDestroy(A_ext); return(0); } int hypre_BoomerAMGInterpTruncation( hypre_ParCSRMatrix *P, double trunc_factor, int max_elmts) { hypre_CSRMatrix *P_diag = hypre_ParCSRMatrixDiag(P); int *P_diag_i = hypre_CSRMatrixI(P_diag); int *P_diag_j = hypre_CSRMatrixJ(P_diag); double *P_diag_data = hypre_CSRMatrixData(P_diag); int *P_diag_j_new; double *P_diag_data_new; hypre_CSRMatrix *P_offd = hypre_ParCSRMatrixOffd(P); int *P_offd_i = hypre_CSRMatrixI(P_offd); int *P_offd_j = hypre_CSRMatrixJ(P_offd); double *P_offd_data = hypre_CSRMatrixData(P_offd); int *P_offd_j_new; double *P_offd_data_new; int n_fine = hypre_CSRMatrixNumRows(P_diag); int num_cols = hypre_CSRMatrixNumCols(P_diag); int i, j, start_j; int ierr = 0; double max_coef; int next_open = 0; int now_checking = 0; int next_open_offd = 0; int now_checking_offd = 0; int num_lost = 0; int num_lost_offd = 0; int num_lost_global = 0; int num_lost_global_offd = 0; int P_diag_size; int P_offd_size; int num_elmts; int cnt, cnt_diag, cnt_offd; double row_sum; double scale; /* Threading variables. Entry i of num_lost_(offd_)per_thread holds the * number of dropped entries over thread i's row range. Cum_lost_per_thread * will temporarily store the cumulative number of dropped entries up to * each thread. */ int my_thread_num, num_threadsID, start, stop; int * max_num_threads = hypre_CTAlloc(int, 1); int * cum_lost_per_thread; int * num_lost_per_thread; int * num_lost_offd_per_thread; /* Initialize threading variables */ max_num_threads[0] = hypre_NumThreads(); cum_lost_per_thread = hypre_CTAlloc(int, max_num_threads[0]); num_lost_per_thread = hypre_CTAlloc(int, max_num_threads[0]); num_lost_offd_per_thread = hypre_CTAlloc(int, max_num_threads[0]); for(i=0; i < max_num_threads[0]; i++) { num_lost_per_thread[i] = 0; num_lost_offd_per_thread[i] = 0; } #ifdef HYPRE_USING_OPENMP #pragma omp parallel private(i,my_thread_num,num_threadsID,max_coef,j,start_j,row_sum,scale,num_lost,now_checking,next_open,num_lost_offd,now_checking_offd,next_open_offd,start,stop,cnt_diag,cnt_offd,num_elmts,cnt) #endif { my_thread_num = hypre_GetThreadNum(); num_threadsID = hypre_NumActiveThreads(); /* Compute each thread's range of rows to truncate and compress. Note, * that i, j and data are all compressed as entries are dropped, but * that the compression only occurs locally over each thread's row * range. P_diag_i is only made globally consistent at the end of this * routine. During the dropping phases, P_diag_i[stop] will point to * the start of the next thread's row range. */ /* my row range */ start = (n_fine/num_threadsID)*my_thread_num; if (my_thread_num == num_threadsID-1) { stop = n_fine; } else { stop = (n_fine/num_threadsID)*(my_thread_num+1); } /* * Truncate based on truncation tolerance */ if (trunc_factor > 0) { num_lost_offd = 0; next_open = P_diag_i[start]; now_checking = P_diag_i[start]; next_open_offd = P_offd_i[start];; now_checking_offd = P_offd_i[start];; for (i = start; i < stop; i++) { max_coef = 0; for (j = P_diag_i[i]; j < P_diag_i[i+1]; j++) max_coef = (max_coef < fabs(P_diag_data[j])) ? fabs(P_diag_data[j]) : max_coef; for (j = P_offd_i[i]; j < P_offd_i[i+1]; j++) max_coef = (max_coef < fabs(P_offd_data[j])) ? fabs(P_offd_data[j]) : max_coef; max_coef *= trunc_factor; start_j = P_diag_i[i]; P_diag_i[i] -= num_lost; row_sum = 0; scale = 0; for (j = start_j; j < P_diag_i[i+1]; j++) { row_sum += P_diag_data[now_checking]; if (fabs(P_diag_data[now_checking]) < max_coef) { num_lost++; now_checking++; } else { scale += P_diag_data[now_checking]; P_diag_data[next_open] = P_diag_data[now_checking]; P_diag_j[next_open] = P_diag_j[now_checking]; now_checking++; next_open++; } } start_j = P_offd_i[i]; P_offd_i[i] -= num_lost_offd; for (j = start_j; j < P_offd_i[i+1]; j++) { row_sum += P_offd_data[now_checking_offd]; if (fabs(P_offd_data[now_checking_offd]) < max_coef) { num_lost_offd++; now_checking_offd++; } else { scale += P_offd_data[now_checking_offd]; P_offd_data[next_open_offd] = P_offd_data[now_checking_offd]; P_offd_j[next_open_offd] = P_offd_j[now_checking_offd]; now_checking_offd++; next_open_offd++; } } /* normalize row of P */ if (scale != 0.) { if (scale != row_sum) { scale = row_sum/scale; for (j = P_diag_i[i]; j < (P_diag_i[i+1]-num_lost); j++) P_diag_data[j] *= scale; for (j = P_offd_i[i]; j < (P_offd_i[i+1]-num_lost_offd); j++) P_offd_data[j] *= scale; } } } /* store number of dropped elements and number of threads */ if(my_thread_num == 0) { max_num_threads[0] = num_threadsID; } num_lost_per_thread[my_thread_num] = num_lost; num_lost_offd_per_thread[my_thread_num] = num_lost_offd; } /* end if (trunc_factor > 0) */ /*P_diag_i[n_fine] -= num_lost; P_offd_i[n_fine] -= num_lost_offd; } */ if (max_elmts > 0) { int P_mxnum, cnt1, last_index, last_index_offd; int *P_aux_j; double *P_aux_data; /* find maximum row length locally over this row range */ P_mxnum = 0; for (i=start; i P_mxnum) P_mxnum = cnt1; } /*rowlength = 0; if (n_fine) rowlength = P_diag_i[1]+P_offd_i[1]; P_mxnum = rowlength; for (i=1; i P_mxnum) P_mxnum = rowlength; }*/ if (P_mxnum > max_elmts) { num_lost = 0; num_lost_offd = 0; /* two temporary arrays to hold row i for temporary operations */ P_aux_j = hypre_CTAlloc(int, P_mxnum); P_aux_data = hypre_CTAlloc(double, P_mxnum); cnt_diag = P_diag_i[start]; cnt_offd = P_offd_i[start]; for (i = start; i < stop; i++) { /* Note P_diag_i[stop] is the starting point for the next thread * in j and data, not the stop point for this thread */ last_index = P_diag_i[i+1]; last_index_offd = P_offd_i[i+1]; if(i == stop-1) { last_index -= num_lost_per_thread[my_thread_num]; last_index_offd -= num_lost_offd_per_thread[my_thread_num]; } row_sum = 0; num_elmts = P_diag_i[i+1]-P_diag_i[i]+P_offd_i[i+1]-P_offd_i[i]; if (max_elmts < num_elmts) { cnt = 0; for (j = P_diag_i[i]; j < P_diag_i[i+1]; j++) { P_aux_j[cnt] = P_diag_j[j]; P_aux_data[cnt++] = P_diag_data[j]; row_sum += P_diag_data[j]; } num_lost += cnt; cnt1 = cnt; for (j = P_offd_i[i]; j < P_offd_i[i+1]; j++) { P_aux_j[cnt] = P_offd_j[j]+num_cols; P_aux_data[cnt++] = P_offd_data[j]; row_sum += P_offd_data[j]; } num_lost_offd += cnt-cnt1; /* sort data */ hypre_qsort2abs(P_aux_j,P_aux_data,0,cnt-1); scale = 0; P_diag_i[i] = cnt_diag; P_offd_i[i] = cnt_offd; for (j = 0; j < max_elmts; j++) { scale += P_aux_data[j]; if (P_aux_j[j] < num_cols) { P_diag_j[cnt_diag] = P_aux_j[j]; P_diag_data[cnt_diag++] = P_aux_data[j]; } else { P_offd_j[cnt_offd] = P_aux_j[j]-num_cols; P_offd_data[cnt_offd++] = P_aux_data[j]; } } num_lost -= cnt_diag-P_diag_i[i]; num_lost_offd -= cnt_offd-P_offd_i[i]; /* normalize row of P */ if (scale != 0.) { if (scale != row_sum) { scale = row_sum/scale; for (j = P_diag_i[i]; j < cnt_diag; j++) P_diag_data[j] *= scale; for (j = P_offd_i[i]; j < cnt_offd; j++) P_offd_data[j] *= scale; } } } else { if (P_diag_i[i] != cnt_diag) { start_j = P_diag_i[i]; P_diag_i[i] = cnt_diag; for (j = start_j; j < P_diag_i[i+1]; j++) { P_diag_j[cnt_diag] = P_diag_j[j]; P_diag_data[cnt_diag++] = P_diag_data[j]; } } else cnt_diag += P_diag_i[i+1]-P_diag_i[i]; if (P_offd_i[i] != cnt_offd) { start_j = P_offd_i[i]; P_offd_i[i] = cnt_offd; for (j = start_j; j < P_offd_i[i+1]; j++) { P_offd_j[cnt_offd] = P_offd_j[j]; P_offd_data[cnt_offd++] = P_offd_data[j]; } } else cnt_offd += P_offd_i[i+1]-P_offd_i[i]; } } num_lost_per_thread[my_thread_num] += num_lost; num_lost_offd_per_thread[my_thread_num] += num_lost_offd; hypre_TFree(P_aux_j); hypre_TFree(P_aux_data); } /* end if (P_mxnum > max_elmts) */ } /* end if (max_elmts > 0) */ /* Sum up num_lost_global */ #ifdef HYPRE_USING_OPENMP #pragma omp barrier #endif if(my_thread_num == 0) { num_lost_global = 0; num_lost_global_offd = 0; for(i = 0; i < max_num_threads[0]; i++) { num_lost_global += num_lost_per_thread[i]; num_lost_global_offd += num_lost_offd_per_thread[i]; } } #ifdef HYPRE_USING_OPENMP #pragma omp barrier #endif /* * Synchronize and create new diag data structures */ if (num_lost_global) { /* Each thread has it's own locally compressed CSR matrix from rows start * to stop. Now, we have to copy each thread's chunk into the new * process-wide CSR data structures * * First, we compute the new process-wide number of nonzeros (i.e., * P_diag_size), and compute cum_lost_per_thread[k] so that this * entry holds the cumulative sum of entries dropped up to and * including thread k. */ if(my_thread_num == 0) { P_diag_size = P_diag_i[n_fine]; for(i = 0; i < max_num_threads[0]; i++) { P_diag_size -= num_lost_per_thread[i]; if(i > 0) { cum_lost_per_thread[i] = num_lost_per_thread[i] + cum_lost_per_thread[i-1]; } else { cum_lost_per_thread[i] = num_lost_per_thread[i]; } } P_diag_j_new = hypre_CTAlloc(int,P_diag_size); P_diag_data_new = hypre_CTAlloc(double,P_diag_size); } #ifdef HYPRE_USING_OPENMP #pragma omp barrier #endif /* points to next open spot in new data structures for this thread */ if(my_thread_num == 0) { next_open = 0; } else { /* remember, cum_lost_per_thread[k] stores the num dropped up to and * including thread k */ next_open = P_diag_i[start] - cum_lost_per_thread[my_thread_num-1]; } /* copy the j and data arrays over */ for(i = P_diag_i[start]; i < P_diag_i[stop] - num_lost_per_thread[my_thread_num]; i++) { P_diag_j_new[next_open] = P_diag_j[i]; P_diag_data_new[next_open] = P_diag_data[i]; next_open += 1; } #ifdef HYPRE_USING_OPENMP #pragma omp barrier #endif /* update P_diag_i with number of dropped entries by all lower ranked * threads */ if(my_thread_num > 0) { for(i=start; i 0) { cum_lost_per_thread[i] = num_lost_offd_per_thread[i] + cum_lost_per_thread[i-1]; } else { cum_lost_per_thread[i] = num_lost_offd_per_thread[i]; } } P_offd_j_new = hypre_CTAlloc(int,P_offd_size); P_offd_data_new = hypre_CTAlloc(double,P_offd_size); } #ifdef HYPRE_USING_OPENMP #pragma omp barrier #endif /* points to next open spot in new data structures for this thread */ if(my_thread_num == 0) { next_open = 0; } else { /* remember, cum_lost_per_thread[k] stores the num dropped up to and * including thread k */ next_open = P_offd_i[start] - cum_lost_per_thread[my_thread_num-1]; } /* copy the j and data arrays over */ for(i = P_offd_i[start]; i < P_offd_i[stop] - num_lost_offd_per_thread[my_thread_num]; i++) { P_offd_j_new[next_open] = P_offd_j[i]; P_offd_data_new[next_open] = P_offd_data[i]; next_open += 1; } #ifdef HYPRE_USING_OPENMP #pragma omp barrier #endif /* update P_offd_i with number of dropped entries by all lower ranked * threads */ if(my_thread_num > 0) { for(i=start; i= right) return; swap2( v, w, left, (left+right)/2); last = left; for (i = left+1; i <= right; i++) if (fabs(w[i]) > fabs(w[left])) { swap2(v, w, ++last, i); } swap2(v, w, left, last); hypre_qsort2abs(v, w, left, last-1); hypre_qsort2abs(v, w, last+1, right); }