/*BHEADER**********************************************************************
 * Copyright (c) 2008,  Lawrence Livermore National Security, LLC.
 * Produced at the Lawrence Livermore National Laboratory.
 * This file is part of HYPRE.  See file COPYRIGHT for details.
 *
 * HYPRE is free software; you can redistribute it and/or modify it under the
 * terms of the GNU Lesser General Public License (as published by the Free
 * Software Foundation) version 2.1 dated February 1999.
 *
 * $Revision: 2.4 $
 ***********************************************************************EHEADER*/




#include "headers.h"

/*--------------------------------------------------------------------------
 * OLD NOTES:
 * Sketch of John's code to build RAP
 *
 * Uses two integer arrays icg and ifg as marker arrays
 *
 *  icg needs to be of size n_fine; size of ia.
 *     A negative value of icg(i) indicates i is a f-point, otherwise
 *     icg(i) is the converts from fine to coarse grid orderings. 
 *     Note that I belive the code assumes that if i<j and both are
 *     c-points, then icg(i) < icg(j).
 *  ifg needs to be of size n_coarse; size of irap
 *     I don't think it has meaning as either input or output.
 *
 * In the code, both the interpolation and restriction operator
 * are stored row-wise in the array b. If i is a f-point,
 * ib(i) points the row of the interpolation operator for point
 * i. If i is a c-point, ib(i) points the row of the restriction
 * operator for point i.
 *
 * In the CSR storage for rap, its guaranteed that the rows will
 * be ordered ( i.e. ic<jc -> irap(ic) < irap(jc)) but I don't
 * think there is a guarantee that the entries within a row will
 * be ordered in any way except that the diagonal entry comes first.
 *
 * As structured now, the code requires that the size of rap be
 * predicted up front. To avoid this, one could execute the code
 * twice, the first time would only keep track of icg ,ifg and ka.
 * Then you would know how much memory to allocate for rap and jrap.
 * The second time would fill in these arrays. Actually you might
 * be able to include the filling in of jrap into the first pass;
 * just overestimate its size (its an integer array) and cut it
 * back before the second time through. This would avoid some if tests
 * in the second pass.
 *
 * Questions
 *            1) parallel (PetSc) version?
 *            2) what if we don't store R row-wise and don't
 *               even want to store a copy of it in this form
 *               temporarily? 
 *--------------------------------------------------------------------------*/
         
hypre_BigCSRMatrix *
hypre_ExchangeRAPData( hypre_BigCSRMatrix *RAP_int,
                       hypre_ParCSRCommPkg *comm_pkg_RT)
{
   int     *RAP_int_i;
   HYPRE_BigInt *RAP_int_j = NULL;
   double  *RAP_int_data = NULL;
   int     num_cols = 0;

   MPI_Comm comm = hypre_ParCSRCommPkgComm(comm_pkg_RT);
   int num_recvs = hypre_ParCSRCommPkgNumRecvs(comm_pkg_RT);
   int *recv_procs = hypre_ParCSRCommPkgRecvProcs(comm_pkg_RT);
   int *recv_vec_starts = hypre_ParCSRCommPkgRecvVecStarts(comm_pkg_RT);
   int num_sends = hypre_ParCSRCommPkgNumSends(comm_pkg_RT);
   int *send_procs = hypre_ParCSRCommPkgSendProcs(comm_pkg_RT);
   int *send_map_starts = hypre_ParCSRCommPkgSendMapStarts(comm_pkg_RT);

   hypre_BigCSRMatrix *RAP_ext;

   int     *RAP_ext_i;
   HYPRE_BigInt *RAP_ext_j = NULL;
   double  *RAP_ext_data = NULL;

   hypre_ParCSRCommHandle *comm_handle;
   hypre_ParCSRCommPkg *tmp_comm_pkg;

   int *jdata_recv_vec_starts;
   int *jdata_send_map_starts;

   int num_rows;
   int num_nonzeros;
   int i, j;
   int num_procs;

   MPI_Comm_size(comm,&num_procs);


   RAP_ext_i = hypre_CTAlloc(int, send_map_starts[num_sends]+1);

   jdata_recv_vec_starts = hypre_CTAlloc(int, num_recvs+1);
   jdata_send_map_starts = hypre_CTAlloc(int, num_sends+1);
 
/*--------------------------------------------------------------------------
 * recompute RAP_int_i so that RAP_int_i[j+1] contains the number of
 * elements of row j (to be determined through send_map_elmnts on the
 * receiving end)
 *--------------------------------------------------------------------------*/

   if (num_recvs)
   {
        RAP_int_i = hypre_BigCSRMatrixI(RAP_int);
        RAP_int_j = hypre_BigCSRMatrixJ(RAP_int);
        RAP_int_data = hypre_BigCSRMatrixData(RAP_int);
        num_cols = hypre_BigCSRMatrixNumCols(RAP_int);
   }

   jdata_recv_vec_starts[0] = 0;
   for (i=0; i < num_recvs; i++)
   {
        jdata_recv_vec_starts[i+1] = RAP_int_i[recv_vec_starts[i+1]];
   }
 
   for (i=num_recvs; i > 0; i--)
        for (j = recv_vec_starts[i]; j > recv_vec_starts[i-1]; j--)
                RAP_int_i[j] -= RAP_int_i[j-1];

/*--------------------------------------------------------------------------
 * initialize communication 
 *--------------------------------------------------------------------------*/

   if (num_recvs && num_sends)
      comm_handle = hypre_ParCSRCommHandleCreate(12,comm_pkg_RT,
                &RAP_int_i[1], &RAP_ext_i[1]);
   else if (num_recvs)
      comm_handle = hypre_ParCSRCommHandleCreate(12,comm_pkg_RT,
                &RAP_int_i[1], NULL);
   else if (num_sends)
      comm_handle = hypre_ParCSRCommHandleCreate(12,comm_pkg_RT,
                NULL, &RAP_ext_i[1]);

   tmp_comm_pkg = hypre_CTAlloc(hypre_ParCSRCommPkg, 1);
   hypre_ParCSRCommPkgComm(tmp_comm_pkg) = comm;
   hypre_ParCSRCommPkgNumSends(tmp_comm_pkg) = num_recvs;
   hypre_ParCSRCommPkgNumRecvs(tmp_comm_pkg) = num_sends;
   hypre_ParCSRCommPkgSendProcs(tmp_comm_pkg) = recv_procs;
   hypre_ParCSRCommPkgRecvProcs(tmp_comm_pkg) = send_procs;

   hypre_ParCSRCommHandleDestroy(comm_handle);
   comm_handle = NULL;

/*--------------------------------------------------------------------------
 * compute num_nonzeros for RAP_ext
 *--------------------------------------------------------------------------*/

   for (i=0; i < num_sends; i++)
        for (j = send_map_starts[i]; j < send_map_starts[i+1]; j++)
                RAP_ext_i[j+1] += RAP_ext_i[j];

   num_rows = send_map_starts[num_sends];
   num_nonzeros = RAP_ext_i[num_rows];
   if (num_nonzeros)
   {

      RAP_ext_j = hypre_CTAlloc(HYPRE_BigInt, num_nonzeros);
      RAP_ext_data = hypre_CTAlloc(double, num_nonzeros);
   }

   for (i=0; i < num_sends+1; i++)
   {
        jdata_send_map_starts[i] = RAP_ext_i[send_map_starts[i]];
   }

   hypre_ParCSRCommPkgRecvVecStarts(tmp_comm_pkg) = jdata_send_map_starts;      
   hypre_ParCSRCommPkgSendMapStarts(tmp_comm_pkg) = jdata_recv_vec_starts;      

   comm_handle = hypre_ParCSRCommHandleCreate(1,tmp_comm_pkg,RAP_int_data,
                                        RAP_ext_data);
   hypre_ParCSRCommHandleDestroy(comm_handle);
   comm_handle = NULL;

   comm_handle = hypre_ParCSRCommHandleCreate(21,tmp_comm_pkg,RAP_int_j,
                                        RAP_ext_j);
   RAP_ext = hypre_BigCSRMatrixCreate(num_rows,num_cols,num_nonzeros);

   hypre_BigCSRMatrixI(RAP_ext) = RAP_ext_i;
   if (num_nonzeros)
   {
      hypre_BigCSRMatrixJ(RAP_ext) = RAP_ext_j;
      hypre_BigCSRMatrixData(RAP_ext) = RAP_ext_data;
   }

   hypre_ParCSRCommHandleDestroy(comm_handle); 
   comm_handle = NULL;

   hypre_TFree(jdata_recv_vec_starts);
   hypre_TFree(jdata_send_map_starts);
   hypre_TFree(tmp_comm_pkg);

   return RAP_ext;
}

/*--------------------------------------------------------------------------
 * hypre_BoomerAMGBuildCoarseOperator
 *--------------------------------------------------------------------------*/

int
hypre_BoomerAMGBuildCoarseOperator( hypre_ParCSRMatrix  *RT,
                                    hypre_ParCSRMatrix  *A,
                                    hypre_ParCSRMatrix  *P,
                                    hypre_ParCSRMatrix **RAP_ptr )

{
   MPI_Comm        comm = hypre_ParCSRMatrixComm(A);

   hypre_CSRMatrix *RT_diag = hypre_ParCSRMatrixDiag(RT);
   hypre_CSRMatrix *RT_offd = hypre_ParCSRMatrixOffd(RT);
   int             num_cols_diag_RT = hypre_CSRMatrixNumCols(RT_diag);
   int             num_cols_offd_RT = hypre_CSRMatrixNumCols(RT_offd);
   int             num_rows_offd_RT = hypre_CSRMatrixNumRows(RT_offd);
   hypre_ParCSRCommPkg   *comm_pkg_RT = hypre_ParCSRMatrixCommPkg(RT);
   int             num_recvs_RT = 0;
   int             num_sends_RT = 0;
   int             *send_map_starts_RT;
   int             *send_map_elmts_RT;

   hypre_CSRMatrix *A_diag = hypre_ParCSRMatrixDiag(A);
   
   double          *A_diag_data = hypre_CSRMatrixData(A_diag);
   int             *A_diag_i = hypre_CSRMatrixI(A_diag);
   int             *A_diag_j = hypre_CSRMatrixJ(A_diag);

   hypre_CSRMatrix *A_offd = hypre_ParCSRMatrixOffd(A);
   
   double          *A_offd_data = hypre_CSRMatrixData(A_offd);
   int             *A_offd_i = hypre_CSRMatrixI(A_offd);
   int             *A_offd_j = hypre_CSRMatrixJ(A_offd);

   int  num_cols_diag_A = hypre_CSRMatrixNumCols(A_diag);
   int  num_cols_offd_A = hypre_CSRMatrixNumCols(A_offd);

   hypre_CSRMatrix *P_diag = hypre_ParCSRMatrixDiag(P);
   
   double          *P_diag_data = hypre_CSRMatrixData(P_diag);
   int             *P_diag_i = hypre_CSRMatrixI(P_diag);
   int             *P_diag_j = hypre_CSRMatrixJ(P_diag);

   hypre_CSRMatrix *P_offd = hypre_ParCSRMatrixOffd(P);
   HYPRE_BigInt    *col_map_offd_P = hypre_ParCSRMatrixColMapOffd(P);
   
   double          *P_offd_data = hypre_CSRMatrixData(P_offd);
   int             *P_offd_i = hypre_CSRMatrixI(P_offd);
   int             *P_offd_j = hypre_CSRMatrixJ(P_offd);

   HYPRE_BigInt  first_col_diag_P = hypre_ParCSRMatrixFirstColDiag(P);
   HYPRE_BigInt  last_col_diag_P;
   int  num_cols_diag_P = hypre_CSRMatrixNumCols(P_diag);
   int  num_cols_offd_P = hypre_CSRMatrixNumCols(P_offd);
   HYPRE_BigInt *coarse_partitioning = hypre_ParCSRMatrixColStarts(P);
   HYPRE_BigInt *RT_partitioning = hypre_ParCSRMatrixColStarts(RT);

   hypre_ParCSRMatrix *RAP;
   HYPRE_BigInt       *col_map_offd_RAP;
   HYPRE_BigInt       *new_col_map_offd_RAP;

   hypre_BigCSRMatrix *RAP_int = NULL;

   double          *RAP_int_data;
   int             *RAP_int_i;
   HYPRE_BigInt    *RAP_int_j;

   hypre_BigCSRMatrix *RAP_ext;

   double          *RAP_ext_data;
   int             *RAP_ext_i;
   HYPRE_BigInt    *RAP_ext_j;
   int             *int_RAP_ext_j = NULL;

   hypre_CSRMatrix *RAP_diag;

   double          *RAP_diag_data;
   int             *RAP_diag_i;
   int             *RAP_diag_j;

   hypre_CSRMatrix *RAP_offd;

   double          *RAP_offd_data;
   int             *RAP_offd_i;
   int             *RAP_offd_j;

   int              RAP_size;
   int              RAP_ext_size;
   int              RAP_diag_size;
   int              RAP_offd_size;
   int              P_ext_diag_size;
   int              P_ext_offd_size;
   HYPRE_BigInt     first_col_diag_RAP;
   HYPRE_BigInt     last_col_diag_RAP;
   int              num_cols_offd_RAP = 0;
   
   hypre_CSRMatrix *R_diag;
   
   double          *R_diag_data;
   int             *R_diag_i;
   int             *R_diag_j;

   hypre_CSRMatrix *R_offd;
   
   double          *R_offd_data;
   int             *R_offd_i;
   int             *R_offd_j;

   hypre_BigCSRMatrix *Ps_ext;
   
   double          *Ps_ext_data;
   int             *Ps_ext_i;
   HYPRE_BigInt    *Ps_ext_j;

   double          *P_ext_diag_data;
   int             *P_ext_diag_i;
   int             *P_ext_diag_j;

   double          *P_ext_offd_data;
   int             *P_ext_offd_i;
   int             *P_ext_offd_j;

   HYPRE_BigInt    *col_map_offd_Pext;
   int             *map_P_to_Pext;
   int             *map_P_to_RAP;
   int             *map_Pext_to_RAP;

   int             *P_marker;
   int            **P_mark_array;
   int            **A_mark_array;
   int             *A_marker;
   HYPRE_BigInt    *temp;

   HYPRE_BigInt     n_coarse, n_coarse_RT;
   HYPRE_BigInt     value;
   int              square = 1;
   int              num_cols_offd_Pext = 0;
   
   int              ic, i, j, k;
   int              i1, i2, i3, ii, ns, ne, size, rest;
   int              cnt, cnt_offd, cnt_diag;
   int              jj1, jj2, jj3, jcol;
   
   int             *jj_count, *jj_cnt_diag, *jj_cnt_offd;
   int              jj_counter, jj_count_diag, jj_count_offd;
   int              jj_row_begining, jj_row_begin_diag, jj_row_begin_offd;
   int              start_indexing = 0; /* start indexing for RAP_data at 0 */
   int              num_nz_cols_A;
   int              num_procs;
   int              num_threads;

   double           r_entry;
   double           r_a_product;
   double           r_a_p_product;
   
   double           zero = 0.0;

   int              tmp_ii, tmp_ii_1, tmp_ii_2, tmp_ii_10, tmp_ii_11;

   /*-----------------------------------------------------------------------
    *  Copy ParCSRMatrix RT into CSRMatrix R so that we have row-wise access 
    *  to restriction .
    *-----------------------------------------------------------------------*/

   MPI_Comm_size(comm,&num_procs);
   num_threads = hypre_NumThreads();

   if (comm_pkg_RT)
   {
        num_recvs_RT = hypre_ParCSRCommPkgNumRecvs(comm_pkg_RT);
        num_sends_RT = hypre_ParCSRCommPkgNumSends(comm_pkg_RT);
        send_map_starts_RT =hypre_ParCSRCommPkgSendMapStarts(comm_pkg_RT);
        send_map_elmts_RT = hypre_ParCSRCommPkgSendMapElmts(comm_pkg_RT);
   }

   hypre_CSRMatrixTranspose(RT_diag,&R_diag,1); 
   if (num_cols_offd_RT) 
   {
        hypre_CSRMatrixTranspose(RT_offd,&R_offd,1); 
        R_offd_data = hypre_CSRMatrixData(R_offd);
        R_offd_i    = hypre_CSRMatrixI(R_offd);
        R_offd_j    = hypre_CSRMatrixJ(R_offd);
   }

   /*-----------------------------------------------------------------------
    *  Access the CSR vectors for R. Also get sizes of fine and
    *  coarse grids.
    *-----------------------------------------------------------------------*/

   R_diag_data = hypre_CSRMatrixData(R_diag);
   R_diag_i    = hypre_CSRMatrixI(R_diag);
   R_diag_j    = hypre_CSRMatrixJ(R_diag);

   n_coarse = hypre_ParCSRMatrixGlobalNumCols(P);
   num_nz_cols_A = num_cols_diag_A + num_cols_offd_A;

   n_coarse_RT = hypre_ParCSRMatrixGlobalNumCols(RT);
   if (n_coarse != n_coarse_RT)
      square = 0;

   /*-----------------------------------------------------------------------
    *  Generate Ps_ext, i.e. portion of P that is stored on neighbor procs
    *  and needed locally for triple matrix product 
    *-----------------------------------------------------------------------*/

   if (num_procs > 1) 
   {
        Ps_ext = hypre_ParCSRMatrixExtractBigExt(P,A,1);
        Ps_ext_data = hypre_BigCSRMatrixData(Ps_ext);
        Ps_ext_i    = hypre_BigCSRMatrixI(Ps_ext);
        Ps_ext_j    = hypre_BigCSRMatrixJ(Ps_ext);
   }
   P_ext_diag_i = hypre_CTAlloc(int,num_cols_offd_A+1);
   P_ext_offd_i = hypre_CTAlloc(int,num_cols_offd_A+1);


   P_ext_diag_size = 0;
   P_ext_offd_size = 0;
   last_col_diag_P = first_col_diag_P + (HYPRE_BigInt)num_cols_diag_P - 1;

   for (i=0; i < num_cols_offd_A; i++)
   {
      for (j=Ps_ext_i[i]; j < Ps_ext_i[i+1]; j++)
         if (Ps_ext_j[j] < first_col_diag_P || Ps_ext_j[j] > last_col_diag_P)
            P_ext_offd_size++;
         else
            P_ext_diag_size++;
      P_ext_diag_i[i+1] = P_ext_diag_size;
      P_ext_offd_i[i+1] = P_ext_offd_size;
   }
   
   if (P_ext_diag_size)
   {
      P_ext_diag_j = hypre_CTAlloc(int, P_ext_diag_size);
      P_ext_diag_data = hypre_CTAlloc(double, P_ext_diag_size);
   }
   if (P_ext_offd_size)
   {
      P_ext_offd_j = hypre_CTAlloc(int, P_ext_offd_size);
      P_ext_offd_data = hypre_CTAlloc(double, P_ext_offd_size);
   }

   if (P_ext_offd_size || num_cols_offd_P)
      temp = hypre_CTAlloc(HYPRE_BigInt, P_ext_offd_size+num_cols_offd_P);

   cnt_offd = 0;
   cnt_diag = 0;
   cnt = 0;
   for (i=0; i < num_cols_offd_A; i++)
   {
      for (j=Ps_ext_i[i]; j < Ps_ext_i[i+1]; j++)
      {
         value = Ps_ext_j[j];
         if (value < first_col_diag_P || value > last_col_diag_P)
         {
            Ps_ext_j[cnt_offd] = value;
            temp[cnt_offd] = value;
            P_ext_offd_data[cnt_offd++] = Ps_ext_data[j];
         }
         else
         {
            P_ext_diag_j[cnt_diag] = (int)(value - first_col_diag_P);
            P_ext_diag_data[cnt_diag++] = Ps_ext_data[j];
         }
      }
   }
   if (P_ext_offd_size || num_cols_offd_P)
   {
      cnt = P_ext_offd_size;
      for (i=0; i < num_cols_offd_P; i++)
         temp[cnt++] = col_map_offd_P[i];
   }
   if (cnt)
   {
      hypre_BigQsort0(temp, 0, cnt-1);

      num_cols_offd_Pext = 1;
      value = temp[0];
      for (i=1; i < cnt; i++)
      {
         if (temp[i] > value)
         {
            value = temp[i];
            temp[num_cols_offd_Pext++] = value;
         }
      }
   }
 
   if (num_cols_offd_Pext)
        col_map_offd_Pext = hypre_CTAlloc(HYPRE_BigInt,num_cols_offd_Pext);

   for (i=0; i < num_cols_offd_Pext; i++)
      col_map_offd_Pext[i] = temp[i];

   for (i=0 ; i < P_ext_offd_size; i++)
      P_ext_offd_j[i] = hypre_BigBinarySearch(col_map_offd_Pext,
                                           Ps_ext_j[i],
                                           num_cols_offd_Pext);

   if (P_ext_offd_size || num_cols_offd_P)
      hypre_TFree(temp);
   if (num_procs > 1) 
   {
      hypre_BigCSRMatrixDestroy(Ps_ext);
      Ps_ext = NULL;
   }


   if (num_cols_offd_P)
   {
      map_P_to_Pext = hypre_CTAlloc(int,num_cols_offd_P);

      cnt = 0;
      for (i=0; i < num_cols_offd_Pext; i++)
         if (col_map_offd_Pext[i] == col_map_offd_P[cnt])
         {
            map_P_to_Pext[cnt++] = i;
            if (cnt == num_cols_offd_P) break;
         }
   }

   /*-----------------------------------------------------------------------
    *  First Pass: Determine size of RAP_int and set up RAP_int_i if there 
    *  are more than one processor and nonzero elements in R_offd
    *-----------------------------------------------------------------------*/

  P_mark_array = hypre_CTAlloc(int *, num_threads);
  A_mark_array = hypre_CTAlloc(int *, num_threads);

  if (num_cols_offd_RT)
  {
   jj_count = hypre_CTAlloc(int, num_threads);

#define HYPRE_SMP_PRIVATE i,ii,ic,i1,i2,i3,jj1,jj2,jj3,ns,ne,size,rest,jj_counter,jj_row_begining,A_marker,P_marker
#include "../utilities/hypre_smp_forloop.h"
   for (ii = 0; ii < num_threads; ii++)
   {
     size = num_cols_offd_RT/num_threads;
     rest = num_cols_offd_RT - size*num_threads;
     if (ii < rest)
     {
        ns = ii*size+ii;
        ne = (ii+1)*size+ii+1;
     }
     else
     {
        ns = ii*size+rest;
        ne = (ii+1)*size+rest;
     }
   
   /*-----------------------------------------------------------------------
    *  Allocate marker arrays.
    *-----------------------------------------------------------------------*/

   if (num_cols_offd_Pext || num_cols_diag_P)
   {
      P_mark_array[ii] = hypre_CTAlloc(int, num_cols_diag_P+num_cols_offd_Pext);
      P_marker = P_mark_array[ii];
   }
   A_mark_array[ii] = hypre_CTAlloc(int, num_nz_cols_A);
   A_marker = A_mark_array[ii];
   /*-----------------------------------------------------------------------
    *  Initialize some stuff.
    *-----------------------------------------------------------------------*/

   jj_counter = start_indexing;
   for (ic = 0; ic < num_cols_diag_P+num_cols_offd_Pext; ic++)
   {      
      P_marker[ic] = -1;
   }
   for (i = 0; i < num_nz_cols_A; i++)
   {      
      A_marker[i] = -1;
   }   

   /*-----------------------------------------------------------------------
    *  Loop over exterior c-points
    *-----------------------------------------------------------------------*/
    
   for (ic = ns; ic < ne; ic++)
   {
      
      jj_row_begining = jj_counter;

      /*--------------------------------------------------------------------
       *  Loop over entries in row ic of R_offd.
       *--------------------------------------------------------------------*/
   
      for (jj1 = R_offd_i[ic]; jj1 < R_offd_i[ic+1]; jj1++)
      {
         i1  = R_offd_j[jj1];

         /*-----------------------------------------------------------------
          *  Loop over entries in row i1 of A_offd.
          *-----------------------------------------------------------------*/
         
         for (jj2 = A_offd_i[i1]; jj2 < A_offd_i[i1+1]; jj2++)
         {
            i2 = A_offd_j[jj2];

            /*--------------------------------------------------------------
             *  Check A_marker to see if point i2 has been previously
             *  visited. New entries in RAP only occur from unmarked points.
             *--------------------------------------------------------------*/

            if (A_marker[i2] != ic)
            {

               /*-----------------------------------------------------------
                *  Mark i2 as visited.
                *-----------------------------------------------------------*/

               A_marker[i2] = ic;
               
               /*-----------------------------------------------------------
                *  Loop over entries in row i2 of P_ext.
                *-----------------------------------------------------------*/

               for (jj3 = P_ext_diag_i[i2]; jj3 < P_ext_diag_i[i2+1]; jj3++)
               {
                  i3 = P_ext_diag_j[jj3];
                  
                  /*--------------------------------------------------------
                   *  Check P_marker to see that RAP_{ic,i3} has not already
                   *  been accounted for. If it has not, mark it and increment
                   *  counter.
                   *--------------------------------------------------------*/

                  if (P_marker[i3] < jj_row_begining)
                  {
                     P_marker[i3] = jj_counter;
                     jj_counter++;
                  }
               }
               for (jj3 = P_ext_offd_i[i2]; jj3 < P_ext_offd_i[i2+1]; jj3++)
               {
                  i3 = P_ext_offd_j[jj3] + num_cols_diag_P;
                  
                  /*--------------------------------------------------------
                   *  Check P_marker to see that RAP_{ic,i3} has not already
                   *  been accounted for. If it has not, mark it and increment
                   *  counter.
                   *--------------------------------------------------------*/

                  if (P_marker[i3] < jj_row_begining)
                  {
                     P_marker[i3] = jj_counter;
                     jj_counter++;
                  }
               }
            }
         }
         /*-----------------------------------------------------------------
          *  Loop over entries in row i1 of A_diag.
          *-----------------------------------------------------------------*/
         
         for (jj2 = A_diag_i[i1]; jj2 < A_diag_i[i1+1]; jj2++)
         {
            i2 = A_diag_j[jj2];

            /*--------------------------------------------------------------
             *  Check A_marker to see if point i2 has been previously
             *  visited. New entries in RAP only occur from unmarked points.
             *--------------------------------------------------------------*/

            if (A_marker[i2+num_cols_offd_A] != ic)
            {

               /*-----------------------------------------------------------
                *  Mark i2 as visited.
                *-----------------------------------------------------------*/

               A_marker[i2+num_cols_offd_A] = ic;
               
               /*-----------------------------------------------------------
                *  Loop over entries in row i2 of P_diag.
                *-----------------------------------------------------------*/

               for (jj3 = P_diag_i[i2]; jj3 < P_diag_i[i2+1]; jj3++)
               {
                  i3 = P_diag_j[jj3];
                  
                  /*--------------------------------------------------------
                   *  Check P_marker to see that RAP_{ic,i3} has not already
                   *  been accounted for. If it has not, mark it and increment
                   *  counter.
                   *--------------------------------------------------------*/

                  if (P_marker[i3] < jj_row_begining)
                  {
                     P_marker[i3] = jj_counter;
                     jj_counter++;
                  }
               }
               /*-----------------------------------------------------------
                *  Loop over entries in row i2 of P_offd.
                *-----------------------------------------------------------*/

               for (jj3 = P_offd_i[i2]; jj3 < P_offd_i[i2+1]; jj3++)
               {
                  i3 = map_P_to_Pext[P_offd_j[jj3]] + num_cols_diag_P;
                  
                  /*--------------------------------------------------------
                   *  Check P_marker to see that RAP_{ic,i3} has not already
                   *  been accounted for. If it has not, mark it and increment
                   *  counter.
                   *--------------------------------------------------------*/

                  if (P_marker[i3] < jj_row_begining)
                  {
                     P_marker[i3] = jj_counter;
                     jj_counter++;
                  }
               }
            }
         }
      }
    }

    jj_count[ii] = jj_counter;

   }
  
   /*-----------------------------------------------------------------------
    *  Allocate RAP_int_data and RAP_int_j arrays.
    *-----------------------------------------------------------------------*/
   for (i = 0; i < num_threads-1; i++)
      jj_count[i+1] += jj_count[i];
    
   RAP_size = jj_count[num_threads-1];
   RAP_int_i = hypre_CTAlloc(int, num_cols_offd_RT+1);
   RAP_int_data = hypre_CTAlloc(double, RAP_size);
   RAP_int_j    = hypre_CTAlloc(HYPRE_BigInt, RAP_size);

   RAP_int_i[num_cols_offd_RT] = RAP_size;

   /*-----------------------------------------------------------------------
    *  Second Pass: Fill in RAP_int_data and RAP_int_j.
    *-----------------------------------------------------------------------*/

#define HYPRE_SMP_PRIVATE i,ii,ic,i1,i2,i3,jj1,jj2,jj3,ns,ne,size,rest,jj_counter,jj_row_begining,A_marker,P_marker,r_entry,r_a_product,r_a_p_product
#include "../utilities/hypre_smp_forloop.h"
   for (ii = 0; ii < num_threads; ii++)
   {
     size = num_cols_offd_RT/num_threads;
     rest = num_cols_offd_RT - size*num_threads;
     if (ii < rest)
     {
        ns = ii*size+ii;
        ne = (ii+1)*size+ii+1;
     }
     else
     {
        ns = ii*size+rest;
        ne = (ii+1)*size+rest;
     }

   /*-----------------------------------------------------------------------
    *  Initialize some stuff.
    *-----------------------------------------------------------------------*/
   if (num_cols_offd_Pext || num_cols_diag_P)
      P_marker = P_mark_array[ii];
   A_marker = A_mark_array[ii];

   jj_counter = start_indexing;
   if (ii > 0) jj_counter = jj_count[ii-1];

   for (ic = 0; ic < num_cols_diag_P+num_cols_offd_Pext; ic++)
   {      
      P_marker[ic] = -1;
   }
   for (i = 0; i < num_nz_cols_A; i++)
   {      
      A_marker[i] = -1;
   }   
   
   /*-----------------------------------------------------------------------
    *  Loop over exterior c-points.
    *-----------------------------------------------------------------------*/
    
   for (ic = ns; ic < ne; ic++)
   {
      
      jj_row_begining = jj_counter;
      RAP_int_i[ic] = jj_counter;

      /*--------------------------------------------------------------------
       *  Loop over entries in row ic of R_offd.
       *--------------------------------------------------------------------*/
   
      for (jj1 = R_offd_i[ic]; jj1 < R_offd_i[ic+1]; jj1++)
      {
         i1  = R_offd_j[jj1];
         r_entry = R_offd_data[jj1];

         /*-----------------------------------------------------------------
          *  Loop over entries in row i1 of A_offd.
          *-----------------------------------------------------------------*/
         
         for (jj2 = A_offd_i[i1]; jj2 < A_offd_i[i1+1]; jj2++)
         {
            i2 = A_offd_j[jj2];
            r_a_product = r_entry * A_offd_data[jj2];
            
            /*--------------------------------------------------------------
             *  Check A_marker to see if point i2 has been previously
             *  visited. New entries in RAP only occur from unmarked points.
             *--------------------------------------------------------------*/

            if (A_marker[i2] != ic)
            {

               /*-----------------------------------------------------------
                *  Mark i2 as visited.
                *-----------------------------------------------------------*/

               A_marker[i2] = ic;
               
               /*-----------------------------------------------------------
                *  Loop over entries in row i2 of P_ext.
                *-----------------------------------------------------------*/

               for (jj3 = P_ext_diag_i[i2]; jj3 < P_ext_diag_i[i2+1]; jj3++)
               {
                  i3 = P_ext_diag_j[jj3];
                  r_a_p_product = r_a_product * P_ext_diag_data[jj3];
                  
                  /*--------------------------------------------------------
                   *  Check P_marker to see that RAP_{ic,i3} has not already
                   *  been accounted for. If it has not, create a new entry.
                   *  If it has, add new contribution.
                   *--------------------------------------------------------*/

                  if (P_marker[i3] < jj_row_begining)
                  {
                     P_marker[i3] = jj_counter;
                     RAP_int_data[jj_counter] = r_a_p_product;
                     RAP_int_j[jj_counter] = (HYPRE_BigInt)i3 + first_col_diag_P;
                     jj_counter++;
                  }
                  else
                  {
                     RAP_int_data[P_marker[i3]] += r_a_p_product;
                  }
               }

               for (jj3 = P_ext_offd_i[i2]; jj3 < P_ext_offd_i[i2+1]; jj3++)
               {
                  i3 = P_ext_offd_j[jj3] + num_cols_diag_P;
                  r_a_p_product = r_a_product * P_ext_offd_data[jj3];
                  
                  /*--------------------------------------------------------
                   *  Check P_marker to see that RAP_{ic,i3} has not already
                   *  been accounted for. If it has not, create a new entry.
                   *  If it has, add new contribution.
                   *--------------------------------------------------------*/

                  if (P_marker[i3] < jj_row_begining)
                  {
                     P_marker[i3] = jj_counter;
                     RAP_int_data[jj_counter] = r_a_p_product;
                     RAP_int_j[jj_counter] 
                                = col_map_offd_Pext[i3-num_cols_diag_P];
                     jj_counter++;
                  }
                  else
                  {
                     RAP_int_data[P_marker[i3]] += r_a_p_product;
                  }
               }
            }

            /*--------------------------------------------------------------
             *  If i2 is previously visited ( A_marker[12]=ic ) it yields
             *  no new entries in RAP and can just add new contributions.
             *--------------------------------------------------------------*/

            else
            {
               for (jj3 = P_ext_diag_i[i2]; jj3 < P_ext_diag_i[i2+1]; jj3++)
               {
                  i3 = P_ext_diag_j[jj3];
                  r_a_p_product = r_a_product * P_ext_diag_data[jj3];
                  RAP_int_data[P_marker[i3]] += r_a_p_product;
               }
               for (jj3 = P_ext_offd_i[i2]; jj3 < P_ext_offd_i[i2+1]; jj3++)
               {
                  i3 = P_ext_offd_j[jj3] + num_cols_diag_P;
                  r_a_p_product = r_a_product * P_ext_offd_data[jj3];
                  RAP_int_data[P_marker[i3]] += r_a_p_product;
               }
            }
         }

         /*-----------------------------------------------------------------
          *  Loop over entries in row i1 of A_diag.
          *-----------------------------------------------------------------*/
         
         for (jj2 = A_diag_i[i1]; jj2 < A_diag_i[i1+1]; jj2++)
         {
            i2 = A_diag_j[jj2];
            r_a_product = r_entry * A_diag_data[jj2];
            
            /*--------------------------------------------------------------
             *  Check A_marker to see if point i2 has been previously
             *  visited. New entries in RAP only occur from unmarked points.
             *--------------------------------------------------------------*/

            if (A_marker[i2+num_cols_offd_A] != ic)
            {

               /*-----------------------------------------------------------
                *  Mark i2 as visited.
                *-----------------------------------------------------------*/

               A_marker[i2+num_cols_offd_A] = ic;
               
               /*-----------------------------------------------------------
                *  Loop over entries in row i2 of P_diag.
                *-----------------------------------------------------------*/

               for (jj3 = P_diag_i[i2]; jj3 < P_diag_i[i2+1]; jj3++)
               {
                  i3 = P_diag_j[jj3];
                  r_a_p_product = r_a_product * P_diag_data[jj3];
                  
                  /*--------------------------------------------------------
                   *  Check P_marker to see that RAP_{ic,i3} has not already
                   *  been accounted for. If it has not, create a new entry.
                   *  If it has, add new contribution.
                   *--------------------------------------------------------*/

                  if (P_marker[i3] < jj_row_begining)
                  {
                     P_marker[i3] = jj_counter;
                     RAP_int_data[jj_counter] = r_a_p_product;
                     RAP_int_j[jj_counter] = (HYPRE_BigInt)i3 + first_col_diag_P;
                     jj_counter++;
                  }
                  else
                  {
                     RAP_int_data[P_marker[i3]] += r_a_p_product;
                  }
               }
               for (jj3 = P_offd_i[i2]; jj3 < P_offd_i[i2+1]; jj3++)
               {
                  i3 = map_P_to_Pext[P_offd_j[jj3]] + num_cols_diag_P;
                  r_a_p_product = r_a_product * P_offd_data[jj3];
                  
                  /*--------------------------------------------------------
                   *  Check P_marker to see that RAP_{ic,i3} has not already
                   *  been accounted for. If it has not, create a new entry.
                   *  If it has, add new contribution.
                   *--------------------------------------------------------*/

                  if (P_marker[i3] < jj_row_begining)
                  {
                     P_marker[i3] = jj_counter;
                     RAP_int_data[jj_counter] = r_a_p_product;
                     RAP_int_j[jj_counter] = 
                                col_map_offd_Pext[i3-num_cols_diag_P];
                     jj_counter++;
                  }
                  else
                  {
                     RAP_int_data[P_marker[i3]] += r_a_p_product;
                  }
               }
            }

            /*--------------------------------------------------------------
             *  If i2 is previously visited ( A_marker[12]=ic ) it yields
             *  no new entries in RAP and can just add new contributions.
             *--------------------------------------------------------------*/

            else
            {
               for (jj3 = P_diag_i[i2]; jj3 < P_diag_i[i2+1]; jj3++)
               {
                  i3 = P_diag_j[jj3];
                  r_a_p_product = r_a_product * P_diag_data[jj3];
                  RAP_int_data[P_marker[i3]] += r_a_p_product;
               }
               for (jj3 = P_offd_i[i2]; jj3 < P_offd_i[i2+1]; jj3++)
               {
                  i3 = map_P_to_Pext[P_offd_j[jj3]] + num_cols_diag_P;
                  r_a_p_product = r_a_product * P_offd_data[jj3];
                  RAP_int_data[P_marker[i3]] += r_a_p_product;
               }
            }
         }
      }
   }
   if (num_cols_offd_Pext || num_cols_diag_P)
      hypre_TFree(P_mark_array[ii]);
   hypre_TFree(A_mark_array[ii]);
   }

   RAP_int = hypre_BigCSRMatrixCreate(num_cols_offd_RT,num_rows_offd_RT,RAP_size);
   hypre_BigCSRMatrixI(RAP_int) = RAP_int_i;
   hypre_BigCSRMatrixJ(RAP_int) = RAP_int_j;
   hypre_BigCSRMatrixData(RAP_int) = RAP_int_data;
   hypre_TFree(jj_count);
  }

   RAP_ext_size = 0;
   if (num_sends_RT || num_recvs_RT)
   {
        RAP_ext = hypre_ExchangeRAPData(RAP_int,comm_pkg_RT);
        RAP_ext_i = hypre_BigCSRMatrixI(RAP_ext);
        RAP_ext_j = hypre_BigCSRMatrixJ(RAP_ext);
        RAP_ext_data = hypre_BigCSRMatrixData(RAP_ext);
        RAP_ext_size = RAP_ext_i[hypre_BigCSRMatrixNumRows(RAP_ext)];
   }
   if (num_cols_offd_RT)
   {
      hypre_BigCSRMatrixDestroy(RAP_int);
      RAP_int = NULL;
   }
   RAP_diag_i = hypre_CTAlloc(int, num_cols_diag_RT+1);
   RAP_offd_i = hypre_CTAlloc(int, num_cols_diag_RT+1);

   first_col_diag_RAP = first_col_diag_P;
   last_col_diag_RAP = first_col_diag_P + (HYPRE_BigInt)num_cols_diag_P - 1;

   /*-----------------------------------------------------------------------
    *  check for new nonzero columns in RAP_offd generated through RAP_ext
    *-----------------------------------------------------------------------*/

   if (RAP_ext_size || num_cols_offd_Pext)
   {
      temp = hypre_CTAlloc(HYPRE_BigInt,RAP_ext_size+num_cols_offd_Pext);
      cnt = 0;
      for (i=0; i < RAP_ext_size; i++)
         if (RAP_ext_j[i] < first_col_diag_RAP 
                        || RAP_ext_j[i] > last_col_diag_RAP)
            temp[cnt++] = RAP_ext_j[i];
      for (i=0; i < num_cols_offd_Pext; i++)
         temp[cnt++] = col_map_offd_Pext[i];


      if (cnt)
      {
         hypre_BigQsort0(temp,0,cnt-1);
         value = temp[0];
         num_cols_offd_RAP = 1;
         for (i=1; i < cnt; i++)
         {
            if (temp[i] > value)
            {
               value = temp[i];
               temp[num_cols_offd_RAP++] = value;
            }
         }
      }

   /* now evaluate col_map_offd_RAP */
      if (num_cols_offd_RAP)
         col_map_offd_RAP = hypre_CTAlloc(HYPRE_BigInt, num_cols_offd_RAP);

      for (i=0 ; i < num_cols_offd_RAP; i++)
         col_map_offd_RAP[i] = temp[i];
  
      hypre_TFree(temp);
   }

   if (num_cols_offd_P)
   {
      map_P_to_RAP = hypre_CTAlloc(int,num_cols_offd_P);

      cnt = 0;
      for (i=0; i < num_cols_offd_RAP; i++)
         if (col_map_offd_RAP[i] == col_map_offd_P[cnt])
         {
            map_P_to_RAP[cnt++] = i;
            if (cnt == num_cols_offd_P) break;
         }
   }

   if (num_cols_offd_Pext)
   {
      map_Pext_to_RAP = hypre_CTAlloc(int,num_cols_offd_Pext);

      cnt = 0;
      for (i=0; i < num_cols_offd_RAP; i++)
         if (col_map_offd_RAP[i] == col_map_offd_Pext[cnt])
         {
            map_Pext_to_RAP[cnt++] = i;
            if (cnt == num_cols_offd_Pext) break;
         }
   }

   /*-----------------------------------------------------------------------
    *  Convert RAP_ext column indices
    *-----------------------------------------------------------------------*/

   int_RAP_ext_j = hypre_CTAlloc(int, RAP_ext_size);

   for (i=0; i < RAP_ext_size; i++)
      if (RAP_ext_j[i] < first_col_diag_RAP 
                        || RAP_ext_j[i] > last_col_diag_RAP)
            int_RAP_ext_j[i] = num_cols_diag_P
                                + hypre_BigBinarySearch(col_map_offd_RAP,
                                                RAP_ext_j[i],num_cols_offd_RAP);
      else
            int_RAP_ext_j[i] = (int)(RAP_ext_j[i]-first_col_diag_RAP);

/*   need to allocate new P_marker etc. and make further changes */
   /*-----------------------------------------------------------------------
    *  Initialize some stuff.
    *-----------------------------------------------------------------------*/
   jj_cnt_diag = hypre_CTAlloc(int, num_threads);
   jj_cnt_offd = hypre_CTAlloc(int, num_threads);
     size = num_cols_diag_RT/num_threads;
     rest = num_cols_diag_RT - size*num_threads;

#define HYPRE_SMP_PRIVATE i,j,k,jcol,ii,ic,i1,i2,i3,jj1,jj2,jj3,ns,ne,jj_count_diag,jj_count_offd,jj_row_begin_diag,jj_row_begin_offd,A_marker,P_marker, tmp_ii_10, tmp_ii, tmp_ii_1
#include "../utilities/hypre_smp_forloop.h"
   for (ii = 0; ii < num_threads; ii++)
   {
     /*size = num_cols_diag_RT/num_threads;
     rest = num_cols_diag_RT - size*num_threads;*/
     if (ii < rest)
     {
        ns = ii*size+ii;
        ne = (ii+1)*size+ii+1;
     }
     else
     {
        ns = ii*size+rest;
        ne = (ii+1)*size+rest;
     }

   P_mark_array[ii] = hypre_CTAlloc(int, num_cols_diag_P+num_cols_offd_RAP);
   A_mark_array[ii] = hypre_CTAlloc(int, num_nz_cols_A);
   P_marker = P_mark_array[ii];
   A_marker = A_mark_array[ii];
   jj_count_diag = start_indexing;
   jj_count_offd = start_indexing;

   for (ic = 0; ic < num_cols_diag_P+num_cols_offd_RAP; ic++)
   {      
      P_marker[ic] = -1;
   }
   for (i = 0; i < num_nz_cols_A; i++)
   {      
      A_marker[i] = -1;
   }   

   /*-----------------------------------------------------------------------
    *  Loop over interior c-points.
    *-----------------------------------------------------------------------*/
   
   for (ic = ns; ic < ne; ic++)
   {
      
      /*--------------------------------------------------------------------
       *  Set marker for diagonal entry, RAP_{ic,ic}. and for all points
       *  being added to row ic of RAP_diag and RAP_offd through RAP_ext
       *--------------------------------------------------------------------*/

      jj_row_begin_diag = jj_count_diag;
      jj_row_begin_offd = jj_count_offd;

      if (square)
         P_marker[ic] = jj_count_diag++;

      for (i=0; i < num_sends_RT; i++)
      {
        tmp_ii_10 = send_map_starts_RT[i+1];
        for (j = send_map_starts_RT[i]; j < tmp_ii_10; j++)
            if (send_map_elmts_RT[j] == ic)
            {
                for (k=RAP_ext_i[j]; k < RAP_ext_i[j+1]; k++)
                {
                   jcol = int_RAP_ext_j[k];
                   if (jcol < num_cols_diag_P)
                   {
                        if (P_marker[jcol] < jj_row_begin_diag)
                        {
                                P_marker[jcol] = jj_count_diag;
                                jj_count_diag++;
                        }
                   }
                   else
                   {
                        if (P_marker[jcol] < jj_row_begin_offd)
                        {
                                P_marker[jcol] = jj_count_offd;
                                jj_count_offd++;
                        }
                   }
                }
                break;
            }
      }
      /*--------------------------------------------------------------------
       *  Loop over entries in row ic of R_diag.
       *--------------------------------------------------------------------*/
   
      for (jj1 = R_diag_i[ic]; jj1 < R_diag_i[ic+1]; jj1++)
      {
         i1  = R_diag_j[jj1];
 
         /*-----------------------------------------------------------------
          *  Loop over entries in row i1 of A_offd.
          *-----------------------------------------------------------------*/
         
         if (num_cols_offd_A)
         {
           for (jj2 = A_offd_i[i1]; jj2 < A_offd_i[i1+1]; jj2++)
           {
            i2 = A_offd_j[jj2];
 
            /*--------------------------------------------------------------
             *  Check A_marker to see if point i2 has been previously
             *  visited. New entries in RAP only occur from unmarked points.
             *--------------------------------------------------------------*/
 
            if (A_marker[i2] != ic)
            {
 
               /*-----------------------------------------------------------
                *  Mark i2 as visited.
                *-----------------------------------------------------------*/
 
               A_marker[i2] = ic;
               
               /*-----------------------------------------------------------
                *  Loop over entries in row i2 of P_ext.
                *-----------------------------------------------------------*/
 
               for (jj3 = P_ext_diag_i[i2]; jj3 < P_ext_diag_i[i2+1]; jj3++)
               {
                  i3 = P_ext_diag_j[jj3];
                  
                  /*--------------------------------------------------------
                   *  Check P_marker to see that RAP_{ic,i3} has not already
                   *  been accounted for. If it has not, mark it and increment
                   *  counter.
                   *--------------------------------------------------------*/

                  if (P_marker[i3] < jj_row_begin_diag)
                  {
                     P_marker[i3] = jj_count_diag;
                     jj_count_diag++;
                  }
               }
               for (jj3 = P_ext_offd_i[i2]; jj3 < P_ext_offd_i[i2+1]; jj3++)
               {
                  i3 = map_Pext_to_RAP[P_ext_offd_j[jj3]]+num_cols_diag_P;
                  
                  /*--------------------------------------------------------
                   *  Check P_marker to see that RAP_{ic,i3} has not already
                   *  been accounted for. If it has not, mark it and increment
                   *  counter.
                   *--------------------------------------------------------*/

                  if (P_marker[i3] < jj_row_begin_offd)
                  {
                     P_marker[i3] = jj_count_offd;
                     jj_count_offd++;
                  }
               }
            }
           }
         }
         /*-----------------------------------------------------------------
          *  Loop over entries in row i1 of A_diag.
          *-----------------------------------------------------------------*/
         
         tmp_ii = A_diag_i[i1+1];
         for (jj2 = A_diag_i[i1]; jj2 < tmp_ii; jj2++)
         {
            i2 = A_diag_j[jj2];
 
            /*--------------------------------------------------------------
             *  Check A_marker to see if point i2 has been previously
             *  visited. New entries in RAP only occur from unmarked points.
             *--------------------------------------------------------------*/
 
            if (A_marker[i2+num_cols_offd_A] != ic)
            {
               /*-----------------------------------------------------------
                *  Mark i2 as visited.
                *-----------------------------------------------------------*/
 
               A_marker[i2+num_cols_offd_A] = ic;
               
               /*-----------------------------------------------------------
                *  Loop over entries in row i2 of P_diag.
                *-----------------------------------------------------------*/
 
               tmp_ii_1 =  P_diag_i[i2+1];
               for (jj3 = P_diag_i[i2]; jj3 < tmp_ii_1; jj3++)
               {
                  i3 = P_diag_j[jj3];
                  
                  /*--------------------------------------------------------
                   *  Check P_marker to see that RAP_{ic,i3} has not already
                   *  been accounted for. If it has not, mark it and increment
                   *  counter.
                   *--------------------------------------------------------*/
 
                  if (P_marker[i3] < jj_row_begin_diag)
                  {
                     P_marker[i3] = jj_count_diag;
                     jj_count_diag++;
                  }
               }
               /*-----------------------------------------------------------
                *  Loop over entries in row i2 of P_offd.
                *-----------------------------------------------------------*/

               if (num_cols_offd_P)
               { 
                 for (jj3 = P_offd_i[i2]; jj3 < P_offd_i[i2+1]; jj3++)
                 {
                  i3 = map_P_to_RAP[P_offd_j[jj3]] + num_cols_diag_P;
                  
                  /*--------------------------------------------------------
                   *  Check P_marker to see that RAP_{ic,i3} has not already
                   *  been accounted for. If it has not, mark it and increment
                   *  counter.
                   *--------------------------------------------------------*/
 
                  if (P_marker[i3] < jj_row_begin_offd)
                  {
                     P_marker[i3] = jj_count_offd;
                     jj_count_offd++;
                  }
                 }
               } 
            }
         }
      }
            
      /*--------------------------------------------------------------------
       * Set RAP_diag_i and RAP_offd_i for this row.
       *--------------------------------------------------------------------*/
/* 
      RAP_diag_i[ic] = jj_row_begin_diag;
      RAP_offd_i[ic] = jj_row_begin_offd;
*/      
    }
    jj_cnt_diag[ii] = jj_count_diag;
    jj_cnt_offd[ii] = jj_count_offd;
   }

   for (i=0; i < num_threads-1; i++)
   {
      jj_cnt_diag[i+1] += jj_cnt_diag[i];
      jj_cnt_offd[i+1] += jj_cnt_offd[i];
   }

   jj_count_diag = jj_cnt_diag[num_threads-1];
   jj_count_offd = jj_cnt_offd[num_threads-1];

   RAP_diag_i[num_cols_diag_RT] = jj_count_diag;
   RAP_offd_i[num_cols_diag_RT] = jj_count_offd;
 
   /*-----------------------------------------------------------------------
    *  Allocate RAP_diag_data and RAP_diag_j arrays.
    *  Allocate RAP_offd_data and RAP_offd_j arrays.
    *-----------------------------------------------------------------------*/
 
   RAP_diag_size = jj_count_diag;
   /*printf ("RAP_diag_size %d\n", RAP_diag_size);
   fflush(NULL);*/
   if (RAP_diag_size)
   { 
      RAP_diag_data = hypre_CTAlloc(double, RAP_diag_size);
      RAP_diag_j    = hypre_CTAlloc(int, RAP_diag_size);
   } 
 
   RAP_offd_size = jj_count_offd;
   if (RAP_offd_size)
   { 
      RAP_offd_data = hypre_CTAlloc(double, RAP_offd_size);
      RAP_offd_j    = hypre_CTAlloc(int, RAP_offd_size);
   } 

   if (RAP_offd_size == 0 && num_cols_offd_RAP != 0)
   {
      num_cols_offd_RAP = 0;
      hypre_TFree(col_map_offd_RAP);
   }
   /*-----------------------------------------------------------------------
    *  Second Pass: Fill in RAP_diag_data and RAP_diag_j.
    *  Second Pass: Fill in RAP_offd_data and RAP_offd_j.
    *-----------------------------------------------------------------------*/

#define HYPRE_SMP_PRIVATE i,j,k,jcol,ii,ic,i1,i2,i3,jj1,jj2,jj3,ns,ne,size,rest,jj_count_diag,jj_count_offd,jj_row_begin_diag,jj_row_begin_offd,A_marker,P_marker,r_entry,r_a_product,r_a_p_product, tmp_ii_11, tmp_ii, tmp_ii_1, tmp_ii_2
#include "../utilities/hypre_smp_forloop.h"
   for (ii = 0; ii < num_threads; ii++)
   {
     size = num_cols_diag_RT/num_threads;
     rest = num_cols_diag_RT - size*num_threads;
     if (ii < rest)
     {
        ns = ii*size+ii;
        ne = (ii+1)*size+ii+1;
     }
     else
     {
        ns = ii*size+rest;
        ne = (ii+1)*size+rest;
     }

   /*-----------------------------------------------------------------------
    *  Initialize some stuff.
    *-----------------------------------------------------------------------*/

   P_marker = P_mark_array[ii];
   A_marker = A_mark_array[ii];
   for (ic = 0; ic < num_cols_diag_P+num_cols_offd_RAP; ic++)
   {      
      P_marker[ic] = -1;
   }
   for (i = 0; i < num_nz_cols_A ; i++)
   {      
      A_marker[i] = -1;
   }   
   
   jj_count_diag = start_indexing;
   jj_count_offd = start_indexing;
   if (ii > 0)
   {
      jj_count_diag = jj_cnt_diag[ii-1];
      jj_count_offd = jj_cnt_offd[ii-1];
   }

   /*-----------------------------------------------------------------------
    *  Loop over interior c-points.
    *-----------------------------------------------------------------------*/
    
   for (ic = ns; ic < ne; ic++)
   {
      
      /*--------------------------------------------------------------------
       *  Create diagonal entry, RAP_{ic,ic} and add entries of RAP_ext 
       *--------------------------------------------------------------------*/

      jj_row_begin_diag = jj_count_diag;
      jj_row_begin_offd = jj_count_offd;
      RAP_diag_i[ic] = jj_row_begin_diag;
      RAP_offd_i[ic] = jj_row_begin_offd;

      if (square)
      {
         P_marker[ic] = jj_count_diag;
         /*if (jj_count_diag > RAP_diag_size-1)
	 {   printf(" warning jj_count_diag %d\n", jj_count_diag);
          fflush(NULL);}*/
         RAP_diag_data[jj_count_diag] = zero;
         RAP_diag_j[jj_count_diag] = ic;
         jj_count_diag++;
      }

      for (i=0; i < num_sends_RT; i++)
      {
        tmp_ii_11 = send_map_starts_RT[i+1];
        for (j = send_map_starts_RT[i]; j < tmp_ii_11; j++)
            if (send_map_elmts_RT[j] == ic)
            {
                for (k=RAP_ext_i[j]; k < RAP_ext_i[j+1]; k++)
                {
                   jcol = int_RAP_ext_j[k];
                   if (jcol < num_cols_diag_P)
                   {
                        if (P_marker[jcol] < jj_row_begin_diag)
                        {
                                P_marker[jcol] = jj_count_diag;
                                RAP_diag_data[jj_count_diag] 
                                        = RAP_ext_data[k];
                                RAP_diag_j[jj_count_diag] = jcol;
                                jj_count_diag++;
                        }
                        else
                                RAP_diag_data[P_marker[jcol]]
                                        += RAP_ext_data[k];
                   }
                   else
                   {
                        if (P_marker[jcol] < jj_row_begin_offd)
                        {
                                P_marker[jcol] = jj_count_offd;
                                RAP_offd_data[jj_count_offd] 
                                        = RAP_ext_data[k];
                                RAP_offd_j[jj_count_offd] 
                                        = jcol-num_cols_diag_P;
                                jj_count_offd++;
                        }
                        else
                                RAP_offd_data[P_marker[jcol]]
                                        += RAP_ext_data[k];
                   }
                }
                break;
            }
      }
      /*--------------------------------------------------------------------
       *  Loop over entries in row ic of R_diag.
       *--------------------------------------------------------------------*/

      for (jj1 = R_diag_i[ic]; jj1 < R_diag_i[ic+1]; jj1++)
      {
         i1  = R_diag_j[jj1];
         r_entry = R_diag_data[jj1];

         /*-----------------------------------------------------------------
          *  Loop over entries in row i1 of A_offd.
          *-----------------------------------------------------------------*/
         
         if (num_cols_offd_A)
         {
          for (jj2 = A_offd_i[i1]; jj2 < A_offd_i[i1+1]; jj2++)
          {
            i2 = A_offd_j[jj2];
            r_a_product = r_entry * A_offd_data[jj2];
            
            /*--------------------------------------------------------------
             *  Check A_marker to see if point i2 has been previously
             *  visited. New entries in RAP only occur from unmarked points.
             *--------------------------------------------------------------*/

            if (A_marker[i2] != ic)
            {

               /*-----------------------------------------------------------
                *  Mark i2 as visited.
                *-----------------------------------------------------------*/

               A_marker[i2] = ic;
               
               /*-----------------------------------------------------------
                *  Loop over entries in row i2 of P_ext.
                *-----------------------------------------------------------*/

               for (jj3 = P_ext_diag_i[i2]; jj3 < P_ext_diag_i[i2+1]; jj3++)
               {
                  i3 = P_ext_diag_j[jj3];
                  r_a_p_product = r_a_product * P_ext_diag_data[jj3];
                  
                  /*--------------------------------------------------------
                   *  Check P_marker to see that RAP_{ic,i3} has not already
                   *  been accounted for. If it has not, create a new entry.
                   *  If it has, add new contribution.
                   *--------------------------------------------------------*/
                  if (P_marker[i3] < jj_row_begin_diag)
                  {
                        P_marker[i3] = jj_count_diag;
                        RAP_diag_data[jj_count_diag] = r_a_p_product;
                        RAP_diag_j[jj_count_diag] = i3;
                        jj_count_diag++;
                  }
                  else
                        RAP_diag_data[P_marker[i3]] += r_a_p_product;
               }
               for (jj3 = P_ext_offd_i[i2]; jj3 < P_ext_offd_i[i2+1]; jj3++)
               {
                  i3 = map_Pext_to_RAP[P_ext_offd_j[jj3]] + num_cols_diag_P;
                  r_a_p_product = r_a_product * P_ext_offd_data[jj3];
                  
                  /*--------------------------------------------------------
                   *  Check P_marker to see that RAP_{ic,i3} has not already
                   *  been accounted for. If it has not, create a new entry.
                   *  If it has, add new contribution.
                   *--------------------------------------------------------*/
                  if (P_marker[i3] < jj_row_begin_offd)
                  {
                        P_marker[i3] = jj_count_offd;
                        RAP_offd_data[jj_count_offd] = r_a_p_product;
                        RAP_offd_j[jj_count_offd] = i3 - num_cols_diag_P;
                        jj_count_offd++;
                  }
                  else
                        RAP_offd_data[P_marker[i3]] += r_a_p_product;
               }
            }

            /*--------------------------------------------------------------
             *  If i2 is previously visited ( A_marker[12]=ic ) it yields
             *  no new entries in RAP and can just add new contributions.
             *--------------------------------------------------------------*/
            else
            {
               for (jj3 = P_ext_diag_i[i2]; jj3 < P_ext_diag_i[i2+1]; jj3++)
               {
                  i3 = P_ext_diag_j[jj3];
                  r_a_p_product = r_a_product * P_ext_diag_data[jj3];
                  RAP_diag_data[P_marker[i3]] += r_a_p_product;
               }
               for (jj3 = P_ext_offd_i[i2]; jj3 < P_ext_offd_i[i2+1]; jj3++)
               {
                  i3 = map_Pext_to_RAP[P_ext_offd_j[jj3]] + num_cols_diag_P;
                  r_a_p_product = r_a_product * P_ext_offd_data[jj3];
                  RAP_offd_data[P_marker[i3]] += r_a_p_product;
               }
            }
          }
         }

         /*-----------------------------------------------------------------
          *  Loop over entries in row i1 of A_diag.
          *-----------------------------------------------------------------*/
         
         /* for (jj2 = A_diag_i[i1]; jj2 < A_diag_i[i1+1]; jj2++) */

         tmp_ii = A_diag_i[i1+1];
         for (jj2 = A_diag_i[i1]; jj2 < tmp_ii; jj2++) 
         {
            i2 = A_diag_j[jj2];
            r_a_product = r_entry * A_diag_data[jj2];
            
            /*--------------------------------------------------------------
             *  Check A_marker to see if point i2 has been previously
             *  visited. New entries in RAP only occur from unmarked points.
             *--------------------------------------------------------------*/

            if (A_marker[i2+num_cols_offd_A] != ic)
            {
               /*-----------------------------------------------------------
                *  Mark i2 as visited.
                *-----------------------------------------------------------*/

               A_marker[i2+num_cols_offd_A] = ic;
               
               /*-----------------------------------------------------------
                *  Loop over entries in row i2 of P_diag.
                *-----------------------------------------------------------*/

               tmp_ii_1 = P_diag_i[i2+1];
               for (jj3 = P_diag_i[i2]; jj3 < tmp_ii_1; jj3++)
               {
                  i3 = P_diag_j[jj3];
                  r_a_p_product = r_a_product * P_diag_data[jj3];
                  
                  /*--------------------------------------------------------
                   *  Check P_marker to see that RAP_{ic,i3} has not already
                   *  been accounted for. If it has not, create a new entry.
                   *  If it has, add new contribution.
                   *--------------------------------------------------------*/

                  if (P_marker[i3] < jj_row_begin_diag)
                  {
                     P_marker[i3] = jj_count_diag;
                     RAP_diag_data[jj_count_diag] = r_a_p_product;
                     RAP_diag_j[jj_count_diag] = P_diag_j[jj3];
                     jj_count_diag++;
                  }
                  else
                  {
                     RAP_diag_data[P_marker[i3]] += r_a_p_product;
                  }
               }
               if (num_cols_offd_P)
               {
                for (jj3 = P_offd_i[i2]; jj3 < P_offd_i[i2+1]; jj3++)
                {
                  i3 = map_P_to_RAP[P_offd_j[jj3]] + num_cols_diag_P;
                  r_a_p_product = r_a_product * P_offd_data[jj3];
                  
                  /*--------------------------------------------------------
                   *  Check P_marker to see that RAP_{ic,i3} has not already
                   *  been accounted for. If it has not, create a new entry.
                   *  If it has, add new contribution.
                   *--------------------------------------------------------*/

                  if (P_marker[i3] < jj_row_begin_offd)
                  {
                     P_marker[i3] = jj_count_offd;
                     RAP_offd_data[jj_count_offd] = r_a_p_product;
                     RAP_offd_j[jj_count_offd] = i3 - num_cols_diag_P;
                     jj_count_offd++;
                  }
                  else
                  {
                     RAP_offd_data[P_marker[i3]] += r_a_p_product;
                  }
                }
               }
            }

            /*--------------------------------------------------------------
             *  If i2 is previously visited ( A_marker[12]=ic ) it yields
             *  no new entries in RAP and can just add new contributions.
             *--------------------------------------------------------------*/

            else
            {
	       tmp_ii_2 =  P_diag_i[i2+1];

               for (jj3 = P_diag_i[i2]; jj3 < tmp_ii_2; jj3++)
               {
                  i3 = P_diag_j[jj3];
                  /* __prefetch_by_load((P_marker + i3)); */
                  r_a_p_product = r_a_product * P_diag_data[jj3];
                  RAP_diag_data[P_marker[i3]] += r_a_p_product;
               }
               if (num_cols_offd_P)
               {
                for (jj3 = P_offd_i[i2]; jj3 < P_offd_i[i2+1]; jj3++)
                {
                  i3 = map_P_to_RAP[P_offd_j[jj3]] + num_cols_diag_P;
                  r_a_p_product = r_a_product * P_offd_data[jj3];
                  RAP_offd_data[P_marker[i3]] += r_a_p_product;
                }
               }
            }
         }
      }
   }
      hypre_TFree(P_mark_array[ii]);   
      hypre_TFree(A_mark_array[ii]);   
   }

   /* check if really all off-diagonal entries occurring in col_map_offd_RAP
	are represented and eliminate if necessary */

   P_marker = hypre_CTAlloc(int,num_cols_offd_RAP);
   for (i=0; i < num_cols_offd_RAP; i++)
      P_marker[i] = -1;

   jj_count_offd = 0;
   for (i=0; i < RAP_offd_size; i++)
   {
      i3 = RAP_offd_j[i];
      if (P_marker[i3])
      {
	 P_marker[i3] = 0;
	 jj_count_offd++;
      }
   }

   if (jj_count_offd < num_cols_offd_RAP)
   {
      new_col_map_offd_RAP = hypre_CTAlloc(HYPRE_BigInt,jj_count_offd);
      jj_counter = 0;
      for (i=0; i < num_cols_offd_RAP; i++)
         if (!P_marker[i]) 
         {
	    P_marker[i] = jj_counter;
	    new_col_map_offd_RAP[jj_counter++] = col_map_offd_RAP[i];
         }
 
      for (i=0; i < RAP_offd_size; i++)
      {
	 i3 = RAP_offd_j[i];
	 RAP_offd_j[i] = P_marker[i3];
      }
      
      num_cols_offd_RAP = jj_count_offd;
      hypre_TFree(col_map_offd_RAP);
      col_map_offd_RAP = new_col_map_offd_RAP;
   }
   hypre_TFree(P_marker);

   RAP = hypre_ParCSRMatrixCreate(comm, n_coarse_RT, n_coarse,
                                  RT_partitioning, coarse_partitioning,
                                  num_cols_offd_RAP, RAP_diag_size,
                                  RAP_offd_size);

/* Have RAP own coarse_partitioning instead of P */
   hypre_ParCSRMatrixSetColStartsOwner(P,0);
   hypre_ParCSRMatrixSetColStartsOwner(RT,0);

   RAP_diag = hypre_ParCSRMatrixDiag(RAP);
   hypre_CSRMatrixI(RAP_diag) = RAP_diag_i; 
   if (RAP_diag_size)
   {
      hypre_CSRMatrixData(RAP_diag) = RAP_diag_data; 
      hypre_CSRMatrixJ(RAP_diag) = RAP_diag_j; 
   }

   RAP_offd = hypre_ParCSRMatrixOffd(RAP);
   hypre_CSRMatrixI(RAP_offd) = RAP_offd_i; 
   if (num_cols_offd_RAP)
   {
        hypre_CSRMatrixData(RAP_offd) = RAP_offd_data; 
        hypre_CSRMatrixJ(RAP_offd) = RAP_offd_j; 
        hypre_ParCSRMatrixColMapOffd(RAP) = col_map_offd_RAP;
   }
   if (num_procs > 1)
   {
        /* hypre_GenerateRAPCommPkg(RAP, A); */
#ifdef HYPRE_NO_GLOBAL_PARTITION
        hypre_NewCommPkgCreate(RAP);
#else
        hypre_MatvecCommPkgCreate(RAP); 
#endif
   }

   *RAP_ptr = RAP;

   /*-----------------------------------------------------------------------
    *  Free R, P_ext and marker arrays.
    *-----------------------------------------------------------------------*/

   hypre_CSRMatrixDestroy(R_diag);
   R_diag = NULL;

   if (num_cols_offd_RT) 
   {
      hypre_CSRMatrixDestroy(R_offd);
      R_offd = NULL;
   }

   if (num_sends_RT || num_recvs_RT) 
   {
      hypre_BigCSRMatrixDestroy(RAP_ext);
      RAP_ext = NULL;
   }
   hypre_TFree(P_mark_array);   
   hypre_TFree(A_mark_array);
   hypre_TFree(P_ext_diag_i);
   hypre_TFree(P_ext_offd_i);
   hypre_TFree(jj_cnt_diag);   
   hypre_TFree(jj_cnt_offd);   
   hypre_TFree(int_RAP_ext_j);   
   if (num_cols_offd_P)
   {
      hypre_TFree(map_P_to_Pext);
      hypre_TFree(map_P_to_RAP);
   }
   if (num_cols_offd_Pext)
   {
      hypre_TFree(col_map_offd_Pext);
      hypre_TFree(map_Pext_to_RAP);
   }
   if (P_ext_diag_size)
   {
      hypre_TFree(P_ext_diag_data);
      hypre_TFree(P_ext_diag_j);
   }
   if (P_ext_offd_size)
   {
      hypre_TFree(P_ext_offd_data);
      hypre_TFree(P_ext_offd_j);
   }
   return(0);
   
}            

