Functions/Subroutines
subroutine	pssyttrd (uplo, n, a, ia, ja, desca, d, e, tau, work, lwork, info)
Function/Subroutine Documentation

◆ pssyttrd()

subroutine pssyttrd	(	character	uplo,
		integer	n,
		real, dimension( * )	a,
		integer	ia,
		integer	ja,
		integer, dimension( * )	desca,
		real, dimension( * )	d,
		real, dimension( * )	e,
		real, dimension( * )	tau,
		real, dimension( * )	work,
		integer	lwork,
		integer	info )
Definition at line 1 of file pssyttrd.f.
*
*  -- ScaLAPACK routine (version 2.0.2) --
*     Univ. of Tennessee, Univ. of California Berkeley, Univ. of Colorado Denver
*     May 1 2012
*
*     .. Scalar Arguments ..
      CHARACTER          UPLO
      INTEGER            IA, INFO, JA, LWORK, N
*     ..
*     .. Array Arguments ..
      INTEGER            DESCA( * )
      REAL               A( * ), D( * ), E( * ), TAU( * ), WORK( * )
*     ..
*
*     Purpose
*
*     =======
*
*     PSSYTTRD reduces a complex Hermitian matrix sub( A ) to Hermitian
*     tridiagonal form T by an unitary similarity transformation:
*     Q' * sub( A ) * Q = T, where sub( A ) = A(IA:IA+N-1,JA:JA+N-1).
*
*     Notes
*     =====
*
*     Each global data object is described by an associated description
*     vector.  This vector stores the information required to establish
*     the mapping between an object element and its corresponding
*     process and memory location.
*
*     Let A be a generic term for any 2D block cyclicly distributed
*     array.
*     Such a global array has an associated description vector DESCA.
*     In the following comments, the character _ should be read as
*     "of the global array".
*
*     NOTATION        STORED IN      EXPLANATION
*     --------------- -------------- -----------------------------------
*     DTYPE_A(global) DESCA( DTYPE_ )The descriptor type.  In this case,
*     DTYPE_A = 1.
*     CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle,
*     indicating the BLACS process grid A is distribu-
*     ted over. The context itself is glo-
*     bal, but the handle (the integer
*     value) may vary.
*     M_A    (global) DESCA( M_ )    The number of rows in the global
*     array A.
*     N_A    (global) DESCA( N_ )    The number of columns in the global
*     array A.
*     MB_A   (global) DESCA( MB_ )   The blocking factor used to
*     distribute the rows of the array.
*     NB_A   (global) DESCA( NB_ )   The blocking factor used to
*     distribute the columns of the array.
*     RSRC_A (global) DESCA( RSRC_ ) The process row over which the
*     first row of the array A is distributed.
*     CSRC_A (global) DESCA( CSRC_ ) The process column over which the
*     first column of the array A is
*     distributed.
*     LLD_A  (local)  DESCA( LLD_ )  The leading dimension of the local
*     array.  LLD_A >= MAX(1,LOCp(M_A)).
*
*     Let K be the number of rows or columns of a distributed matrix,
*     and assume that its process grid has dimension p x q.
*     LOCp( K ) denotes the number of elements of K that a process
*     would receive if K were distributed over the p processes of its
*     process column.
*     Similarly, LOCq( K ) denotes the number of elements of K that a
*     process would receive if K were distributed over the q processes
*     of its process row.
*     The values of LOCp() and LOCq() may be determined via a call to
*     the ScaLAPACK tool function, NUMROC:
*     LOCp( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ),
*     LOCq( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ).
*     An upper bound for these quantities may be computed by:
*     LOCp( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A
*     LOCq( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A
*
*     Arguments
*     =========
*
*     UPLO    (global input) CHARACTER
*     Specifies whether the upper or lower triangular part of the
*     Hermitian matrix sub( A ) is stored:
*     = 'U':  Upper triangular
*     = 'L':  Lower triangular
*
*     N       (global input) INTEGER
*     The number of rows and columns to be operated on, i.e. the
*     order of the distributed submatrix sub( A ). N >= 0.
*
*     A       (local input/local output) REAL pointer into the
*     local memory to an array of dimension (LLD_A,LOCq(JA+N-1)).
*     On entry, this array contains the local pieces of the
*     Hermitian distributed matrix sub( A ).  If UPLO = 'U', the
*     leading N-by-N upper triangular part of sub( A ) contains
*     the upper triangular part of the matrix, and its strictly
*     lower triangular part is not referenced. If UPLO = 'L', the
*     leading N-by-N lower triangular part of sub( A ) contains the
*     lower triangular part of the matrix, and its strictly upper
*     triangular part is not referenced. On exit, if UPLO = 'U',
*     the diagonal and first superdiagonal of sub( A ) are over-
*     written by the corresponding elements of the tridiagonal
*     matrix T, and the elements above the first superdiagonal,
*     with the array TAU, represent the unitary matrix Q as a
*     product of elementary reflectors; if UPLO = 'L', the diagonal
*     and first subdiagonal of sub( A ) are overwritten by the
*     corresponding elements of the tridiagonal matrix T, and the
*     elements below the first subdiagonal, with the array TAU,
*     represent the unitary matrix Q as a product of elementary
*     reflectors. See Further Details.
*
*     IA      (global input) INTEGER
*     The row index in the global array A indicating the first
*     row of sub( A ).
*
*     JA      (global input) INTEGER
*     The column index in the global array A indicating the
*     first column of sub( A ).
*
*     DESCA   (global and local input) INTEGER array of dimension DLEN_.
*     The array descriptor for the distributed matrix A.
*
*     D       (local output) REAL array, dim LOCq(JA+N-1)
*     The diagonal elements of the tridiagonal matrix T:
*     D(i) = A(i,i). D is tied to the distributed matrix A.
*
*     E       (local output) REAL array, dim LOCq(JA+N-1)
*     if UPLO = 'U', LOCq(JA+N-2) otherwise. The off-diagonal
*     elements of the tridiagonal matrix T: E(i) = A(i,i+1) if
*     UPLO = 'U', E(i) = A(i+1,i) if UPLO = 'L'. E is tied to the
*     distributed matrix A.
*
*     TAU     (local output) REAL, array, dimension
*     LOCq(JA+N-1). This array contains the scalar factors TAU of
*     the elementary reflectors. TAU is tied to the distributed
*     matrix A.
*
*     WORK    (local workspace) REAL array, dimension (LWORK)
*     On exit, WORK( 1 ) returns the minimal and optimal workspace
*
*     LWORK   (local input) INTEGER
*     The dimension of the array WORK.
*     LWORK >= 2*( ANB+1 )*( 4*NPS+2 ) + NPS
*     Where:
*         NPS = MAX( NUMROC( N, 1, 0, 0, NPROW ), 2*ANB )
*         ANB = PJLAENV( DESCA( CTXT_ ), 3, 'PSSYTTRD', 'L', 0, 0,
*           0, 0 )
*
*         NUMROC is a ScaLAPACK tool function;
*         PJLAENV is a ScaLAPACK envionmental inquiry function
*         MYROW, MYCOL, NPROW and NPCOL can be determined by calling
*         the subroutine BLACS_GRIDINFO.
*
*     INFO    (global output) INTEGER
*     = 0:  successful exit
*     < 0:  If the i-th argument is an array and the j-entry had
*     an illegal value, then INFO = -(i*100+j), if the i-th
*     argument is a scalar and had an illegal value, then
*     INFO = -i.
*
*     Further Details
*     ===============
*
*     If UPLO = 'U', the matrix Q is represented as a product of
*     elementary reflectors
*
*     Q = H(n-1) . . . H(2) H(1).
*
*     Each H(i) has the form
*
*     H(i) = I - tau * v * v'
*
*     where tau is a complex scalar, and v is a complex vector with
*     v(i+1:n) = 0 and v(i) = 1; v(1:i-1) is stored on exit in
*     A(ia:ia+i-2,ja+i), and tau in TAU(ja+i-1).
*
*     If UPLO = 'L', the matrix Q is represented as a product of
*     elementary reflectors
*
*     Q = H(1) H(2) . . . H(n-1).
*
*     Each H(i) has the form
*
*     H(i) = I - tau * v * v'
*
*     where tau is a complex scalar, and v is a complex vector with
*     v(1:i) = 0 and v(i+1) = 1; v(i+2:n) is stored on exit in
*     A(ia+i+1:ia+n-1,ja+i-1), and tau in TAU(ja+i-1).
*
*     The contents of sub( A ) on exit are illustrated by the following
*     examples with n = 5:
*
*     if UPLO = 'U':                       if UPLO = 'L':
*
*     (  d   e   v2  v3  v4 )              (  d                  )
*     (      d   e   v3  v4 )              (  e   d              )
*     (          d   e   v4 )              (  v1  e   d          )
*     (              d   e  )              (  v1  v2  e   d      )
*     (                  d  )              (  v1  v2  v3  e   d  )
*
*     where d and e denote diagonal and off-diagonal elements of T, and
*     vi denotes an element of the vector defining H(i).
*
*     Data storage requirements
*     =========================
*
*     PSSYTTRD is not intended to be called directly.  All users are
*     encourage to call PSSYTRD which will then call PSHETTRD if
*     appropriate.  A must be in cyclic format (i.e. MB = NB = 1),
*     the process grid must be square ( i.e. NPROW = NPCOL ) and
*     only lower triangular storage is supported.
*
*     Local variables
*     ===============
*
*     PSSYTTRD uses five local arrays:
*       WORK ( InV ) dimension ( NP, ANB+1): array V
*       WORK ( InH ) dimension ( NP, ANB+1): array H
*       WORK ( InVT ) dimension ( NQ, ANB+1): transpose of the array V
*       WORK ( InHT ) dimension ( NQ, ANB+1): transpose of the array H
*       WORK ( InVTT ) dimension ( NQ, 1): transpose of the array VT
*
*     Arrays V and H are replicated across all processor columns.
*     Arrays V^T and H^T are replicated across all processor rows.
*
*         WORK ( InVT ), or V^T, is stored as a tall skinny
*         array ( NQ x ANB-1 ) for efficiency.  Since only the lower
*         triangular portion of A is updated, Av is computed as:
*         tril(A) * v + v^T * tril(A,-1).  This is performed as
*         two local triangular matrix-vector multiplications (both in
*         MVR2) followed by a transpose and a sum across the columns.
*         In the local computation, WORK( InVT ) is used to compute
*         tril(A) * v and WORK( InV ) is used to compute
*         v^T * tril(A,-1)
*
*     The following variables are global indices into A:
*       INDEX:  The current global row and column number.
*       MAXINDEX:  The global row and column for the first row and
*       column in the trailing block of A.
*       LIIB, LIJB:  The first row, column in
*
*     The following variables point into the arrays A, V, H, V^T, H^T:
*       BINDEX  =INDEX-MININDEX: The column index in V, H, V^T, H^T.
*       LII:  local index I:  The local row number for row INDEX
*       LIJ:  local index J:  The local column number for column INDEX
*       LIIP1:  local index I+1:  The local row number for row INDEX+1
*       LIJP1:  local index J+1:  The local col number for col INDEX+1
*       LTLI: lower triangular local index I:  The local row for the
*         upper left entry in tril( A(INDEX, INDEX) )
*       LTLIP1: lower triangular local index I+1:  The local row for the
*         upper left entry in tril( A(INDEX+1, INDEX+1) )
*
*         Details:  The distinction between LII and LTLI (and between
*         LIIP1 and LTLIP1) is subtle.  Within the current processor
*         column (i.e. MYCOL .eq. CURCOL) they are the same.  However,
*         on some processors, A( LII, LIJ ) points to an element
*         above the diagonal, on these processors, LTLI = LII+1.
*
*     The following variables give the number of rows and/or columns
*     in various matrices:
*       NP:  The number of local rows in A( 1:N, 1:N )
*       NQ:  The number of local columns in A( 1:N, 1:N )
*       NPM0:  The number of local rows in A( INDEX:N, INDEX:N )
*       NQM0:  The number of local columns in A( INDEX:N, INDEX:N )
*       NPM1:  The number of local rows in A( INDEX+1:N, INDEX:N )
*       NQM1:  The number of local columns in A( INDEX+1:N, INDEX:N )
*       LTNM0:  The number of local rows & columns in
*         tril( A( INDEX:N, INDEX:N ) )
*       LTNM1:  The number of local rows & columns in
*         tril( A( INDEX+1:N, INDEX+1:N ) )
*         NOTE:  LTNM0 == LTNM1 on all processors except the diagonal
*         processors, i.e. those where MYCOL == MYROW.
*
*         Invariants:
*           NP = NPM0 + LII - 1
*           NQ = NQM0 + LIJ - 1
*           NP = NPM1 + LIIP1 - 1
*           NQ = NQM1 + LIJP1 - 1
*           NP = LTLI + LTNM0 - 1
*           NP = LTLIP1 + LTNM1 - 1
*
*       Temporary variables.  The following variables are used within
*       a few lines after they are set and do hold state from one loop
*       iteration to the next:
*
*     The matrix A:
*       The matrix A does not hold the same values that it would
*       in an unblocked code nor the values that it would hold in
*       in a blocked code.
*
*       The value of A is confusing.  It is easiest to state the
*       difference between trueA and A at the point that MVR2 is called,
*       so we will start there.
*
*       Let trueA be the value that A would
*       have at a given point in an unblocked code and A
*       be the value that A has in this code at the same point.
*
*       At the time of the call to MVR2,
*       trueA = A + V' * H + H' * V
*       where H = H( MAXINDEX:N, 1:BINDEX ) and
*       V = V( MAXINDEX:N, 1:BINDEX ).
*
*       At the bottom of the inner loop,
*       trueA = A +  V' * H + H' * V + v' * h + h' * v
*       where H = H( MAXINDEX:N, 1:BINDEX ) and
*       V = V( MAXINDEX:N, 1:BINDEX ) and
*       v = V( liip1:N, BINDEX+1 ) and
*       h = H( liip1:N, BINDEX+1 )
*
*       At the top of the loop, BINDEX gets incremented, hence:
*       trueA = A +  V' * H + H' * V + v' * h + h' * v
*       where H = H( MAXINDEX:N, 1:BINDEX-1 ) and
*       V = V( MAXINDEX:N, 1:BINDEX-1 ) and
*       v = V( liip1:N, BINDEX ) and
*       h = H( liip1:N, BINDEX )
*
*
*       A gets updated at the bottom of the outer loop
*       After this update, trueA = A + v' * h + h' * v
*       where v = V( liip1:N, BINDEX ) and
*       h = H( liip1:N, BINDEX ) and BINDEX = 0
*       Indeed, the previous loop invariant as stated above for the
*       top of the loop still holds, but with BINDEX = 0, H and V
*       are null matrices.
*
*       After the current column of A is updated,
*         trueA( INDEX, INDEX:N ) = A( INDEX, INDEX:N )
*       the rest of A is untouched.
*
*       After the current block column of A is updated,
*       trueA = A + V' * H + H' * V
*       where H = H( MAXINDEX:N, 1:BINDEX ) and
*       V = V( MAXINDEX:N, 1:BINDEX )
*
*       This brings us back to the point at which mvr2 is called.
*
*
*     Details of the parallelization:
*
*       We delay spreading v across to all processor columns (which
*       would naturally happen at the bottom of the loop) in order to
*       combine the spread of v( : , i-1 ) with the spread of h( : , i )
*
*       In order to compute h( :, i ), we must update A( :, i )
*       which means that the processor column owning A( :, i ) must
*       have: c, tau, v( i, i ) and h( i, i ).
*
*       The traditional
*       way of computing v (and the one used in pzlatrd.f and
*       zlatrd.f) is:
*         v = tau * v
*         c = v' * h
*         alpha = - tau * c / 2
*         v = v + alpha * h
*       However, the traditional way of computing v requires that tau
*       be broadcast to all processors in the current column (to compute
*       v = tau * v) and then a sum-to-all is required (to
*       compute v' * h ).  We use the following formula instead:
*         c = v' * h
*         v = tau * ( v - c * tau' * h / 2 )
*       The above formula allows tau to be spread down in the
*       same call to SGSUM2D which performs the sum-to-all of c.
*
*       The computation of v, which could be performed in any processor
*       column (or other procesor subsets), is performed in the
*       processor column that owns A( :, i+1 ) so that A( :, i+1 )
*       can be updated prior to spreading v across.
*
*       We keep the block column of A up-to-date to minimize the
*       work required in updating the current column of A.  Updating
*       the block column of A is reasonably load balanced whereas
*       updating the current column of A is not (only the current
*       processor column is involved).
*
*     In the following overview of the steps performed, M in the
*     margin indicates message traffic and C indicates O(n^2 nb/sqrt(p))
*     or more flops per processor.
*
*     Inner loop:
*       A( index:n, index ) -= ( v * ht(bindex) + h * vt( bindex) )
*M      h = house( A(index:n, index) )
*M      Spread v, h across
*M      vt = v^T; ht = h^T
*       A( index+1:n, index+1:maxindex ) -=
*         ( v * ht(index+1:maxindex) + h *vt(index+1:maxindex) )
*C      v = tril(A) * h; vt = ht * tril(A,-1)
*MorC   v = v - H*V*h - V*H*h
*M      v = v + vt^T
*M      c = v' * h
*       v = tau * ( v - c * tau' * h / 2 )
*C    A = A - H*V - V*H
*
*
*
*     =================================================================
*
*     .. Parameters ..
      INTEGER            BLOCK_CYCLIC_2D, DLEN_, DTYPE_, CTXT_, M_, N_,
     $                   MB_, NB_, RSRC_, CSRC_, LLD_
      parameter( block_cyclic_2d = 1, dlen_ = 9, dtype_ = 1,
     $                   ctxt_ = 2, m_ = 3, n_ = 4, mb_ = 5, nb_ = 6,
     $                   rsrc_ = 7, csrc_ = 8, lld_ = 9 )
      REAL               ONE
      parameter( one = 1.0e0 )
      REAL               Z_ONE, Z_NEGONE, Z_ZERO
      parameter( z_one = 1.0e0, z_negone = -1.0e0,
     $                   z_zero = 0.0e0 )
      REAL               ZERO
      parameter( zero = 0.0e+0 )
*     ..
*
*
*     .. Local Scalars ..
*
*
      LOGICAL            BALANCED, INTERLEAVE, TWOGEMMS, UPPER
      INTEGER            ANB, BINDEX, CURCOL, CURROW, I, ICTXT, INDEX,
     $                   INDEXA, INDEXINH, INDEXINV, INH, INHB, INHT,
     $                   INHTB, INTMP, INV, INVB, INVT, INVTB, J, LDA,
     $                   LDV, LDZG, LII, LIIB, LIIP1, LIJ, LIJB, LIJP1,
     $                   LTLIP1, LTNM1, LWMIN, MAXINDEX, MININDEX,
     $                   MYCOL, MYFIRSTROW, MYROW, MYSETNUM, NBZG, NP,
     $                   NPB, NPCOL, NPM0, NPM1, NPROW, NPS, NPSET, NQ,
     $                   NQB, NQM1, NUMROWS, NXTCOL, NXTROW, PBMAX,
     $                   PBMIN, PBSIZE, PNB, ROWSPERPROC
      REAL               ALPHA, BETA, C, NORM, ONEOVERBETA, SAFMAX,
     $                   SAFMIN, TOPH, TOPNV, TOPTAU, TOPV, TTOPH, TTOPV
*     ..
*     .. Local Arrays ..
*
*
*
*
      INTEGER            IDUM1( 1 ), IDUM2( 1 )
      REAL               CC( 3 ), DTMP( 5 )
*     ..
*     .. External Subroutines ..
      EXTERNAL           blacs_gridinfo, chk1mat, pchk1mat, pstreecomb,
     $                   pxerbla, scombnrm2, sgebr2d, sgebs2d, sgemm,
     $                   sgemv, sgerv2d, sgesd2d, sgsum2d, slamov,
     $                   sscal, strmvt
*     ..
*     .. External Functions ..
*
      LOGICAL            LSAME
      INTEGER            ICEIL, NUMROC, PJLAENV
      REAL               PSLAMCH, SNRM2
      EXTERNAL           lsame, iceil, numroc, pjlaenv, pslamch, snrm2
*     ..
*     .. Intrinsic Functions ..
      INTRINSIC          ichar, max, min, mod, real, sign, sqrt
*     ..
*
*
*     .. Executable Statements ..
*       This is just to keep ftnchek and toolpack/1 happy
      IF( block_cyclic_2d*csrc_*ctxt_*dlen_*dtype_*lld_*mb_*m_*nb_*n_*
     $    rsrc_.LT.0 )RETURN
*
*
*
*     Further details
*     ===============
*
*     At the top of the loop, v and nh have been computed but not
*     spread across.  Hence, A is out-of-date even after the
*     rank 2k update.  Furthermore, we compute the next v before
*     nh is spread across.
*
*     I claim that if we used a sum-to-all on NV, by summing CC within
*     each column, that we could compute NV locally and could avoid
*     spreading V across.  Bruce claims that sum-to-all can be made
*     to cost no more than sum-to-one on the Paragon.  If that is
*     true, this would be a win.  But,
*     the BLACS sum-to-all is just a sum-to-one followed by a broadcast,
*     and hence the present scheme is better for now.
*
*     Get grid parameters
*
      ictxt = desca( ctxt_ )
      CALL blacs_gridinfo( ictxt, nprow, npcol, myrow, mycol )
*
      safmax = sqrt( pslamch( ictxt, 'O' ) ) / n
      safmin = sqrt( pslamch( ictxt, 'S' ) )
*
*     Test the input parameters
*
      info = 0
      IF( nprow.EQ.-1 ) THEN
         info = -( 600+ctxt_ )
      ELSE
*
*     Here we set execution options for PSSYTTRD
*
         pnb = pjlaenv( ictxt, 2, 'PSSYTTRD', 'L', 0, 0, 0, 0 )
         anb = pjlaenv( ictxt, 3, 'PSSYTTRD', 'L', 0, 0, 0, 0 )
*
         interleave = ( pjlaenv( ictxt, 4, 'PSSYTTRD', 'L', 1, 0, 0,
     $                0 ).EQ.1 )
         twogemms = ( pjlaenv( ictxt, 4, 'PSSYTTRD', 'L', 2, 0, 0,
     $              0 ).EQ.1 )
         balanced = ( pjlaenv( ictxt, 4, 'PSSYTTRD', 'L', 3, 0, 0,
     $              0 ).EQ.1 )
*
         CALL chk1mat( n, 2, n, 2, ia, ja, desca, 6, info )
*
*
         upper = lsame( uplo, 'U' )
         IF( info.EQ.0 .AND. desca( nb_ ).NE.1 )
     $      info = 600 + nb_
         IF( info.EQ.0 ) THEN
*
*
*           Here is the arithmetic:
*             Let maxnpq = max( np, nq, 2 * ANB )
*             LDV = 4 * max( np, nq ) + 2
*             LWMIN = 2 * ( ANB + 1 ) * LDV + MAX( np, 2 * ANB )
*             = 2 * ( ANB + 1 ) * ( 4 * NPS + 2 ) + NPS
*
*           This overestimates memory requirements when ANB > NP/2
*           Memory requirements are lower when interleave = .false.
*           Hence, we could have two sets of memory requirements,
*           one for interleave and one for
*
*
            nps = max( numroc( n, 1, 0, 0, nprow ), 2*anb )
            lwmin = 2*( anb+1 )*( 4*nps+2 ) + nps
*
            work( 1 ) = real( lwmin )
            IF( .NOT.lsame( uplo, 'L' ) ) THEN
               info = -1
            ELSE IF( ia.NE.1 ) THEN
               info = -4
            ELSE IF( ja.NE.1 ) THEN
               info = -5
            ELSE IF( nprow.NE.npcol ) THEN
               info = -( 600+ctxt_ )
            ELSE IF( desca( dtype_ ).NE.1 ) THEN
               info = -( 600+dtype_ )
            ELSE IF( desca( mb_ ).NE.1 ) THEN
               info = -( 600+mb_ )
            ELSE IF( desca( nb_ ).NE.1 ) THEN
               info = -( 600+nb_ )
            ELSE IF( desca( rsrc_ ).NE.0 ) THEN
               info = -( 600+rsrc_ )
            ELSE IF( desca( csrc_ ).NE.0 ) THEN
               info = -( 600+csrc_ )
            ELSE IF( lwork.LT.lwmin ) THEN
               info = -11
            END IF
         END IF
         IF( upper ) THEN
            idum1( 1 ) = ichar( 'U' )
         ELSE
            idum1( 1 ) = ichar( 'L' )
         END IF
         idum2( 1 ) = 1
*
         CALL pchk1mat( n, 2, n, 2, ia, ja, desca, 6, 1, idum1, idum2,
     $                  info )
      END IF
*
      IF( info.NE.0 ) THEN
         CALL pxerbla( ictxt, 'PSSYTTRD', -info )
         RETURN
      END IF
*
*     Quick return if possible
*
      IF( n.EQ.0 )
     $   RETURN
*
*
*
*     Reduce the lower triangle of sub( A )
      np = numroc( n, 1, myrow, 0, nprow )
      nq = numroc( n, 1, mycol, 0, npcol )
*
      nxtrow = 0
      nxtcol = 0
*
      liip1 = 1
      lijp1 = 1
      npm1 = np
      nqm1 = nq
*
      lda = desca( lld_ )
      ictxt = desca( ctxt_ )
*
*
*
*     Miscellaneous details:
*     Put tau, D and E in the right places
*     Check signs
*     Place all the arrays in WORK, control their placement
*     in  memory.
*
*
*
*     Loop invariants
*     A(LIIP1, LIJ) points to the first element of A(I+1,J)
*     NPM1,NQM1 = the number of rows, cols in A( LII+1:N,LIJ+1:N )
*     A(LII:N,LIJ:N) is one step out of date.
*     proc( CURROW, CURCOL ) owns A(LII,LIJ)
*     proc( NXTROW, CURCOL ) owns A(LIIP1,LIJ)
*
      inh = 1
*
      IF( interleave ) THEN
*
*        H and V are interleaved to minimize memory movement
*        LDV has to be twice as large to accomodate interleaving.
*        In addition, LDV is doubled again to allow v, h and
*        toptau to be spreaad across and transposed in a
*        single communication operation with minimum memory
*        movement.
*
*        We could reduce LDV back to 2*MAX(NPM1,NQM1)
*        by increasing the memory movement required in
*        the spread and transpose of v, h and toptau.
*        However, since the non-interleaved path already
*        provides a mear minimum memory requirement option,
*        we did not provide this additional path.
*
         ldv = 4*( max( npm1, nqm1 ) ) + 2
*
         inh = 1
*
         inv = inh + ldv / 2
         invt = inh + ( anb+1 )*ldv
*
         inht = invt + ldv / 2
         intmp = invt + ldv*( anb+1 )
*
      ELSE
         ldv = max( npm1, nqm1 )
*
         inht = inh + ldv*( anb+1 )
         inv = inht + ldv*( anb+1 )
*
*        The code works without this +1, but only because of a
*        coincidence.  Without the +1, WORK(INVT) gets trashed, but
*        WORK(INVT) is only used once and when it is used, it is
*        multiplied by WORK( INH ) which is zero.  Hence, the fact
*        that WORK(INVT) is trashed has no effect.
*
         invt = inv + ldv*( anb+1 ) + 1
         intmp = invt + ldv*( 2*anb )
*
      END IF
*
      IF( info.NE.0 ) THEN
         CALL pxerbla( ictxt, 'PSSYTTRD', -info )
         work( 1 ) = real( lwmin )
         RETURN
      END IF
*
*
*        The satisfies the loop invariant: trueA = A - V * HT - H * VT,
*        (where V, H, VT and HT all have BINDEX+1 rows/columns)
*        the first ANB times through the loop.
*
*
*
*     Setting either ( InH and InHT ) or InV to Z_ZERO
*     is adequate except in the face of NaNs.
*
*
      DO 10 i = 1, np
         work( inh+i-1 ) = z_zero
         work( inv+i-1 ) = z_zero
   10 CONTINUE
      DO 20 i = 1, nq
         work( inht+i-1 ) = z_zero
   20 CONTINUE
*
*
*
      topnv = z_zero
*
      ltlip1 = lijp1
      ltnm1 = npm1
      IF( mycol.GT.myrow ) THEN
         ltlip1 = ltlip1 + 1
         ltnm1 = ltnm1 - 1
      END IF
*
*
      DO 210 minindex = 1, n - 1, anb
*
*
         maxindex = min( minindex+anb-1, n )
         lijb = numroc( maxindex, 1, mycol, 0, npcol ) + 1
         liib = numroc( maxindex, 1, myrow, 0, nprow ) + 1
*
         nqb = nq - lijb + 1
         npb = np - liib + 1
         inhtb = inht + lijb - 1
         invtb = invt + lijb - 1
         inhb = inh + liib - 1
         invb = inv + liib - 1
*
*
*
*
         DO 160 index = minindex, min( maxindex, n-1 )
*
            bindex = index - minindex
*
            currow = nxtrow
            curcol = nxtcol
*
            nxtrow = mod( currow+1, nprow )
            nxtcol = mod( curcol+1, npcol )
*
            lii = liip1
            lij = lijp1
            npm0 = npm1
*
            IF( myrow.EQ.currow ) THEN
               npm1 = npm1 - 1
               liip1 = liip1 + 1
            END IF
            IF( mycol.EQ.curcol ) THEN
               nqm1 = nqm1 - 1
               lijp1 = lijp1 + 1
               ltlip1 = ltlip1 + 1
               ltnm1 = ltnm1 - 1
            END IF
*
*
*
*
*     V = NV, VT = NVT, H = NH, HT = NHT
*
*
*     Update the current column of A
*
*
            IF( mycol.EQ.curcol ) THEN
*
               indexa = lii + ( lij-1 )*lda
               indexinv = inv + lii - 1 + ( bindex-1 )*ldv
               indexinh = inh + lii - 1 + ( bindex-1 )*ldv
               ttoph = work( inht+lij-1+bindex*ldv )
               ttopv = topnv
*
               IF( index.GT.1 ) THEN
                  DO 30 i = 0, npm0 - 1
*                  A( INDEXA+I ) = A( INDEXA+I )
                     a( indexa+i ) = a( indexa+i ) -
     $                               work( indexinv+ldv+i )*ttoph -
     $                               work( indexinh+ldv+i )*ttopv
   30             CONTINUE
               END IF
*
*
            END IF
*
*
            IF( mycol.EQ.curcol ) THEN
*
*     Compute the householder vector
*
               IF( myrow.EQ.currow ) THEN
                  dtmp( 2 ) = a( lii+( lij-1 )*lda )
               ELSE
                  dtmp( 2 ) = zero
               END IF
               IF( myrow.EQ.nxtrow ) THEN
                  dtmp( 3 ) = a( liip1+( lij-1 )*lda )
                  dtmp( 4 ) = zero
               ELSE
                  dtmp( 3 ) = zero
                  dtmp( 4 ) = zero
               END IF
*
               norm = snrm2( npm1, a( liip1+( lij-1 )*lda ), 1 )
               dtmp( 1 ) = norm
*
*              IF DTMP(5) = 1.0, NORM is too large and might cause
*              overflow, hence PSTREECOMB must be called.  IF DTMP(5)
*              is zero on output, DTMP(1) can be trusted.
*
               dtmp( 5 ) = zero
               IF( dtmp( 1 ).GE.safmax .OR. dtmp( 1 ).LT.safmin ) THEN
                  dtmp( 5 ) = one
                  dtmp( 1 ) = zero
               END IF
*
               dtmp( 1 ) = dtmp( 1 )*dtmp( 1 )
               CALL sgsum2d( ictxt, 'C', ' ', 5, 1, dtmp, 5, -1,
     $                       curcol )
               IF( dtmp( 5 ).EQ.zero ) THEN
                  dtmp( 1 ) = sqrt( dtmp( 1 ) )
               ELSE
                  dtmp( 1 ) = norm
                  CALL pstreecomb( ictxt, 'C', 1, dtmp, -1, mycol,
     $                             scombnrm2 )
               END IF
*
               norm = dtmp( 1 )
*
               d( lij ) = dtmp( 2 )
               IF( myrow.EQ.currow .AND. mycol.EQ.curcol ) THEN
                  a( lii+( lij-1 )*lda ) = d( lij )
               END IF
*
*
               alpha = dtmp( 3 )
*
               norm = sign( norm, alpha )
*
               IF( norm.EQ.zero ) THEN
                  toptau = zero
               ELSE
                  beta = norm + alpha
                  toptau = beta / norm
                  oneoverbeta = 1.0e0 / beta
*
                  CALL sscal( npm1, oneoverbeta,
     $                        a( liip1+( lij-1 )*lda ), 1 )
               END IF
*
               IF( myrow.EQ.nxtrow ) THEN
                  a( liip1+( lij-1 )*lda ) = z_one
               END IF
*
               tau( lij ) = toptau
               e( lij ) = -norm
*
            END IF
*
*
*     Spread v, nh, toptau across
*
            DO 40 i = 0, npm1 - 1
               work( inv+liip1-1+bindex*ldv+npm1+i ) = a( liip1+i+
     $            ( lij-1 )*lda )
   40       CONTINUE
*
            IF( mycol.EQ.curcol ) THEN
               work( inv+liip1-1+bindex*ldv+npm1+npm1 ) = toptau
               CALL sgebs2d( ictxt, 'R', ' ', npm1+npm1+1, 1,
     $                       work( inv+liip1-1+bindex*ldv ),
     $                       npm1+npm1+1 )
            ELSE
               CALL sgebr2d( ictxt, 'R', ' ', npm1+npm1+1, 1,
     $                       work( inv+liip1-1+bindex*ldv ),
     $                       npm1+npm1+1, myrow, curcol )
               toptau = work( inv+liip1-1+bindex*ldv+npm1+npm1 )
            END IF
            DO 50 i = 0, npm1 - 1
               work( inh+liip1-1+( bindex+1 )*ldv+i ) = work( inv+liip1-
     $            1+bindex*ldv+npm1+i )
   50       CONTINUE
*
            IF( index.LT.n ) THEN
               IF( myrow.EQ.nxtrow .AND. mycol.EQ.curcol )
     $            a( liip1+( lij-1 )*lda ) = e( lij )
            END IF
*
*     Transpose v, nh
*
*
            IF( myrow.EQ.mycol ) THEN
               DO 60 i = 0, npm1 + npm1
                  work( invt+lijp1-1+bindex*ldv+i ) = work( inv+liip1-1+
     $               bindex*ldv+i )
   60          CONTINUE
            ELSE
               CALL sgesd2d( ictxt, npm1+npm1, 1,
     $                       work( inv+liip1-1+bindex*ldv ), npm1+npm1,
     $                       mycol, myrow )
               CALL sgerv2d( ictxt, nqm1+nqm1, 1,
     $                       work( invt+lijp1-1+bindex*ldv ), nqm1+nqm1,
     $                       mycol, myrow )
            END IF
*
            DO 70 i = 0, nqm1 - 1
               work( inht+lijp1-1+( bindex+1 )*ldv+i ) = work( invt+
     $            lijp1-1+bindex*ldv+nqm1+i )
   70       CONTINUE
*
*
*           Update the current block column of A
*
            IF( index.GT.1 ) THEN
               DO 90 j = lijp1, lijb - 1
                  DO 80 i = 0, npm1 - 1
*
                     a( liip1+i+( j-1 )*lda ) = a( liip1+i+( j-1 )*lda )
     $                   - work( inv+liip1-1+bindex*ldv+i )*
     $                  work( inht+j-1+bindex*ldv ) -
     $                  work( inh+liip1-1+bindex*ldv+i )*
     $                  work( invt+j-1+bindex*ldv )
   80             CONTINUE
   90          CONTINUE
            END IF
*
*
*
*     Compute NV = A * NHT; NVT = A * NH
*
*           These two lines are necessary because these elements
*           are not always involved in the calls to STRMVT
*           for two reasons:
*           1)  On diagonal processors, the call to TRMVT
*               involves only LTNM1-1 elements
*           2)  On some processes, NQM1 < LTM1 or  LIIP1 < LTLIP1
*               and when the results are combined across all processes,
*               uninitialized values may be included.
            work( inv+liip1-1+( bindex+1 )*ldv ) = z_zero
            work( invt+lijp1-1+( bindex+1 )*ldv+nqm1-1 ) = z_zero
*
*
            IF( myrow.EQ.mycol ) THEN
               IF( ltnm1.GT.1 ) THEN
                  CALL strmvt( 'L', ltnm1-1,
     $                         a( ltlip1+1+( lijp1-1 )*lda ), lda,
     $                         work( invt+lijp1-1+( bindex+1 )*ldv ), 1,
     $                         work( inh+ltlip1+1-1+( bindex+1 )*ldv ),
     $                         1, work( inv+ltlip1+1-1+( bindex+1 )*
     $                         ldv ), 1, work( inht+lijp1-1+( bindex+
     $                         1 )*ldv ), 1 )
               END IF
               DO 100 i = 1, ltnm1
                  work( invt+lijp1+i-1-1+( bindex+1 )*ldv )
     $               = work( invt+lijp1+i-1-1+( bindex+1 )*ldv ) +
     $               a( ltlip1+i-1+( lijp1+i-1-1 )*lda )*
     $               work( inh+ltlip1+i-1-1+( bindex+1 )*ldv )
  100          CONTINUE
            ELSE
               IF( ltnm1.GT.0 )
     $            CALL strmvt( 'L', ltnm1, a( ltlip1+( lijp1-1 )*lda ),
     $                         lda, work( invt+lijp1-1+( bindex+1 )*
     $                         ldv ), 1, work( inh+ltlip1-1+( bindex+
     $                         1 )*ldv ), 1, work( inv+ltlip1-1+
     $                         ( bindex+1 )*ldv ), 1,
     $                         work( inht+lijp1-1+( bindex+1 )*ldv ),
     $                         1 )
*
            END IF
*
*
*     We take advantage of the fact that:
*     A * sum( B ) = sum ( A * B ) for matrices A,B
*
*     trueA = A + V * HT + H * VT
*     hence:  (trueA)v = Av' + V * HT * v + H * VT * v
*     VT * v = sum_p_in_NPROW ( VTp * v )
*     H * VT * v = H * sum (VTp * v) = sum ( H * VTp * v )
*
*     v = v + V * HT * h + H * VT * h
*
*
*
*     tmp = HT * nh1
            DO 110 i = 1, 2*( bindex+1 )
               work( intmp-1+i ) = 0
  110       CONTINUE
*
            IF( balanced ) THEN
               npset = nprow
               mysetnum = myrow
               rowsperproc = iceil( nqb, npset )
               myfirstrow = min( nqb+1, 1+rowsperproc*mysetnum )
               numrows = min( rowsperproc, nqb-myfirstrow+1 )
*
*
*     tmp = HT * v
*
               CALL sgemv( 'C', numrows, bindex+1, z_one,
     $                     work( inhtb+myfirstrow-1 ), ldv,
     $                     work( inhtb+myfirstrow-1+( bindex+1 )*ldv ),
     $                     1, z_zero, work( intmp ), 1 )
*     tmp2 = VT * v
               CALL sgemv( 'C', numrows, bindex+1, z_one,
     $                     work( invtb+myfirstrow-1 ), ldv,
     $                     work( inhtb+myfirstrow-1+( bindex+1 )*ldv ),
     $                     1, z_zero, work( intmp+bindex+1 ), 1 )
*
*
               CALL sgsum2d( ictxt, 'C', ' ', 2*( bindex+1 ), 1,
     $                       work( intmp ), 2*( bindex+1 ), -1, -1 )
            ELSE
*     tmp = HT * v
*
               CALL sgemv( 'C', nqb, bindex+1, z_one, work( inhtb ),
     $                     ldv, work( inhtb+( bindex+1 )*ldv ), 1,
     $                     z_zero, work( intmp ), 1 )
*     tmp2 = VT * v
               CALL sgemv( 'C', nqb, bindex+1, z_one, work( invtb ),
     $                     ldv, work( inhtb+( bindex+1 )*ldv ), 1,
     $                     z_zero, work( intmp+bindex+1 ), 1 )
*
            END IF
*
*
*
            IF( balanced ) THEN
               mysetnum = mycol
*
               rowsperproc = iceil( npb, npset )
               myfirstrow = min( npb+1, 1+rowsperproc*mysetnum )
               numrows = min( rowsperproc, npb-myfirstrow+1 )
*
               CALL sgsum2d( ictxt, 'R', ' ', 2*( bindex+1 ), 1,
     $                       work( intmp ), 2*( bindex+1 ), -1, -1 )
*
*
*     v = v + V * tmp
               IF( index.GT.1. ) THEN
                  CALL sgemv( 'N', numrows, bindex+1, z_negone,
     $                        work( invb+myfirstrow-1 ), ldv,
     $                        work( intmp ), 1, z_one,
     $                        work( invb+myfirstrow-1+( bindex+1 )*
     $                        ldv ), 1 )
*
*     v = v + H * tmp2
                  CALL sgemv( 'N', numrows, bindex+1, z_negone,
     $                        work( inhb+myfirstrow-1 ), ldv,
     $                        work( intmp+bindex+1 ), 1, z_one,
     $                        work( invb+myfirstrow-1+( bindex+1 )*
     $                        ldv ), 1 )
               END IF
*
            ELSE
*     v = v + V * tmp
               CALL sgemv( 'N', npb, bindex+1, z_negone, work( invb ),
     $                     ldv, work( intmp ), 1, z_one,
     $                     work( invb+( bindex+1 )*ldv ), 1 )
*
*
*     v = v + H * tmp2
               CALL sgemv( 'N', npb, bindex+1, z_negone, work( inhb ),
     $                     ldv, work( intmp+bindex+1 ), 1, z_one,
     $                     work( invb+( bindex+1 )*ldv ), 1 )
*
            END IF
*
*
*     Transpose NV and add it back into NVT
*
            IF( myrow.EQ.mycol ) THEN
               DO 120 i = 0, nqm1 - 1
                  work( intmp+i ) = work( invt+lijp1-1+( bindex+1 )*ldv+
     $                              i )
  120          CONTINUE
            ELSE
               CALL sgesd2d( ictxt, nqm1, 1,
     $                       work( invt+lijp1-1+( bindex+1 )*ldv ),
     $                       nqm1, mycol, myrow )
               CALL sgerv2d( ictxt, npm1, 1, work( intmp ), npm1, mycol,
     $                       myrow )
*
            END IF
            DO 130 i = 0, npm1 - 1
               work( inv+liip1-1+( bindex+1 )*ldv+i ) = work( inv+liip1-
     $            1+( bindex+1 )*ldv+i ) + work( intmp+i )
  130       CONTINUE
*
*     Sum-to-one NV rowwise (within a row)
*
            CALL sgsum2d( ictxt, 'R', ' ', npm1, 1,
     $                    work( inv+liip1-1+( bindex+1 )*ldv ), npm1,
     $                    myrow, nxtcol )
*
*
*     Dot product c = NV * NH
*     Sum-to-all c within next processor column
*
*
            IF( mycol.EQ.nxtcol ) THEN
               cc( 1 ) = z_zero
               DO 140 i = 0, npm1 - 1
                  cc( 1 ) = cc( 1 ) + work( inv+liip1-1+( bindex+1 )*
     $                      ldv+i )*work( inh+liip1-1+( bindex+1 )*ldv+
     $                      i )
  140          CONTINUE
               IF( myrow.EQ.nxtrow ) THEN
                  cc( 2 ) = work( inv+liip1-1+( bindex+1 )*ldv )
                  cc( 3 ) = work( inh+liip1-1+( bindex+1 )*ldv )
               ELSE
                  cc( 2 ) = z_zero
                  cc( 3 ) = z_zero
               END IF
               CALL sgsum2d( ictxt, 'C', ' ', 3, 1, cc, 3, -1, nxtcol )
*
               topv = cc( 2 )
               c = cc( 1 )
               toph = cc( 3 )
*
               topnv = toptau*( topv-c*toptau / 2*toph )
*
*
*     Compute V = Tau * (V - C * Tau' / 2 * H )
*
*
               DO 150 i = 0, npm1 - 1
                  work( inv+liip1-1+( bindex+1 )*ldv+i ) = toptau*
     $               ( work( inv+liip1-1+( bindex+1 )*ldv+i )-c*toptau /
     $               2*work( inh+liip1-1+( bindex+1 )*ldv+i ) )
  150          CONTINUE
*
            END IF
*
*
  160    CONTINUE
*
*
*     Perform the rank2k update
*
         IF( maxindex.LT.n ) THEN
*
            DO 170 i = 0, npm1 - 1
               work( intmp+i ) = work( inh+liip1-1+anb*ldv+i )
  170       CONTINUE
*
*
*
            IF( .NOT.twogemms ) THEN
               IF( interleave ) THEN
                  ldzg = ldv / 2
               ELSE
                  CALL slamov( 'A', ltnm1, anb, work( inht+lijp1-1 ),
     $                         ldv, work( invt+lijp1-1+anb*ldv ), ldv )
*
                  CALL slamov( 'A', ltnm1, anb, work( inv+ltlip1-1 ),
     $                         ldv, work( inh+ltlip1-1+anb*ldv ), ldv )
                  ldzg = ldv
               END IF
               nbzg = anb*2
            ELSE
               ldzg = ldv
               nbzg = anb
            END IF
*
*
            DO 180 pbmin = 1, ltnm1, pnb
*
               pbsize = min( pnb, ltnm1-pbmin+1 )
               pbmax = min( ltnm1, pbmin+pnb-1 )
               CALL sgemm( 'N', 'C', pbsize, pbmax, nbzg, z_negone,
     $                     work( inh+ltlip1-1+pbmin-1 ), ldzg,
     $                     work( invt+lijp1-1 ), ldzg, z_one,
     $                     a( ltlip1+pbmin-1+( lijp1-1 )*lda ), lda )
               IF( twogemms ) THEN
                  CALL sgemm( 'N', 'C', pbsize, pbmax, anb, z_negone,
     $                        work( inv+ltlip1-1+pbmin-1 ), ldzg,
     $                        work( inht+lijp1-1 ), ldzg, z_one,
     $                        a( ltlip1+pbmin-1+( lijp1-1 )*lda ), lda )
               END IF
  180       CONTINUE
*
*
*
            DO 190 i = 0, npm1 - 1
               work( inv+liip1-1+i ) = work( inv+liip1-1+anb*ldv+i )
               work( inh+liip1-1+i ) = work( intmp+i )
  190       CONTINUE
            DO 200 i = 0, nqm1 - 1
               work( inht+lijp1-1+i ) = work( inht+lijp1-1+anb*ldv+i )
  200       CONTINUE
*
*
         END IF
*
*     End of the update A code
*
  210 CONTINUE
*
      IF( mycol.EQ.nxtcol ) THEN
         IF( myrow.EQ.nxtrow ) THEN
*
            d( nq ) = a( np+( nq-1 )*lda )
*
            CALL sgebs2d( ictxt, 'C', ' ', 1, 1, d( nq ), 1 )
         ELSE
            CALL sgebr2d( ictxt, 'C', ' ', 1, 1, d( nq ), 1, nxtrow,
     $                    nxtcol )
         END IF
      END IF
*
*
*
*
      work( 1 ) = real( lwmin )
      RETURN
*
*     End of PSSYTTRD
*
*
OpenRadioss 2025.1.11 OpenRadioss project
Functions/Subroutines

Function/Subroutine Documentation

◆ pssyttrd()