OpenRadioss 2025.1.11
OpenRadioss project
Loading...
Searching...
No Matches
pzgbtrs.f
Go to the documentation of this file.
1 SUBROUTINE pzgbtrs( TRANS, N, BWL, BWU, NRHS, A, JA, DESCA, IPIV,
2 $ B, IB, DESCB, AF, LAF, WORK, LWORK, INFO )
3*
4* -- ScaLAPACK routine (version 2.0.2) --
5* Univ. of Tennessee, Univ. of California Berkeley, Univ. of Colorado Denver
6* May 1 2012
7*
8* .. Scalar Arguments ..
9 CHARACTER TRANS
10 INTEGER BWU, BWL, IB, INFO, JA, LAF, LWORK, N, NRHS
11* ..
12* .. Array Arguments ..
13 INTEGER DESCA( * ), DESCB( * ), IPIV(*)
14 COMPLEX*16 A( * ), AF( * ), B( * ), WORK( * )
15* ..
16*
17*
18* Purpose
19* =======
20*
21* PZGBTRS solves a system of linear equations
22*
23* A(1:N, JA:JA+N-1) * X = B(IB:IB+N-1, 1:NRHS)
24* or
25* A(1:N, JA:JA+N-1)' * X = B(IB:IB+N-1, 1:NRHS)
26*
27* where A(1:N, JA:JA+N-1) is the matrix used to produce the factors
28* stored in A(1:N,JA:JA+N-1) and AF by PZGBTRF.
29* A(1:N, JA:JA+N-1) is an N-by-N complex
30* banded distributed
31* matrix with bandwidth BWL, BWU.
32*
33* Routine PZGBTRF MUST be called first.
34*
35* =====================================================================
36*
37* Arguments
38* =========
39*
40*
41* TRANS (global input) CHARACTER
42* = 'N': Solve with A(1:N, JA:JA+N-1);
43* = 'C': Solve with conjugate_transpose( A(1:N, JA:JA+N-1) );
44*
45* N (global input) INTEGER
46* The number of rows and columns to be operated on, i.e. the
47* order of the distributed submatrix A(1:N, JA:JA+N-1). N >= 0.
48*
49* BWL (global input) INTEGER
50* Number of subdiagonals. 0 <= BWL <= N-1
51*
52* BWU (global input) INTEGER
53* Number of superdiagonals. 0 <= BWU <= N-1
54*
55* NRHS (global input) INTEGER
56* The number of right hand sides, i.e., the number of columns
57* of the distributed submatrix B(IB:IB+N-1, 1:NRHS).
58* NRHS >= 0.
59*
60* A (local input/local output) COMPLEX*16 pointer into
61* local memory to an array with first dimension
62* LLD_A >=(2*bwl+2*bwu+1) (stored in DESCA).
63* On entry, this array contains the local pieces of the
64* N-by-N unsymmetric banded distributed Cholesky factor L or
65* L^T A(1:N, JA:JA+N-1).
66* This local portion is stored in the packed banded format
67* used in LAPACK. Please see the Notes below and the
68* ScaLAPACK manual for more detail on the format of
69* distributed matrices.
70*
71* JA (global input) INTEGER
72* The index in the global array A that points to the start of
73* the matrix to be operated on (which may be either all of A
74* or a submatrix of A).
75*
76* DESCA (global and local input) INTEGER array of dimension DLEN.
77* if 1D type (DTYPE_A=501), DLEN >= 7;
78* if 2D type (DTYPE_A=1), DLEN >= 9 .
79* The array descriptor for the distributed matrix A.
80* Contains information of mapping of A to memory. Please
81* see NOTES below for full description and options.
82*
83* IPIV (local output) INTEGER array, dimension >= DESCA( NB ).
84* Pivot indices for local factorizations.
85* Users *should not* alter the contents between
86* factorization and solve.
87*
88* B (local input/local output) COMPLEX*16 pointer into
89* local memory to an array of local lead dimension lld_b>=NB.
90* On entry, this array contains the
91* the local pieces of the right hand sides
92* B(IB:IB+N-1, 1:NRHS).
93* On exit, this contains the local piece of the solutions
94* distributed matrix X.
95*
96* IB (global input) INTEGER
97* The row index in the global array B that points to the first
98* row of the matrix to be operated on (which may be either
99* all of B or a submatrix of B).
100*
101* DESCB (global and local input) INTEGER array of dimension DLEN.
102* if 1D type (DTYPE_B=502), DLEN >=7;
103* if 2D type (DTYPE_B=1), DLEN >= 9.
104* The array descriptor for the distributed matrix B.
105* Contains information of mapping of B to memory. Please
106* see NOTES below for full description and options.
107*
108* AF (local output) COMPLEX*16 array, dimension LAF.
109* Auxiliary Fillin Space.
110* Fillin is created during the factorization routine
111* PZGBTRF and this is stored in AF. If a linear system
112* is to be solved using PZGBTRS after the factorization
113* routine, AF *must not be altered* after the factorization.
114*
115* LAF (local input) INTEGER
116* Size of user-input Auxiliary Fillin space AF. Must be >=
117* (NB+bwu)*(bwl+bwu)+6*(bwl+bwu)*(bwl+2*bwu)
118* If LAF is not large enough, an error code will be returned
119* and the minimum acceptable size will be returned in AF( 1 )
120*
121* WORK (local workspace/local output)
122* COMPLEX*16 temporary workspace. This space may
123* be overwritten in between calls to routines. WORK must be
124* the size given in LWORK.
125* On exit, WORK( 1 ) contains the minimal LWORK.
126*
127* LWORK (local input or global input) INTEGER
128* Size of user-input workspace WORK.
129* If LWORK is too small, the minimal acceptable size will be
130* returned in WORK(1) and an error code is returned. LWORK>=
131* NRHS*(NB+2*bwl+4*bwu)
132*
133* INFO (global output) INTEGER
134* = 0: successful exit
135* < 0: If the i-th argument is an array and the j-entry had
136* an illegal value, then INFO = -(i*100+j), if the i-th
137* argument is a scalar and had an illegal value, then
138* INFO = -i.
139*
140* =====================================================================
141*
142*
143* Restrictions
144* ============
145*
146* The following are restrictions on the input parameters. Some of these
147* are temporary and will be removed in future releases, while others
148* may reflect fundamental technical limitations.
149*
150* Non-cyclic restriction: VERY IMPORTANT!
151* P*NB>= mod(JA-1,NB)+N.
152* The mapping for matrices must be blocked, reflecting the nature
153* of the divide and conquer algorithm as a task-parallel algorithm.
154* This formula in words is: no processor may have more than one
155* chunk of the matrix.
156*
157* Blocksize cannot be too small:
158* If the matrix spans more than one processor, the following
159* restriction on NB, the size of each block on each processor,
160* must hold:
161* NB >= (BWL+BWU)+1
162* The bulk of parallel computation is done on the matrix of size
163* O(NB) on each processor. If this is too small, divide and conquer
164* is a poor choice of algorithm.
165*
166* Submatrix reference:
167* JA = IB
168* Alignment restriction that prevents unnecessary communication.
169*
170*
171* =====================================================================
172*
173*
174* Notes
175* =====
176*
177* If the factorization routine and the solve routine are to be called
178* separately (to solve various sets of righthand sides using the same
179* coefficient matrix), the auxiliary space AF *must not be altered*
180* between calls to the factorization routine and the solve routine.
181*
182* The best algorithm for solving banded and tridiagonal linear systems
183* depends on a variety of parameters, especially the bandwidth.
184* Currently, only algorithms designed for the case N/P >> bw are
185* implemented. These go by many names, including Divide and Conquer,
186* Partitioning, domain decomposition-type, etc.
187*
188* Algorithm description: Divide and Conquer
189*
190* The Divide and Conqer algorithm assumes the matrix is narrowly
191* banded compared with the number of equations. In this situation,
192* it is best to distribute the input matrix A one-dimensionally,
193* with columns atomic and rows divided amongst the processes.
194* The basic algorithm divides the banded matrix up into
195* P pieces with one stored on each processor,
196* and then proceeds in 2 phases for the factorization or 3 for the
197* solution of a linear system.
198* 1) Local Phase:
199* The individual pieces are factored independently and in
200* parallel. These factors are applied to the matrix creating
201* fillin, which is stored in a non-inspectable way in auxiliary
202* space AF. Mathematically, this is equivalent to reordering
203* the matrix A as P A P^T and then factoring the principal
204* leading submatrix of size equal to the sum of the sizes of
205* the matrices factored on each processor. The factors of
206* these submatrices overwrite the corresponding parts of A
207* in memory.
208* 2) Reduced System Phase:
209* A small (max(bwl,bwu)* (P-1)) system is formed representing
210* interaction of the larger blocks, and is stored (as are its
211* factors) in the space AF. A parallel Block Cyclic Reduction
212* algorithm is used. For a linear system, a parallel front solve
213* followed by an analagous backsolve, both using the structure
214* of the factored matrix, are performed.
215* 3) Backsubsitution Phase:
216* For a linear system, a local backsubstitution is performed on
217* each processor in parallel.
218*
219*
220* Descriptors
221* ===========
222*
223* Descriptors now have *types* and differ from ScaLAPACK 1.0.
224*
225* Note: banded codes can use either the old two dimensional
226* or new one-dimensional descriptors, though the processor grid in
227* both cases *must be one-dimensional*. We describe both types below.
228*
229* Each global data object is described by an associated description
230* vector. This vector stores the information required to establish
231* the mapping between an object element and its corresponding process
232* and memory location.
233*
234* Let A be a generic term for any 2D block cyclicly distributed array.
235* Such a global array has an associated description vector DESCA.
236* In the following comments, the character _ should be read as
237* "of the global array".
238*
239* NOTATION STORED IN EXPLANATION
240* --------------- -------------- --------------------------------------
241* DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case,
242* DTYPE_A = 1.
243* CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating
244* the BLACS process grid A is distribu-
245* ted over. The context itself is glo-
246* bal, but the handle (the integer
247* value) may vary.
248* M_A (global) DESCA( M_ ) The number of rows in the global
249* array A.
250* N_A (global) DESCA( N_ ) The number of columns in the global
251* array A.
252* MB_A (global) DESCA( MB_ ) The blocking factor used to distribute
253* the rows of the array.
254* NB_A (global) DESCA( NB_ ) The blocking factor used to distribute
255* the columns of the array.
256* RSRC_A (global) DESCA( RSRC_ ) The process row over which the first
257* row of the array A is distributed.
258* CSRC_A (global) DESCA( CSRC_ ) The process column over which the
259* first column of the array A is
260* distributed.
261* LLD_A (local) DESCA( LLD_ ) The leading dimension of the local
262* array. LLD_A >= MAX(1,LOCr(M_A)).
263*
264* Let K be the number of rows or columns of a distributed matrix,
265* and assume that its process grid has dimension p x q.
266* LOCr( K ) denotes the number of elements of K that a process
267* would receive if K were distributed over the p processes of its
268* process column.
269* Similarly, LOCc( K ) denotes the number of elements of K that a
270* process would receive if K were distributed over the q processes of
271* its process row.
272* The values of LOCr() and LOCc() may be determined via a call to the
273* ScaLAPACK tool function, NUMROC:
274* LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ),
275* LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ).
276* An upper bound for these quantities may be computed by:
277* LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A
278* LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A
279*
280*
281* One-dimensional descriptors:
282*
283* One-dimensional descriptors are a new addition to ScaLAPACK since
284* version 1.0. They simplify and shorten the descriptor for 1D
285* arrays.
286*
287* Since ScaLAPACK supports two-dimensional arrays as the fundamental
288* object, we allow 1D arrays to be distributed either over the
289* first dimension of the array (as if the grid were P-by-1) or the
290* 2nd dimension (as if the grid were 1-by-P). This choice is
291* indicated by the descriptor type (501 or 502)
292* as described below.
293*
294* IMPORTANT NOTE: the actual BLACS grid represented by the
295* CTXT entry in the descriptor may be *either* P-by-1 or 1-by-P
296* irrespective of which one-dimensional descriptor type
297* (501 or 502) is input.
298* This routine will interpret the grid properly either way.
299* ScaLAPACK routines *do not support intercontext operations* so that
300* the grid passed to a single ScaLAPACK routine *must be the same*
301* for all array descriptors passed to that routine.
302*
303* NOTE: In all cases where 1D descriptors are used, 2D descriptors
304* may also be used, since a one-dimensional array is a special case
305* of a two-dimensional array with one dimension of size unity.
306* The two-dimensional array used in this case *must* be of the
307* proper orientation:
308* If the appropriate one-dimensional descriptor is DTYPEA=501
309* (1 by P type), then the two dimensional descriptor must
310* have a CTXT value that refers to a 1 by P BLACS grid;
311* If the appropriate one-dimensional descriptor is DTYPEA=502
312* (P by 1 type), then the two dimensional descriptor must
313* have a CTXT value that refers to a P by 1 BLACS grid.
314*
315*
316* Summary of allowed descriptors, types, and BLACS grids:
317* DTYPE 501 502 1 1
318* BLACS grid 1xP or Px1 1xP or Px1 1xP Px1
319* -----------------------------------------------------
320* A OK NO OK NO
321* B NO OK NO OK
322*
323* Note that a consequence of this chart is that it is not possible
324* for *both* DTYPE_A and DTYPE_B to be 2D_type(1), as these lead
325* to opposite requirements for the orientation of the BLACS grid,
326* and as noted before, the *same* BLACS context must be used in
327* all descriptors in a single ScaLAPACK subroutine call.
328*
329* Let A be a generic term for any 1D block cyclicly distributed array.
330* Such a global array has an associated description vector DESCA.
331* In the following comments, the character _ should be read as
332* "of the global array".
333*
334* NOTATION STORED IN EXPLANATION
335* --------------- ---------- ------------------------------------------
336* DTYPE_A(global) DESCA( 1 ) The descriptor type. For 1D grids,
337* TYPE_A = 501: 1-by-P grid.
338* TYPE_A = 502: P-by-1 grid.
339* CTXT_A (global) DESCA( 2 ) The BLACS context handle, indicating
340* the BLACS process grid A is distribu-
341* ted over. The context itself is glo-
342* bal, but the handle (the integer
343* value) may vary.
344* N_A (global) DESCA( 3 ) The size of the array dimension being
345* distributed.
346* NB_A (global) DESCA( 4 ) The blocking factor used to distribute
347* the distributed dimension of the array.
348* SRC_A (global) DESCA( 5 ) The process row or column over which the
349* first row or column of the array
350* is distributed.
351* LLD_A (local) DESCA( 6 ) The leading dimension of the local array
352* storing the local blocks of the distri-
353* buted array A. Minimum value of LLD_A
354* depends on TYPE_A.
355* TYPE_A = 501: LLD_A >=
356* size of undistributed dimension, 1.
357* TYPE_A = 502: LLD_A >=NB_A, 1.
358* Reserved DESCA( 7 ) Reserved for future use.
359*
360*
361*
362* =====================================================================
363*
364* Implemented for ScaLAPACK by:
365* Andrew J. Cleary, Livermore National Lab and University of Tenn.,
366* and Marbwus Hegland, Australian Natonal University. Feb., 1997.
367* Based on code written by : Peter Arbenz, ETH Zurich, 1996.
368*
369* =====================================================================
370*
371* .. Parameters ..
372 DOUBLE PRECISION ONE, ZERO
373 parameter( one = 1.0d+0 )
374 parameter( zero = 0.0d+0 )
375 COMPLEX*16 CONE, CZERO
376 parameter( cone = ( 1.0d+0, 0.0d+0 ) )
377 parameter( czero = ( 0.0d+0, 0.0d+0 ) )
378 INTEGER INT_ONE
379 parameter( int_one = 1 )
380 INTEGER DESCMULT, BIGNUM
381 parameter( descmult = 100, bignum = descmult*descmult )
382 INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_,
383 $ lld_, mb_, m_, nb_, n_, rsrc_
384 parameter( block_cyclic_2d = 1, dlen_ = 9, dtype_ = 1,
385 $ ctxt_ = 2, m_ = 3, n_ = 4, mb_ = 5, nb_ = 6,
386 $ rsrc_ = 7, csrc_ = 8, lld_ = 9 )
387* ..
388* .. Local Scalars ..
389 INTEGER APTR, BBPTR, BM, BMN, BN, BNN, BW, CSRC,
390 $ first_proc, ictxt, ictxt_new, ictxt_save,
391 $ idum2, idum3, j, ja_new, l, lbwl, lbwu, ldbb,
392 $ ldw, llda, lldb, lm, lmj, ln, lptr, mycol,
393 $ myrow, nb, neicol, np, npact, npcol, nprow,
394 $ npstr, np_save, odd_size, part_offset,
395 $ recovery_val, return_code, store_m_b,
396 $ store_n_a, work_size_min, wptr
397* ..
398* .. Local Arrays ..
399 INTEGER DESCA_1XP( 7 ), DESCB_PX1( 7 ),
400 $ param_check( 17, 3 )
401* ..
402* .. External Subroutines ..
404 $ reshape
405* ..
406* .. External Functions ..
407 LOGICAL LSAME
408 INTEGER NUMROC
409 EXTERNAL lsame
410 EXTERNAL numroc
411* ..
412* .. Intrinsic Functions ..
413 INTRINSIC ichar, mod
414* ..
415* .. Executable Statements ..
416*
417*
418* Test the input parameters
419*
420 info = 0
421*
422* Convert descriptor into standard form for easy access to
423* parameters, check that grid is of right shape.
424*
425 desca_1xp( 1 ) = 501
426 descb_px1( 1 ) = 502
427*
428 CALL desc_convert( desca, desca_1xp, return_code )
429*
430 IF( return_code .NE. 0) THEN
431 info = -( 8*100 + 2 )
432 ENDIF
433*
434 CALL desc_convert( descb, descb_px1, return_code )
435*
436 IF( return_code .NE. 0) THEN
437 info = -( 11*100 + 2 )
438 ENDIF
439*
440* Consistency checks for DESCA and DESCB.
441*
442* Context must be the same
443 IF( desca_1xp( 2 ) .NE. descb_px1( 2 ) ) THEN
444 info = -( 11*100 + 2 )
445 ENDIF
446*
447* These are alignment restrictions that may or may not be removed
448* in future releases. -Andy Cleary, April 14, 1996.
449*
450* Block sizes must be the same
451 IF( desca_1xp( 4 ) .NE. descb_px1( 4 ) ) THEN
452 info = -( 11*100 + 4 )
453 ENDIF
454*
455* Source processor must be the same
456*
457 IF( desca_1xp( 5 ) .NE. descb_px1( 5 ) ) THEN
458 info = -( 11*100 + 5 )
459 ENDIF
460*
461* Get values out of descriptor for use in code.
462*
463 ictxt = desca_1xp( 2 )
464 csrc = desca_1xp( 5 )
465 nb = desca_1xp( 4 )
466 llda = desca_1xp( 6 )
467 store_n_a = desca_1xp( 3 )
468 lldb = descb_px1( 6 )
469 store_m_b = descb_px1( 3 )
470*
471* Get grid parameters
472*
473*
474 CALL blacs_gridinfo( ictxt, nprow, npcol, myrow, mycol )
475 np = nprow * npcol
476*
477*
478*
479 IF( lsame( trans, 'n' ) ) THEN
480 IDUM2 = ICHAR( 'n' )
481 ELSE IF ( LSAME( TRANS, 'c' ) ) THEN
482 IDUM2 = ICHAR( 'c' )
483 ELSE
484 INFO = -1
485 END IF
486*
487.LT. IF( LWORK -1) THEN
488 INFO = -16
489.EQ. ELSE IF ( LWORK -1 ) THEN
490 IDUM3 = -1
491 ELSE
492 IDUM3 = 1
493 ENDIF
494*
495.LT. IF( N 0 ) THEN
496 INFO = -2
497 ENDIF
498*
499.GT. IF( N+JA-1 STORE_N_A ) THEN
500 INFO = -( 8*100 + 6 )
501 ENDIF
502*
503.GT..OR. IF(( BWL N-1 )
504.LT. $ ( BWL 0 ) ) THEN
505 INFO = -3
506 ENDIF
507*
508.GT..OR. IF(( BWU N-1 )
509.LT. $ ( BWU 0 ) ) THEN
510 INFO = -4
511 ENDIF
512*
513.LT. IF( LLDA (2*BWL+2*BWU+1) ) THEN
514 INFO = -( 8*100 + 6 )
515 ENDIF
516*
517.LE. IF( NB 0 ) THEN
518 INFO = -( 8*100 + 4 )
519 ENDIF
520*
521 BW = BWU+BWL
522*
523.GT. IF( N+IB-1 STORE_M_B ) THEN
524 INFO = -( 11*100 + 3 )
525 ENDIF
526*
527.LT. IF( LLDB NB ) THEN
528 INFO = -( 11*100 + 6 )
529 ENDIF
530*
531.LT. IF( NRHS 0 ) THEN
532 INFO = -5
533 ENDIF
534*
535* Current alignment restriction
536*
537.NE. IF( JA IB) THEN
538 INFO = -7
539 ENDIF
540*
541* Argument checking that is specific to Divide & Conquer routine
542*
543.NE. IF( NPROW 1 ) THEN
544 INFO = -( 8*100+2 )
545 ENDIF
546*
547.GT. IF( N NP*NB-MOD( JA-1, NB )) THEN
548 INFO = -( 2 )
549 CALL PXERBLA( ICTXT,
550 $ 'pzgbtrs, d&c alg.: only 1 block per proc',
551 $ -INFO )
552 RETURN
553 ENDIF
554*
555.GT..AND..LT. IF((JA+N-1NB) ( NB(BWL+BWU+1) )) THEN
556 INFO = -( 8*100+4 )
557 CALL PXERBLA( ICTXT,
558 $ 'pzgbtrs, d&c alg.: nb too small',
559 $ -INFO )
560 RETURN
561 ENDIF
562*
563*
564* Check worksize
565*
566 WORK_SIZE_MIN = NRHS*(NB+2*BWL+4*BWU)
567*
568 WORK( 1 ) = WORK_SIZE_MIN
569*
570.LT. IF( LWORK WORK_SIZE_MIN ) THEN
571.NE. IF( LWORK -1 ) THEN
572 INFO = -16
573 CALL PXERBLA( ICTXT,
574 $ 'pzgbtrs: worksize error ',
575 $ -INFO )
576 ENDIF
577 RETURN
578 ENDIF
579*
580* Pack params and positions into arrays for global consistency check
581*
582 PARAM_CHECK( 17, 1 ) = DESCB(5)
583 PARAM_CHECK( 16, 1 ) = DESCB(4)
584 PARAM_CHECK( 15, 1 ) = DESCB(3)
585 PARAM_CHECK( 14, 1 ) = DESCB(2)
586 PARAM_CHECK( 13, 1 ) = DESCB(1)
587 PARAM_CHECK( 12, 1 ) = IB
588 PARAM_CHECK( 11, 1 ) = DESCA(5)
589 PARAM_CHECK( 10, 1 ) = DESCA(4)
590 PARAM_CHECK( 9, 1 ) = DESCA(3)
591 PARAM_CHECK( 8, 1 ) = DESCA(1)
592 PARAM_CHECK( 7, 1 ) = JA
593 PARAM_CHECK( 6, 1 ) = NRHS
594 PARAM_CHECK( 5, 1 ) = BWU
595 PARAM_CHECK( 4, 1 ) = BWL
596 PARAM_CHECK( 3, 1 ) = N
597 PARAM_CHECK( 2, 1 ) = IDUM3
598 PARAM_CHECK( 1, 1 ) = IDUM2
599*
600 PARAM_CHECK( 17, 2 ) = 1105
601 PARAM_CHECK( 16, 2 ) = 1104
602 PARAM_CHECK( 15, 2 ) = 1103
603 PARAM_CHECK( 14, 2 ) = 1102
604 PARAM_CHECK( 13, 2 ) = 1101
605 PARAM_CHECK( 12, 2 ) = 10
606 PARAM_CHECK( 11, 2 ) = 805
607 PARAM_CHECK( 10, 2 ) = 804
608 PARAM_CHECK( 9, 2 ) = 803
609 PARAM_CHECK( 8, 2 ) = 801
610 PARAM_CHECK( 7, 2 ) = 7
611 PARAM_CHECK( 6, 2 ) = 5
612 PARAM_CHECK( 5, 2 ) = 4
613 PARAM_CHECK( 4, 2 ) = 3
614 PARAM_CHECK( 3, 2 ) = 2
615 PARAM_CHECK( 2, 2 ) = 16
616 PARAM_CHECK( 1, 2 ) = 1
617*
618* Want to find errors with MIN( ), so if no error, set it to a big
619* number. If there already is an error, multiply by the the
620* descriptor multiplier.
621*
622.GE. IF( INFO0 ) THEN
623 INFO = BIGNUM
624.LT. ELSE IF( INFO-DESCMULT ) THEN
625 INFO = -INFO
626 ELSE
627 INFO = -INFO * DESCMULT
628 END IF
629*
630* Check consistency across processors
631*
632 CALL GLOBCHK( ICTXT, 17, PARAM_CHECK, 17,
633 $ PARAM_CHECK( 1, 3 ), INFO )
634*
635* Prepare output: set info = 0 if no error, and divide by DESCMULT
636* if error is not in a descriptor entry.
637*
638.EQ. IF( INFOBIGNUM ) THEN
639 INFO = 0
640.EQ. ELSE IF( MOD( INFO, DESCMULT ) 0 ) THEN
641 INFO = -INFO / DESCMULT
642 ELSE
643 INFO = -INFO
644 END IF
645*
646.LT. IF( INFO0 ) THEN
647 CALL PXERBLA( ICTXT, 'pzgbtrs', -INFO )
648 RETURN
649 END IF
650*
651* Quick return if possible
652*
653.EQ. IF( N0 )
654 $ RETURN
655*
656.EQ. IF( NRHS0 )
657 $ RETURN
658*
659*
660* Adjust addressing into matrix space to properly get into
661* the beginning part of the relevant data
662*
663 PART_OFFSET = NB*( (JA-1)/(NPCOL*NB) )
664*
665.LT. IF ( (MYCOL-CSRC) (JA-PART_OFFSET-1)/NB ) THEN
666 PART_OFFSET = PART_OFFSET + NB
667 ENDIF
668*
669.LT. IF ( MYCOL CSRC ) THEN
670 PART_OFFSET = PART_OFFSET - NB
671 ENDIF
672*
673* Form a new BLACS grid (the "standard form" grid) with only procs
674* holding part of the matrix, of size 1xNP where NP is adjusted,
675* starting at csrc=0, with JA modified to reflect dropped procs.
676*
677* First processor to hold part of the matrix:
678*
679 FIRST_PROC = MOD( ( JA-1 )/NB+CSRC, NPCOL )
680*
681* Calculate new JA one while dropping off unused processors.
682*
683 JA_NEW = MOD( JA-1, NB ) + 1
684*
685* Save and compute new value of NP
686*
687 NP_SAVE = NP
688 NP = ( JA_NEW+N-2 )/NB + 1
689*
690* Call utility routine that forms "standard-form" grid
691*
692 CALL RESHAPE( ICTXT, INT_ONE, ICTXT_NEW, INT_ONE,
693 $ FIRST_PROC, INT_ONE, NP )
694*
695* Use new context from standard grid as context.
696*
697 ICTXT_SAVE = ICTXT
698 ICTXT = ICTXT_NEW
699 DESCA_1XP( 2 ) = ICTXT_NEW
700 DESCB_PX1( 2 ) = ICTXT_NEW
701*
702* Get information about new grid.
703*
704 CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL )
705*
706* Drop out processors that do not have part of the matrix.
707*
708.LT. IF( MYROW 0 ) THEN
709 GOTO 1234
710 ENDIF
711*
712*
713*
714* Begin main code
715*
716* Move data into workspace - communicate/copy (overlap)
717*
718.LT. IF (MYCOL NPCOL-1) THEN
719 CALL ZGESD2D( ICTXT, BWU, NRHS, B(NB-BWU+1), LLDB,
720 $ 0, MYCOL + 1)
721 ENDIF
722*
723.LT. IF (MYCOL NPCOL-1) THEN
724 LM = NB-BWU
725 ELSE
726 LM = NB
727 ENDIF
728*
729.GT. IF (MYCOL 0) THEN
730 WPTR = BWU+1
731 ELSE
732 WPTR = 1
733 ENDIF
734*
735 LDW = NB+BWU + 2*BW+BWU
736*
737 CALL ZLAMOV( 'g', LM, NRHS, B(1), LLDB, WORK( WPTR ), LDW )
738*
739* Zero out rest of work
740*
741 DO 1501 J=1, NRHS
742 DO 1502 L=WPTR+LM, LDW
743 WORK( (J-1)*LDW+L ) = CZERO
744 1502 CONTINUE
745 1501 CONTINUE
746*
747.GT. IF (MYCOL 0) THEN
748 CALL ZGERV2D( ICTXT, BWU, NRHS, WORK(1), LDW,
749 $ 0, MYCOL-1)
750 ENDIF
751*
752********************************************************************
753* PHASE 1: Local computation phase -- Solve L*X = B
754********************************************************************
755*
756* Size of main (or odd) partition in each processor
757*
758 ODD_SIZE = NUMROC( N, NB, MYCOL, 0, NPCOL )
759*
760.NE. IF (MYCOL 0) THEN
761 LBWL = BW
762 LBWU = 0
763 APTR = 1
764 ELSE
765 LBWL = BWL
766 LBWU = BWU
767 APTR = 1+BWU
768 ENDIF
769*
770.NE. IF (MYCOL NPCOL-1) THEN
771 LM = NB - LBWU
772 LN = NB - BW
773.NE. ELSE IF (MYCOL 0) THEN
774 LM = ODD_SIZE + BWU
775 LN = MAX(ODD_SIZE-BW,0)
776 ELSE
777 LM = N
778 LN = MAX( N-BW, 0 )
779 ENDIF
780*
781 DO 21 J = 1, LN
782*
783 LMJ = MIN(LBWL,LM-J)
784 L = IPIV( J )
785*
786.NE. IF( LJ ) THEN
787 CALL ZSWAP(NRHS, WORK(L), LDW, WORK(J), LDW)
788 ENDIF
789*
790 LPTR = BW+1 + (J-1)*LLDA + APTR
791*
792 CALL ZGERU(LMJ,NRHS,-CONE, A(LPTR),1, WORK(J),LDW,
793 $ WORK(J+1),LDW)
794*
795 21 CONTINUE
796*
797********************************************************************
798* PHASE 2: Global computation phase -- Solve L*X = B
799********************************************************************
800*
801* Define the initial dimensions of the diagonal blocks
802* The offdiagonal blocks (for MYCOL > 0) are of size BM by BW
803*
804.NE. IF (MYCOL NPCOL-1) THEN
805 BM = BW - LBWU
806 BN = BW
807 ELSE
808 BM = MIN(BW,ODD_SIZE) + BWU
809 BN = MIN(BW,ODD_SIZE)
810 ENDIF
811*
812* Pointer to first element of block bidiagonal matrix in AF
813* Leading dimension of block bidiagonal system
814*
815 BBPTR = (NB+BWU)*BW + 1
816 LDBB = 2*BW + BWU
817*
818.EQ. IF (NPCOL1) THEN
819*
820* In this case the loop over the levels will not be
821* performed.
822 CALL ZGETRS( 'n', N-LN, NRHS, AF(BBPTR+BW*LDBB), LDBB,
823 $ IPIV(LN+1), WORK( LN+1 ), LDW, INFO)
824*
825 ENDIF
826*
827* Loop over levels ...
828*
829* The two integers NPACT (nu. of active processors) and NPSTR
830* (stride between active processors) is used to control the
831* loop.
832*
833 NPACT = NPCOL
834 NPSTR = 1
835*
836* Begin loop over levels
837.LE. 200 IF (NPACT 1) GOTO 300
838*
839* Test if processor is active
840.EQ. IF (MOD(MYCOL,NPSTR) 0) THEN
841*
842* Send/Receive blocks
843*
844.EQ. IF (MOD(MYCOL,2*NPSTR) 0) THEN
845*
846 NEICOL = MYCOL + NPSTR
847*
848.LE. IF (NEICOL/NPSTR NPACT-1) THEN
849*
850.LT. IF (NEICOL/NPSTR NPACT-1) THEN
851 BMN = BW
852 ELSE
853 BMN = MIN(BW,NUMROC(N, NB, NEICOL, 0, NPCOL))+BWU
854 ENDIF
855*
856 CALL ZGESD2D( ICTXT, BM, NRHS,
857 $ WORK(LN+1), LDW, 0, NEICOL )
858*
859.NE. IF( NPACT 2 )THEN
860*
861* Receive answers back from partner processor
862*
863 CALL ZGERV2D(ICTXT, BM+BMN-BW, NRHS,
864 $ WORK( LN+1 ), LDW, 0, NEICOL )
865*
866 BM = BM+BMN-BW
867*
868 ENDIF
869*
870 ENDIF
871*
872 ELSE
873*
874 NEICOL = MYCOL - NPSTR
875*
876.EQ. IF (NEICOL 0) THEN
877 BMN = BW - BWU
878 ELSE
879 BMN = BW
880 ENDIF
881*
882 CALL ZLAMOV( 'g', BM, NRHS, WORK(LN+1), LDW,
883 $ WORK(NB+BWU+BMN+1), LDW )
884*
885 CALL ZGERV2D( ICTXT, BMN, NRHS, WORK( NB+BWU+1 ),
886 $ LDW, 0, NEICOL )
887*
888* and do the permutations and eliminations
889*
890.NE. IF (NPACT 2) THEN
891*
892* Solve locally for BW variables
893*
894 CALL ZLASWP( NRHS, WORK(NB+BWU+1), LDW, 1, BW,
895 $ IPIV(LN+1), 1)
896*
897 CALL ZTRSM('l','l','n','u', BW, NRHS, CONE,
898 $ AF(BBPTR+BW*LDBB), LDBB, WORK(NB+BWU+1), LDW)
899*
900* Use soln just calculated to update RHS
901*
902 CALL ZGEMM( 'n', 'n', BM+BMN-BW, NRHS, BW,
903 $ -CONE, AF(BBPTR+BW*LDBB+BW), LDBB,
904 $ WORK(NB+BWU+1), LDW,
905 $ CONE, WORK(NB+BWU+1+BW), LDW )
906*
907* Give answers back to partner processor
908*
909 CALL ZGESD2D( ICTXT, BM+BMN-BW, NRHS,
910 $ WORK(NB+BWU+1+BW), LDW, 0, NEICOL )
911*
912 ELSE
913*
914* Finish up calculations for final level
915*
916 CALL ZLASWP( NRHS, WORK(NB+BWU+1), LDW, 1, BM+BMN,
917 $ IPIV(LN+1), 1)
918*
919 CALL ZTRSM('l','l','n','u', BM+BMN, NRHS, CONE,
920 $ AF(BBPTR+BW*LDBB), LDBB, WORK(NB+BWU+1), LDW)
921 ENDIF
922*
923 ENDIF
924*
925 NPACT = (NPACT + 1)/2
926 NPSTR = NPSTR * 2
927 GOTO 200
928*
929 ENDIF
930*
931 300 CONTINUE
932*
933*
934**************************************
935* BACKSOLVE
936********************************************************************
937* PHASE 2: Global computation phase -- Solve U*Y = X
938********************************************************************
939*
940.EQ. IF (NPCOL1) THEN
941*
942* In this case the loop over the levels will not be
943* performed.
944* In fact, the backsolve portion was done in the call to
945* ZGETRS in the frontsolve.
946*
947 ENDIF
948*
949* Compute variable needed to reverse loop structure in
950* reduced system.
951*
952 RECOVERY_VAL = NPACT*NPSTR - NPCOL
953*
954* Loop over levels
955* Terminal values of NPACT and NPSTR from frontsolve are used
956*
957.GE. 2200 IF( NPACT NPCOL ) GOTO 2300
958*
959 NPSTR = NPSTR/2
960*
961 NPACT = NPACT*2
962*
963* Have to adjust npact for non-power-of-2
964*
965 NPACT = NPACT-MOD( (RECOVERY_VAL/NPSTR), 2 )
966*
967* Find size of submatrix in this proc at this level
968*
969.LT. IF( MYCOL/NPSTR NPACT-1 ) THEN
970 BN = BW
971 ELSE
972 BN = MIN(BW, NUMROC(N, NB, NPCOL-1, 0, NPCOL) )
973 ENDIF
974*
975* If this processor is even in this level...
976*
977.EQ. IF( MOD( MYCOL, 2*NPSTR ) 0 ) THEN
978*
979 NEICOL = MYCOL+NPSTR
980*
981.LE. IF( NEICOL/NPSTR NPACT-1 ) THEN
982*
983.LT. IF( NEICOL/NPSTR NPACT-1 ) THEN
984 BMN = BW
985 BNN = BW
986 ELSE
987 BMN = MIN(BW,NUMROC(N, NB, NEICOL, 0, NPCOL))+BWU
988 BNN = MIN(BW, NUMROC(N, NB, NEICOL, 0, NPCOL) )
989 ENDIF
990*
991.GT. IF( NPACT 2 ) THEN
992*
993 CALL ZGESD2D( ICTXT, 2*BW, NRHS, WORK( LN+1 ),
994 $ LDW, 0, NEICOL )
995*
996 CALL ZGERV2D( ICTXT, BW, NRHS, WORK( LN+1 ),
997 $ LDW, 0, NEICOL )
998*
999 ELSE
1000*
1001 CALL ZGERV2D( ICTXT, BW, NRHS, WORK( LN+1 ),
1002 $ LDW, 0, NEICOL )
1003*
1004 ENDIF
1005*
1006 ENDIF
1007*
1008 ELSE
1009* This processor is odd on this level
1010*
1011 NEICOL = MYCOL - NPSTR
1012*
1013.EQ. IF (NEICOL 0) THEN
1014 BMN = BW - BWU
1015 ELSE
1016 BMN = BW
1017 ENDIF
1018*
1019.LT. IF( NEICOL NPCOL-1 ) THEN
1020 BNN = BW
1021 ELSE
1022 BNN = MIN(BW, NUMROC(N, NB, NEICOL, 0, NPCOL) )
1023 ENDIF
1024*
1025.GT. IF( NPACT 2 ) THEN
1026*
1027* Move RHS to make room for received solutions
1028*
1029 CALL ZLAMOV( 'g', BW, NRHS, WORK(NB+BWU+1),
1030 $ LDW, WORK(NB+BWU+BW+1), LDW )
1031*
1032 CALL ZGERV2D( ICTXT, 2*BW, NRHS, WORK( LN+1 ),
1033 $ LDW, 0, NEICOL )
1034*
1035 CALL ZGEMM( 'n', 'n', BW, NRHS, BN,
1036 $ -CONE, AF(BBPTR), LDBB,
1037 $ WORK(LN+1), LDW,
1038 $ CONE, WORK(NB+BWU+BW+1), LDW )
1039*
1040*
1041.GT. IF( MYCOL NPSTR ) THEN
1042*
1043 CALL ZGEMM( 'n', 'n', BW, NRHS, BW,
1044 $ -CONE, AF(BBPTR+2*BW*LDBB), LDBB,
1045 $ WORK(LN+BW+1), LDW,
1046 $ CONE, WORK(NB+BWU+BW+1), LDW )
1047*
1048 ENDIF
1049*
1050 CALL ZTRSM('l','u','n','n', BW, NRHS, CONE,
1051 $ AF(BBPTR+BW*LDBB), LDBB, WORK(NB+BWU+BW+1), LDW)
1052*
1053* Send new solution to neighbor
1054*
1055 CALL ZGESD2D( ICTXT, BW, NRHS,
1056 $ WORK( NB+BWU+BW+1 ), LDW, 0, NEICOL )
1057*
1058* Copy new solution into expected place
1059*
1060 CALL ZLAMOV( 'g', BW, NRHS, WORK(NB+BWU+1+BW),
1061 $ LDW, WORK(LN+BW+1), LDW )
1062*
1063 ELSE
1064*
1065* Solve with local diagonal block
1066*
1067 CALL ZTRSM( 'l','u','n','n', BN+BNN, NRHS, CONE,
1068 $ AF(BBPTR+BW*LDBB), LDBB, WORK(NB+BWU+1), LDW)
1069*
1070* Send new solution to neighbor
1071*
1072 CALL ZGESD2D( ICTXT, BW, NRHS,
1073 $ WORK(NB+BWU+1), LDW, 0, NEICOL )
1074*
1075* Shift solutions into expected positions
1076*
1077 CALL ZLAMOV( 'g', BNN+BN-BW, NRHS, WORK(NB+BWU+1+BW),
1078 $ LDW, WORK(LN+1), LDW )
1079*
1080*
1081.NE. IF( (NB+BWU+1) (LN+1+BW) ) THEN
1082*
1083* Copy one row at a time since spaces may overlap
1084*
1085 DO 1064 J=1, BW
1086 CALL ZCOPY( NRHS, WORK(NB+BWU+J), LDW,
1087 $ WORK(LN+BW+J), LDW )
1088 1064 CONTINUE
1089*
1090 ENDIF
1091*
1092 ENDIF
1093*
1094 ENDIF
1095*
1096 GOTO 2200
1097*
1098 2300 CONTINUE
1099* End of loop over levels
1100*
1101********************************************************************
1102* PHASE 1: (Almost) Local computation phase -- Solve U*Y = X
1103********************************************************************
1104*
1105* Reset BM to value it had before reduced system frontsolve...
1106*
1107.NE. IF (MYCOL NPCOL-1) THEN
1108 BM = BW - LBWU
1109 ELSE
1110 BM = MIN(BW,ODD_SIZE) + BWU
1111 ENDIF
1112*
1113* First metastep is to account for the fillin blocks AF
1114*
1115.LT. IF( MYCOL NPCOL-1 ) THEN
1116*
1117 CALL ZGESD2D( ICTXT, BW, NRHS, WORK( NB-BW+1 ),
1118 $ LDW, 0, MYCOL+1 )
1119*
1120 ENDIF
1121*
1122.GT. IF( MYCOL 0 ) THEN
1123*
1124 CALL ZGERV2D( ICTXT, BW, NRHS, WORK( NB+BWU+1 ),
1125 $ LDW, 0, MYCOL-1 )
1126*
1127* Modify local right hand sides with received rhs's
1128*
1129 CALL ZGEMM( 'n', 'n', LM-BM, NRHS, BW, -CONE,
1130 $ AF( 1 ), LM, WORK( NB+BWU+1 ), LDW, CONE,
1131 $ WORK( 1 ), LDW )
1132*
1133 ENDIF
1134*
1135 DO 2021 J = LN, 1, -1
1136*
1137 LMJ = MIN( BW, ODD_SIZE-1 )
1138*
1139 LPTR = BW-1+J*LLDA+APTR
1140*
1141* In the following, the TRANS=T option is used to reverse
1142* the order of multiplication, not as a true transpose
1143*
1144 CALL ZGEMV( 't', LMJ, NRHS, -CONE, WORK( J+1), LDW,
1145 $ A( LPTR ), LLDA-1, CONE, WORK( J ), LDW )
1146*
1147* Divide by diagonal element
1148*
1149 CALL ZSCAL( NRHS, CONE/A( LPTR-LLDA+1 ),
1150 $ WORK( J ), LDW )
1151 2021 CONTINUE
1152*
1153*
1154*
1155 CALL ZLAMOV( 'g', ODD_SIZE, NRHS, WORK( 1 ), LDW,
1156 $ B( 1 ), LLDB )
1157*
1158* Free BLACS space used to hold standard-form grid.
1159*
1160 ICTXT = ICTXT_SAVE
1161.NE. IF( ICTXT ICTXT_NEW ) THEN
1162 CALL BLACS_GRIDEXIT( ICTXT_NEW )
1163 ENDIF
1164*
1165 1234 CONTINUE
1166*
1167* Restore saved input parameters
1168*
1169 NP = NP_SAVE
1170*
1171* Output worksize
1172*
1173 WORK( 1 ) = WORK_SIZE_MIN
1174*
1175 RETURN
1176*
1177* End of PZGBTRS
1178*
1179 END
subroutine desc_convert(desc_in, desc_out, info)
Definition desc_convert.f:2
subroutine pxerbla(contxt, srname, info)
Definition mpi.f:1600
subroutine blacs_gridinfo(cntxt, nprow, npcol, myrow, mycol)
Definition mpi.f:754
subroutine globchk(ictxt, n, x, ldx, iwork, info)
Definition pchkxmat.f:403
subroutine pzgbtrs(trans, n, bwl, bwu, nrhs, a, ja, desca, ipiv, b, ib, descb, af, laf, work, lwork, info)
Definition pzgbtrs.f:3
void reshape(Int *context_in, Int *major_in, Int *context_out, Int *major_out, Int *first_proc, Int *nprow_new, Int *npcol_new)
Definition reshape.c:80