Actual source code: bvlapack.c
slepc-3.15.0 2021-03-31
1: /*
2: - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
3: SLEPc - Scalable Library for Eigenvalue Problem Computations
4: Copyright (c) 2002-2021, Universitat Politecnica de Valencia, Spain
6: This file is part of SLEPc.
7: SLEPc is distributed under a 2-clause BSD license (see LICENSE).
8: - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
9: */
10: /*
11: BV private kernels that use the LAPACK
12: */
14: #include <slepc/private/bvimpl.h>
15: #include <slepcblaslapack.h>
17: /*
18: Reduction operation to compute sqrt(x**2+y**2) when normalizing vectors
19: */
20: SLEPC_EXTERN void MPIAPI SlepcPythag(void *in,void *inout,PetscMPIInt *len,MPI_Datatype *datatype)
21: {
22: PetscBLASInt i,n=*len;
23: PetscReal *x = (PetscReal*)in,*y = (PetscReal*)inout;
26: if (PetscUnlikely(*datatype!=MPIU_REAL)) {
27: (*PetscErrorPrintf)("Only implemented for MPIU_REAL data type");
28: MPI_Abort(MPI_COMM_WORLD,1);
29: }
30: for (i=0;i<n;i++) y[i] = SlepcAbs(x[i],y[i]);
31: PetscFunctionReturnVoid();
32: }
34: /*
35: Compute ||A|| for an mxn matrix
36: */
37: PetscErrorCode BVNorm_LAPACK_Private(BV bv,PetscInt m_,PetscInt n_,const PetscScalar *A,NormType type,PetscReal *nrm,PetscBool mpi)
38: {
40: PetscBLASInt m,n,i,j;
41: PetscMPIInt len;
42: PetscReal lnrm,*rwork=NULL,*rwork2=NULL;
45: PetscFPTrapPush(PETSC_FP_TRAP_OFF);
46: PetscBLASIntCast(m_,&m);
47: PetscBLASIntCast(n_,&n);
48: if (type==NORM_FROBENIUS || type==NORM_2) {
49: lnrm = LAPACKlange_("F",&m,&n,(PetscScalar*)A,&m,rwork);
50: if (mpi) {
51: MPIU_Allreduce(&lnrm,nrm,1,MPIU_REAL,MPIU_LAPY2,PetscObjectComm((PetscObject)bv));
52: } else *nrm = lnrm;
53: PetscLogFlops(2.0*m*n);
54: } else if (type==NORM_1) {
55: if (mpi) {
56: BVAllocateWork_Private(bv,2*n_);
57: rwork = (PetscReal*)bv->work;
58: rwork2 = rwork+n_;
59: PetscArrayzero(rwork,n_);
60: PetscArrayzero(rwork2,n_);
61: for (j=0;j<n_;j++) {
62: for (i=0;i<m_;i++) {
63: rwork[j] += PetscAbsScalar(A[i+j*m_]);
64: }
65: }
66: PetscMPIIntCast(n_,&len);
67: MPIU_Allreduce(rwork,rwork2,len,MPIU_REAL,MPIU_SUM,PetscObjectComm((PetscObject)bv));
68: *nrm = 0.0;
69: for (j=0;j<n_;j++) if (rwork2[j] > *nrm) *nrm = rwork2[j];
70: } else {
71: *nrm = LAPACKlange_("O",&m,&n,(PetscScalar*)A,&m,rwork);
72: }
73: PetscLogFlops(1.0*m*n);
74: } else if (type==NORM_INFINITY) {
75: BVAllocateWork_Private(bv,m_);
76: rwork = (PetscReal*)bv->work;
77: lnrm = LAPACKlange_("I",&m,&n,(PetscScalar*)A,&m,rwork);
78: if (mpi) {
79: MPIU_Allreduce(&lnrm,nrm,1,MPIU_REAL,MPIU_MAX,PetscObjectComm((PetscObject)bv));
80: } else *nrm = lnrm;
81: PetscLogFlops(1.0*m*n);
82: }
83: PetscFPTrapPop();
84: return(0);
85: }
87: /*
88: Normalize the columns of an mxn matrix A
89: */
90: PetscErrorCode BVNormalize_LAPACK_Private(BV bv,PetscInt m_,PetscInt n_,const PetscScalar *A,PetscScalar *eigi,PetscBool mpi)
91: {
93: PetscBLASInt m,j,k,info,zero=0;
94: PetscMPIInt len;
95: PetscReal *norms,*rwork=NULL,*rwork2=NULL,done=1.0;
98: PetscFPTrapPush(PETSC_FP_TRAP_OFF);
99: PetscBLASIntCast(m_,&m);
100: BVAllocateWork_Private(bv,2*n_);
101: rwork = (PetscReal*)bv->work;
102: rwork2 = rwork+n_;
103: /* compute local norms */
104: for (j=0;j<n_;j++) {
105: k = 1;
106: #if !defined(PETSC_USE_COMPLEX)
107: if (eigi && eigi[j] != 0.0) k = 2;
108: #endif
109: rwork[j] = LAPACKlange_("F",&m,&k,(PetscScalar*)(A+j*m_),&m,rwork2);
110: if (k==2) { rwork[j+1] = rwork[j]; j++; }
111: }
112: /* reduction to get global norms */
113: if (mpi) {
114: PetscMPIIntCast(n_,&len);
115: PetscArrayzero(rwork2,n_);
116: MPIU_Allreduce(rwork,rwork2,len,MPIU_REAL,MPIU_LAPY2,PetscObjectComm((PetscObject)bv));
117: norms = rwork2;
118: } else norms = rwork;
119: /* scale columns */
120: for (j=0;j<n_;j++) {
121: k = 1;
122: #if !defined(PETSC_USE_COMPLEX)
123: if (eigi && eigi[j] != 0.0) k = 2;
124: #endif
125: PetscStackCallBLAS("LAPACKlascl",LAPACKlascl_("G",&zero,&zero,norms+j,&done,&m,&k,(PetscScalar*)(A+j*m_),&m,&info));
126: SlepcCheckLapackInfo("lascl",info);
127: if (k==2) j++;
128: }
129: PetscLogFlops(3.0*m*n_);
130: PetscFPTrapPop();
131: return(0);
132: }
134: /*
135: Compute the upper Cholesky factor in R and its inverse in S.
136: If S == R then the inverse overwrites the Cholesky factor.
137: */
138: PetscErrorCode BVMatCholInv_LAPACK_Private(BV bv,Mat R,Mat S)
139: {
141: PetscInt i,k,l,n,m,ld,lds;
142: PetscScalar *pR,*pS;
143: PetscBLASInt info,n_ = 0,m_ = 0,ld_,lds_;
146: l = bv->l;
147: k = bv->k;
148: MatGetSize(R,&m,NULL);
149: n = k-l;
150: PetscBLASIntCast(m,&m_);
151: PetscBLASIntCast(n,&n_);
152: ld = m;
153: ld_ = m_;
154: MatDenseGetArray(R,&pR);
156: if (S==R) {
157: BVAllocateWork_Private(bv,m*k);
158: pS = bv->work;
159: lds = ld;
160: lds_ = ld_;
161: } else {
162: MatDenseGetArray(S,&pS);
163: MatGetSize(S,&lds,NULL);
164: PetscBLASIntCast(lds,&lds_);
165: }
167: /* save a copy of matrix in S */
168: for (i=l;i<k;i++) {
169: PetscArraycpy(pS+i*lds+l,pR+i*ld+l,n);
170: }
172: /* compute upper Cholesky factor in R */
173: PetscFPTrapPush(PETSC_FP_TRAP_OFF);
174: PetscStackCallBLAS("LAPACKpotrf",LAPACKpotrf_("U",&n_,pR+l*ld+l,&ld_,&info));
175: PetscLogFlops((1.0*n*n*n)/3.0);
177: if (info) { /* LAPACKpotrf failed, retry on diagonally perturbed matrix */
178: for (i=l;i<k;i++) {
179: PetscArraycpy(pR+i*ld+l,pS+i*lds+l,n);
180: pR[i+i*ld] += 50.0*PETSC_MACHINE_EPSILON;
181: }
182: PetscStackCallBLAS("LAPACKpotrf",LAPACKpotrf_("U",&n_,pR+l*ld+l,&ld_,&info));
183: SlepcCheckLapackInfo("potrf",info);
184: PetscLogFlops((1.0*n*n*n)/3.0);
185: }
187: /* compute S = inv(R) */
188: if (S==R) {
189: PetscStackCallBLAS("LAPACKtrtri",LAPACKtrtri_("U","N",&n_,pR+l*ld+l,&ld_,&info));
190: } else {
191: PetscArrayzero(pS+l*lds,(k-l)*k);
192: for (i=l;i<k;i++) {
193: PetscArraycpy(pS+i*lds+l,pR+i*ld+l,n);
194: }
195: PetscStackCallBLAS("LAPACKtrtri",LAPACKtrtri_("U","N",&n_,pS+l*lds+l,&lds_,&info));
196: }
197: SlepcCheckLapackInfo("trtri",info);
198: PetscFPTrapPop();
199: PetscLogFlops(0.33*n*n*n);
201: /* Zero out entries below the diagonal */
202: for (i=l;i<k-1;i++) {
203: PetscArrayzero(pR+i*ld+i+1,(k-i-1));
204: if (S!=R) { PetscArrayzero(pS+i*lds+i+1,(k-i-1)); }
205: }
206: MatDenseRestoreArray(R,&pR);
207: if (S!=R) { MatDenseRestoreArray(S,&pS); }
208: return(0);
209: }
211: /*
212: Compute the inverse of an upper triangular matrix R, store it in S.
213: If S == R then the inverse overwrites R.
214: */
215: PetscErrorCode BVMatTriInv_LAPACK_Private(BV bv,Mat R,Mat S)
216: {
218: PetscInt i,k,l,n,m,ld,lds;
219: PetscScalar *pR,*pS;
220: PetscBLASInt info,n_,m_ = 0,ld_,lds_;
223: l = bv->l;
224: k = bv->k;
225: MatGetSize(R,&m,NULL);
226: n = k-l;
227: PetscBLASIntCast(m,&m_);
228: PetscBLASIntCast(n,&n_);
229: ld = m;
230: ld_ = m_;
231: MatDenseGetArray(R,&pR);
233: if (S==R) {
234: BVAllocateWork_Private(bv,m*k);
235: pS = bv->work;
236: lds = ld;
237: lds_ = ld_;
238: } else {
239: MatDenseGetArray(S,&pS);
240: MatGetSize(S,&lds,NULL);
241: PetscBLASIntCast(lds,&lds_);
242: }
244: /* compute S = inv(R) */
245: PetscFPTrapPush(PETSC_FP_TRAP_OFF);
246: if (S==R) {
247: PetscStackCallBLAS("LAPACKtrtri",LAPACKtrtri_("U","N",&n_,pR+l*ld+l,&ld_,&info));
248: } else {
249: PetscArrayzero(pS+l*lds,(k-l)*k);
250: for (i=l;i<k;i++) {
251: PetscArraycpy(pS+i*lds+l,pR+i*ld+l,n);
252: }
253: PetscStackCallBLAS("LAPACKtrtri",LAPACKtrtri_("U","N",&n_,pS+l*lds+l,&lds_,&info));
254: }
255: SlepcCheckLapackInfo("trtri",info);
256: PetscFPTrapPop();
257: PetscLogFlops(0.33*n*n*n);
259: MatDenseRestoreArray(R,&pR);
260: if (S!=R) { MatDenseRestoreArray(S,&pS); }
261: return(0);
262: }
264: /*
265: Compute the matrix to be used for post-multiplying the basis in the SVQB
266: block orthogonalization method.
267: On input R = V'*V, on output S = D*U*Lambda^{-1/2} where (U,Lambda) is
268: the eigendecomposition of D*R*D with D=diag(R)^{-1/2}.
269: If S == R then the result overwrites R.
270: */
271: PetscErrorCode BVMatSVQB_LAPACK_Private(BV bv,Mat R,Mat S)
272: {
274: PetscInt i,j,k,l,n,m,ld,lds;
275: PetscScalar *pR,*pS,*D,*work,a;
276: PetscReal *eig,dummy;
277: PetscBLASInt info,lwork,n_,m_ = 0,ld_,lds_;
278: #if defined(PETSC_USE_COMPLEX)
279: PetscReal *rwork,rdummy;
280: #endif
283: l = bv->l;
284: k = bv->k;
285: MatGetSize(R,&m,NULL);
286: n = k-l;
287: PetscBLASIntCast(m,&m_);
288: PetscBLASIntCast(n,&n_);
289: ld = m;
290: ld_ = m_;
291: MatDenseGetArray(R,&pR);
293: if (S==R) {
294: pS = pR;
295: lds = ld;
296: lds_ = ld_;
297: } else {
298: MatDenseGetArray(S,&pS);
299: MatGetSize(S,&lds,NULL);
300: PetscBLASIntCast(lds,&lds_);
301: }
302: PetscFPTrapPush(PETSC_FP_TRAP_OFF);
304: /* workspace query and memory allocation */
305: lwork = -1;
306: #if defined(PETSC_USE_COMPLEX)
307: PetscStackCallBLAS("LAPACKsyev",LAPACKsyev_("V","L",&n_,pS,&lds_,&dummy,&a,&lwork,&rdummy,&info));
308: PetscBLASIntCast((PetscInt)PetscRealPart(a),&lwork);
309: PetscMalloc4(n,&eig,n,&D,lwork,&work,PetscMax(1,3*n-2),&rwork);
310: #else
311: PetscStackCallBLAS("LAPACKsyev",LAPACKsyev_("V","L",&n_,pS,&lds_,&dummy,&a,&lwork,&info));
312: PetscBLASIntCast((PetscInt)a,&lwork);
313: PetscMalloc3(n,&eig,n,&D,lwork,&work);
314: #endif
316: /* copy and scale matrix */
317: for (i=l;i<k;i++) D[i-l] = 1.0/PetscSqrtReal(PetscRealPart(pR[i+i*ld]));
318: for (i=l;i<k;i++) for (j=l;j<k;j++) pS[i+j*lds] = pR[i+j*ld]*D[i-l];
319: for (j=l;j<k;j++) for (i=l;i<k;i++) pS[i+j*lds] *= D[j-l];
321: /* compute eigendecomposition */
322: #if defined(PETSC_USE_COMPLEX)
323: PetscStackCallBLAS("LAPACKsyev",LAPACKsyev_("V","L",&n_,pS+l*lds+l,&lds_,eig,work,&lwork,rwork,&info));
324: #else
325: PetscStackCallBLAS("LAPACKsyev",LAPACKsyev_("V","L",&n_,pS+l*lds+l,&lds_,eig,work,&lwork,&info));
326: #endif
327: SlepcCheckLapackInfo("syev",info);
329: if (S!=R) { /* R = U' */
330: for (i=l;i<k;i++) for (j=l;j<k;j++) pR[i+j*ld] = pS[j+i*lds];
331: }
333: /* compute S = D*U*Lambda^{-1/2} */
334: for (i=l;i<k;i++) for (j=l;j<k;j++) pS[i+j*lds] *= D[i-l];
335: for (j=l;j<k;j++) for (i=l;i<k;i++) pS[i+j*lds] /= PetscSqrtReal(eig[j-l]);
337: if (S!=R) { /* compute R = inv(S) = Lambda^{1/2}*U'/D */
338: for (i=l;i<k;i++) for (j=l;j<k;j++) pR[i+j*ld] *= PetscSqrtReal(eig[i-l]);
339: for (j=l;j<k;j++) for (i=l;i<k;i++) pR[i+j*ld] /= D[j-l];
340: }
342: #if defined(PETSC_USE_COMPLEX)
343: PetscFree4(eig,D,work,rwork);
344: #else
345: PetscFree3(eig,D,work);
346: #endif
347: PetscLogFlops(9.0*n*n*n);
348: PetscFPTrapPop();
350: MatDenseRestoreArray(R,&pR);
351: if (S!=R) { MatDenseRestoreArray(S,&pS); }
352: return(0);
353: }
355: /*
356: QR factorization of an mxn matrix via parallel TSQR
357: */
358: PetscErrorCode BVOrthogonalize_LAPACK_TSQR(BV bv,PetscInt m_,PetscInt n_,PetscScalar *Q,PetscScalar *R,PetscInt ldr)
359: {
361: PetscInt level,plevel,nlevels,powtwo,lda,worklen;
362: PetscBLASInt m,n,i,j,k,l,s = 0,nb,sz,lwork,info;
363: PetscScalar *tau,*work,*A=NULL,*QQ=NULL,*Qhalf,*C=NULL,one=1.0,zero=0.0;
364: PetscMPIInt rank,size,count,stride;
365: MPI_Datatype tmat;
368: PetscFPTrapPush(PETSC_FP_TRAP_OFF);
369: PetscBLASIntCast(m_,&m);
370: PetscBLASIntCast(n_,&n);
371: k = PetscMin(m,n);
372: nb = 16;
373: lda = 2*n;
374: MPI_Comm_size(PetscObjectComm((PetscObject)bv),&size);CHKERRMPI(ierr);
375: MPI_Comm_rank(PetscObjectComm((PetscObject)bv),&rank);CHKERRMPI(ierr);
376: nlevels = (PetscInt)PetscCeilReal(PetscLog2Real((PetscReal)size));
377: powtwo = PetscPowInt(2,(PetscInt)PetscFloorReal(PetscLog2Real((PetscReal)size)));
378: worklen = n+n*nb;
379: if (nlevels) worklen += n*lda+n*lda*nlevels+n*lda;
380: BVAllocateWork_Private(bv,worklen);
381: tau = bv->work;
382: work = bv->work+n;
383: PetscBLASIntCast(n*nb,&lwork);
384: if (nlevels) {
385: A = bv->work+n+n*nb;
386: QQ = bv->work+n+n*nb+n*lda;
387: C = bv->work+n+n*nb+n*lda+n*lda*nlevels;
388: }
390: /* Compute QR */
391: PetscStackCallBLAS("LAPACKgeqrf",LAPACKgeqrf_(&m,&n,Q,&m,tau,work,&lwork,&info));
392: SlepcCheckLapackInfo("geqrf",info);
394: /* Extract R */
395: if (R || nlevels) {
396: for (j=0;j<n;j++) {
397: for (i=0;i<=PetscMin(j,m-1);i++) {
398: if (nlevels) A[i+j*lda] = Q[i+j*m];
399: else R[i+j*ldr] = Q[i+j*m];
400: }
401: for (i=PetscMin(j,m-1)+1;i<n;i++) {
402: if (nlevels) A[i+j*lda] = 0.0;
403: else R[i+j*ldr] = 0.0;
404: }
405: }
406: }
408: /* Compute orthogonal matrix in Q */
409: PetscStackCallBLAS("LAPACKorgqr",LAPACKorgqr_(&m,&k,&k,Q,&m,tau,work,&lwork,&info));
410: SlepcCheckLapackInfo("orgqr",info);
412: if (nlevels) {
414: PetscMPIIntCast(n,&count);
415: PetscMPIIntCast(lda,&stride);
416: PetscBLASIntCast(lda,&l);
417: MPI_Type_vector(count,count,stride,MPIU_SCALAR,&tmat);CHKERRMPI(ierr);
418: MPI_Type_commit(&tmat);CHKERRMPI(ierr);
420: for (level=nlevels;level>=1;level--) {
422: plevel = PetscPowInt(2,level);
423: PetscBLASIntCast(plevel*PetscFloorReal(rank/(PetscReal)plevel)+(rank+PetscPowInt(2,level-1))%plevel,&s);
425: /* Stack triangular matrices */
426: if (rank<s && s<size) { /* send top part, receive bottom part */
427: MPI_Sendrecv(A,1,tmat,s,111,A+n,1,tmat,s,111,PetscObjectComm((PetscObject)bv),MPI_STATUS_IGNORE);CHKERRMPI(ierr);
428: } else if (s<size) { /* copy top to bottom, receive top part */
429: MPI_Sendrecv(A,1,tmat,rank,111,A+n,1,tmat,rank,111,PetscObjectComm((PetscObject)bv),MPI_STATUS_IGNORE);CHKERRMPI(ierr);
430: MPI_Sendrecv(A+n,1,tmat,s,111,A,1,tmat,s,111,PetscObjectComm((PetscObject)bv),MPI_STATUS_IGNORE);CHKERRMPI(ierr);
431: }
432: if (level<nlevels && size!=powtwo) { /* for cases when size is not a power of 2 */
433: if (rank<size-powtwo) { /* send bottom part */
434: MPI_Send(A+n,1,tmat,rank+powtwo,111,PetscObjectComm((PetscObject)bv));CHKERRMPI(ierr);
435: } else if (rank>=powtwo) { /* receive bottom part */
436: MPI_Recv(A+n,1,tmat,rank-powtwo,111,PetscObjectComm((PetscObject)bv),MPI_STATUS_IGNORE);CHKERRMPI(ierr);
437: }
438: }
439: /* Compute QR and build orthogonal matrix */
440: if (level<nlevels || (level==nlevels && s<size)) {
441: PetscStackCallBLAS("LAPACKgeqrf",LAPACKgeqrf_(&l,&n,A,&l,tau,work,&lwork,&info));
442: SlepcCheckLapackInfo("geqrf",info);
443: PetscArraycpy(QQ+(level-1)*n*lda,A,n*lda);
444: PetscStackCallBLAS("LAPACKorgqr",LAPACKorgqr_(&l,&n,&n,QQ+(level-1)*n*lda,&l,tau,work,&lwork,&info));
445: SlepcCheckLapackInfo("orgqr",info);
446: for (j=0;j<n;j++) {
447: for (i=j+1;i<n;i++) A[i+j*lda] = 0.0;
448: }
449: } else if (level==nlevels) { /* only one triangular matrix, set Q=I */
450: PetscArrayzero(QQ+(level-1)*n*lda,n*lda);
451: for (j=0;j<n;j++) QQ[j+j*lda+(level-1)*n*lda] = 1.0;
452: }
453: }
455: /* Extract R */
456: if (R) {
457: for (j=0;j<n;j++) {
458: for (i=0;i<=j;i++) R[i+j*ldr] = A[i+j*lda];
459: for (i=j+1;i<n;i++) R[i+j*ldr] = 0.0;
460: }
461: }
463: /* Accumulate orthogonal matrices */
464: for (level=1;level<=nlevels;level++) {
465: plevel = PetscPowInt(2,level);
466: PetscBLASIntCast(plevel*PetscFloorReal(rank/(PetscReal)plevel)+(rank+PetscPowInt(2,level-1))%plevel,&s);
467: Qhalf = (rank<s)? QQ+(level-1)*n*lda: QQ+(level-1)*n*lda+n;
468: if (level<nlevels) {
469: PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&l,&n,&n,&one,QQ+level*n*lda,&l,Qhalf,&l,&zero,C,&l));
470: PetscArraycpy(QQ+level*n*lda,C,n*lda);
471: } else {
472: for (i=0;i<m/l;i++) {
473: PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&l,&n,&n,&one,Q+i*l,&m,Qhalf,&l,&zero,C,&l));
474: for (j=0;j<n;j++) { PetscArraycpy(Q+i*l+j*m,C+j*l,l); }
475: }
476: sz = m%l;
477: if (sz) {
478: PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&sz,&n,&n,&one,Q+(m/l)*l,&m,Qhalf,&l,&zero,C,&l));
479: for (j=0;j<n;j++) { PetscArraycpy(Q+(m/l)*l+j*m,C+j*l,sz); }
480: }
481: }
482: }
484: MPI_Type_free(&tmat);CHKERRMPI(ierr);
485: }
487: PetscLogFlops(3.0*m*n*n);
488: PetscFPTrapPop();
489: return(0);
490: }
492: /*
493: Reduction operation to compute [~,Rout]=qr([Rin1;Rin2]) in the TSQR algorithm;
494: all matrices are upper triangular stored in packed format
495: */
496: SLEPC_EXTERN void MPIAPI SlepcGivensPacked(void *in,void *inout,PetscMPIInt *len,MPI_Datatype *datatype)
497: {
499: PetscBLASInt n,i,j,k,one=1;
500: PetscMPIInt tsize;
501: PetscScalar v,s,*R2=(PetscScalar*)in,*R1=(PetscScalar*)inout;
502: PetscReal c;
505: MPI_Type_size(*datatype,&tsize);CHKERRABORT(PETSC_COMM_SELF,ierr); /* we assume len=1 */
506: tsize /= sizeof(PetscScalar);
507: n = (-1+(PetscBLASInt)PetscSqrtReal(1+8*tsize))/2;
508: for (j=0;j<n;j++) {
509: for (i=0;i<=j;i++) {
510: LAPACKlartg_(R1+(2*n-j-1)*j/2+j,R2+(2*n-i-1)*i/2+j,&c,&s,&v);
511: R1[(2*n-j-1)*j/2+j] = v;
512: k = n-j-1;
513: if (k) BLASrot_(&k,R1+(2*n-j-1)*j/2+j+1,&one,R2+(2*n-i-1)*i/2+j+1,&one,&c,&s);
514: }
515: }
516: PetscFunctionReturnVoid();
517: }
519: /*
520: Computes the R factor of the QR factorization of an mxn matrix via parallel TSQR
521: */
522: PetscErrorCode BVOrthogonalize_LAPACK_TSQR_OnlyR(BV bv,PetscInt m_,PetscInt n_,PetscScalar *Q,PetscScalar *R,PetscInt ldr)
523: {
525: PetscInt worklen;
526: PetscBLASInt m,n,i,j,s,nb,lwork,info;
527: PetscScalar *tau,*work,*A=NULL,*R1=NULL,*R2=NULL;
528: PetscMPIInt size,count;
529: MPI_Datatype tmat;
532: PetscFPTrapPush(PETSC_FP_TRAP_OFF);
533: PetscBLASIntCast(m_,&m);
534: PetscBLASIntCast(n_,&n);
535: nb = 16;
536: s = n+n*(n-1)/2; /* length of packed triangular matrix */
537: MPI_Comm_size(PetscObjectComm((PetscObject)bv),&size);CHKERRMPI(ierr);
538: worklen = n+n*nb+2*s+m*n;
539: BVAllocateWork_Private(bv,worklen);
540: tau = bv->work;
541: work = bv->work+n;
542: R1 = bv->work+n+n*nb;
543: R2 = bv->work+n+n*nb+s;
544: A = bv->work+n+n*nb+2*s;
545: PetscBLASIntCast(n*nb,&lwork);
546: PetscArraycpy(A,Q,m*n);
548: /* Compute QR */
549: PetscStackCallBLAS("LAPACKgeqrf",LAPACKgeqrf_(&m,&n,A,&m,tau,work,&lwork,&info));
550: SlepcCheckLapackInfo("geqrf",info);
552: if (size==1) {
553: /* Extract R */
554: for (j=0;j<n;j++) {
555: for (i=0;i<=PetscMin(j,m-1);i++) R[i+j*ldr] = A[i+j*m];
556: for (i=PetscMin(j,m-1)+1;i<n;i++) R[i+j*ldr] = 0.0;
557: }
558: } else {
559: /* Use MPI reduction operation to obtain global R */
560: PetscMPIIntCast(s,&count);
561: MPI_Type_contiguous(count,MPIU_SCALAR,&tmat);CHKERRMPI(ierr);
562: MPI_Type_commit(&tmat);CHKERRMPI(ierr);
563: for (i=0;i<n;i++) {
564: for (j=i;j<n;j++) R1[(2*n-i-1)*i/2+j] = (i<m)?A[i+j*m]:0.0;
565: }
566: MPIU_Allreduce(R1,R2,1,tmat,MPIU_TSQR,PetscObjectComm((PetscObject)bv));
567: for (i=0;i<n;i++) {
568: for (j=0;j<i;j++) R[i+j*ldr] = 0.0;
569: for (j=i;j<n;j++) R[i+j*ldr] = R2[(2*n-i-1)*i/2+j];
570: }
571: MPI_Type_free(&tmat);CHKERRMPI(ierr);
572: }
574: PetscLogFlops(3.0*m*n*n);
575: PetscFPTrapPop();
576: return(0);
577: }