Actual source code: fnexp.c
slepc-3.18.3 2023-03-24
1: /*
2: - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
3: SLEPc - Scalable Library for Eigenvalue Problem Computations
4: Copyright (c) 2002-, Universitat Politecnica de Valencia, Spain
6: This file is part of SLEPc.
7: SLEPc is distributed under a 2-clause BSD license (see LICENSE).
8: - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
9: */
10: /*
11: Exponential function exp(x)
12: */
14: #include <slepc/private/fnimpl.h>
15: #include <slepcblaslapack.h>
17: PetscErrorCode FNEvaluateFunction_Exp(FN fn,PetscScalar x,PetscScalar *y)
18: {
19: *y = PetscExpScalar(x);
20: return 0;
21: }
23: PetscErrorCode FNEvaluateDerivative_Exp(FN fn,PetscScalar x,PetscScalar *y)
24: {
25: *y = PetscExpScalar(x);
26: return 0;
27: }
29: #define MAX_PADE 6
30: #define SWAP(a,b,t) {t=a;a=b;b=t;}
32: PetscErrorCode FNEvaluateFunctionMat_Exp_Pade(FN fn,Mat A,Mat B)
33: {
34: PetscBLASInt n=0,ld,ld2,*ipiv,info,inc=1;
35: PetscInt m,j,k,sexp;
36: PetscBool odd;
37: const PetscInt p=MAX_PADE;
38: PetscReal c[MAX_PADE+1],s,*rwork;
39: PetscScalar scale,mone=-1.0,one=1.0,two=2.0,zero=0.0;
40: PetscScalar *Ba,*As,*A2,*Q,*P,*W,*aux;
41: const PetscScalar *Aa;
43: MatDenseGetArrayRead(A,&Aa);
44: MatDenseGetArray(B,&Ba);
45: MatGetSize(A,&m,NULL);
46: PetscBLASIntCast(m,&n);
47: ld = n;
48: ld2 = ld*ld;
49: P = Ba;
50: PetscMalloc6(m*m,&Q,m*m,&W,m*m,&As,m*m,&A2,ld,&rwork,ld,&ipiv);
51: PetscArraycpy(As,Aa,ld2);
53: /* Pade' coefficients */
54: c[0] = 1.0;
55: for (k=1;k<=p;k++) c[k] = c[k-1]*(p+1-k)/(k*(2*p+1-k));
57: /* Scaling */
58: s = LAPACKlange_("I",&n,&n,As,&ld,rwork);
59: PetscLogFlops(1.0*n*n);
60: if (s>0.5) {
61: sexp = PetscMax(0,(int)(PetscLogReal(s)/PetscLogReal(2.0))+2);
62: scale = PetscPowRealInt(2.0,-sexp);
63: PetscCallBLAS("BLASscal",BLASscal_(&ld2,&scale,As,&inc));
64: PetscLogFlops(1.0*n*n);
65: } else sexp = 0;
67: /* Horner evaluation */
68: PetscCallBLAS("BLASgemm",BLASgemm_("N","N",&n,&n,&n,&one,As,&ld,As,&ld,&zero,A2,&ld));
69: PetscLogFlops(2.0*n*n*n);
70: PetscArrayzero(Q,ld2);
71: PetscArrayzero(P,ld2);
72: for (j=0;j<n;j++) {
73: Q[j+j*ld] = c[p];
74: P[j+j*ld] = c[p-1];
75: }
77: odd = PETSC_TRUE;
78: for (k=p-1;k>0;k--) {
79: if (odd) {
80: PetscCallBLAS("BLASgemm",BLASgemm_("N","N",&n,&n,&n,&one,Q,&ld,A2,&ld,&zero,W,&ld));
81: SWAP(Q,W,aux);
82: for (j=0;j<n;j++) Q[j+j*ld] += c[k-1];
83: odd = PETSC_FALSE;
84: } else {
85: PetscCallBLAS("BLASgemm",BLASgemm_("N","N",&n,&n,&n,&one,P,&ld,A2,&ld,&zero,W,&ld));
86: SWAP(P,W,aux);
87: for (j=0;j<n;j++) P[j+j*ld] += c[k-1];
88: odd = PETSC_TRUE;
89: }
90: PetscLogFlops(2.0*n*n*n);
91: }
92: /*if (odd) {
93: PetscCallBLAS("BLASgemm",BLASgemm_("N","N",&n,&n,&n,&one,Q,&ld,As,&ld,&zero,W,&ld));
94: SWAP(Q,W,aux);
95: PetscCallBLAS("BLASaxpy",BLASaxpy_(&ld2,&mone,P,&inc,Q,&inc));
96: PetscCallBLAS("LAPACKgesv",LAPACKgesv_(&n,&n,Q,&ld,ipiv,P,&ld,&info));
97: SlepcCheckLapackInfo("gesv",info);
98: PetscCallBLAS("BLASscal",BLASscal_(&ld2,&two,P,&inc));
99: for (j=0;j<n;j++) P[j+j*ld] += 1.0;
100: PetscCallBLAS("BLASscal",BLASscal_(&ld2,&mone,P,&inc));
101: } else {*/
102: PetscCallBLAS("BLASgemm",BLASgemm_("N","N",&n,&n,&n,&one,P,&ld,As,&ld,&zero,W,&ld));
103: SWAP(P,W,aux);
104: PetscCallBLAS("BLASaxpy",BLASaxpy_(&ld2,&mone,P,&inc,Q,&inc));
105: PetscCallBLAS("LAPACKgesv",LAPACKgesv_(&n,&n,Q,&ld,ipiv,P,&ld,&info));
106: SlepcCheckLapackInfo("gesv",info);
107: PetscCallBLAS("BLASscal",BLASscal_(&ld2,&two,P,&inc));
108: for (j=0;j<n;j++) P[j+j*ld] += 1.0;
109: /*}*/
110: PetscLogFlops(2.0*n*n*n+2.0*n*n*n/3.0+4.0*n*n);
112: for (k=1;k<=sexp;k++) {
113: PetscCallBLAS("BLASgemm",BLASgemm_("N","N",&n,&n,&n,&one,P,&ld,P,&ld,&zero,W,&ld));
114: PetscArraycpy(P,W,ld2);
115: }
116: if (P!=Ba) PetscArraycpy(Ba,P,ld2);
117: PetscLogFlops(2.0*n*n*n*sexp);
119: PetscFree6(Q,W,As,A2,rwork,ipiv);
120: MatDenseRestoreArrayRead(A,&Aa);
121: MatDenseRestoreArray(B,&Ba);
122: return 0;
123: }
125: /*
126: * Set scaling factor (s) and Pade degree (k,m)
127: */
128: static PetscErrorCode sexpm_params(PetscReal nrm,PetscInt *s,PetscInt *k,PetscInt *m)
129: {
130: if (nrm>1) {
131: if (nrm<200) {*s = 4; *k = 5; *m = *k-1;}
132: else if (nrm<1e4) {*s = 4; *k = 4; *m = *k+1;}
133: else if (nrm<1e6) {*s = 4; *k = 3; *m = *k+1;}
134: else if (nrm<1e9) {*s = 3; *k = 3; *m = *k+1;}
135: else if (nrm<1e11) {*s = 2; *k = 3; *m = *k+1;}
136: else if (nrm<1e12) {*s = 2; *k = 2; *m = *k+1;}
137: else if (nrm<1e14) {*s = 2; *k = 1; *m = *k+1;}
138: else {*s = 1; *k = 1; *m = *k+1;}
139: } else { /* nrm<1 */
140: if (nrm>0.5) {*s = 4; *k = 4; *m = *k-1;}
141: else if (nrm>0.3) {*s = 3; *k = 4; *m = *k-1;}
142: else if (nrm>0.15) {*s = 2; *k = 4; *m = *k-1;}
143: else if (nrm>0.07) {*s = 1; *k = 4; *m = *k-1;}
144: else if (nrm>0.01) {*s = 0; *k = 4; *m = *k-1;}
145: else if (nrm>3e-4) {*s = 0; *k = 3; *m = *k-1;}
146: else if (nrm>1e-5) {*s = 0; *k = 3; *m = 0;}
147: else if (nrm>1e-8) {*s = 0; *k = 2; *m = 0;}
148: else {*s = 0; *k = 1; *m = 0;}
149: }
150: return 0;
151: }
153: #if defined(PETSC_HAVE_COMPLEX)
154: /*
155: * Partial fraction form coefficients.
156: * If query, the function returns the size necessary to store the coefficients.
157: */
158: static PetscErrorCode getcoeffs(PetscInt k,PetscInt m,PetscComplex *r,PetscComplex *q,PetscComplex *remain,PetscBool query)
159: {
160: PetscInt i;
161: const PetscComplex /* m == k+1 */
162: p1r4[5] = {-1.582680186458572e+01 - 2.412564578224361e+01*PETSC_i,
163: -1.582680186458572e+01 + 2.412564578224361e+01*PETSC_i,
164: 1.499984465975511e+02 + 6.804227952202417e+01*PETSC_i,
165: 1.499984465975511e+02 - 6.804227952202417e+01*PETSC_i,
166: -2.733432894659307e+02 },
167: p1q4[5] = { 3.655694325463550e+00 + 6.543736899360086e+00*PETSC_i,
168: 3.655694325463550e+00 - 6.543736899360086e+00*PETSC_i,
169: 5.700953298671832e+00 + 3.210265600308496e+00*PETSC_i,
170: 5.700953298671832e+00 - 3.210265600308496e+00*PETSC_i,
171: 6.286704751729261e+00 },
172: p1r3[4] = {-1.130153999597152e+01 + 1.247167585025031e+01*PETSC_i,
173: -1.130153999597152e+01 - 1.247167585025031e+01*PETSC_i,
174: 1.330153999597152e+01 - 6.007173273704750e+01*PETSC_i,
175: 1.330153999597152e+01 + 6.007173273704750e+01*PETSC_i},
176: p1q3[4] = { 3.212806896871536e+00 + 4.773087433276636e+00*PETSC_i,
177: 3.212806896871536e+00 - 4.773087433276636e+00*PETSC_i,
178: 4.787193103128464e+00 + 1.567476416895212e+00*PETSC_i,
179: 4.787193103128464e+00 - 1.567476416895212e+00*PETSC_i},
180: p1r2[3] = { 7.648749087422928e+00 + 4.171640244747463e+00*PETSC_i,
181: 7.648749087422928e+00 - 4.171640244747463e+00*PETSC_i,
182: -1.829749817484586e+01 },
183: p1q2[3] = { 2.681082873627756e+00 + 3.050430199247411e+00*PETSC_i,
184: 2.681082873627756e+00 - 3.050430199247411e+00*PETSC_i,
185: 3.637834252744491e+00 },
186: p1r1[2] = { 1.000000000000000e+00 - 3.535533905932738e+00*PETSC_i,
187: 1.000000000000000e+00 + 3.535533905932738e+00*PETSC_i},
188: p1q1[2] = { 2.000000000000000e+00 + 1.414213562373095e+00*PETSC_i,
189: 2.000000000000000e+00 - 1.414213562373095e+00*PETSC_i};
190: const PetscComplex /* m == k-1 */
191: m1r5[4] = {-1.423367961376821e+02 - 1.385465094833037e+01*PETSC_i,
192: -1.423367961376821e+02 + 1.385465094833037e+01*PETSC_i,
193: 2.647367961376822e+02 - 4.814394493714596e+02*PETSC_i,
194: 2.647367961376822e+02 + 4.814394493714596e+02*PETSC_i},
195: m1q5[4] = { 5.203941240131764e+00 + 5.805856841805367e+00*PETSC_i,
196: 5.203941240131764e+00 - 5.805856841805367e+00*PETSC_i,
197: 6.796058759868242e+00 + 1.886649260140217e+00*PETSC_i,
198: 6.796058759868242e+00 - 1.886649260140217e+00*PETSC_i},
199: m1r4[3] = { 2.484269593165883e+01 + 7.460342395992306e+01*PETSC_i,
200: 2.484269593165883e+01 - 7.460342395992306e+01*PETSC_i,
201: -1.734353918633177e+02 },
202: m1q4[3] = { 4.675757014491557e+00 + 3.913489560603711e+00*PETSC_i,
203: 4.675757014491557e+00 - 3.913489560603711e+00*PETSC_i,
204: 5.648485971016893e+00 },
205: m1r3[2] = { 2.533333333333333e+01 - 2.733333333333333e+01*PETSC_i,
206: 2.533333333333333e+01 + 2.733333333333333e+01*PETSC_i},
207: m1q3[2] = { 4.000000000000000e+00 + 2.000000000000000e+00*PETSC_i,
208: 4.000000000000000e+00 - 2.000000000000000e+00*PETSC_i};
209: const PetscScalar /* m == k-1 */
210: m1remain5[2] = { 2.000000000000000e-01, 9.800000000000000e+00},
211: m1remain4[2] = {-2.500000000000000e-01, -7.750000000000000e+00},
212: m1remain3[2] = { 3.333333333333333e-01, 5.666666666666667e+00},
213: m1remain2[2] = {-0.5, -3.5},
214: remain3[4] = {1.0/6.0, 1.0/2.0, 1, 1},
215: remain2[3] = {1.0/2.0, 1, 1};
217: if (query) { /* query about buffer's size */
218: if (m==k+1) {
219: *remain = 0;
220: *r = *q = k+1;
221: return 0; /* quick return */
222: }
223: if (m==k-1) {
224: *remain = 2;
225: if (k==5) *r = *q = 4;
226: else if (k==4) *r = *q = 3;
227: else if (k==3) *r = *q = 2;
228: else if (k==2) *r = *q = 1;
229: }
230: if (m==0) {
231: *r = *q = 0;
232: *remain = k+1;
233: }
234: } else {
235: if (m==k+1) {
236: if (k==4) {
237: for (i=0;i<5;i++) { r[i] = p1r4[i]; q[i] = p1q4[i]; }
238: } else if (k==3) {
239: for (i=0;i<4;i++) { r[i] = p1r3[i]; q[i] = p1q3[i]; }
240: } else if (k==2) {
241: for (i=0;i<3;i++) { r[i] = p1r2[i]; q[i] = p1q2[i]; }
242: } else if (k==1) {
243: for (i=0;i<2;i++) { r[i] = p1r1[i]; q[i] = p1q1[i]; }
244: }
245: return 0; /* quick return */
246: }
247: if (m==k-1) {
248: if (k==5) {
249: for (i=0;i<4;i++) { r[i] = m1r5[i]; q[i] = m1q5[i]; }
250: for (i=0;i<2;i++) remain[i] = m1remain5[i];
251: } else if (k==4) {
252: for (i=0;i<3;i++) { r[i] = m1r4[i]; q[i] = m1q4[i]; }
253: for (i=0;i<2;i++) remain[i] = m1remain4[i];
254: } else if (k==3) {
255: for (i=0;i<2;i++) { r[i] = m1r3[i]; q[i] = m1q3[i]; remain[i] = m1remain3[i]; }
256: } else if (k==2) {
257: r[0] = -13.5; q[0] = 3;
258: for (i=0;i<2;i++) remain[i] = m1remain2[i];
259: }
260: }
261: if (m==0) {
262: r = q = NULL;
263: if (k==3) {
264: for (i=0;i<4;i++) remain[i] = remain3[i];
265: } else if (k==2) {
266: for (i=0;i<3;i++) remain[i] = remain2[i];
267: }
268: }
269: }
270: return 0;
271: }
273: /*
274: * Product form coefficients.
275: * If query, the function returns the size necessary to store the coefficients.
276: */
277: static PetscErrorCode getcoeffsproduct(PetscInt k,PetscInt m,PetscComplex *p,PetscComplex *q,PetscComplex *mult,PetscBool query)
278: {
279: PetscInt i;
280: const PetscComplex /* m == k+1 */
281: p1p4[4] = {-5.203941240131764e+00 + 5.805856841805367e+00*PETSC_i,
282: -5.203941240131764e+00 - 5.805856841805367e+00*PETSC_i,
283: -6.796058759868242e+00 + 1.886649260140217e+00*PETSC_i,
284: -6.796058759868242e+00 - 1.886649260140217e+00*PETSC_i},
285: p1q4[5] = { 3.655694325463550e+00 + 6.543736899360086e+00*PETSC_i,
286: 3.655694325463550e+00 - 6.543736899360086e+00*PETSC_i,
287: 6.286704751729261e+00 ,
288: 5.700953298671832e+00 + 3.210265600308496e+00*PETSC_i,
289: 5.700953298671832e+00 - 3.210265600308496e+00*PETSC_i},
290: p1p3[3] = {-4.675757014491557e+00 + 3.913489560603711e+00*PETSC_i,
291: -4.675757014491557e+00 - 3.913489560603711e+00*PETSC_i,
292: -5.648485971016893e+00 },
293: p1q3[4] = { 3.212806896871536e+00 + 4.773087433276636e+00*PETSC_i,
294: 3.212806896871536e+00 - 4.773087433276636e+00*PETSC_i,
295: 4.787193103128464e+00 + 1.567476416895212e+00*PETSC_i,
296: 4.787193103128464e+00 - 1.567476416895212e+00*PETSC_i},
297: p1p2[2] = {-4.00000000000000e+00 + 2.000000000000000e+00*PETSC_i,
298: -4.00000000000000e+00 - 2.000000000000000e+00*PETSC_i},
299: p1q2[3] = { 2.681082873627756e+00 + 3.050430199247411e+00*PETSC_i,
300: 2.681082873627756e+00 - 3.050430199247411e+00*PETSC_i,
301: 3.637834252744491e+00 },
302: p1q1[2] = { 2.000000000000000e+00 + 1.414213562373095e+00*PETSC_i,
303: 2.000000000000000e+00 - 1.414213562373095e+00*PETSC_i};
304: const PetscComplex /* m == k-1 */
305: m1p5[5] = {-3.655694325463550e+00 + 6.543736899360086e+00*PETSC_i,
306: -3.655694325463550e+00 - 6.543736899360086e+00*PETSC_i,
307: -6.286704751729261e+00 ,
308: -5.700953298671832e+00 + 3.210265600308496e+00*PETSC_i,
309: -5.700953298671832e+00 - 3.210265600308496e+00*PETSC_i},
310: m1q5[4] = { 5.203941240131764e+00 + 5.805856841805367e+00*PETSC_i,
311: 5.203941240131764e+00 - 5.805856841805367e+00*PETSC_i,
312: 6.796058759868242e+00 + 1.886649260140217e+00*PETSC_i,
313: 6.796058759868242e+00 - 1.886649260140217e+00*PETSC_i},
314: m1p4[4] = {-3.212806896871536e+00 + 4.773087433276636e+00*PETSC_i,
315: -3.212806896871536e+00 - 4.773087433276636e+00*PETSC_i,
316: -4.787193103128464e+00 + 1.567476416895212e+00*PETSC_i,
317: -4.787193103128464e+00 - 1.567476416895212e+00*PETSC_i},
318: m1q4[3] = { 4.675757014491557e+00 + 3.913489560603711e+00*PETSC_i,
319: 4.675757014491557e+00 - 3.913489560603711e+00*PETSC_i,
320: 5.648485971016893e+00 },
321: m1p3[3] = {-2.681082873627756e+00 + 3.050430199247411e+00*PETSC_i,
322: -2.681082873627756e+00 - 3.050430199247411e+00*PETSC_i,
323: -3.637834252744491e+00 },
324: m1q3[2] = { 4.000000000000000e+00 + 2.000000000000000e+00*PETSC_i,
325: 4.000000000000000e+00 - 2.000000000000001e+00*PETSC_i},
326: m1p2[2] = {-2.000000000000000e+00 + 1.414213562373095e+00*PETSC_i,
327: -2.000000000000000e+00 - 1.414213562373095e+00*PETSC_i};
329: if (query) {
330: if (m == k+1) {
331: *mult = 1;
332: *p = k;
333: *q = k+1;
334: return 0;
335: }
336: if (m==k-1) {
337: *mult = 1;
338: *p = k;
339: *q = k-1;
340: }
341: } else {
342: if (m == k+1) {
343: *mult = PetscPowInt(-1,m);
344: *mult *= m;
345: if (k==4) {
346: for (i=0;i<4;i++) { p[i] = p1p4[i]; q[i] = p1q4[i]; }
347: q[4] = p1q4[4];
348: } else if (k==3) {
349: for (i=0;i<3;i++) { p[i] = p1p3[i]; q[i] = p1q3[i]; }
350: q[3] = p1q3[3];
351: } else if (k==2) {
352: for (i=0;i<2;i++) { p[i] = p1p2[i]; q[i] = p1q2[i]; }
353: q[2] = p1q2[2];
354: } else if (k==1) {
355: p[0] = -3;
356: for (i=0;i<2;i++) q[i] = p1q1[i];
357: }
358: return 0;
359: }
360: if (m==k-1) {
361: *mult = PetscPowInt(-1,m);
362: *mult /= k;
363: if (k==5) {
364: for (i=0;i<4;i++) { p[i] = m1p5[i]; q[i] = m1q5[i]; }
365: p[4] = m1p5[4];
366: } else if (k==4) {
367: for (i=0;i<3;i++) { p[i] = m1p4[i]; q[i] = m1q4[i]; }
368: p[3] = m1p4[3];
369: } else if (k==3) {
370: for (i=0;i<2;i++) { p[i] = m1p3[i]; q[i] = m1q3[i]; }
371: p[2] = m1p3[2];
372: } else if (k==2) {
373: for (i=0;i<2;i++) p[i] = m1p2[i];
374: q[0] = 3;
375: }
376: }
377: }
378: return 0;
379: }
380: #endif /* PETSC_HAVE_COMPLEX */
382: #if defined(PETSC_USE_COMPLEX)
383: static PetscErrorCode getisreal(PetscInt n,PetscComplex *a,PetscBool *result)
384: {
385: PetscInt i;
387: *result=PETSC_TRUE;
388: for (i=0;i<n&&*result;i++) {
389: if (PetscImaginaryPartComplex(a[i])) *result=PETSC_FALSE;
390: }
391: return 0;
392: }
393: #endif
395: /*
396: * Matrix exponential implementation based on algorithm and matlab code by Stefan Guettel
397: * and Yuji Nakatsukasa
398: *
399: * Stefan Guettel and Yuji Nakatsukasa, "Scaled and Squared Subdiagonal Pade
400: * Approximation for the Matrix Exponential",
401: * SIAM J. Matrix Anal. Appl. 37(1):145-170, 2016.
402: * https://doi.org/10.1137/15M1027553
403: */
404: PetscErrorCode FNEvaluateFunctionMat_Exp_GuettelNakatsukasa(FN fn,Mat A,Mat B)
405: {
406: #if !defined(PETSC_HAVE_COMPLEX)
407: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"This function requires C99 or C++ complex support");
408: #else
409: PetscInt i,j,n_,s,k,m,mod;
410: PetscBLASInt n=0,n2=0,irsize=0,rsizediv2,ipsize=0,iremainsize=0,info,*piv,minlen,lwork=0,one=1;
411: PetscReal nrm,shift=0.0;
412: #if defined(PETSC_USE_COMPLEX)
413: PetscReal *rwork=NULL;
414: #endif
415: PetscComplex *As,*RR,*RR2,*expmA,*expmA2,*Maux,*Maux2,rsize,*r,psize,*p,remainsize,*remainterm,*rootp,*rootq,mult=0.0,scale,cone=1.0,czero=0.0,*aux;
416: PetscScalar *Ba,*Ba2,*sMaux,*wr,*wi,expshift,sone=1.0,szero=0.0,*saux;
417: const PetscScalar *Aa;
418: PetscBool isreal,flg;
419: PetscBLASInt query=-1;
420: PetscScalar work1,*work;
422: MatGetSize(A,&n_,NULL);
423: PetscBLASIntCast(n_,&n);
424: MatDenseGetArrayRead(A,&Aa);
425: MatDenseGetArray(B,&Ba);
426: Ba2 = Ba;
427: PetscBLASIntCast(n*n,&n2);
429: PetscMalloc2(n2,&sMaux,n2,&Maux);
430: Maux2 = Maux;
431: PetscOptionsGetReal(NULL,NULL,"-fn_expm_estimated_eig",&shift,&flg);
432: if (!flg) {
433: PetscMalloc2(n,&wr,n,&wi);
434: PetscArraycpy(sMaux,Aa,n2);
435: /* estimate rightmost eigenvalue and shift A with it */
436: #if !defined(PETSC_USE_COMPLEX)
437: PetscCallBLAS("LAPACKgeev",LAPACKgeev_("N","N",&n,sMaux,&n,wr,wi,NULL,&n,NULL,&n,&work1,&query,&info));
438: SlepcCheckLapackInfo("geev",info);
439: PetscBLASIntCast((PetscInt)work1,&lwork);
440: PetscMalloc1(lwork,&work);
441: PetscCallBLAS("LAPACKgeev",LAPACKgeev_("N","N",&n,sMaux,&n,wr,wi,NULL,&n,NULL,&n,work,&lwork,&info));
442: PetscFree(work);
443: #else
444: PetscArraycpy(Maux,Aa,n2);
445: PetscCallBLAS("LAPACKgeev",LAPACKgeev_("N","N",&n,Maux,&n,wr,NULL,&n,NULL,&n,&work1,&query,rwork,&info));
446: SlepcCheckLapackInfo("geev",info);
447: PetscBLASIntCast((PetscInt)PetscRealPart(work1),&lwork);
448: PetscMalloc2(2*n,&rwork,lwork,&work);
449: PetscCallBLAS("LAPACKgeev",LAPACKgeev_("N","N",&n,Maux,&n,wr,NULL,&n,NULL,&n,work,&lwork,rwork,&info));
450: PetscFree2(rwork,work);
451: #endif
452: SlepcCheckLapackInfo("geev",info);
453: PetscLogFlops(25.0*n*n*n+(n*n*n)/3.0+1.0*n*n*n);
455: shift = PetscRealPart(wr[0]);
456: for (i=1;i<n;i++) {
457: if (PetscRealPart(wr[i]) > shift) shift = PetscRealPart(wr[i]);
458: }
459: PetscFree2(wr,wi);
460: }
461: /* shift so that largest real part is (about) 0 */
462: PetscArraycpy(sMaux,Aa,n2);
463: if (shift) {
464: for (i=0;i<n;i++) sMaux[i+i*n] -= shift;
465: PetscLogFlops(1.0*n);
466: }
467: #if defined(PETSC_USE_COMPLEX)
468: PetscArraycpy(Maux,Aa,n2);
469: if (shift) {
470: for (i=0;i<n;i++) Maux[i+i*n] -= shift;
471: PetscLogFlops(1.0*n);
472: }
473: #endif
475: /* estimate norm(A) and select the scaling factor */
476: nrm = LAPACKlange_("O",&n,&n,sMaux,&n,NULL);
477: PetscLogFlops(1.0*n*n);
478: sexpm_params(nrm,&s,&k,&m);
479: if (s==0 && k==1 && m==0) { /* exp(A) = I+A to eps! */
480: if (shift) expshift = PetscExpReal(shift);
481: for (i=0;i<n;i++) sMaux[i+i*n] += 1.0;
482: if (shift) {
483: PetscCallBLAS("BLASscal",BLASscal_(&n2,&expshift,sMaux,&one));
484: PetscLogFlops(1.0*(n+n2));
485: } else PetscLogFlops(1.0*n);
486: PetscArraycpy(Ba,sMaux,n2);
487: PetscFree2(sMaux,Maux);
488: MatDenseRestoreArrayRead(A,&Aa);
489: MatDenseRestoreArray(B,&Ba);
490: return 0; /* quick return */
491: }
493: PetscMalloc4(n2,&expmA,n2,&As,n2,&RR,n,&piv);
494: expmA2 = expmA; RR2 = RR;
495: /* scale matrix */
496: #if !defined(PETSC_USE_COMPLEX)
497: for (i=0;i<n2;i++) {
498: As[i] = sMaux[i];
499: }
500: #else
501: PetscArraycpy(As,sMaux,n2);
502: #endif
503: scale = 1.0/PetscPowRealInt(2.0,s);
504: PetscCallBLAS("BLASCOMPLEXscal",BLASCOMPLEXscal_(&n2,&scale,As,&one));
505: SlepcLogFlopsComplex(1.0*n2);
507: /* evaluate Pade approximant (partial fraction or product form) */
508: if (fn->method==3 || !m) { /* partial fraction */
509: getcoeffs(k,m,&rsize,&psize,&remainsize,PETSC_TRUE);
510: PetscBLASIntCast((PetscInt)PetscRealPartComplex(rsize),&irsize);
511: PetscBLASIntCast((PetscInt)PetscRealPartComplex(psize),&ipsize);
512: PetscBLASIntCast((PetscInt)PetscRealPartComplex(remainsize),&iremainsize);
513: PetscMalloc3(irsize,&r,ipsize,&p,iremainsize,&remainterm);
514: getcoeffs(k,m,r,p,remainterm,PETSC_FALSE);
516: PetscArrayzero(expmA,n2);
517: #if !defined(PETSC_USE_COMPLEX)
518: isreal = PETSC_TRUE;
519: #else
520: getisreal(n2,Maux,&isreal);
521: #endif
522: if (isreal) {
523: rsizediv2 = irsize/2;
524: for (i=0;i<rsizediv2;i++) { /* use partial fraction to get R(As) */
525: PetscArraycpy(Maux,As,n2);
526: PetscArrayzero(RR,n2);
527: for (j=0;j<n;j++) {
528: Maux[j+j*n] -= p[2*i];
529: RR[j+j*n] = r[2*i];
530: }
531: PetscCallBLAS("LAPACKCOMPLEXgesv",LAPACKCOMPLEXgesv_(&n,&n,Maux,&n,piv,RR,&n,&info));
532: SlepcCheckLapackInfo("gesv",info);
533: for (j=0;j<n2;j++) {
534: expmA[j] += RR[j] + PetscConj(RR[j]);
535: }
536: /* loop(n) + gesv + loop(n2) */
537: SlepcLogFlopsComplex(1.0*n+(2.0*n*n*n/3.0+2.0*n*n*n)+2.0*n2);
538: }
540: mod = ipsize % 2;
541: if (mod) {
542: PetscArraycpy(Maux,As,n2);
543: PetscArrayzero(RR,n2);
544: for (j=0;j<n;j++) {
545: Maux[j+j*n] -= p[ipsize-1];
546: RR[j+j*n] = r[irsize-1];
547: }
548: PetscCallBLAS("LAPACKCOMPLEXgesv",LAPACKCOMPLEXgesv_(&n,&n,Maux,&n,piv,RR,&n,&info));
549: SlepcCheckLapackInfo("gesv",info);
550: for (j=0;j<n2;j++) {
551: expmA[j] += RR[j];
552: }
553: SlepcLogFlopsComplex(1.0*n+(2.0*n*n*n/3.0+2.0*n*n*n)+1.0*n2);
554: }
555: } else { /* complex */
556: for (i=0;i<irsize;i++) { /* use partial fraction to get R(As) */
557: PetscArraycpy(Maux,As,n2);
558: PetscArrayzero(RR,n2);
559: for (j=0;j<n;j++) {
560: Maux[j+j*n] -= p[i];
561: RR[j+j*n] = r[i];
562: }
563: PetscCallBLAS("LAPACKCOMPLEXgesv",LAPACKCOMPLEXgesv_(&n,&n,Maux,&n,piv,RR,&n,&info));
564: SlepcCheckLapackInfo("gesv",info);
565: for (j=0;j<n2;j++) {
566: expmA[j] += RR[j];
567: }
568: SlepcLogFlopsComplex(1.0*n+(2.0*n*n*n/3.0+2.0*n*n*n)+1.0*n2);
569: }
570: }
571: for (i=0;i<iremainsize;i++) {
572: if (!i) {
573: PetscArrayzero(RR,n2);
574: for (j=0;j<n;j++) {
575: RR[j+j*n] = remainterm[iremainsize-1];
576: }
577: } else {
578: PetscArraycpy(RR,As,n2);
579: for (j=1;j<i;j++) {
580: PetscCallBLAS("BLASCOMPLEXgemm",BLASCOMPLEXgemm_("N","N",&n,&n,&n,&cone,RR,&n,RR,&n,&czero,Maux,&n));
581: SWAP(RR,Maux,aux);
582: SlepcLogFlopsComplex(2.0*n*n*n);
583: }
584: PetscCallBLAS("BLASCOMPLEXscal",BLASCOMPLEXscal_(&n2,&remainterm[iremainsize-1-i],RR,&one));
585: SlepcLogFlopsComplex(1.0*n2);
586: }
587: for (j=0;j<n2;j++) {
588: expmA[j] += RR[j];
589: }
590: SlepcLogFlopsComplex(1.0*n2);
591: }
592: PetscFree3(r,p,remainterm);
593: } else { /* product form, default */
594: getcoeffsproduct(k,m,&rsize,&psize,&mult,PETSC_TRUE);
595: PetscBLASIntCast((PetscInt)PetscRealPartComplex(rsize),&irsize);
596: PetscBLASIntCast((PetscInt)PetscRealPartComplex(psize),&ipsize);
597: PetscMalloc2(irsize,&rootp,ipsize,&rootq);
598: getcoeffsproduct(k,m,rootp,rootq,&mult,PETSC_FALSE);
600: PetscArrayzero(expmA,n2);
601: for (i=0;i<n;i++) { /* initialize */
602: expmA[i+i*n] = 1.0;
603: }
604: minlen = PetscMin(irsize,ipsize);
605: for (i=0;i<minlen;i++) {
606: PetscArraycpy(RR,As,n2);
607: for (j=0;j<n;j++) {
608: RR[j+j*n] -= rootp[i];
609: }
610: PetscCallBLAS("BLASCOMPLEXgemm",BLASCOMPLEXgemm_("N","N",&n,&n,&n,&cone,RR,&n,expmA,&n,&czero,Maux,&n));
611: SWAP(expmA,Maux,aux);
612: PetscArraycpy(RR,As,n2);
613: for (j=0;j<n;j++) {
614: RR[j+j*n] -= rootq[i];
615: }
616: PetscCallBLAS("LAPACKCOMPLEXgesv",LAPACKCOMPLEXgesv_(&n,&n,RR,&n,piv,expmA,&n,&info));
617: SlepcCheckLapackInfo("gesv",info);
618: /* loop(n) + gemm + loop(n) + gesv */
619: SlepcLogFlopsComplex(1.0*n+(2.0*n*n*n)+1.0*n+(2.0*n*n*n/3.0+2.0*n*n*n));
620: }
621: /* extra numerator */
622: for (i=minlen;i<irsize;i++) {
623: PetscArraycpy(RR,As,n2);
624: for (j=0;j<n;j++) {
625: RR[j+j*n] -= rootp[i];
626: }
627: PetscCallBLAS("BLASCOMPLEXgemm",BLASCOMPLEXgemm_("N","N",&n,&n,&n,&cone,RR,&n,expmA,&n,&czero,Maux,&n));
628: SWAP(expmA,Maux,aux);
629: SlepcLogFlopsComplex(1.0*n+2.0*n*n*n);
630: }
631: /* extra denominator */
632: for (i=minlen;i<ipsize;i++) {
633: PetscArraycpy(RR,As,n2);
634: for (j=0;j<n;j++) RR[j+j*n] -= rootq[i];
635: PetscCallBLAS("LAPACKCOMPLEXgesv",LAPACKCOMPLEXgesv_(&n,&n,RR,&n,piv,expmA,&n,&info));
636: SlepcCheckLapackInfo("gesv",info);
637: SlepcLogFlopsComplex(1.0*n+(2.0*n*n*n/3.0+2.0*n*n*n));
638: }
639: PetscCallBLAS("BLASCOMPLEXscal",BLASCOMPLEXscal_(&n2,&mult,expmA,&one));
640: SlepcLogFlopsComplex(1.0*n2);
641: PetscFree2(rootp,rootq);
642: }
644: #if !defined(PETSC_USE_COMPLEX)
645: for (i=0;i<n2;i++) {
646: Ba2[i] = PetscRealPartComplex(expmA[i]);
647: }
648: #else
649: PetscArraycpy(Ba2,expmA,n2);
650: #endif
652: /* perform repeated squaring */
653: for (i=0;i<s;i++) { /* final squaring */
654: PetscCallBLAS("BLASgemm",BLASgemm_("N","N",&n,&n,&n,&sone,Ba2,&n,Ba2,&n,&szero,sMaux,&n));
655: SWAP(Ba2,sMaux,saux);
656: PetscLogFlops(2.0*n*n*n);
657: }
658: if (Ba2!=Ba) {
659: PetscArraycpy(Ba,Ba2,n2);
660: sMaux = Ba2;
661: }
662: if (shift) {
663: expshift = PetscExpReal(shift);
664: PetscCallBLAS("BLASscal",BLASscal_(&n2,&expshift,Ba,&one));
665: PetscLogFlops(1.0*n2);
666: }
668: /* restore pointers */
669: Maux = Maux2; expmA = expmA2; RR = RR2;
670: PetscFree2(sMaux,Maux);
671: PetscFree4(expmA,As,RR,piv);
672: MatDenseRestoreArrayRead(A,&Aa);
673: MatDenseRestoreArray(B,&Ba);
674: return 0;
675: #endif
676: }
678: #define SMALLN 100
680: /*
681: * Function needed to compute optimal parameters (required workspace is 3*n*n)
682: */
683: static PetscInt ell(PetscBLASInt n,PetscScalar *A,PetscReal coeff,PetscInt m,PetscScalar *work,PetscRandom rand)
684: {
685: PetscScalar *Ascaled=work;
686: PetscReal nrm,alpha,beta,rwork[1];
687: PetscInt t;
688: PetscBLASInt i,j;
690: beta = PetscPowReal(coeff,1.0/(2*m+1));
691: for (i=0;i<n;i++)
692: for (j=0;j<n;j++)
693: Ascaled[i+j*n] = beta*PetscAbsScalar(A[i+j*n]);
694: nrm = LAPACKlange_("O",&n,&n,A,&n,rwork);
695: PetscLogFlops(2.0*n*n);
696: SlepcNormAm(n,Ascaled,2*m+1,work+n*n,rand,&alpha);
697: alpha /= nrm;
698: t = PetscMax((PetscInt)PetscCeilReal(PetscLogReal(2.0*alpha/PETSC_MACHINE_EPSILON)/PetscLogReal(2.0)/(2*m)),0);
699: return t;
700: }
702: /*
703: * Compute scaling parameter (s) and order of Pade approximant (m) (required workspace is 4*n*n)
704: */
705: static PetscErrorCode expm_params(PetscInt n,PetscScalar **Apowers,PetscInt *s,PetscInt *m,PetscScalar *work)
706: {
707: PetscScalar sfactor,sone=1.0,szero=0.0,*A=Apowers[0],*Ascaled;
708: PetscReal d4,d6,d8,d10,eta1,eta3,eta4,eta5,rwork[1];
709: PetscBLASInt n_=0,n2,one=1;
710: PetscRandom rand;
711: const PetscReal coeff[5] = { 9.92063492063492e-06, 9.94131285136576e-11, /* backward error function */
712: 2.22819456055356e-16, 1.69079293431187e-22, 8.82996160201868e-36 };
713: const PetscReal theta[5] = { 1.495585217958292e-002, /* m = 3 */
714: 2.539398330063230e-001, /* m = 5 */
715: 9.504178996162932e-001, /* m = 7 */
716: 2.097847961257068e+000, /* m = 9 */
717: 5.371920351148152e+000 }; /* m = 13 */
719: *s = 0;
720: *m = 13;
721: PetscBLASIntCast(n,&n_);
722: PetscRandomCreate(PETSC_COMM_SELF,&rand);
723: d4 = PetscPowReal(LAPACKlange_("O",&n_,&n_,Apowers[2],&n_,rwork),1.0/4.0);
724: if (d4==0.0) { /* safeguard for the case A = 0 */
725: *m = 3;
726: goto done;
727: }
728: d6 = PetscPowReal(LAPACKlange_("O",&n_,&n_,Apowers[3],&n_,rwork),1.0/6.0);
729: PetscLogFlops(2.0*n*n);
730: eta1 = PetscMax(d4,d6);
731: if (eta1<=theta[0] && !ell(n_,A,coeff[0],3,work,rand)) {
732: *m = 3;
733: goto done;
734: }
735: if (eta1<=theta[1] && !ell(n_,A,coeff[1],5,work,rand)) {
736: *m = 5;
737: goto done;
738: }
739: if (n<SMALLN) {
740: PetscCallBLAS("BLASgemm",BLASgemm_("N","N",&n_,&n_,&n_,&sone,Apowers[2],&n_,Apowers[2],&n_,&szero,work,&n_));
741: d8 = PetscPowReal(LAPACKlange_("O",&n_,&n_,work,&n_,rwork),1.0/8.0);
742: PetscLogFlops(2.0*n*n*n+1.0*n*n);
743: } else {
744: SlepcNormAm(n_,Apowers[2],2,work,rand,&d8);
745: d8 = PetscPowReal(d8,1.0/8.0);
746: }
747: eta3 = PetscMax(d6,d8);
748: if (eta3<=theta[2] && !ell(n_,A,coeff[2],7,work,rand)) {
749: *m = 7;
750: goto done;
751: }
752: if (eta3<=theta[3] && !ell(n_,A,coeff[3],9,work,rand)) {
753: *m = 9;
754: goto done;
755: }
756: if (n<SMALLN) {
757: PetscCallBLAS("BLASgemm",BLASgemm_("N","N",&n_,&n_,&n_,&sone,Apowers[2],&n_,Apowers[3],&n_,&szero,work,&n_));
758: d10 = PetscPowReal(LAPACKlange_("O",&n_,&n_,work,&n_,rwork),1.0/10.0);
759: PetscLogFlops(2.0*n*n*n+1.0*n*n);
760: } else {
761: SlepcNormAm(n_,Apowers[1],5,work,rand,&d10);
762: d10 = PetscPowReal(d10,1.0/10.0);
763: }
764: eta4 = PetscMax(d8,d10);
765: eta5 = PetscMin(eta3,eta4);
766: *s = PetscMax((PetscInt)PetscCeilReal(PetscLogReal(eta5/theta[4])/PetscLogReal(2.0)),0);
767: if (*s) {
768: Ascaled = work+3*n*n;
769: n2 = n_*n_;
770: PetscCallBLAS("BLAScopy",BLAScopy_(&n2,A,&one,Ascaled,&one));
771: sfactor = PetscPowRealInt(2.0,-(*s));
772: PetscCallBLAS("BLASscal",BLASscal_(&n2,&sfactor,Ascaled,&one));
773: PetscLogFlops(1.0*n*n);
774: } else Ascaled = A;
775: *s += ell(n_,Ascaled,coeff[4],13,work,rand);
776: done:
777: PetscRandomDestroy(&rand);
778: return 0;
779: }
781: /*
782: * Matrix exponential implementation based on algorithm and matlab code by N. Higham and co-authors
783: *
784: * N. J. Higham, "The scaling and squaring method for the matrix exponential
785: * revisited", SIAM J. Matrix Anal. Appl. 26(4):1179-1193, 2005.
786: */
787: PetscErrorCode FNEvaluateFunctionMat_Exp_Higham(FN fn,Mat A,Mat B)
788: {
789: PetscBLASInt n_=0,n2,*ipiv,info,one=1;
790: PetscInt n,m,j,s;
791: PetscScalar scale,smone=-1.0,sone=1.0,stwo=2.0,szero=0.0;
792: PetscScalar *Ba,*Apowers[5],*Q,*P,*W,*work,*aux;
793: const PetscScalar *Aa,*c;
794: const PetscScalar c3[4] = { 120, 60, 12, 1 };
795: const PetscScalar c5[6] = { 30240, 15120, 3360, 420, 30, 1 };
796: const PetscScalar c7[8] = { 17297280, 8648640, 1995840, 277200, 25200, 1512, 56, 1 };
797: const PetscScalar c9[10] = { 17643225600.0, 8821612800.0, 2075673600, 302702400, 30270240,
798: 2162160, 110880, 3960, 90, 1 };
799: const PetscScalar c13[14] = { 64764752532480000.0, 32382376266240000.0, 7771770303897600.0,
800: 1187353796428800.0, 129060195264000.0, 10559470521600.0,
801: 670442572800.0, 33522128640.0, 1323241920.0,
802: 40840800, 960960, 16380, 182, 1 };
804: MatDenseGetArrayRead(A,&Aa);
805: MatDenseGetArray(B,&Ba);
806: MatGetSize(A,&n,NULL);
807: PetscBLASIntCast(n,&n_);
808: n2 = n_*n_;
809: PetscMalloc2(8*n*n,&work,n,&ipiv);
811: /* Matrix powers */
812: Apowers[0] = work; /* Apowers[0] = A */
813: Apowers[1] = Apowers[0] + n*n; /* Apowers[1] = A^2 */
814: Apowers[2] = Apowers[1] + n*n; /* Apowers[2] = A^4 */
815: Apowers[3] = Apowers[2] + n*n; /* Apowers[3] = A^6 */
816: Apowers[4] = Apowers[3] + n*n; /* Apowers[4] = A^8 */
818: PetscArraycpy(Apowers[0],Aa,n2);
819: PetscCallBLAS("BLASgemm",BLASgemm_("N","N",&n_,&n_,&n_,&sone,Apowers[0],&n_,Apowers[0],&n_,&szero,Apowers[1],&n_));
820: PetscCallBLAS("BLASgemm",BLASgemm_("N","N",&n_,&n_,&n_,&sone,Apowers[1],&n_,Apowers[1],&n_,&szero,Apowers[2],&n_));
821: PetscCallBLAS("BLASgemm",BLASgemm_("N","N",&n_,&n_,&n_,&sone,Apowers[1],&n_,Apowers[2],&n_,&szero,Apowers[3],&n_));
822: PetscLogFlops(6.0*n*n*n);
824: /* Compute scaling parameter and order of Pade approximant */
825: expm_params(n,Apowers,&s,&m,Apowers[4]);
827: if (s) { /* rescale */
828: for (j=0;j<4;j++) {
829: scale = PetscPowRealInt(2.0,-PetscMax(2*j,1)*s);
830: PetscCallBLAS("BLASscal",BLASscal_(&n2,&scale,Apowers[j],&one));
831: }
832: PetscLogFlops(4.0*n*n);
833: }
835: /* Evaluate the Pade approximant */
836: switch (m) {
837: case 3: c = c3; break;
838: case 5: c = c5; break;
839: case 7: c = c7; break;
840: case 9: c = c9; break;
841: case 13: c = c13; break;
842: default: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Wrong value of m %" PetscInt_FMT,m);
843: }
844: P = Ba;
845: Q = Apowers[4] + n*n;
846: W = Q + n*n;
847: switch (m) {
848: case 3:
849: case 5:
850: case 7:
851: case 9:
852: if (m==9) PetscCallBLAS("BLASgemm",BLASgemm_("N","N",&n_,&n_,&n_,&sone,Apowers[1],&n_,Apowers[3],&n_,&szero,Apowers[4],&n_));
853: PetscArrayzero(P,n2);
854: PetscArrayzero(Q,n2);
855: for (j=0;j<n;j++) {
856: P[j+j*n] = c[1];
857: Q[j+j*n] = c[0];
858: }
859: for (j=m;j>=3;j-=2) {
860: PetscCallBLAS("BLASaxpy",BLASaxpy_(&n2,&c[j],Apowers[(j+1)/2-1],&one,P,&one));
861: PetscCallBLAS("BLASaxpy",BLASaxpy_(&n2,&c[j-1],Apowers[(j+1)/2-1],&one,Q,&one));
862: PetscLogFlops(4.0*n*n);
863: }
864: PetscCallBLAS("BLASgemm",BLASgemm_("N","N",&n_,&n_,&n_,&sone,Apowers[0],&n_,P,&n_,&szero,W,&n_));
865: PetscLogFlops(2.0*n*n*n);
866: SWAP(P,W,aux);
867: break;
868: case 13:
869: /* P = A*(Apowers[3]*(c[13]*Apowers[3] + c[11]*Apowers[2] + c[9]*Apowers[1])
870: + c[7]*Apowers[3] + c[5]*Apowers[2] + c[3]*Apowers[1] + c[1]*I) */
871: PetscCallBLAS("BLAScopy",BLAScopy_(&n2,Apowers[3],&one,P,&one));
872: PetscCallBLAS("BLASscal",BLASscal_(&n2,&c[13],P,&one));
873: PetscCallBLAS("BLASaxpy",BLASaxpy_(&n2,&c[11],Apowers[2],&one,P,&one));
874: PetscCallBLAS("BLASaxpy",BLASaxpy_(&n2,&c[9],Apowers[1],&one,P,&one));
875: PetscCallBLAS("BLASgemm",BLASgemm_("N","N",&n_,&n_,&n_,&sone,Apowers[3],&n_,P,&n_,&szero,W,&n_));
876: PetscLogFlops(5.0*n*n+2.0*n*n*n);
877: PetscArrayzero(P,n2);
878: for (j=0;j<n;j++) P[j+j*n] = c[1];
879: PetscCallBLAS("BLASaxpy",BLASaxpy_(&n2,&c[7],Apowers[3],&one,P,&one));
880: PetscCallBLAS("BLASaxpy",BLASaxpy_(&n2,&c[5],Apowers[2],&one,P,&one));
881: PetscCallBLAS("BLASaxpy",BLASaxpy_(&n2,&c[3],Apowers[1],&one,P,&one));
882: PetscCallBLAS("BLASaxpy",BLASaxpy_(&n2,&sone,P,&one,W,&one));
883: PetscCallBLAS("BLASgemm",BLASgemm_("N","N",&n_,&n_,&n_,&sone,Apowers[0],&n_,W,&n_,&szero,P,&n_));
884: PetscLogFlops(7.0*n*n+2.0*n*n*n);
885: /* Q = Apowers[3]*(c[12]*Apowers[3] + c[10]*Apowers[2] + c[8]*Apowers[1])
886: + c[6]*Apowers[3] + c[4]*Apowers[2] + c[2]*Apowers[1] + c[0]*I */
887: PetscCallBLAS("BLAScopy",BLAScopy_(&n2,Apowers[3],&one,Q,&one));
888: PetscCallBLAS("BLASscal",BLASscal_(&n2,&c[12],Q,&one));
889: PetscCallBLAS("BLASaxpy",BLASaxpy_(&n2,&c[10],Apowers[2],&one,Q,&one));
890: PetscCallBLAS("BLASaxpy",BLASaxpy_(&n2,&c[8],Apowers[1],&one,Q,&one));
891: PetscCallBLAS("BLASgemm",BLASgemm_("N","N",&n_,&n_,&n_,&sone,Apowers[3],&n_,Q,&n_,&szero,W,&n_));
892: PetscLogFlops(5.0*n*n+2.0*n*n*n);
893: PetscArrayzero(Q,n2);
894: for (j=0;j<n;j++) Q[j+j*n] = c[0];
895: PetscCallBLAS("BLASaxpy",BLASaxpy_(&n2,&c[6],Apowers[3],&one,Q,&one));
896: PetscCallBLAS("BLASaxpy",BLASaxpy_(&n2,&c[4],Apowers[2],&one,Q,&one));
897: PetscCallBLAS("BLASaxpy",BLASaxpy_(&n2,&c[2],Apowers[1],&one,Q,&one));
898: PetscCallBLAS("BLASaxpy",BLASaxpy_(&n2,&sone,W,&one,Q,&one));
899: PetscLogFlops(7.0*n*n);
900: break;
901: default: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Wrong value of m %" PetscInt_FMT,m);
902: }
903: PetscCallBLAS("BLASaxpy",BLASaxpy_(&n2,&smone,P,&one,Q,&one));
904: PetscCallBLAS("LAPACKgesv",LAPACKgesv_(&n_,&n_,Q,&n_,ipiv,P,&n_,&info));
905: SlepcCheckLapackInfo("gesv",info);
906: PetscCallBLAS("BLASscal",BLASscal_(&n2,&stwo,P,&one));
907: for (j=0;j<n;j++) P[j+j*n] += 1.0;
908: PetscLogFlops(2.0*n*n*n/3.0+4.0*n*n);
910: /* Squaring */
911: for (j=1;j<=s;j++) {
912: PetscCallBLAS("BLASgemm",BLASgemm_("N","N",&n_,&n_,&n_,&sone,P,&n_,P,&n_,&szero,W,&n_));
913: SWAP(P,W,aux);
914: }
915: if (P!=Ba) PetscArraycpy(Ba,P,n2);
916: PetscLogFlops(2.0*n*n*n*s);
918: PetscFree2(work,ipiv);
919: MatDenseRestoreArrayRead(A,&Aa);
920: MatDenseRestoreArray(B,&Ba);
921: return 0;
922: }
924: #if defined(PETSC_HAVE_CUDA)
925: #include "../src/sys/classes/fn/impls/cuda/fnutilcuda.h"
926: #include <slepccublas.h>
928: PetscErrorCode FNEvaluateFunctionMat_Exp_Pade_CUDA(FN fn,Mat A,Mat B)
929: {
930: PetscBLASInt n=0,ld,ld2,*d_ipiv,*d_info,info,one=1;
931: PetscInt m,k,sexp;
932: PetscBool odd;
933: const PetscInt p=MAX_PADE;
934: PetscReal c[MAX_PADE+1],s;
935: PetscScalar scale,smone=-1.0,sone=1.0,stwo=2.0,szero=0.0;
936: const PetscScalar *Aa;
937: PetscScalar *d_Ba,*d_As,*d_A2,*d_Q,*d_P,*d_W,*aux,**ppP,**d_ppP,**ppQ,**d_ppQ;
938: cublasHandle_t cublasv2handle;
940: PetscDeviceInitialize(PETSC_DEVICE_CUDA); /* For CUDA event timers */
941: PetscCUBLASGetHandle(&cublasv2handle);
942: MatGetSize(A,&m,NULL);
943: PetscBLASIntCast(m,&n);
944: ld = n;
945: ld2 = ld*ld;
946: if (A==B) {
947: cudaMalloc((void **)&d_As,sizeof(PetscScalar)*m*m);
948: MatDenseCUDAGetArrayRead(A,&Aa);
949: cudaMemcpy(d_As,Aa,sizeof(PetscScalar)*ld2,cudaMemcpyDeviceToDevice);
950: MatDenseCUDARestoreArrayRead(A,&Aa);
951: } else MatDenseCUDAGetArrayRead(A,(const PetscScalar**)&d_As);
952: MatDenseCUDAGetArrayWrite(B,&d_Ba);
954: cudaMalloc((void **)&d_Q,sizeof(PetscScalar)*m*m);
955: cudaMalloc((void **)&d_W,sizeof(PetscScalar)*m*m);
956: cudaMalloc((void **)&d_A2,sizeof(PetscScalar)*m*m);
957: cudaMalloc((void **)&d_ipiv,sizeof(PetscBLASInt)*ld);
958: cudaMalloc((void **)&d_info,sizeof(PetscBLASInt));
959: cudaMalloc((void **)&d_ppP,sizeof(PetscScalar*));
960: cudaMalloc((void **)&d_ppQ,sizeof(PetscScalar*));
962: PetscMalloc1(1,&ppP);
963: PetscMalloc1(1,&ppQ);
965: d_P = d_Ba;
966: PetscLogGpuTimeBegin();
968: /* Pade' coefficients */
969: c[0] = 1.0;
970: for (k=1;k<=p;k++) c[k] = c[k-1]*(p+1-k)/(k*(2*p+1-k));
972: /* Scaling */
973: cublasXnrm2(cublasv2handle,ld2,d_As,one,&s);
974: if (s>0.5) {
975: sexp = PetscMax(0,(int)(PetscLogReal(s)/PetscLogReal(2.0))+2);
976: scale = PetscPowRealInt(2.0,-sexp);
977: cublasXscal(cublasv2handle,ld2,&scale,d_As,one);
978: PetscLogGpuFlops(1.0*n*n);
979: } else sexp = 0;
981: /* Horner evaluation */
982: cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n,n,n,&sone,d_As,ld,d_As,ld,&szero,d_A2,ld);
983: PetscLogGpuFlops(2.0*n*n*n);
984: cudaMemset(d_Q,0,sizeof(PetscScalar)*ld2);
985: cudaMemset(d_P,0,sizeof(PetscScalar)*ld2);
986: set_diagonal(n,d_Q,ld,c[p]);
987: set_diagonal(n,d_P,ld,c[p-1]);
989: odd = PETSC_TRUE;
990: for (k=p-1;k>0;k--) {
991: if (odd) {
992: cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n,n,n,&sone,d_Q,ld,d_A2,ld,&szero,d_W,ld);
993: SWAP(d_Q,d_W,aux);
994: shift_diagonal(n,d_Q,ld,c[k-1]);
995: odd = PETSC_FALSE;
996: } else {
997: cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n,n,n,&sone,d_P,ld,d_A2,ld,&szero,d_W,ld);
998: SWAP(d_P,d_W,aux);
999: shift_diagonal(n,d_P,ld,c[k-1]);
1000: odd = PETSC_TRUE;
1001: }
1002: PetscLogGpuFlops(2.0*n*n*n);
1003: }
1004: if (odd) {
1005: cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n,n,n,&sone,d_Q,ld,d_As,ld,&szero,d_W,ld);
1006: SWAP(d_Q,d_W,aux);
1007: cublasXaxpy(cublasv2handle,ld2,&smone,d_P,one,d_Q,one);
1009: ppQ[0] = d_Q;
1010: ppP[0] = d_P;
1011: cudaMemcpy(d_ppQ,ppQ,sizeof(PetscScalar*),cudaMemcpyHostToDevice);
1012: cudaMemcpy(d_ppP,ppP,sizeof(PetscScalar*),cudaMemcpyHostToDevice);
1014: cublasXgetrfBatched(cublasv2handle,n,d_ppQ,ld,d_ipiv,d_info,one);
1015: cudaMemcpy(&info,d_info,sizeof(PetscBLASInt),cudaMemcpyDeviceToHost);
1018: cublasXgetrsBatched(cublasv2handle,CUBLAS_OP_N,n,n,(const PetscScalar **)d_ppQ,ld,d_ipiv,d_ppP,ld,&info,one);
1021: cublasXscal(cublasv2handle,ld2,&stwo,d_P,one);
1022: shift_diagonal(n,d_P,ld,sone);
1023: cublasXscal(cublasv2handle,ld2,&smone,d_P,one);
1024: } else {
1025: cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n,n,n,&sone,d_P,ld,d_As,ld,&szero,d_W,ld);
1026: SWAP(d_P,d_W,aux);
1027: cublasXaxpy(cublasv2handle,ld2,&smone,d_P,one,d_Q,one);
1029: ppQ[0] = d_Q;
1030: ppP[0] = d_P;
1031: cudaMemcpy(d_ppQ,ppQ,sizeof(PetscScalar*),cudaMemcpyHostToDevice);
1032: cudaMemcpy(d_ppP,ppP,sizeof(PetscScalar*),cudaMemcpyHostToDevice);
1034: cublasXgetrfBatched(cublasv2handle,n,d_ppQ,ld,d_ipiv,d_info,one);
1035: cudaMemcpy(&info,d_info,sizeof(PetscBLASInt),cudaMemcpyDeviceToHost);
1038: cublasXgetrsBatched(cublasv2handle,CUBLAS_OP_N,n,n,(const PetscScalar **)d_ppQ,ld,d_ipiv,d_ppP,ld,&info,one);
1041: cublasXscal(cublasv2handle,ld2,&stwo,d_P,one);
1042: shift_diagonal(n,d_P,ld,sone);
1043: }
1044: PetscLogGpuFlops(2.0*n*n*n+2.0*n*n*n/3.0+4.0*n*n);
1046: for (k=1;k<=sexp;k++) {
1047: cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n,n,n,&sone,d_P,ld,d_P,ld,&szero,d_W,ld);
1048: cudaMemcpy(d_P,d_W,sizeof(PetscScalar)*ld2,cudaMemcpyDeviceToDevice);
1049: }
1050: PetscLogGpuFlops(2.0*n*n*n*sexp);
1052: PetscLogGpuTimeEnd();
1053: cudaFree(d_Q);
1054: cudaFree(d_W);
1055: cudaFree(d_A2);
1056: cudaFree(d_ipiv);
1057: cudaFree(d_info);
1058: cudaFree(d_ppP);
1059: cudaFree(d_ppQ);
1061: PetscFree(ppP);
1062: PetscFree(ppQ);
1064: if (d_P!=d_Ba) cudaMemcpy(d_Ba,d_P,sizeof(PetscScalar)*ld2,cudaMemcpyDeviceToDevice);
1065: if (A!=B) {
1066: if (s>0.5) { /* undo scaling */
1067: scale = 1.0/scale;
1068: cublasXscal(cublasv2handle,ld2,&scale,d_As,one);
1069: }
1070: MatDenseCUDARestoreArrayRead(A,(const PetscScalar**)&d_As);
1071: } else cudaFree(d_As);
1072: MatDenseCUDARestoreArrayWrite(B,&d_Ba);
1073: return 0;
1074: }
1076: #if defined(PETSC_HAVE_MAGMA)
1077: #include <slepcmagma.h>
1079: PetscErrorCode FNEvaluateFunctionMat_Exp_Pade_CUDAm(FN fn,Mat A,Mat B)
1080: {
1081: PetscBLASInt n=0,ld,ld2,*piv,one=1;
1082: PetscInt m,k,sexp;
1083: PetscBool odd;
1084: const PetscInt p=MAX_PADE;
1085: PetscReal c[MAX_PADE+1],s;
1086: PetscScalar scale,smone=-1.0,sone=1.0,stwo=2.0,szero=0.0;
1087: const PetscScalar *Aa;
1088: PetscScalar *d_Ba,*d_As,*d_A2,*d_Q,*d_P,*d_W,*aux;
1089: cublasHandle_t cublasv2handle;
1091: PetscDeviceInitialize(PETSC_DEVICE_CUDA); /* For CUDA event timers */
1092: PetscCUBLASGetHandle(&cublasv2handle);
1093: SlepcMagmaInit();
1094: MatGetSize(A,&m,NULL);
1095: PetscBLASIntCast(m,&n);
1096: ld = n;
1097: ld2 = ld*ld;
1098: if (A==B) {
1099: cudaMalloc((void **)&d_As,sizeof(PetscScalar)*m*m);
1100: MatDenseCUDAGetArrayRead(A,&Aa);
1101: cudaMemcpy(d_As,Aa,sizeof(PetscScalar)*ld2,cudaMemcpyDeviceToDevice);
1102: MatDenseCUDARestoreArrayRead(A,&Aa);
1103: } else MatDenseCUDAGetArrayRead(A,(const PetscScalar**)&d_As);
1104: MatDenseCUDAGetArrayWrite(B,&d_Ba);
1106: cudaMalloc((void **)&d_Q,sizeof(PetscScalar)*m*m);
1107: cudaMalloc((void **)&d_W,sizeof(PetscScalar)*m*m);
1108: cudaMalloc((void **)&d_A2,sizeof(PetscScalar)*m*m);
1110: PetscMalloc1(n,&piv);
1112: d_P = d_Ba;
1113: PetscLogGpuTimeBegin();
1115: /* Pade' coefficients */
1116: c[0] = 1.0;
1117: for (k=1;k<=p;k++) c[k] = c[k-1]*(p+1-k)/(k*(2*p+1-k));
1119: /* Scaling */
1120: cublasXnrm2(cublasv2handle,ld2,d_As,one,&s);
1121: PetscLogGpuFlops(1.0*n*n);
1123: if (s>0.5) {
1124: sexp = PetscMax(0,(int)(PetscLogReal(s)/PetscLogReal(2.0))+2);
1125: scale = PetscPowRealInt(2.0,-sexp);
1126: cublasXscal(cublasv2handle,ld2,&scale,d_As,one);
1127: PetscLogGpuFlops(1.0*n*n);
1128: } else sexp = 0;
1130: /* Horner evaluation */
1131: cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n,n,n,&sone,d_As,ld,d_As,ld,&szero,d_A2,ld);
1132: PetscLogGpuFlops(2.0*n*n*n);
1133: cudaMemset(d_Q,0,sizeof(PetscScalar)*ld2);
1134: cudaMemset(d_P,0,sizeof(PetscScalar)*ld2);
1135: set_diagonal(n,d_Q,ld,c[p]);
1136: set_diagonal(n,d_P,ld,c[p-1]);
1138: odd = PETSC_TRUE;
1139: for (k=p-1;k>0;k--) {
1140: if (odd) {
1141: cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n,n,n,&sone,d_Q,ld,d_A2,ld,&szero,d_W,ld);
1142: SWAP(d_Q,d_W,aux);
1143: shift_diagonal(n,d_Q,ld,c[k-1]);
1144: odd = PETSC_FALSE;
1145: } else {
1146: cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n,n,n,&sone,d_P,ld,d_A2,ld,&szero,d_W,ld);
1147: SWAP(d_P,d_W,aux);
1148: shift_diagonal(n,d_P,ld,c[k-1]);
1149: odd = PETSC_TRUE;
1150: }
1151: PetscLogGpuFlops(2.0*n*n*n);
1152: }
1153: if (odd) {
1154: cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n,n,n,&sone,d_Q,ld,d_As,ld,&szero,d_W,ld);
1155: SWAP(d_Q,d_W,aux);
1156: cublasXaxpy(cublasv2handle,ld2,&smone,d_P,one,d_Q,one);
1157: PetscCallMAGMA(magma_xgesv_gpu,n,n,d_Q,ld,piv,d_P,ld);
1158: cublasXscal(cublasv2handle,ld2,&stwo,d_P,one);
1159: shift_diagonal(n,d_P,ld,sone);
1160: cublasXscal(cublasv2handle,ld2,&smone,d_P,one);
1161: } else {
1162: cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n,n,n,&sone,d_P,ld,d_As,ld,&szero,d_W,ld);
1163: SWAP(d_P,d_W,aux);
1164: cublasXaxpy(cublasv2handle,ld2,&smone,d_P,one,d_Q,one);
1165: PetscCallMAGMA(magma_xgesv_gpu,n,n,d_Q,ld,piv,d_P,ld);
1166: cublasXscal(cublasv2handle,ld2,&stwo,d_P,one);
1167: shift_diagonal(n,d_P,ld,sone);
1168: }
1169: PetscLogGpuFlops(2.0*n*n*n+2.0*n*n*n/3.0+4.0*n*n);
1171: for (k=1;k<=sexp;k++) {
1172: cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n,n,n,&sone,d_P,ld,d_P,ld,&szero,d_W,ld);
1173: cudaMemcpy(d_P,d_W,sizeof(PetscScalar)*ld2,cudaMemcpyDeviceToDevice);
1174: }
1175: PetscLogGpuFlops(2.0*n*n*n*sexp);
1177: PetscLogGpuTimeEnd();
1178: cudaFree(d_Q);
1179: cudaFree(d_W);
1180: cudaFree(d_A2);
1181: PetscFree(piv);
1183: if (d_P!=d_Ba) cudaMemcpy(d_Ba,d_P,sizeof(PetscScalar)*ld2,cudaMemcpyDeviceToDevice);
1184: if (A!=B) {
1185: if (s>0.5) { /* undo scaling */
1186: scale = 1.0/scale;
1187: cublasXscal(cublasv2handle,ld2,&scale,d_As,one);
1188: }
1189: MatDenseCUDARestoreArrayRead(A,(const PetscScalar**)&d_As);
1190: } else cudaFree(d_As);
1191: MatDenseCUDARestoreArrayWrite(B,&d_Ba);
1192: return 0;
1193: }
1195: /*
1196: * Matrix exponential implementation based on algorithm and matlab code by N. Higham and co-authors
1197: *
1198: * N. J. Higham, "The scaling and squaring method for the matrix exponential
1199: * revisited", SIAM J. Matrix Anal. Appl. 26(4):1179-1193, 2005.
1200: */
1201: PetscErrorCode FNEvaluateFunctionMat_Exp_Higham_CUDAm(FN fn,Mat A,Mat B)
1202: {
1203: PetscBLASInt n_=0,n2,*ipiv,one=1;
1204: PetscInt n,m,j,s;
1205: PetscScalar scale,smone=-1.0,sone=1.0,stwo=2.0,szero=0.0;
1206: PetscScalar *d_Ba,*Apowers[5],*d_Apowers[5],*d_Q,*d_P,*d_W,*work,*d_work,*aux;
1207: const PetscScalar *Aa,*c;
1208: const PetscScalar c3[4] = { 120, 60, 12, 1 };
1209: const PetscScalar c5[6] = { 30240, 15120, 3360, 420, 30, 1 };
1210: const PetscScalar c7[8] = { 17297280, 8648640, 1995840, 277200, 25200, 1512, 56, 1 };
1211: const PetscScalar c9[10] = { 17643225600, 8821612800, 2075673600, 302702400, 30270240,
1212: 2162160, 110880, 3960, 90, 1 };
1213: const PetscScalar c13[14] = { 64764752532480000, 32382376266240000, 7771770303897600,
1214: 1187353796428800, 129060195264000, 10559470521600,
1215: 670442572800, 33522128640, 1323241920,
1216: 40840800, 960960, 16380, 182, 1 };
1217: cublasHandle_t cublasv2handle;
1219: PetscDeviceInitialize(PETSC_DEVICE_CUDA); /* For CUDA event timers */
1220: PetscCUBLASGetHandle(&cublasv2handle);
1221: SlepcMagmaInit();
1222: MatGetSize(A,&n,NULL);
1223: PetscBLASIntCast(n,&n_);
1224: n2 = n_*n_;
1225: PetscMalloc2(8*n*n,&work,n,&ipiv);
1226: /* Matrix powers */
1227: Apowers[0] = work; /* Apowers[0] = A */
1228: Apowers[1] = Apowers[0] + n*n; /* Apowers[1] = A^2 */
1229: Apowers[2] = Apowers[1] + n*n; /* Apowers[2] = A^4 */
1230: Apowers[3] = Apowers[2] + n*n; /* Apowers[3] = A^6 */
1231: Apowers[4] = Apowers[3] + n*n; /* Apowers[4] = A^8 */
1232: if (A==B) {
1233: cudaMalloc((void**)&d_work,7*n*n*sizeof(PetscScalar));
1234: d_Apowers[0] = d_work; /* d_Apowers[0] = A */
1235: d_Apowers[1] = d_Apowers[0] + n*n; /* d_Apowers[1] = A^2 */
1236: MatDenseCUDAGetArrayRead(A,&Aa);
1237: cudaMemcpy(d_Apowers[0],Aa,n2*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);
1238: MatDenseCUDARestoreArrayRead(A,&Aa);
1239: } else {
1240: cudaMalloc((void**)&d_work,6*n*n*sizeof(PetscScalar));
1241: MatDenseCUDAGetArrayRead(A,(const PetscScalar**)&d_Apowers[0]);
1242: d_Apowers[1] = d_work; /* d_Apowers[1] = A^2 */
1243: }
1244: MatDenseCUDAGetArrayWrite(B,&d_Ba);
1245: d_Apowers[2] = d_Apowers[1] + n*n; /* d_Apowers[2] = A^4 */
1246: d_Apowers[3] = d_Apowers[2] + n*n; /* d_Apowers[3] = A^6 */
1247: d_Apowers[4] = d_Apowers[3] + n*n; /* d_Apowers[4] = A^8 */
1248: d_Q = d_Apowers[4] + n*n;
1249: d_W = d_Q + n*n;
1251: PetscLogGpuTimeBegin();
1253: cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n_,n_,n_,&sone,d_Apowers[0],n_,d_Apowers[0],n_,&szero,d_Apowers[1],n_);
1254: cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n_,n_,n_,&sone,d_Apowers[1],n_,d_Apowers[1],n_,&szero,d_Apowers[2],n_);
1255: cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n_,n_,n_,&sone,d_Apowers[1],n_,d_Apowers[2],n_,&szero,d_Apowers[3],n_);
1256: PetscLogGpuFlops(6.0*n*n*n);
1258: cudaMemcpy(Apowers[0],d_Apowers[0],n2*sizeof(PetscScalar),cudaMemcpyDeviceToHost);
1259: cudaMemcpy(Apowers[1],d_Apowers[1],3*n2*sizeof(PetscScalar),cudaMemcpyDeviceToHost);
1260: PetscLogGpuToCpu(4*n2*sizeof(PetscScalar));
1261: /* Compute scaling parameter and order of Pade approximant */
1262: expm_params(n,Apowers,&s,&m,Apowers[4]);
1264: if (s) { /* rescale */
1265: for (j=0;j<4;j++) {
1266: scale = PetscPowRealInt(2.0,-PetscMax(2*j,1)*s);
1267: cublasXscal(cublasv2handle,n2,&scale,d_Apowers[j],one);
1268: }
1269: PetscLogGpuFlops(4.0*n*n);
1270: }
1272: /* Evaluate the Pade approximant */
1273: switch (m) {
1274: case 3: c = c3; break;
1275: case 5: c = c5; break;
1276: case 7: c = c7; break;
1277: case 9: c = c9; break;
1278: case 13: c = c13; break;
1279: default: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Wrong value of m %" PetscInt_FMT,m);
1280: }
1281: d_P = d_Ba;
1282: switch (m) {
1283: case 3:
1284: case 5:
1285: case 7:
1286: case 9:
1287: if (m==9) cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n_,n_,n_,&sone,d_Apowers[1],n_,d_Apowers[3],n_,&szero,d_Apowers[4],n_);
1288: cudaMemset(d_P,0,sizeof(PetscScalar)*n2);
1289: cudaMemset(d_Q,0,sizeof(PetscScalar)*n2);
1290: set_diagonal(n,d_P,n,c[1]);
1291: set_diagonal(n,d_Q,n,c[0]);
1292: for (j=m;j>=3;j-=2) {
1293: cublasXaxpy(cublasv2handle,n2,&c[j],d_Apowers[(j+1)/2-1],one,d_P,one);
1294: cublasXaxpy(cublasv2handle,n2,&c[j-1],d_Apowers[(j+1)/2-1],one,d_Q,one);
1295: PetscLogGpuFlops(4.0*n*n);
1296: }
1297: cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n_,n_,n_,&sone,d_Apowers[0],n_,d_P,n_,&szero,d_W,n_);
1298: PetscLogGpuFlops(2.0*n*n*n);
1299: SWAP(d_P,d_W,aux);
1300: break;
1301: case 13:
1302: /* P = A*(Apowers[3]*(c[13]*Apowers[3] + c[11]*Apowers[2] + c[9]*Apowers[1])
1303: + c[7]*Apowers[3] + c[5]*Apowers[2] + c[3]*Apowers[1] + c[1]*I) */
1304: cudaMemcpy(d_P,d_Apowers[3],n2*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);
1305: cublasXscal(cublasv2handle,n2,&c[13],d_P,one);
1306: cublasXaxpy(cublasv2handle,n2,&c[11],d_Apowers[2],one,d_P,one);
1307: cublasXaxpy(cublasv2handle,n2,&c[9],d_Apowers[1],one,d_P,one);
1308: cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n_,n_,n_,&sone,d_Apowers[3],n_,d_P,n_,&szero,d_W,n_);
1309: PetscLogGpuFlops(5.0*n*n+2.0*n*n*n);
1311: cudaMemset(d_P,0,sizeof(PetscScalar)*n2);
1312: set_diagonal(n,d_P,n,c[1]);
1313: cublasXaxpy(cublasv2handle,n2,&c[7],d_Apowers[3],one,d_P,one);
1314: cublasXaxpy(cublasv2handle,n2,&c[5],d_Apowers[2],one,d_P,one);
1315: cublasXaxpy(cublasv2handle,n2,&c[3],d_Apowers[1],one,d_P,one);
1316: cublasXaxpy(cublasv2handle,n2,&sone,d_P,one,d_W,one);
1317: cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n_,n_,n_,&sone,d_Apowers[0],n_,d_W,n_,&szero,d_P,n_);
1318: PetscLogGpuFlops(7.0*n*n+2.0*n*n*n);
1319: /* Q = Apowers[3]*(c[12]*Apowers[3] + c[10]*Apowers[2] + c[8]*Apowers[1])
1320: + c[6]*Apowers[3] + c[4]*Apowers[2] + c[2]*Apowers[1] + c[0]*I */
1321: cudaMemcpy(d_Q,d_Apowers[3],n2*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);
1322: cublasXscal(cublasv2handle,n2,&c[12],d_Q,one);
1323: cublasXaxpy(cublasv2handle,n2,&c[10],d_Apowers[2],one,d_Q,one);
1324: cublasXaxpy(cublasv2handle,n2,&c[8],d_Apowers[1],one,d_Q,one);
1325: cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n_,n_,n_,&sone,d_Apowers[3],n_,d_Q,n_,&szero,d_W,n_);
1326: PetscLogGpuFlops(5.0*n*n+2.0*n*n*n);
1327: cudaMemset(d_Q,0,sizeof(PetscScalar)*n2);
1328: set_diagonal(n,d_Q,n,c[0]);
1329: cublasXaxpy(cublasv2handle,n2,&c[6],d_Apowers[3],one,d_Q,one);
1330: cublasXaxpy(cublasv2handle,n2,&c[4],d_Apowers[2],one,d_Q,one);
1331: cublasXaxpy(cublasv2handle,n2,&c[2],d_Apowers[1],one,d_Q,one);
1332: cublasXaxpy(cublasv2handle,n2,&sone,d_W,one,d_Q,one);
1333: PetscLogGpuFlops(7.0*n*n);
1334: break;
1335: default: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Wrong value of m %" PetscInt_FMT,m);
1336: }
1337: cublasXaxpy(cublasv2handle,n2,&smone,d_P,one,d_Q,one);
1339: PetscCallMAGMA(magma_xgesv_gpu,n_,n_,d_Q,n_,ipiv,d_P,n_);
1341: cublasXscal(cublasv2handle,n2,&stwo,d_P,one);
1342: shift_diagonal(n,d_P,n,sone);
1343: PetscLogGpuFlops(2.0*n*n*n/3.0+4.0*n*n);
1345: /* Squaring */
1346: for (j=1;j<=s;j++) {
1347: cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n_,n_,n_,&sone,d_P,n_,d_P,n_,&szero,d_W,n_);
1348: SWAP(d_P,d_W,aux);
1349: }
1350: PetscLogGpuFlops(2.0*n*n*n*s);
1351: PetscLogGpuTimeEnd();
1353: PetscFree2(work,ipiv);
1354: if (d_P!=d_Ba) cudaMemcpy(d_Ba,d_P,n2*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);
1355: if (A!=B) {
1356: if (s>0.5) { /* undo scaling */
1357: scale = 1.0/PetscPowRealInt(2.0,-s);
1358: cublasXscal(cublasv2handle,n2,&scale,d_Apowers[0],one);
1359: }
1360: MatDenseCUDARestoreArrayRead(A,(const PetscScalar**)&d_Apowers[0]);
1361: }
1362: MatDenseCUDARestoreArrayWrite(B,&d_Ba);
1363: cudaFree(d_work);
1364: return 0;
1365: }
1367: /*
1368: * Matrix exponential implementation based on algorithm and matlab code by Stefan Guettel
1369: * and Yuji Nakatsukasa
1370: *
1371: * Stefan Guettel and Yuji Nakatsukasa, "Scaled and Squared Subdiagonal Pade'
1372: * Approximation for the Matrix Exponential",
1373: * SIAM J. Matrix Anal. Appl. 37(1):145-170, 2016.
1374: * https://doi.org/10.1137/15M1027553
1375: */
1376: PetscErrorCode FNEvaluateFunctionMat_Exp_GuettelNakatsukasa_CUDAm(FN fn,Mat A,Mat B)
1377: {
1378: PetscInt i,j,n_,s,k,m,mod;
1379: PetscBLASInt n=0,n2=0,irsize=0,rsizediv2,ipsize=0,iremainsize=0,query=-1,*piv,minlen,lwork=0,one=1;
1380: PetscReal nrm,shift=0.0,rone=1.0,rzero=0.0;
1381: #if defined(PETSC_USE_COMPLEX)
1382: PetscReal *rwork=NULL;
1383: #endif
1384: PetscComplex *d_As,*d_RR,*d_RR2,*d_expmA,*d_expmA2,*d_Maux,*d_Maux2,rsize,*r,psize,*p,remainsize,*remainterm,*rootp,*rootq,mult=0.0,scale,cone=1.0,czero=0.0,*aux;
1385: PetscScalar *d_Aa,*d_Ba,*d_Ba2,*Maux,*d_sMaux,*wr,*wi,expshift,sone=1.0,szero=0.0,*work,work1,*saux;
1386: const PetscScalar *Aa;
1387: PetscBool isreal,*d_isreal,flg;
1388: cublasHandle_t cublasv2handle;
1390: PetscDeviceInitialize(PETSC_DEVICE_CUDA); /* For CUDA event timers */
1391: PetscCUBLASGetHandle(&cublasv2handle);
1392: SlepcMagmaInit();
1393: MatGetSize(A,&n_,NULL);
1394: PetscBLASIntCast(n_,&n);
1395: PetscBLASIntCast(n*n,&n2);
1397: if (A==B) {
1398: cudaMalloc((void **)&d_Aa,sizeof(PetscScalar)*n2);
1399: MatDenseCUDAGetArrayRead(A,&Aa);
1400: cudaMemcpy(d_Aa,Aa,sizeof(PetscScalar)*n2,cudaMemcpyDeviceToDevice);
1401: MatDenseCUDARestoreArrayRead(A,&Aa);
1402: } else MatDenseCUDAGetArrayRead(A,(const PetscScalar**)&d_Aa);
1403: MatDenseCUDAGetArrayWrite(B,&d_Ba);
1404: d_Ba2 = d_Ba;
1406: cudaMalloc((void **)&d_isreal,sizeof(PetscBool));
1407: cudaMalloc((void **)&d_sMaux,sizeof(PetscScalar)*n2);
1408: cudaMalloc((void **)&d_Maux,sizeof(PetscComplex)*n2);
1410: PetscLogGpuTimeBegin();
1411: d_Maux2 = d_Maux;
1412: PetscOptionsGetReal(NULL,NULL,"-fn_expm_estimated_eig",&shift,&flg);
1413: if (!flg) {
1414: PetscMalloc2(n,&wr,n,&wi);
1415: /* estimate rightmost eigenvalue and shift A with it */
1416: PetscMalloc1(n2,&Maux);
1417: MatDenseGetArrayRead(A,&Aa);
1418: PetscArraycpy(Maux,Aa,n2);
1419: MatDenseRestoreArrayRead(A,&Aa);
1420: #if !defined(PETSC_USE_COMPLEX)
1421: PetscCallMAGMA(magma_xgeev,MagmaNoVec,MagmaNoVec,n,Maux,n,wr,wi,NULL,n,NULL,n,&work1,query);
1422: PetscBLASIntCast((PetscInt)PetscRealPart(work1),&lwork);
1423: PetscMalloc1(lwork,&work);
1424: PetscCallMAGMA(magma_xgeev,MagmaNoVec,MagmaNoVec,n,Maux,n,wr,wi,NULL,n,NULL,n,work,lwork);
1425: PetscFree(work);
1426: #else
1427: PetscCallMAGMA(magma_xgeev,MagmaNoVec,MagmaNoVec,n,Maux,n,wr,NULL,n,NULL,n,&work1,query,rwork);
1428: PetscBLASIntCast((PetscInt)PetscRealPart(work1),&lwork);
1429: PetscMalloc2(2*n,&rwork,lwork,&work);
1430: PetscCallMAGMA(magma_xgeev,MagmaNoVec,MagmaNoVec,n,Maux,n,wr,NULL,n,NULL,n,work,lwork,rwork);
1431: PetscFree2(rwork,work);
1432: #endif
1433: PetscFree(Maux);
1434: PetscLogGpuFlops(25.0*n*n*n+(n*n*n)/3.0+1.0*n*n*n);
1436: shift = PetscRealPart(wr[0]);
1437: for (i=1;i<n;i++) {
1438: if (PetscRealPart(wr[i]) > shift) shift = PetscRealPart(wr[i]);
1439: }
1440: PetscFree2(wr,wi);
1441: }
1442: /* shift so that largest real part is (about) 0 */
1443: cudaMemcpy(d_sMaux,d_Aa,sizeof(PetscScalar)*n2,cudaMemcpyDeviceToDevice);
1444: if (shift) {
1445: shift_diagonal(n,d_sMaux,n,-shift);
1446: PetscLogGpuFlops(1.0*n);
1447: }
1448: #if defined(PETSC_USE_COMPLEX)
1449: cudaMemcpy(d_Maux,d_Aa,sizeof(PetscScalar)*n2,cudaMemcpyDeviceToDevice);
1450: if (shift) {
1451: shift_diagonal(n,d_Maux,n,-shift);
1452: PetscLogGpuFlops(1.0*n);
1453: }
1454: #endif
1455: if (A!=B) MatDenseCUDARestoreArrayRead(A,(const PetscScalar**)&d_Aa);
1456: else cudaFree(d_Aa);
1458: /* estimate norm(A) and select the scaling factor */
1459: cublasXnrm2(cublasv2handle,n2,d_sMaux,one,&nrm);
1460: PetscLogGpuFlops(2.0*n*n);
1461: sexpm_params(nrm,&s,&k,&m);
1462: if (s==0 && k==1 && m==0) { /* exp(A) = I+A to eps! */
1463: if (shift) expshift = PetscExpReal(shift);
1464: shift_Cdiagonal(n,d_Maux,n,rone,rzero);
1465: if (shift) {
1466: cublasXscal(cublasv2handle,n2,&expshift,d_sMaux,one);
1467: PetscLogGpuFlops(1.0*(n+n2));
1468: } else PetscLogGpuFlops(1.0*n);
1469: cudaMemcpy(d_Ba,d_sMaux,sizeof(PetscScalar)*n2,cudaMemcpyDeviceToDevice);
1470: cudaFree(d_isreal);
1471: cudaFree(d_sMaux);
1472: cudaFree(d_Maux);
1473: MatDenseCUDARestoreArrayWrite(B,&d_Ba);
1474: return 0; /* quick return */
1475: }
1477: cudaMalloc((void **)&d_expmA,sizeof(PetscComplex)*n2);
1478: cudaMalloc((void **)&d_As,sizeof(PetscComplex)*n2);
1479: cudaMalloc((void **)&d_RR,sizeof(PetscComplex)*n2);
1480: d_expmA2 = d_expmA; d_RR2 = d_RR;
1481: PetscMalloc1(n,&piv);
1482: /* scale matrix */
1483: #if !defined(PETSC_USE_COMPLEX)
1484: copy_array2D_S2C(n,n,d_As,n,d_sMaux,n);
1485: #else
1486: cudaMemcpy(d_As,d_sMaux,sizeof(PetscScalar)*n2,cudaMemcpyDeviceToDevice);
1487: #endif
1488: scale = 1.0/PetscPowRealInt(2.0,s);
1489: cublasXCscal(cublasv2handle,n2,(const cuComplex *)&scale,(cuComplex *)d_As,one);
1490: SlepcLogGpuFlopsComplex(1.0*n2);
1492: /* evaluate Pade approximant (partial fraction or product form) */
1493: if (fn->method==3 || !m) { /* partial fraction */
1494: getcoeffs(k,m,&rsize,&psize,&remainsize,PETSC_TRUE);
1495: PetscBLASIntCast((PetscInt)PetscRealPartComplex(rsize),&irsize);
1496: PetscBLASIntCast((PetscInt)PetscRealPartComplex(psize),&ipsize);
1497: PetscBLASIntCast((PetscInt)PetscRealPartComplex(remainsize),&iremainsize);
1498: PetscMalloc3(irsize,&r,ipsize,&p,iremainsize,&remainterm);
1499: getcoeffs(k,m,r,p,remainterm,PETSC_FALSE);
1501: cudaMemset(d_expmA,0,sizeof(PetscComplex)*n2);
1502: #if !defined(PETSC_USE_COMPLEX)
1503: isreal = PETSC_TRUE;
1504: #else
1505: getisreal_array2D(n,n,d_Maux,n,d_isreal);
1506: cudaMemcpy(&isreal,d_isreal,sizeof(PetscBool),cudaMemcpyDeviceToHost);
1507: #endif
1508: if (isreal) {
1509: rsizediv2 = irsize/2;
1510: for (i=0;i<rsizediv2;i++) { /* use partial fraction to get R(As) */
1511: cudaMemcpy(d_Maux,d_As,sizeof(PetscComplex)*n2,cudaMemcpyDeviceToDevice);
1512: cudaMemset(d_RR,0,sizeof(PetscComplex)*n2);
1513: shift_Cdiagonal(n,d_Maux,n,-PetscRealPartComplex(p[2*i]),-PetscImaginaryPartComplex(p[2*i]));
1514: set_Cdiagonal(n,d_RR,n,PetscRealPartComplex(r[2*i]),PetscImaginaryPartComplex(r[2*i]));
1515: PetscCallMAGMA(magma_Cgesv_gpu,n,n,d_Maux,n,piv,d_RR,n);
1516: add_array2D_Conj(n,n,d_RR,n);
1517: cublasXCaxpy(cublasv2handle,n2,&cone,d_RR,one,d_expmA,one);
1518: /* shift(n) + gesv + axpy(n2) */
1519: SlepcLogGpuFlopsComplex(1.0*n+(2.0*n*n*n/3.0+2.0*n*n*n)+2.0*n2);
1520: }
1522: mod = ipsize % 2;
1523: if (mod) {
1524: cudaMemcpy(d_Maux,d_As,sizeof(PetscComplex)*n2,cudaMemcpyDeviceToDevice);
1525: cudaMemset(d_RR,0,sizeof(PetscComplex)*n2);
1526: shift_Cdiagonal(n,d_Maux,n,-PetscRealPartComplex(p[ipsize-1]),-PetscImaginaryPartComplex(p[ipsize-1]));
1527: set_Cdiagonal(n,d_RR,n,PetscRealPartComplex(r[irsize-1]),PetscImaginaryPartComplex(r[irsize-1]));
1528: PetscCallMAGMA(magma_Cgesv_gpu,n,n,d_Maux,n,piv,d_RR,n);
1529: cublasXCaxpy(cublasv2handle,n2,&cone,d_RR,one,d_expmA,one);
1530: SlepcLogGpuFlopsComplex(1.0*n+(2.0*n*n*n/3.0+2.0*n*n*n)+1.0*n2);
1531: }
1532: } else { /* complex */
1533: for (i=0;i<irsize;i++) { /* use partial fraction to get R(As) */
1534: cudaMemcpy(d_Maux,d_As,sizeof(PetscComplex)*n2,cudaMemcpyDeviceToDevice);
1535: cudaMemset(d_RR,0,sizeof(PetscComplex)*n2);
1536: shift_Cdiagonal(n,d_Maux,n,-PetscRealPartComplex(p[i]),-PetscImaginaryPartComplex(p[i]));
1537: set_Cdiagonal(n,d_RR,n,PetscRealPartComplex(r[i]),PetscImaginaryPartComplex(r[i]));
1538: PetscCallMAGMA(magma_Cgesv_gpu,n,n,d_Maux,n,piv,d_RR,n);
1539: cublasXCaxpy(cublasv2handle,n2,&cone,d_RR,one,d_expmA,one);
1540: SlepcLogGpuFlopsComplex(1.0*n+(2.0*n*n*n/3.0+2.0*n*n*n)+1.0*n2);
1541: }
1542: }
1543: for (i=0;i<iremainsize;i++) {
1544: if (!i) {
1545: cudaMemset(d_RR,0,sizeof(PetscComplex)*n2);
1546: set_Cdiagonal(n,d_RR,n,PetscRealPartComplex(remainterm[iremainsize-1]),PetscImaginaryPartComplex(remainterm[iremainsize-1]));
1547: } else {
1548: cudaMemcpy(d_RR,d_As,sizeof(PetscComplex)*n2,cudaMemcpyDeviceToDevice);
1549: for (j=1;j<i;j++) {
1550: cublasXCgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n,n,n,&cone,d_RR,n,d_RR,n,&czero,d_Maux,n);
1551: SWAP(d_RR,d_Maux,aux);
1552: SlepcLogGpuFlopsComplex(2.0*n*n*n);
1553: }
1554: cublasXCscal(cublasv2handle,n2,&remainterm[iremainsize-1-i],d_RR,one);
1555: SlepcLogGpuFlopsComplex(1.0*n2);
1556: }
1557: cublasXCaxpy(cublasv2handle,n2,&cone,d_RR,one,d_expmA,one);
1558: SlepcLogGpuFlopsComplex(1.0*n2);
1559: }
1560: PetscFree3(r,p,remainterm);
1561: } else { /* product form, default */
1562: getcoeffsproduct(k,m,&rsize,&psize,&mult,PETSC_TRUE);
1563: PetscBLASIntCast((PetscInt)PetscRealPartComplex(rsize),&irsize);
1564: PetscBLASIntCast((PetscInt)PetscRealPartComplex(psize),&ipsize);
1565: PetscMalloc2(irsize,&rootp,ipsize,&rootq);
1566: getcoeffsproduct(k,m,rootp,rootq,&mult,PETSC_FALSE);
1568: cudaMemset(d_expmA,0,sizeof(PetscComplex)*n2);
1569: set_Cdiagonal(n,d_expmA,n,rone,rzero); /* initialize */
1570: minlen = PetscMin(irsize,ipsize);
1571: for (i=0;i<minlen;i++) {
1572: cudaMemcpy(d_RR,d_As,sizeof(PetscComplex)*n2,cudaMemcpyDeviceToDevice);
1573: shift_Cdiagonal(n,d_RR,n,-PetscRealPartComplex(rootp[i]),-PetscImaginaryPartComplex(rootp[i]));
1574: cublasXCgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n,n,n,&cone,d_RR,n,d_expmA,n,&czero,d_Maux,n);
1575: SWAP(d_expmA,d_Maux,aux);
1576: cudaMemcpy(d_RR,d_As,sizeof(PetscComplex)*n2,cudaMemcpyDeviceToDevice);
1577: shift_Cdiagonal(n,d_RR,n,-PetscRealPartComplex(rootq[i]),-PetscImaginaryPartComplex(rootq[i]));
1578: PetscCallMAGMA(magma_Cgesv_gpu,n,n,d_RR,n,piv,d_expmA,n);
1579: /* shift(n) + gemm + shift(n) + gesv */
1580: SlepcLogGpuFlopsComplex(1.0*n+(2.0*n*n*n)+1.0*n+(2.0*n*n*n/3.0+2.0*n*n*n));
1581: }
1582: /* extra enumerator */
1583: for (i=minlen;i<irsize;i++) {
1584: cudaMemcpy(d_RR,d_As,sizeof(PetscComplex)*n2,cudaMemcpyDeviceToDevice);
1585: shift_Cdiagonal(n,d_RR,n,-PetscRealPartComplex(rootp[i]),-PetscImaginaryPartComplex(rootp[i]));
1586: cublasXCgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n,n,n,&cone,d_RR,n,d_expmA,n,&czero,d_Maux,n);
1587: SWAP(d_expmA,d_Maux,aux);
1588: SlepcLogGpuFlopsComplex(1.0*n+2.0*n*n*n);
1589: }
1590: /* extra denominator */
1591: for (i=minlen;i<ipsize;i++) {
1592: cudaMemcpy(d_RR,d_As,sizeof(PetscComplex)*n2,cudaMemcpyDeviceToDevice);
1593: shift_Cdiagonal(n,d_RR,n,-PetscRealPartComplex(rootq[i]),-PetscImaginaryPartComplex(rootq[i]));
1594: PetscCallMAGMA(magma_Cgesv_gpu,n,n,d_RR,n,piv,d_expmA,n);
1595: SlepcLogGpuFlopsComplex(1.0*n+(2.0*n*n*n/3.0+2.0*n*n*n));
1596: }
1597: cublasXCscal(cublasv2handle,n2,&mult,d_expmA,one);
1598: SlepcLogGpuFlopsComplex(1.0*n2);
1599: PetscFree2(rootp,rootq);
1600: }
1602: #if !defined(PETSC_USE_COMPLEX)
1603: copy_array2D_C2S(n,n,d_Ba2,n,d_expmA,n);
1604: #else
1605: cudaMemcpy(d_Ba2,d_expmA,sizeof(PetscScalar)*n2,cudaMemcpyDeviceToDevice);
1606: #endif
1608: /* perform repeated squaring */
1609: for (i=0;i<s;i++) { /* final squaring */
1610: cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n,n,n,&sone,d_Ba2,n,d_Ba2,n,&szero,d_sMaux,n);
1611: SWAP(d_Ba2,d_sMaux,saux);
1612: PetscLogGpuFlops(2.0*n*n*n);
1613: }
1614: if (d_Ba2!=d_Ba) {
1615: cudaMemcpy(d_Ba,d_Ba2,sizeof(PetscScalar)*n2,cudaMemcpyDeviceToDevice);
1616: d_sMaux = d_Ba2;
1617: }
1618: if (shift) {
1619: expshift = PetscExpReal(shift);
1620: cublasXscal(cublasv2handle,n2,&expshift,d_Ba,one);
1621: PetscLogGpuFlops(1.0*n2);
1622: }
1624: PetscLogGpuTimeEnd();
1626: /* restore pointers */
1627: d_Maux = d_Maux2; d_expmA = d_expmA2; d_RR = d_RR2;
1628: MatDenseCUDARestoreArrayWrite(B,&d_Ba);
1629: cudaFree(d_isreal);
1630: cudaFree(d_sMaux);
1631: cudaFree(d_Maux);
1632: cudaFree(d_expmA);
1633: cudaFree(d_As);
1634: cudaFree(d_RR);
1635: PetscFree(piv);
1636: return 0;
1637: }
1638: #endif /* PETSC_HAVE_MAGMA */
1639: #endif /* PETSC_HAVE_CUDA */
1641: PetscErrorCode FNView_Exp(FN fn,PetscViewer viewer)
1642: {
1643: PetscBool isascii;
1644: char str[50];
1645: const char *methodname[] = {
1646: "scaling & squaring, [m/m] Pade approximant (Higham)",
1647: "scaling & squaring, [6/6] Pade approximant",
1648: "scaling & squaring, subdiagonal Pade approximant (product form)",
1649: "scaling & squaring, subdiagonal Pade approximant (partial fraction)"
1650: };
1651: const int nmeth=PETSC_STATIC_ARRAY_LENGTH(methodname);
1653: PetscObjectTypeCompare((PetscObject)viewer,PETSCVIEWERASCII,&isascii);
1654: if (isascii) {
1655: if (fn->beta==(PetscScalar)1.0) {
1656: if (fn->alpha==(PetscScalar)1.0) PetscViewerASCIIPrintf(viewer," exponential: exp(x)\n");
1657: else {
1658: SlepcSNPrintfScalar(str,sizeof(str),fn->alpha,PETSC_TRUE);
1659: PetscViewerASCIIPrintf(viewer," exponential: exp(%s*x)\n",str);
1660: }
1661: } else {
1662: SlepcSNPrintfScalar(str,sizeof(str),fn->beta,PETSC_TRUE);
1663: if (fn->alpha==(PetscScalar)1.0) PetscViewerASCIIPrintf(viewer," exponential: %s*exp(x)\n",str);
1664: else {
1665: PetscViewerASCIIPrintf(viewer," exponential: %s",str);
1666: PetscViewerASCIIUseTabs(viewer,PETSC_FALSE);
1667: SlepcSNPrintfScalar(str,sizeof(str),fn->alpha,PETSC_TRUE);
1668: PetscViewerASCIIPrintf(viewer,"*exp(%s*x)\n",str);
1669: PetscViewerASCIIUseTabs(viewer,PETSC_TRUE);
1670: }
1671: }
1672: if (fn->method<nmeth) PetscViewerASCIIPrintf(viewer," computing matrix functions with: %s\n",methodname[fn->method]);
1673: }
1674: return 0;
1675: }
1677: SLEPC_EXTERN PetscErrorCode FNCreate_Exp(FN fn)
1678: {
1679: fn->ops->evaluatefunction = FNEvaluateFunction_Exp;
1680: fn->ops->evaluatederivative = FNEvaluateDerivative_Exp;
1681: fn->ops->evaluatefunctionmat[0] = FNEvaluateFunctionMat_Exp_Higham;
1682: fn->ops->evaluatefunctionmat[1] = FNEvaluateFunctionMat_Exp_Pade;
1683: fn->ops->evaluatefunctionmat[2] = FNEvaluateFunctionMat_Exp_GuettelNakatsukasa; /* product form */
1684: fn->ops->evaluatefunctionmat[3] = FNEvaluateFunctionMat_Exp_GuettelNakatsukasa; /* partial fraction */
1685: #if defined(PETSC_HAVE_CUDA)
1686: fn->ops->evaluatefunctionmatcuda[1] = FNEvaluateFunctionMat_Exp_Pade_CUDA;
1687: #if defined(PETSC_HAVE_MAGMA)
1688: fn->ops->evaluatefunctionmatcuda[0] = FNEvaluateFunctionMat_Exp_Higham_CUDAm;
1689: fn->ops->evaluatefunctionmatcuda[1] = FNEvaluateFunctionMat_Exp_Pade_CUDAm;
1690: fn->ops->evaluatefunctionmatcuda[2] = FNEvaluateFunctionMat_Exp_GuettelNakatsukasa_CUDAm; /* product form */
1691: fn->ops->evaluatefunctionmatcuda[3] = FNEvaluateFunctionMat_Exp_GuettelNakatsukasa_CUDAm; /* partial fraction */
1692: #endif
1693: #endif
1694: fn->ops->view = FNView_Exp;
1695: return 0;
1696: }