Newer
Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
/**
*
* @copyright (c) 2009-2014 The University of Tennessee and The University
* of Tennessee Research Foundation.
* All rights reserved.
* @copyright (c) 2012-2015 Inria. All rights reserved.
* @copyright (c) 2012-2014 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, Univ. Bordeaux. All rights reserved.
*
**/
/**
*
* @file cuda_zgelqt.c
*
* MORSE cudablas kernel
* MORSE is a software package provided by Univ. of Tennessee,
* Univ. of California Berkeley and Univ. of Colorado Denver,
* and INRIA Bordeaux Sud-Ouest
*
* @author Florent Pruvost
* @date 2015-09-16
* @precisions normal z -> c d s
*
**/
#include "cudablas/include/cudablas.h"
#if defined(CHAMELEON_USE_MAGMA)
int CUDA_zgelqt(
magma_int_t m, magma_int_t n, magma_int_t nb,
magmaDoubleComplex *da, magma_int_t ldda,
magmaDoubleComplex *v, magma_int_t ldv,
magmaDoubleComplex *dt, magma_int_t lddt,
magmaDoubleComplex *t, magma_int_t ldt,
magmaDoubleComplex *dd,
magmaDoubleComplex *d, magma_int_t ldd,
magmaDoubleComplex *tau,
magmaDoubleComplex *hwork,
magmaDoubleComplex *dwork,
CUstream stream)
{
#define da_ref(a_1,a_2) ( da+(a_2)*(ldda) + (a_1))
#define v_ref(a_1,a_2) ( v+(a_2)*(ldv) + (a_1))
#define dt_ref(a_1,a_2) ( dt+(a_2)*(lddt) + (a_1))
#define t_ref(a_1,a_2) ( t+(a_2)*(ldt) + (a_1))
int i, k, ib, lddwork, old_i, old_ib, rows, cols;
double _Complex one=1.;
if (m < 0) {
return -1;
} else if (n < 0) {
return -2;
} else if (ldda < max(1,m)) {
return -4;
}
k = min(m,n);
if (k == 0) {
hwork[0] = *((magmaDoubleComplex*) &one);
return MAGMA_SUCCESS;
}
lddwork= m;
/* lower parts of little T must be zero: memset to 0 for simplicity */
memset(t_ref(0,0), 0, nb*n*sizeof(magmaDoubleComplex));
cudaMemset(dt_ref(0,0), 0, nb*n*sizeof(magmaDoubleComplex));
/* copy first panel of A on the host */
cublasGetMatrix(min(m, nb), n, sizeof(magmaDoubleComplex),
da_ref(0, 0), ldda,
v, ldv);
/* Use blocked code initially */
for (i = 0; i < k; i += nb) {
ib = min(k-i, nb);
if (i+nb >= m) ib = min(m-i, nb);
cols = n-i;
if (i > 0){
/* copy panel of A from device to host */
cublasGetMatrix(ib, n, sizeof(magmaDoubleComplex),
da_ref(i, 0), ldda,
v, ldv);
/* Apply H' to A(i+2*ib:m, i:n) from the right */
rows = m-old_i-2*old_ib;
if (rows > 0){
magma_zlarfb_gpu( MagmaRight, MagmaNoTrans, MagmaForward, MagmaRowwise,
rows, n-old_i, old_ib,
da_ref(old_i, old_i), ldda, dt_ref(0,old_i), lddt,
da_ref(old_i+2*old_ib, old_i), ldda,
dwork, lddwork);
}
/* copy the lower diag tile into d_A */
CUDA_zgemerge(MagmaRight, MagmaUnit, old_ib, old_ib,
dd, ldd, da_ref(old_i, old_i), ldda, stream);
}
/* Form the triangular factor of the block reflector on the host
H = H'(i+ib-1) . . . H(i+1) H(i) */
CORE_zgelqt(ib, cols, ib,
(double _Complex*) v_ref(0,i), ldv,
(double _Complex*) t_ref(0,0), ldt,
(double _Complex*) tau+i,
(double _Complex*) hwork);
if ( i + ib < m ){
/* put 0s in the lower triangular part of a panel (and 1s on the
diagonal); copy the lower triangular in d */
CORE_zgesplit(MorseRight, MorseUnit, ib, min(cols,ib),
(double _Complex*) v_ref(0,i), ldv,
(double _Complex*) d, ldd);
/* copy from host to device a tile diag */
cublasSetMatrix( ib, min(cols,ib), sizeof(magmaDoubleComplex),
d, ldd, dd, ldd );
}
/* Send the triangular factor T to the GPU */
cublasSetMatrix( ib, ib, sizeof(magmaDoubleComplex),
t_ref(0,0), ldt, dt_ref(0,i), lddt );
/* A panel (with zeros in lower tri of its diag) is ready to be used
in input of zlarfb_gpu: we send the panel to the gpu */
cublasSetMatrix( ib, cols, sizeof(magmaDoubleComplex),
v_ref(0,i), ldv, da_ref(i,i), ldda );
if (i + ib < m) {
if (i+2*ib < m){
rows = ib;
}
else{
rows = m-i-ib;
}
/* Apply H' to A(i+ib:i+2*ib, i:n) from the right */
magma_zlarfb_gpu( MagmaRight, MagmaNoTrans, MagmaForward, MagmaRowwise,
rows, cols, ib, da_ref(i,i), ldda, dt_ref(0,i),
lddt, da_ref(i+ib,i), ldda, dwork, lddwork);

PRUVOST Florent
committed
cudaThreadSynchronize();
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
old_i = i;
old_ib = ib;
if (i+nb >= k){
/* Apply H' to A(i+2*ib:m, i:n) from the right */
rows = m-old_i-2*old_ib;
if (rows > 0){
magma_zlarfb_gpu( MagmaRight, MagmaNoTrans, MagmaForward, MagmaRowwise,
rows, cols, old_ib,
da_ref(old_i, old_i), ldda, dt_ref(0,old_i), lddt,
da_ref(old_i+2*old_ib, old_i), ldda,
dwork, lddwork);
}
/* copy the upper diag tile into d_A */
CUDA_zgemerge(MagmaRight, MagmaUnit, old_ib, old_ib,
dd, ldd, da_ref(old_i, old_i), ldda, stream);
}
}
}
#undef da_ref
#undef v_ref
#undef dt_ref
#undef t_ref
return MORSE_SUCCESS;
}
#endif