diff --git a/compute/pzgelqf.c b/compute/pzgelqf.c index 4609398fb0e3b265a72b013aa9a0fdf734d0add0..f8969bc0bcedc0cb57cb9e8f5eecad5ea3968ae6 100644 --- a/compute/pzgelqf.c +++ b/compute/pzgelqf.c @@ -104,7 +104,7 @@ void morse_pzgelqf(MORSE_desc_t *A, MORSE_desc_t *T, /* necessary to avoid dependencies between tslqt and unmlq tasks regarding the diag tile */ DIAG = (MORSE_desc_t*)malloc(sizeof(MORSE_desc_t)); - morse_zdesc_alloc_diag(*DIAG, A->mb, A->nb, (minMT-1)*A->mb, A->nb, 0, 0, (minMT-1)*A->mb, A->nb, A->p, A->q); + morse_zdesc_alloc_diag(*DIAG, A->mb, A->nb, min(A->m, A->n), A->nb, 0, 0, min(A->m, A->n), A->nb, A->p, A->q); for (k = 0; k < min(A->mt, A->nt); k++) { tempkm = k == A->mt-1 ? A->m-k*A->mb : A->mb; diff --git a/coreblas/compute/core_zgelqt.c b/coreblas/compute/core_zgelqt.c index 8268be84117d9f19fc705a0f3c9c288dd0270ace..c7ca8c12d7ea786e17afb83a0f705993b3bd4f8e 100644 --- a/coreblas/compute/core_zgelqt.c +++ b/coreblas/compute/core_zgelqt.c @@ -156,6 +156,22 @@ int CORE_zgelqt(int M, int N, int IB, WORK, M-i-sb); } } + // print the matrices +// int i1, i2; +// printf("\nV \n"); +// for (i1=0; i1<M; i1++){ +// for (i2=0; i2<N; i2++){ +// printf("%f ", A[i1+i2*LDA]); +// } +// printf("\n"); +// } +// printf("\nT \n"); +// for (i1=0; i1<IB; i1++){ +// for (i2=0; i2<N; i2++){ +// printf("%f ", T[i1+i2*LDT]); +// } +// printf("\n"); +// } return MORSE_SUCCESS; } diff --git a/cudablas/compute/cuda_zgelqt.c b/cudablas/compute/cuda_zgelqt.c index 17f9168436a516c320f27aa9ded3a838c01f5ffe..ff18cc171eba1573d417cedcd925400e719d737a 100644 --- a/cudablas/compute/cuda_zgelqt.c +++ b/cudablas/compute/cuda_zgelqt.c @@ -43,7 +43,7 @@ int CUDA_zgelqt( #define dt_ref(a_1,a_2) ( dt+(a_2)*(lddt) + (a_1)) #define t_ref(a_1,a_2) ( t+(a_2)*(ldt) + (a_1)) - int i, k, ib, lddwork, old_i, old_ib, rows, cols; + int i, k, ib, old_i, old_ib, rows, cols; double _Complex one=1.; if (m < 0) { @@ -60,107 +60,110 @@ int CUDA_zgelqt( return MAGMA_SUCCESS; } - lddwork= m; - /* lower parts of little T must be zero: memset to 0 for simplicity */ memset(t_ref(0,0), 0, nb*n*sizeof(magmaDoubleComplex)); - cudaMemset(dt_ref(0,0), 0, nb*n*sizeof(magmaDoubleComplex)); - - /* copy first panel of A on the host */ - cublasGetMatrix(min(m, nb), n, sizeof(magmaDoubleComplex), - da_ref(0, 0), ldda, - v, ldv); - - /* Use blocked code initially */ - for (i = 0; i < k; i += nb) { - - ib = min(k-i, nb); - if (i+nb >= m) ib = min(m-i, nb); - cols = n-i; - - if (i > 0){ - - /* copy panel of A from device to host */ - cublasGetMatrix(ib, n, sizeof(magmaDoubleComplex), - da_ref(i, 0), ldda, - v, ldv); - - /* Apply H' to A(i+2*ib:m, i:n) from the right */ - rows = m-old_i-2*old_ib; - if (rows > 0){ - magma_zlarfb_gpu( MagmaRight, MagmaNoTrans, MagmaForward, MagmaRowwise, - rows, n-old_i, old_ib, - da_ref(old_i, old_i), ldda, dt_ref(0,old_i), lddt, - da_ref(old_i+2*old_ib, old_i), ldda, - dwork, lddwork); - } - - /* copy the lower diag tile into d_A */ - CUDA_zgemerge(MagmaRight, MagmaUnit, old_ib, old_ib, - dd, ldd, da_ref(old_i, old_i), ldda, stream); - - } - - /* Form the triangular factor of the block reflector on the host - H = H'(i+ib-1) . . . H(i+1) H(i) */ - CORE_zgelqt(ib, cols, ib, - (double _Complex*) v_ref(0,i), ldv, - (double _Complex*) t_ref(0,0), ldt, - (double _Complex*) tau+i, - (double _Complex*) hwork); - - if ( i + ib < m ){ - /* put 0s in the lower triangular part of a panel (and 1s on the - diagonal); copy the lower triangular in d */ - CORE_zgesplit(MorseRight, MorseUnit, ib, min(cols,ib), - (double _Complex*) v_ref(0,i), ldv, - (double _Complex*) d, ldd); - - /* copy from host to device a tile diag */ - cublasSetMatrix( ib, min(cols,ib), sizeof(magmaDoubleComplex), - d, ldd, dd, ldd ); - } - - /* Send the triangular factor T to the GPU */ - cublasSetMatrix( ib, ib, sizeof(magmaDoubleComplex), - t_ref(0,0), ldt, dt_ref(0,i), lddt ); - - /* A panel (with zeros in lower tri of its diag) is ready to be used - in input of zlarfb_gpu: we send the panel to the gpu */ - cublasSetMatrix( ib, cols, sizeof(magmaDoubleComplex), - v_ref(0,i), ldv, da_ref(i,i), ldda ); - - if (i + ib < m) { - - if (i+2*ib < m){ - rows = ib; - } - else{ - rows = m-i-ib; - } - /* Apply H' to A(i+ib:i+2*ib, i:n) from the right */ - magma_zlarfb_gpu( MagmaRight, MagmaNoTrans, MagmaForward, MagmaRowwise, - rows, cols, ib, da_ref(i,i), ldda, dt_ref(0,i), - lddt, da_ref(i+ib,i), ldda, dwork, lddwork); - cudaThreadSynchronize(); - old_i = i; - old_ib = ib; - if (i+nb >= k){ - /* Apply H' to A(i+2*ib:m, i:n) from the right */ - rows = m-old_i-2*old_ib; - if (rows > 0){ - magma_zlarfb_gpu( MagmaRight, MagmaNoTrans, MagmaForward, MagmaRowwise, - rows, cols, old_ib, - da_ref(old_i, old_i), ldda, dt_ref(0,old_i), lddt, - da_ref(old_i+2*old_ib, old_i), ldda, - dwork, lddwork); - } - /* copy the upper diag tile into d_A */ - CUDA_zgemerge(MagmaRight, MagmaUnit, old_ib, old_ib, - dd, ldd, da_ref(old_i, old_i), ldda, stream); - } + cudaMemsetAsync(dt_ref(0,0), 0, nb*n*sizeof(magmaDoubleComplex), stream); + + if ( (nb > 1) && (nb < k) ) { + /* Use blocked code initially */ + old_i = 0; old_ib = nb; + for (i = 0; i < k-nb; i += nb) { + + ib = min(k-i, nb); + cols = n-i; + magma_zgetmatrix_async( ib, cols, + da_ref(i,i), ldda, + v_ref(0,i), ib, stream ); + + if (i>0){ + /* Apply H' to A(i+2*ib:m, i:n) from the right */ + rows = m-old_i-2*old_ib; + magma_zlarfb_gpu( MagmaRight, MagmaNoTrans, MagmaForward, MagmaRowwise, + rows, n-old_i, old_ib, + da_ref(old_i, old_i), ldda, dt_ref(0,old_i), lddt, + da_ref(old_i+2*old_ib, old_i), ldda, + dwork, rows); + + /* store the diagonal */ + magma_zsetmatrix_async( old_ib, old_ib, + d, old_ib, + da_ref(old_i, old_i), ldda, stream ); + } + + magma_queue_sync( stream ); + /* Form the triangular factor of the block reflector on the host + H = H'(i+ib-1) . . . H(i+1) H(i) */ + CORE_zgelqt(ib, cols, ib, + (double _Complex*) v_ref(0,i), ib, + (double _Complex*) t_ref(0,0), ib, + (double _Complex*) tau+i, + (double _Complex*) hwork); + + /* put 0s in the lower triangular part of a panel (and 1s on the + diagonal); copy the lower triangular in d */ + CORE_zgesplit(MorseRight, MorseUnit, ib, min(ib,cols), + (double _Complex*) v_ref(0,i), ib, + (double _Complex*) d, ib); + + /* send the custom panel to the GPU */ + magma_zsetmatrix( ib, cols, + v_ref(0, i), ib, + da_ref(i, i), ldda ); + + if ( i + ib < n ){ + /* Send the triangular factor T to the GPU */ + magma_zsetmatrix( ib, ib, + t_ref(0, 0), ib, + dt_ref(0, i), lddt ); + + if (i+nb < k-nb) { + /* Apply H' to A(i+ib:i+2*ib, i:n) from the right */ + magma_zlarfb_gpu( MagmaRight, MagmaNoTrans, MagmaForward, MagmaRowwise, + ib, cols, ib, + da_ref(i, i), ldda, dt_ref(0,i), lddt, + da_ref(i+ib,i), ldda, dwork, ib); + } + else { + rows = m-i-ib; + magma_zlarfb_gpu( MagmaRight, MagmaNoTrans, MagmaForward, MagmaRowwise, + rows, cols, ib, + da_ref(i, i), ldda, dt_ref(0,i), lddt, + da_ref(i+ib,i), ldda, dwork, rows); + cudaThreadSynchronize(); + /* Fix the diagonal block */ + magma_zsetmatrix_async( ib, ib, + d, ib, + da_ref(i, i), ldda, + stream ); + } + old_i = i; + old_ib = ib; + } + } + } else { + i = 0; } + /* Use unblocked code to factor the last or only block. */ + if (i < k) { + ib = m-i; + cols = n-i; + magma_zgetmatrix( ib, cols, + da_ref(i,i), ldda, + v_ref(0,i), ib ); + CORE_zgelqt(ib, cols, ib, + (double _Complex*) v_ref(0,i), ib, + (double _Complex*) t_ref(0,0), ib, + (double _Complex*) tau+i, + (double _Complex*) hwork); + /* send the last factorized panel to the GPU */ + magma_zsetmatrix( ib, cols, + v_ref(0, i), ib, + da_ref(i, i), ldda ); + /* Send the triangular factor T to the GPU */ + magma_zsetmatrix( ib, cols, + t_ref(0, 0), ib, + dt_ref(0, i), lddt ); } #undef da_ref diff --git a/cudablas/compute/cuda_ztslqt.c b/cudablas/compute/cuda_ztslqt.c index aeba03dce7755cb8a0e988e08a6169f959baff45..a129d69ea4ca205312c7146b3e859140463a211c 100644 --- a/cudablas/compute/cuda_ztslqt.c +++ b/cudablas/compute/cuda_ztslqt.c @@ -24,7 +24,7 @@ **/ #include "cudablas/include/cudablas.h" -#if defined(CHAMELEON_USE_MAGMA) +#if defined(CHAMELEON_USE_MAGMA) && 0 int CUDA_ztslqt( magma_int_t m, magma_int_t n, magma_int_t nb, magmaDoubleComplex *da1, magma_int_t ldda1, @@ -70,16 +70,16 @@ int CUDA_ztslqt( memset(t, 0, nb*n*sizeof(magmaDoubleComplex)); cudaMemset(dt, 0, nb*n*sizeof(magmaDoubleComplex)); - k = min(m, nb); // m can be lower than IB + //k = min(m, nb); // m can be lower than IB /* copy the first diag tile of A1 from device to host: da1 -> d */ - cublasGetMatrix(k, k, sizeof(magmaDoubleComplex), + cublasGetMatrix(nb, nb, sizeof(magmaDoubleComplex), da1_ref(0, 0), ldda1, - d, ldd); + d, nb); /* copy first panel of A2 from device to host: da2 -> a2 */ - cublasGetMatrix(k, n, sizeof(magmaDoubleComplex), + cublasGetMatrix(nb, n, sizeof(magmaDoubleComplex), da2_ref(0, 0), ldda2, - a2, lda2); + a2, nb); /* This is only blocked code for now */ for (i = 0; i < m; i += nb) { @@ -95,12 +95,12 @@ int CUDA_ztslqt( /* copy the diag tile of A1 from device to host: da1 -> d */ cublasGetMatrix(ib, ib, sizeof(magmaDoubleComplex), da1_ref(i, i), ldda1, - d, ldd); + d, ib); /* copy panel of A2 from device to host: da2 -> a2 */ cublasGetMatrix(ib, cols, sizeof(magmaDoubleComplex), da2_ref(i, 0), ldda2, - a2, lda2); + a2, ib); /* Apply H' to A(i+2*ib:m,i:n) from the left */ rows = m-old_i-2*old_ib; @@ -121,25 +121,25 @@ int CUDA_ztslqt( /* compute LQ factorization of the panel of A2 ib x cols */ CORE_ztslqt(ib, cols, ib, - (double _Complex*) d, ldd, - (double _Complex*) a2, lda2, - (double _Complex*) t, ldt, + (double _Complex*) d, ib, + (double _Complex*) a2, ib, + (double _Complex*) t, ib, (double _Complex*) tau, (double _Complex*) hwork); /* Send the panel from A2 back to the GPU */ cublasSetMatrix(ib, cols, sizeof(magmaDoubleComplex), - a2, lda2, + a2, ib, da2_ref(i, 0), ldda2); /* Send the triangular factor T from hwork to the GPU */ - cublasSetMatrix(ib, ib, sizeof(magmaDoubleComplex), - t, ldt, + cublasSetMatrix(ib, cols, sizeof(magmaDoubleComplex), + t, ib, dt_ref(0, i), lddt); /* get back the diag tile in A1 from host to device: d -> da1 */ cublasSetMatrix(ib, ib, sizeof(magmaDoubleComplex), - d, ldd, + d, ib, da1_ref(i, i), ldda1); /* tsmlq update on one panel forward (look ahead 1) */ diff --git a/runtime/starpu/codelets/codelet_ztslqt.c b/runtime/starpu/codelets/codelet_ztslqt.c index 264d50c51079179ea2a239fa3b319cf0adb2c1bc..4cf727d5002e1b630b7d8f5401c6ffe0835b31f6 100644 --- a/runtime/starpu/codelets/codelet_ztslqt.c +++ b/runtime/starpu/codelets/codelet_ztslqt.c @@ -170,7 +170,7 @@ static void cl_ztslqt_cpu_func(void *descr[], void *cl_arg) CORE_ztslqt(m, n, ib, A1, lda1, A2, lda2, T, ldt, TAU, WORK); } -#if defined(CHAMELEON_USE_MAGMA) +#if defined(CHAMELEON_USE_MAGMA) && 0 static void cl_ztslqt_cuda_func(void *descr[], void *cl_arg) { MORSE_starpu_ws_t *h_work; @@ -216,7 +216,7 @@ static void cl_ztslqt_cuda_func(void *descr[], void *cl_arg) /* * Codelet definition */ -#if defined(CHAMELEON_USE_MAGMA) || defined(CHAMELEON_SIMULATION) +#if (defined(CHAMELEON_USE_MAGMA) && 0) || defined(CHAMELEON_SIMULATION) CODELETS(ztslqt, 4, cl_ztslqt_cpu_func, cl_ztslqt_cuda_func, 0) #else CODELETS_CPU(ztslqt, 4, cl_ztslqt_cpu_func) diff --git a/timing/CMakeLists.txt b/timing/CMakeLists.txt index 1183d03483c80c80be2cb8406e7a9d8f2713f086..b098bda368a03d5d01d7ce67850112712134976a 100644 --- a/timing/CMakeLists.txt +++ b/timing/CMakeLists.txt @@ -83,6 +83,8 @@ set(ZSRC time_zgels_tile.c time_zgeqrf.c time_zgeqrf_tile.c + time_zgelqf.c + time_zgelqf_tile.c time_zgeqrs_tile.c time_zgetrf_incpiv.c time_zgetrf_incpiv_tile.c diff --git a/timing/CTestLists.cmake b/timing/CTestLists.cmake index dc2093c527ecff9696921debe5530389e4849fab..56dc2d7f89d89bf0cbc228bc582fd897936f77a9 100644 --- a/timing/CTestLists.cmake +++ b/timing/CTestLists.cmake @@ -23,6 +23,7 @@ set(TESTLIST getrf_incpiv getrf_nopiv geqrf + gelqf posv potrf potri diff --git a/timing/time_zgelqf.c b/timing/time_zgelqf.c new file mode 100644 index 0000000000000000000000000000000000000000..bf891e5dfa0570f494187c782c83f1a3529d8adf --- /dev/null +++ b/timing/time_zgelqf.c @@ -0,0 +1,80 @@ +/** + * + * @copyright (c) 2009-2014 The University of Tennessee and The University + * of Tennessee Research Foundation. + * All rights reserved. + * @copyright (c) 2012-2014 Inria. All rights reserved. + * @copyright (c) 2012-2014 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, Univ. Bordeaux. All rights reserved. + * + **/ + +/** + * + * @precisions normal z -> c d s + * + **/ +#define _TYPE MORSE_Complex64_t +#define _PREC double +#define _LAMCH LAPACKE_dlamch_work + +#define _NAME "MORSE_zgelqf" +/* See Lawn 41 page 120 */ +#define _FMULS FMULS_GELQF(M, N) +#define _FADDS FADDS_GELQF(M, N) + +#include "./timing.c" +#include "timing_zauxiliary.h" + +static int +RunTest(int *iparam, double *dparam, morse_time_t *t_) +{ + MORSE_desc_t *T; + PASTE_CODE_IPARAM_LOCALS( iparam ); + + if ( M != N && check ) { + fprintf(stderr, "Check cannot be perfomed with M != N\n"); + check = 0; + } + + /* Allocate Data */ + PASTE_CODE_ALLOCATE_MATRIX( A, 1, MORSE_Complex64_t, LDA, N ); + + /* Initialize Data */ + MORSE_zplrnt(M, N, A, LDA, 3456); + + /* Allocate Workspace */ + MORSE_Alloc_Workspace_zgels(M, N, &T, P, Q); + memset(T->mat, 0, (T->llm*T->lln)*sizeof(MorseComplexDouble)); + + /* Save AT in lapack layout for check */ + PASTE_CODE_ALLOCATE_COPY( Acpy, check, MORSE_Complex64_t, A, LDA, N ); + + START_TIMING(); + MORSE_zgelqf( M, N, A, LDA, T ); + STOP_TIMING(); + + /* Check the solution */ + if ( check ) + { + PASTE_CODE_ALLOCATE_MATRIX( X, 1, MORSE_Complex64_t, LDB, NRHS ); + MORSE_zplrnt( N, NRHS, X, LDB, 5673 ); + PASTE_CODE_ALLOCATE_COPY( B, 1, MORSE_Complex64_t, X, LDB, NRHS ); + + MORSE_zgelqs(M, N, NRHS, A, LDA, T, X, LDB); + + dparam[IPARAM_RES] = z_check_solution(M, N, NRHS, Acpy, LDA, B, X, LDB, + &(dparam[IPARAM_ANORM]), + &(dparam[IPARAM_BNORM]), + &(dparam[IPARAM_XNORM])); + + free( Acpy ); + free( B ); + free( X ); + } + + /* Free Workspace */ + MORSE_Dealloc_Workspace( &T ); + free( A ); + + return 0; +} diff --git a/timing/time_zgelqf_tile.c b/timing/time_zgelqf_tile.c new file mode 100644 index 0000000000000000000000000000000000000000..72efa19e1a0beef4b2d34caaa7dabe6f4603ef3b --- /dev/null +++ b/timing/time_zgelqf_tile.c @@ -0,0 +1,81 @@ +/** + * + * @copyright (c) 2009-2014 The University of Tennessee and The University + * of Tennessee Research Foundation. + * All rights reserved. + * @copyright (c) 2012-2014 Inria. All rights reserved. + * @copyright (c) 2012-2014 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, Univ. Bordeaux. All rights reserved. + * + **/ + +/** + * + * @precisions normal z -> c d s + * + **/ +#define _TYPE MORSE_Complex64_t +#define _PREC double +#define _LAMCH LAPACKE_dlamch_work + +#define _NAME "MORSE_zgelqf_Tile" +/* See Lawn 41 page 120 */ +#define _FMULS FMULS_GELQF( M, N ) +#define _FADDS FADDS_GELQF( M, N ) + +#include "./timing.c" + +static int +RunTest(int *iparam, double *dparam, morse_time_t *t_) +{ + MORSE_desc_t *descT; + PASTE_CODE_IPARAM_LOCALS( iparam ); + + /* Allocate Data */ + PASTE_CODE_ALLOCATE_MATRIX_TILE( descA, 1, MORSE_Complex64_t, MorseComplexDouble, LDA, M, N ); + PASTE_CODE_ALLOCATE_MATRIX_TILE( descX, ( check && M == N ), MORSE_Complex64_t, MorseComplexDouble, LDB, M, NRHS ); + PASTE_CODE_ALLOCATE_MATRIX_TILE( descAC, ( check && M == N ), MORSE_Complex64_t, MorseComplexDouble, LDA, M, N ); + PASTE_CODE_ALLOCATE_MATRIX_TILE( descB, ( check && M == N ), MORSE_Complex64_t, MorseComplexDouble, LDB, M, NRHS ); + + MORSE_zplrnt_Tile( descA, 5373 ); + + /* Save A for check */ + if (check == 1 && M == N){ + MORSE_zlacpy_Tile(MorseUpperLower, descA, descAC); + } + + /* Allocate Workspace */ + MORSE_Alloc_Workspace_zgels_Tile(M, N, &descT, P, Q); + memset(descT->mat, 0, (descT->llm*descT->lln)*sizeof(MorseComplexDouble)); + + /* MORSE ZGEQRF */ + START_TIMING(); + MORSE_zgelqf_Tile( descA, descT ); + STOP_TIMING(); + + /* Check the solution */ + if ( check && M == N ) + { + /* Initialize and save B */ + MORSE_zplrnt_Tile( descX, 2264 ); + MORSE_zlacpy_Tile(MorseUpperLower, descX, descB); + + /* Compute the solution */ + MORSE_zgelqs_Tile( descA, descT, descX ); + + /* Check solution */ + dparam[IPARAM_ANORM] = MORSE_zlange_Tile(MorseInfNorm, descAC); + dparam[IPARAM_BNORM] = MORSE_zlange_Tile(MorseInfNorm, descB); + dparam[IPARAM_XNORM] = MORSE_zlange_Tile(MorseInfNorm, descX); + MORSE_zgemm_Tile( MorseNoTrans, MorseNoTrans, 1.0, descAC, descX, -1.0, descB ); + dparam[IPARAM_RES] = MORSE_zlange_Tile(MorseInfNorm, descB); + PASTE_CODE_FREE_MATRIX( descX ) + PASTE_CODE_FREE_MATRIX( descAC ) + PASTE_CODE_FREE_MATRIX( descB ) + } + + /* Free data */ + MORSE_Dealloc_Workspace(&descT); + PASTE_CODE_FREE_MATRIX( descA ) + + return 0; +} diff --git a/timing/time_zgeqrf_tile.c b/timing/time_zgeqrf_tile.c index 9980e56afdf16ec6f3ba82c5b84c58d251390f2c..af1241da638262d664c29270c1b009460cc65ff0 100644 --- a/timing/time_zgeqrf_tile.c +++ b/timing/time_zgeqrf_tile.c @@ -23,6 +23,7 @@ #define _FADDS FADDS_GEQRF( M, N ) #include "./timing.c" +#include <starpu.h> static int RunTest(int *iparam, double *dparam, morse_time_t *t_) @@ -36,6 +37,14 @@ RunTest(int *iparam, double *dparam, morse_time_t *t_) PASTE_CODE_ALLOCATE_MATRIX_TILE( descAC, ( check && M == N ), MORSE_Complex64_t, MorseComplexDouble, LDA, M, N ); PASTE_CODE_ALLOCATE_MATRIX_TILE( descB, ( check && M == N ), MORSE_Complex64_t, MorseComplexDouble, LDB, M, NRHS ); + +// RUNTIME_zlocality_onerestrict( MORSE_GEQRT, STARPU_CUDA ); +// RUNTIME_zlocality_onerestrict( MORSE_GEQRT, STARPU_CPU ); +// RUNTIME_zlocality_onerestrict( MORSE_UNMQR, STARPU_CUDA ); +// RUNTIME_zlocality_onerestrict( MORSE_TSQRT, STARPU_CPU ); +// RUNTIME_zlocality_onerestrict( MORSE_TSMQR, STARPU_CUDA ); +// RUNTIME_zlocality_allrestrict( STARPU_CUDA ); + MORSE_zplrnt_Tile( descA, 5373 ); /* Save A for check */