diff --git a/compute/pzgelqf.c b/compute/pzgelqf.c index 6bf814a62bd42c1b1107eb759f89fdb69e2e808e..313b83eebd7e3815e2a35dcea4f7d9c8ab2f4727 100644 --- a/compute/pzgelqf.c +++ b/compute/pzgelqf.c @@ -19,7 +19,7 @@ * @author Mathieu Faverge * @author Emmanuel Agullo * @author Cedric Castagnede - * @date 2010-11-15 + * @date 2018-11-08 * @precisions normal z -> s d c * */ @@ -65,10 +65,10 @@ void chameleon_pzgelqf( int genD, CHAM_desc_t *A, CHAM_desc_t *T, CHAM_desc_t *D } /* - * zgelqt = A->nb * (ib+1) - * zunmlq = A->nb * ib - * ztslqt = A->nb * (ib+1) - * ztsmlq = A->nb * ib + * zgelqt = A->nb * (ib+1) + * zunmlq = A->nb * ib + * ztplqt = A->nb * (ib+1) + * ztpmlqt = A->nb * ib */ ws_worker = A->nb * (ib+1); @@ -76,8 +76,8 @@ void chameleon_pzgelqf( int genD, CHAM_desc_t *A, CHAM_desc_t *T, CHAM_desc_t *D #if defined(CHAMELEON_USE_CUDA) /* Worker space * - * zunmqr = A->nb * ib - * ztsmqr = 2 * A->nb * ib + * zunmlq = A->nb * ib + * ztpmlqt = 2 * A->nb * ib */ ws_worker = chameleon_max( ws_worker, ib * A->nb * 2 ); #endif diff --git a/compute/pzgelqf_param.c b/compute/pzgelqf_param.c index c91cea637129374915d28d8a9ee0743e82c7acda..7cc655e6deef5bd566445956ecf19886095816f4 100644 --- a/compute/pzgelqf_param.c +++ b/compute/pzgelqf_param.c @@ -14,7 +14,7 @@ * @version 1.0.0 * @author Mathieu Faverge * @author Raphael Boucherie - * @date 2017-05-17 + * @date 2018-11-09 * @precisions normal z -> s d c * */ @@ -69,10 +69,10 @@ void chameleon_pzgelqf_param( int genD, const libhqr_tree_t *qrtree, CHAM_desc_t /* Allocation of temporary (scratch) working space */ #if defined(CHAMELEON_USE_CUDA) /* - * zunmqr = A->nb * ib - * ztpmqrt = 2 * A->nb * ib + * zunmlq = A->nb * ib + * ztpmlqt = 3 * A->nb * ib */ - ws_worker = chameleon_max( ws_worker, ib * A->nb * 2 ); + ws_worker = chameleon_max( ws_worker, ib * A->nb * 3 ); #endif ws_worker *= sizeof(CHAMELEON_Complex64_t); diff --git a/compute/pzgelqfrh.c b/compute/pzgelqfrh.c index d9b77e5487e11f6c23e95bc347dc7943a5439ca5..7cec545bbf0318e47fe5920993876608e8c50cb7 100644 --- a/compute/pzgelqfrh.c +++ b/compute/pzgelqfrh.c @@ -20,7 +20,7 @@ * @author Mathieu Faverge * @author Emmanuel Agullo * @author Cedric Castagnede - * @date 2010-11-15 + * @date 2018-11-09 * @precisions normal z -> s d c * */ @@ -62,10 +62,10 @@ void chameleon_pzgelqfrh( int genD, int BS, CHAM_desc_t *A, CHAM_desc_t *T, CHAM } /* - * zgelqt = A->nb * (ib+1) - * zunmlq = A->nb * ib - * ztplqt = A->nb * (ib+1) - * ztpmlq = A->nb * ib + * zgelqt = A->nb * (ib+1) + * zunmlq = A->nb * ib + * ztplqt = A->nb * (ib+1) + * ztpmlqt = A->nb * ib */ ws_worker = A->nb * (ib+1); @@ -73,10 +73,10 @@ void chameleon_pzgelqfrh( int genD, int BS, CHAM_desc_t *A, CHAM_desc_t *T, CHAM #if defined(CHAMELEON_USE_CUDA) /* Worker space * - * zunmqr = A->nb * ib - * ztpmqr = 2 * A->nb * ib + * zunmlq = A->nb * ib + * ztpmlqt = 3 * A->nb * ib */ - ws_worker = chameleon_max( ws_worker, ib * A->nb * 2 ); + ws_worker = chameleon_max( ws_worker, ib * A->nb * 3 ); #endif ws_worker *= sizeof(CHAMELEON_Complex64_t); diff --git a/compute/pzgeqrf.c b/compute/pzgeqrf.c index 650e769a022a38c2066aadd75dfa0d776aa62f78..4ee0c4dabb4d13afdb94ea51bdce23f6b64d9fe8 100644 --- a/compute/pzgeqrf.c +++ b/compute/pzgeqrf.c @@ -19,7 +19,7 @@ * @author Mathieu Faverge * @author Emmanuel Agullo * @author Cedric Castagnede - * @date 2010-11-15 + * @date 2018-11-08 * @precisions normal z -> s d c * */ @@ -60,10 +60,10 @@ void chameleon_pzgeqrf( int genD, CHAM_desc_t *A, CHAM_desc_t *T, CHAM_desc_t *D } /* - * zgeqrt = A->nb * (ib+1) - * zunmqr = A->nb * ib - * ztsqrt = A->nb * (ib+1) - * ztsmqr = A->nb * ib + * zgeqrt = A->nb * (ib+1) + * zunmqr = A->nb * ib + * ztpqrt = A->nb * (ib+1) + * ztpmqrt = A->nb * ib */ ws_worker = A->nb * (ib+1); @@ -71,8 +71,8 @@ void chameleon_pzgeqrf( int genD, CHAM_desc_t *A, CHAM_desc_t *T, CHAM_desc_t *D #if defined(CHAMELEON_USE_CUDA) /* Worker space * - * zunmqr = A->nb * ib - * ztsmqr = 2 * A->nb * ib + * zunmqr = A->nb * ib + * ztpmqrt = 2 * A->nb * ib */ ws_worker = chameleon_max( ws_worker, ib * A->nb * 2 ); #endif diff --git a/compute/pzgeqrf_param.c b/compute/pzgeqrf_param.c index b297e598ff0e0bf40899be877d4942ab172dd7c6..72dcc8e5775b315a2e22f383f57e153fd1ee186e 100644 --- a/compute/pzgeqrf_param.c +++ b/compute/pzgeqrf_param.c @@ -14,7 +14,7 @@ * @version 1.0.0 * @author Mathieu Faverge * @author Raphael Boucherie - * @date 2017-05-17 + * @date 2018-11-09 * @precisions normal z -> s d c * */ @@ -74,10 +74,10 @@ void chameleon_pzgeqrf_param( int genD, int K, /* Allocation of temporary (scratch) working space */ #if defined(CHAMELEON_USE_CUDA) /* - * zunmqr = A->nb * ib - * ztpmqrt = 2 * A->nb * ib + * zunmqr = A->nb * ib + * ztpmqrt = 3 * A->nb * ib */ - ws_worker = chameleon_max( ws_worker, ib * A->nb * 2 ); + ws_worker = chameleon_max( ws_worker, ib * A->nb * 3 ); #endif ws_worker *= sizeof(CHAMELEON_Complex64_t); diff --git a/compute/pzgeqrfrh.c b/compute/pzgeqrfrh.c index b075a0ef19c860c3952220cf58d0f826b8b394ab..74d153ae80f99c8f19bc0913470c69759a9d45f6 100644 --- a/compute/pzgeqrfrh.c +++ b/compute/pzgeqrfrh.c @@ -20,7 +20,7 @@ * @author Mathieu Faverge * @author Emmanuel Agullo * @author Cedric Castagnede - * @date 2010-11-15 + * @date 2018-11-09 * @precisions normal z -> s d c * */ @@ -62,10 +62,10 @@ void chameleon_pzgeqrfrh( int genD, int BS, CHAM_desc_t *A, CHAM_desc_t *T, CHAM } /* - * zgeqrt = A->nb * (ib+1) - * zunmqr = A->nb * ib - * ztpqrt = A->nb * (ib+1) - * ztpmqr = A->nb * ib + * zgeqrt = A->nb * (ib+1) + * zunmqr = A->nb * ib + * ztpqrt = A->nb * (ib+1) + * ztpmqrt = A->nb * ib */ ws_worker = A->nb * (ib+1); @@ -73,10 +73,10 @@ void chameleon_pzgeqrfrh( int genD, int BS, CHAM_desc_t *A, CHAM_desc_t *T, CHAM #if defined(CHAMELEON_USE_CUDA) /* Worker space * - * zunmqr = A->nb * ib - * ztpmqr = 2 * A->nb * ib + * zunmqr = A->nb * ib + * ztpmqrt = 3 * A->nb * ib */ - ws_worker = chameleon_max( ws_worker, ib * A->nb * 2 ); + ws_worker = chameleon_max( ws_worker, ib * A->nb * 3 ); #endif ws_worker *= sizeof(CHAMELEON_Complex64_t); diff --git a/compute/pzhetrd_he2hb.c b/compute/pzhetrd_he2hb.c index 711651d15bfcc711bb46315a7d936f4c64c18c74..623a25517808356acaa7b61373e831a9b043452b 100644 --- a/compute/pzhetrd_he2hb.c +++ b/compute/pzhetrd_he2hb.c @@ -14,7 +14,7 @@ * @version 1.0.0 * @author Hatem Ltaief * @author Azzam Haidar - * @date 2010-11-15 + * @date 2018-11-09 * @precisions normal z -> s d c * */ @@ -74,9 +74,9 @@ void chameleon_pzhetrd_he2hb(cham_uplo_t uplo, #if defined(CHAMELEON_USE_CUDA) /* Worker space * - * zunmqr = A->nb * ib + * zunmqr = A->nb * ib * ztsmqr = 2 * A->nb * ib - * zherfb = A->nb * ib + * zherfb = A->nb * ib */ ws_worker = chameleon_max( ws_worker, ib * A->nb * 2 ); #endif diff --git a/compute/pztpgqrt.c b/compute/pztpgqrt.c index ec1880018e8b2b0f2089e7a9cc6364d135095b5b..52d99f42850f531dc8177d30f73c12169040272b 100644 --- a/compute/pztpgqrt.c +++ b/compute/pztpgqrt.c @@ -14,7 +14,7 @@ * * @version 1.0.0 * @author Mathieu Faverge - * @date 2016-12-15 + * @date 2018-11-09 * @precisions normal z -> s d c * */ @@ -66,9 +66,9 @@ void chameleon_pztpgqrt( int KT, int L, #if defined(CHAMELEON_USE_CUDA) /* Worker space * - * ztpmqrt = 2 * Q1->nb * ib + * ztpmqrt = 3 * Q1->nb * ib */ - ws_worker = chameleon_max( ws_worker, ib * Q1->nb * 2 ); + ws_worker = chameleon_max( ws_worker, ib * Q1->nb * 3 ); #endif ws_worker *= sizeof(CHAMELEON_Complex64_t); diff --git a/compute/pztpqrt.c b/compute/pztpqrt.c index 37de659fee2f50ac169db02532dd00082b38bc1c..0783d14da4e55f294f3b784349039485952da66d 100644 --- a/compute/pztpqrt.c +++ b/compute/pztpqrt.c @@ -14,7 +14,7 @@ * * @version 1.0.0 * @author Mathieu Faverge - * @date 2016-12-15 + * @date 2018-11-09 * @precisions normal z -> s d c * */ @@ -61,9 +61,9 @@ void chameleon_pztpqrt( int L, CHAM_desc_t *A, CHAM_desc_t *B, CHAM_desc_t *T, /* Allocation of temporary (scratch) working space */ #if defined(CHAMELEON_USE_CUDA) /* - * ztpmqrt = 2 * A->nb * ib + * ztpmqrt = 3 * A->nb * ib */ - ws_worker = chameleon_max( ws_worker, ib * A->nb * 2 ); + ws_worker = chameleon_max( ws_worker, ib * A->nb * 3 ); #endif ws_worker *= sizeof(CHAMELEON_Complex64_t); diff --git a/compute/pzunglq.c b/compute/pzunglq.c index 05ad8cdef0ae7fb131376b631559f8f14dfb7f42..492843fc3710220d31abda8728dfbaf0903555b7 100644 --- a/compute/pzunglq.c +++ b/compute/pzunglq.c @@ -19,7 +19,7 @@ * @author Mathieu Faverge * @author Emmanuel Agullo * @author Cedric Castagnede - * @date 2010-11-15 + * @date 2018-11-09 * @precisions normal z -> s d c * */ @@ -67,8 +67,8 @@ void chameleon_pzunglq( int genD, CHAM_desc_t *A, CHAM_desc_t *Q, CHAM_desc_t *T } /* - * zunmlq = A->nb * ib - * ztpmlq = A->nb * ib + * zunmlq = A->nb * ib + * ztpmlqt = A->nb * ib */ ws_worker = A->nb * ib; @@ -76,8 +76,8 @@ void chameleon_pzunglq( int genD, CHAM_desc_t *A, CHAM_desc_t *Q, CHAM_desc_t *T #if defined(CHAMELEON_USE_CUDA) /* Worker space * - * zunmlq = A->nb * ib - * ztpmlq = 2 * A->nb * ib + * zunmlq = A->nb * ib + * ztpmlqt = 2 * A->nb * ib */ ws_worker = chameleon_max( ws_worker, ib * A->nb * 2 ); #endif diff --git a/compute/pzunglq_param.c b/compute/pzunglq_param.c index d98a98ac70c4c7909f937e99bd588d1a95107eb2..b70080358fb49e66f3176097d98ecb9422ec44f2 100644 --- a/compute/pzunglq_param.c +++ b/compute/pzunglq_param.c @@ -14,7 +14,7 @@ * @version 1.0.0 * @author Mathieu Faverge * @author Raphael Boucherie - * @date 2017-05-17 + * @date 2018-11-09 * @precisions normal z -> s d c * */ @@ -63,18 +63,18 @@ void chameleon_pzunglq_param( int genD, const libhqr_tree_t *qrtree, CHAM_desc_t } /* - * zunmqr = A->nb * ib - * ztpmqr = A->nb * ib + * zunmlq = A->nb * ib + * ztpmlqt = A->nb * ib */ ws_worker = A->nb * ib; #if defined(CHAMELEON_USE_CUDA) /* Worker space * - * zunmqr = A->nb * ib - * ztpmqr = 2 * A->nb * ib + * zunmlq = A->nb * ib + * ztpmlqt = 3 * A->nb * ib */ - ws_worker = chameleon_max( ws_worker, ib * A->nb * 2 ); + ws_worker = chameleon_max( ws_worker, ib * A->nb * 3 ); #endif ws_worker *= sizeof(CHAMELEON_Complex64_t); diff --git a/compute/pzunglqrh.c b/compute/pzunglqrh.c index 95d6a7df46b5125504e2986c54e15f95ea96f94f..d31203a9eec0ad3c51fe633a2469eb065fad568e 100644 --- a/compute/pzunglqrh.c +++ b/compute/pzunglqrh.c @@ -18,7 +18,7 @@ * @author Mathieu Faverge * @author Emmanuel Agullo * @author Cedric Castagnede - * @date 2011-05-24 + * @date 2018-11-09 * @precisions normal z -> s d c * */ @@ -64,18 +64,18 @@ void chameleon_pzunglqrh( int genD, int BS, } /* - * zunmqr = A->nb * ib - * ztpmqr = A->nb * ib + * zunmlq = A->nb * ib + * ztpmlqt = A->nb * ib */ ws_worker = A->nb * ib; #if defined(CHAMELEON_USE_CUDA) /* Worker space * - * zunmqr = A->nb * ib - * ztpmqr = 2 * A->nb * ib + * zunmlq = A->nb * ib + * ztpmlqt = 3 * A->nb * ib */ - ws_worker = chameleon_max( ws_worker, ib * A->nb * 2 ); + ws_worker = chameleon_max( ws_worker, ib * A->nb * 3 ); #endif ws_worker *= sizeof(CHAMELEON_Complex64_t); diff --git a/compute/pzungqr.c b/compute/pzungqr.c index f9cc9182224acb9d6b01366f863b3d1c54fa779c..a190027288631727f1a08a2e4b1bcbe2fb69af19 100644 --- a/compute/pzungqr.c +++ b/compute/pzungqr.c @@ -19,7 +19,7 @@ * @author Mathieu Faverge * @author Emmanuel Agullo * @author Cedric Castagnede - * @date 2010-11-15 + * @date 2018-11-09 * @precisions normal z -> s d c * */ @@ -68,8 +68,8 @@ void chameleon_pzungqr( int genD, CHAM_desc_t *A, CHAM_desc_t *Q, } /* - * zunmqr = A->nb * ib - * ztsmqr = A->nb * ib + * zunmqr = A->nb * ib + * ztpmqrt = A->nb * ib */ ws_worker = A->nb * ib; @@ -77,8 +77,8 @@ void chameleon_pzungqr( int genD, CHAM_desc_t *A, CHAM_desc_t *Q, #if defined(CHAMELEON_USE_CUDA) /* Worker space * - * zunmqr = A->nb * ib - * ztsmqr = 2 * A->nb * ib + * zunmqr = A->nb * ib + * ztpmqrt = 2 * A->nb * ib */ ws_worker = chameleon_max( ws_worker, ib * A->nb * 2 ); #endif diff --git a/compute/pzungqr_param.c b/compute/pzungqr_param.c index 6f550160137776a091b9ed685ebf24dd64b1c992..8bcb901c13a962a4e199a8e0c2d64966e81ca3ea 100644 --- a/compute/pzungqr_param.c +++ b/compute/pzungqr_param.c @@ -14,7 +14,7 @@ * @version 1.0.0 * @author Mathieu Faverge * @author Raphael Boucherie - * @date 2017-05-17 + * @date 2018-11-09 * @precisions normal z -> s d c * */ @@ -60,17 +60,17 @@ void chameleon_pzungqr_param( int genD, int K, } /* - * zunmqr = A->nb * ib - * ztpmqr = A->nb * ib + * zunmqr = A->nb * ib + * ztpmqrt = A->nb * ib */ ws_worker = A->nb * ib; /* Allocation of temporary (scratch) working space */ #if defined(CHAMELEON_USE_CUDA) /* - * ztpmqrt = 2 * A->nb * ib + * ztpmqrt = 3 * A->nb * ib */ - ws_worker = chameleon_max( ws_worker, ib * A->nb * 2 ); + ws_worker = chameleon_max( ws_worker, ib * A->nb * 3 ); #endif ws_worker *= sizeof(CHAMELEON_Complex64_t); diff --git a/compute/pzungqrrh.c b/compute/pzungqrrh.c index 8fbfb9f489647bac8526a48b824380231cadb2ea..23d5e29db53f570516eee7231e4b326fde04e140 100644 --- a/compute/pzungqrrh.c +++ b/compute/pzungqrrh.c @@ -20,7 +20,7 @@ * @author Mathieu Faverge * @author Emmanuel Agullo * @author Cedric Castagnede - * @date 2010-11-15 + * @date 2018-11-09 * @precisions normal z -> s d c * */ @@ -67,8 +67,7 @@ void chameleon_pzungqrrh( int genD, int BS, } /* - * zunmqr = A->nb * ib - * ztsmqr = A->nb * ib + * zunmqr = A->nb * ib * ztpmqrt = A->nb * ib */ ws_worker = A->nb * ib; @@ -76,10 +75,10 @@ void chameleon_pzungqrrh( int genD, int BS, #if defined(CHAMELEON_USE_CUDA) /* Worker space * - * zunmqr = A->nb * ib - * ztsmqr = 2 * A->nb * ib + * zunmqr = A->nb * ib + * ztpmqrt = 3 * A->nb * ib */ - ws_worker = chameleon_max( ws_worker, ib * A->nb * 2 ); + ws_worker = chameleon_max( ws_worker, ib * A->nb * 3 ); #endif ws_worker *= sizeof(CHAMELEON_Complex64_t); diff --git a/compute/pzunmlq.c b/compute/pzunmlq.c index faff5895e38d393a431996019f483a0ee2a9a4ef..a5d23bf2b1db09789830af95372ffd67d46d76da 100644 --- a/compute/pzunmlq.c +++ b/compute/pzunmlq.c @@ -20,7 +20,7 @@ * @author Mathieu Faverge * @author Emmanuel Agullo * @author Cedric Castagnede - * @date 2010-11-15 + * @date 2018-11-09 * @precisions normal z -> s d c * */ @@ -70,16 +70,16 @@ void chameleon_pzunmlq( int genD, cham_side_t side, cham_trans_t trans, } /* - * zunmlq = A->mb * ib - * ztsmlq = A->mb * ib + * zunmlq = A->mb * ib + * ztpmlqt = A->mb * ib */ ws_worker = A->mb * ib; #if defined(CHAMELEON_USE_CUDA) /* Worker space * - * zunmlq = A->mb * ib - * ztsmlq = 2 * A->mb * ib + * zunmlq = A->mb * ib + * ztpmlqt = 2 * A->mb * ib */ ws_worker = chameleon_max( ws_worker, ib * A->mb * 2 ); #endif diff --git a/compute/pzunmlq_param.c b/compute/pzunmlq_param.c index 01c34e52b5a9a2f61b322737125a53322394b7c8..48dbc13eaab3b75b2a75b01c567c68524041680e 100644 --- a/compute/pzunmlq_param.c +++ b/compute/pzunmlq_param.c @@ -14,7 +14,7 @@ * @version 1.0.0 * @author Mathieu Faverge * @author Raphael Boucherie - * @date 2017-05-17 + * @date 2018-11-09 * @precisions normal z -> s d c * */ @@ -63,19 +63,18 @@ void chameleon_pzunmlq_param( int genD, const libhqr_tree_t *qrtree, } /* - * zunmlq = A->nb * ib - * ztsmlq = A->nb * ib - * zttmlq = A->nb * ib + * zunmlq = A->nb * ib + * ztpmlqt = A->nb * ib */ ws_worker = A->nb * ib; #if defined(CHAMELEON_USE_CUDA) /* Worker space * - * zunmlq = A->nb * ib - * ztsmlq = 2 * A->nb * ib + * zunmlq = A->nb * ib + * ztpmlqt = 3 * A->nb * ib */ - ws_worker = chameleon_max( ws_worker, ib * A->nb * 2 ); + ws_worker = chameleon_max( ws_worker, ib * A->nb * 3 ); #endif ws_worker *= sizeof(CHAMELEON_Complex64_t); diff --git a/compute/pzunmlqrh.c b/compute/pzunmlqrh.c index 23d0381769f2878af18f28f992f446dce18e1b97..26130cb69a7071245bc341f2310b01067e35f5de 100644 --- a/compute/pzunmlqrh.c +++ b/compute/pzunmlqrh.c @@ -20,7 +20,7 @@ * @author Mathieu Faverge * @author Emmanuel Agullo * @author Cedric Castagnede - * @date 2010-11-15 + * @date 2018-11-09 * @precisions normal z -> s d c * */ @@ -65,19 +65,18 @@ void chameleon_pzunmlqrh( int genD, int BS, cham_side_t side, cham_trans_t trans } /* - * zunmlq = A->nb * ib - * ztsmlq = A->nb * ib - * zttmlq = A->nb * ib + * zunmlq = A->nb * ib + * ztpmlqt = A->nb * ib */ ws_worker = A->nb * ib; #if defined(CHAMELEON_USE_CUDA) /* Worker space * - * zunmlq = A->nb * ib - * ztsmlq = 2 * A->nb * ib + * zunmlq = A->nb * ib + * ztpmlqt = 3 * A->nb * ib */ - ws_worker = chameleon_max( ws_worker, ib * A->nb * 2 ); + ws_worker = chameleon_max( ws_worker, ib * A->nb * 3 ); #endif ws_worker *= sizeof(CHAMELEON_Complex64_t); diff --git a/compute/pzunmqr.c b/compute/pzunmqr.c index 9000c51582d83b89c351f94ac5533f88d792f0d6..4c15ba7fd0ac6ec475583f73c190c34caae6f1bb 100644 --- a/compute/pzunmqr.c +++ b/compute/pzunmqr.c @@ -20,7 +20,7 @@ * @author Mathieu Faverge * @author Emmanuel Agullo * @author Cedric Castagnede - * @date 2010-11-15 + * @date 2018-11-09 * @precisions normal z -> s d c * */ @@ -70,16 +70,16 @@ void chameleon_pzunmqr( int genD, cham_side_t side, cham_trans_t trans, } /* - * zunmqr = A->nb * ib - * ztsmqr = A->nb * ib + * zunmqr = A->nb * ib + * ztpmqrt = A->nb * ib */ ws_worker = A->nb * ib; #if defined(CHAMELEON_USE_CUDA) /* Worker space * - * zunmqr = A->nb * ib - * ztsmqr = 2 * A->nb * ib + * zunmqr = A->nb * ib + * ztpmqrt = 2 * A->nb * ib */ ws_worker = chameleon_max( ws_worker, ib * A->nb * 2 ); #endif diff --git a/compute/pzunmqr_param.c b/compute/pzunmqr_param.c index 7a9b2b6e11765a47f3aaaad0e654765ca25b2b1a..772bfdf48f4310ef9a96a7de427fdc7c35a09f49 100644 --- a/compute/pzunmqr_param.c +++ b/compute/pzunmqr_param.c @@ -14,7 +14,7 @@ * @version 1.0.0 * @author Mathieu Faverge * @author Raphael Boucherie - * @date 2017-05-17 + * @date 2018-11-09 * @precisions normal z -> s d c * */ @@ -63,8 +63,7 @@ void chameleon_pzunmqr_param( int genD, const libhqr_tree_t *qrtree, } /* - * zunmqr = A->nb * ib - * ztsmqr = A->nb * ib + * zunmqr = A->nb * ib * ztpmqrt = A->nb * ib */ ws_worker = A->nb * ib; @@ -72,10 +71,10 @@ void chameleon_pzunmqr_param( int genD, const libhqr_tree_t *qrtree, #if defined(CHAMELEON_USE_CUDA) /* Worker space * - * zunmqr = A->nb * ib - * ztsmqr = 2 * A->nb * ib + * zunmqr = A->nb * ib + * ztpmqrt = 3 * A->nb * ib */ - ws_worker = chameleon_max( ws_worker, ib * A->nb * 2 ); + ws_worker = chameleon_max( ws_worker, ib * A->nb * 3 ); #endif ws_worker *= sizeof(CHAMELEON_Complex64_t); diff --git a/compute/pzunmqrrh.c b/compute/pzunmqrrh.c index 1073db6ac13142e4e71f6bbb761de68d4402470a..e8429f8cfeb7ae92709228dd8fab734364149944 100644 --- a/compute/pzunmqrrh.c +++ b/compute/pzunmqrrh.c @@ -20,7 +20,7 @@ * @author Mathieu Faverge * @author Emmanuel Agullo * @author Cedric Castagnede - * @date 2010-11-15 + * @date 2018-11-09 * @precisions normal z -> s d c * */ @@ -66,8 +66,7 @@ void chameleon_pzunmqrrh( int genD, int BS, cham_side_t side, cham_trans_t trans } /* - * zunmqr = A->nb * ib - * ztsmqr = A->nb * ib + * zunmqr = A->nb * ib * ztpmqrt = A->nb * ib */ ws_worker = A->nb * ib; @@ -75,10 +74,10 @@ void chameleon_pzunmqrrh( int genD, int BS, cham_side_t side, cham_trans_t trans #if defined(CHAMELEON_USE_CUDA) /* Worker space * - * zunmqr = A->nb * ib - * ztsmqr = 2 * A->nb * ib + * zunmqr = A->nb * ib + * ztpmqrt = 3 * A->nb * ib */ - ws_worker = chameleon_max( ws_worker, ib * A->nb * 2 ); + ws_worker = chameleon_max( ws_worker, ib * A->nb * 3 ); #endif ws_worker *= sizeof(CHAMELEON_Complex64_t); diff --git a/coreblas/compute/core_zparfb.c b/coreblas/compute/core_zparfb.c index 5199173acfadea031f5d656407638c7b0e9a8c18..a359402d6b90c8aa0484e1d1b587c413060561c4 100644 --- a/coreblas/compute/core_zparfb.c +++ b/coreblas/compute/core_zparfb.c @@ -18,7 +18,7 @@ * @author Mathieu Faverge * @author Emmanuel Agullo * @author Cedric Castagnede - * @date 2011-06-14 + * @date 2018-11-09 * @precisions normal z -> c d s * */ @@ -139,7 +139,8 @@ */ /* This kernel is never traced so return type on previous line for convert2eztrace.pl script */ int -CORE_zparfb(cham_side_t side, cham_trans_t trans, cham_dir_t direct, cham_store_t storev, +CORE_zparfb(cham_side_t side, cham_trans_t trans, + cham_dir_t direct, cham_store_t storev, int M1, int N1, int M2, int N2, int K, int L, CHAMELEON_Complex64_t *A1, int LDA1, CHAMELEON_Complex64_t *A2, int LDA2, diff --git a/coreblas/compute/core_ztpmlqt.c b/coreblas/compute/core_ztpmlqt.c index 72e54b1ad14c6590adb0a034f7d1c86d22df9737..bfc53b98dba91a9ca56fbf71e2b6b3faf1828c99 100644 --- a/coreblas/compute/core_ztpmlqt.c +++ b/coreblas/compute/core_ztpmlqt.c @@ -13,7 +13,7 @@ * * @version 1.0.0 * @author Mathieu Faverge - * @date 2016-12-15 + * @date 2018-11-09 * @precisions normal z -> c d s * */ @@ -24,9 +24,11 @@ * * @ingroup CORE_CHAMELEON_Complex64_t * - * CORE_ztpmlqt applies a complex orthogonal matrix Q obtained from a - * "triangular-pentagonal" complex block reflector H to a general complex matrix - * C, which consists of two blocks A and B. + * @brief Applies a complex orthogonal matrix Q. + * + * The matrix Q is obtained from a "triangular-pentagonal" complex block + * reflector H to a general complex matrix C, which consists of two blocks A and + * B. * ******************************************************************************* * @@ -128,9 +130,8 @@ * ******************************************************************************* * - * @return - * \retval CHAMELEON_SUCCESS successful exit - * \retval <0 if -i, the i-th argument had an illegal value + * @retval CHAMELEON_SUCCESS successful exit + * @retval <0 if -i, the i-th argument had an illegal value * */ diff --git a/coreblas/compute/core_zttmlq.c b/coreblas/compute/core_zttmlq.c index 69a7004c6d9d9cd148766f822025fcdbe7f441a9..5b6ee0261ec8e920f4883847931526fc864ddf76 100644 --- a/coreblas/compute/core_zttmlq.c +++ b/coreblas/compute/core_zttmlq.c @@ -19,7 +19,7 @@ * @author Mathieu Faverge * @author Emmanuel Agullo * @author Cedric Castagnede - * @date 2010-11-15 + * @date 2018-11-09 * @precisions normal z -> c d s * */ @@ -127,9 +127,9 @@ int CORE_zttmlq(cham_side_t side, cham_trans_t trans, const CHAMELEON_Complex64_t *T, int LDT, CHAMELEON_Complex64_t *WORK, int LDWORK) { - int i, i1, i3, l; + int i, i1, i3; int NW; - int kb; + int kb, l; int ic = 0; int jc = 0; int mi1 = M1; @@ -205,11 +205,13 @@ int CORE_zttmlq(cham_side_t side, cham_trans_t trans, } /* Quick return */ - if ((M1 == 0) || (N1 == 0) || (M2 == 0) || (N2 == 0) || (K == 0) || (IB == 0)) + if ((M1 == 0) || (N1 == 0) || (M2 == 0) || (N2 == 0) || (K == 0) || (IB == 0)) { return CHAMELEON_SUCCESS; + } - if (((side == ChamLeft) && (trans == ChamNoTrans)) - || ((side == ChamRight) && (trans != ChamNoTrans))) { + if ( ((side == ChamLeft ) && (trans == ChamNoTrans)) || + ((side == ChamRight) && (trans != ChamNoTrans)) ) + { i1 = 0; i3 = IB; } @@ -248,13 +250,11 @@ int CORE_zttmlq(cham_side_t side, cham_trans_t trans, CORE_zparfb( side, trans, ChamDirForward, ChamRowwise, mi1, ni1, mi2, ni2, kb, l, - &A1[LDA1*jc+ic], LDA1, + A1 + LDA1 * jc + ic, LDA1, A2, LDA2, - &V[i], LDV, - &T[LDT*i], LDT, + V + i, LDV, + T + LDT * i, LDT, WORK, LDWORK); } return CHAMELEON_SUCCESS; } - - diff --git a/cudablas/compute/CMakeLists.txt b/cudablas/compute/CMakeLists.txt index d9859a604616b7b4c8e79f7bc8a32238b68eca40..4a1f8559a9c62bbad84f79c545c89075dd71229d 100644 --- a/cudablas/compute/CMakeLists.txt +++ b/cudablas/compute/CMakeLists.txt @@ -19,7 +19,7 @@ # # @version 1.0.0 # @author Florent Pruvost -# @date 2015-09-16 +# @date 2018-11-09 # ### @@ -38,11 +38,13 @@ set(ZSRC cuda_zsymm.c cuda_zsyr2k.c cuda_zsyrk.c + cuda_ztpmlqt.c cuda_ztpmqrt.c cuda_ztrmm.c cuda_ztrsm.c cuda_ztsmlq.c cuda_ztsmqr.c + cuda_zttmlq.c cuda_zttmqr.c cuda_zunmlqt.c cuda_zunmqrt.c diff --git a/cudablas/compute/cuda_zlarfb.c b/cudablas/compute/cuda_zlarfb.c index b44b22ca9225b87dae0bc7cacf0de17050a6a2ef..51fb0f3c70c33d84ea01d9499f51eeac053a0791 100644 --- a/cudablas/compute/cuda_zlarfb.c +++ b/cudablas/compute/cuda_zlarfb.c @@ -15,21 +15,21 @@ * * @version 1.0.0 * @author Florent Pruvost - * @date 2015-09-16 + * @date 2018-11-09 * @precisions normal z -> c d s * */ #include "cudablas.h" int -CUDA_zlarfb(cham_side_t side, cham_trans_t trans, - cham_dir_t direct, cham_store_t storev, - int M, int N, int K, - const cuDoubleComplex *V, int LDV, - const cuDoubleComplex *T, int LDT, - cuDoubleComplex *C, int LDC, - cuDoubleComplex *WORK, int LDWORK, - CUBLAS_STREAM_PARAM ) +CUDA_zlarfb( cham_side_t side, cham_trans_t trans, + cham_dir_t direct, cham_store_t storev, + int M, int N, int K, + const cuDoubleComplex *V, int LDV, + const cuDoubleComplex *T, int LDT, + cuDoubleComplex *C, int LDC, + cuDoubleComplex *WORK, int LDWORK, + CUBLAS_STREAM_PARAM ) { #if defined(PRECISION_z) || defined(PRECISION_c) cuDoubleComplex zzero = make_cuDoubleComplex(0.0, 0.0); @@ -67,20 +67,25 @@ CUDA_zlarfb(cham_side_t side, cham_trans_t trans, } /* Quick return */ - if ((M == 0) || (N == 0) || (K == 0)) + if ((M == 0) || (N == 0) || (K == 0)) { return CHAMELEON_SUCCESS; + } // opposite of trans - if (trans == ChamNoTrans) + if (trans == ChamNoTrans) { transT = ChamConjTrans; - else + } + else { transT = ChamNoTrans; + } // whether T is upper or lower triangular - if (direct == ChamDirForward) + if (direct == ChamDirForward) { uplo = ChamUpper; - else + } + else { uplo = ChamLower; + } if (storev == ChamColumnwise) { notransV = ChamNoTrans; @@ -106,8 +111,8 @@ CUDA_zlarfb(cham_side_t side, cham_trans_t trans, // W = W T^H = C^H V T^H CUDA_ztrmm( ChamRight, uplo, transT, ChamNonUnit, N, K, - CUBLAS_SADDR(zone), T, LDT, - WORK, LDWORK, + &zone, T, LDT, + WORK, LDWORK, CUBLAS_STREAM_VALUE ); // C = C - V W^H = C - V T V^H C = (I - V T V^H) C = H C @@ -133,8 +138,8 @@ CUDA_zlarfb(cham_side_t side, cham_trans_t trans, // W = W T = C V T CUDA_ztrmm( ChamRight, uplo, trans, ChamNonUnit, M, K, - CUBLAS_SADDR(zone), T, LDT, - WORK, LDWORK, + &zone, T, LDT, + WORK, LDWORK, CUBLAS_STREAM_VALUE ); // C = C - W V^H = C - C V T V^H = C (I - V T V^H) = C H diff --git a/cudablas/compute/cuda_zparfb.c b/cudablas/compute/cuda_zparfb.c index 292ac3b647cebbee27d15f7336ed80a21c8de9b0..bcef47797c5892accf0073aa9e2fd67b2cb6ab1b 100644 --- a/cudablas/compute/cuda_zparfb.c +++ b/cudablas/compute/cuda_zparfb.c @@ -13,7 +13,7 @@ * * @version 1.0.0 * @author Florent Pruvost - * @date 2015-09-16 + * @date 2018-11-09 * @precisions normal z -> c d s * */ @@ -120,40 +120,32 @@ * The leading dimension of the array T. LDT >= K. * * @param[in,out] WORK - * Workspace of dimension LDWORK-by-N1 if side == ChamLeft, LDWORK-by-K - * otherwise. + * Workspace of dimension at least: + * - K * (M2 + N2). + * If L > 0, it is recommended to extend it to + * - K * (2 * M2 + N2 ) if side == ChamLeft. + * - K * (M2 + 2 * N2 ) if side == ChamRight. * - * @param[in] LDWORK - * The leading dimension of the array WORK: LDWORK >= K, if side == - * ChamLeft, LDWORK >= M1 otehrwise. - * - * @param[in,out] WORKC - * Optionnal additional workspace to replace the TRMM operation by a GEMM kernel. - * This workspace is of dimension LDWORK-by-K if side == ChamLeft, LDWORK-by-N2 - * otherwise. - * - * @param[in] LDWORKC - * The leading dimension of the array WORKC: LDWORKC >= M2, if side == - * ChamLeft, LDWORK >= K otehrwise. + * @param[in] LWORK + * The dimension of the array WORK. If LWORK < 0, returns immediately + * the recommended workspace size. * ******************************************************************************* * - * @return - * \retval CHAMELEON_SUCCESS successful exit - * \retval <0 if -i, the i-th argument had an illegal value - * + * @retval CHAMELEON_SUCCESS successful exit + * @retval <0 if -i, the i-th argument had an illegal value + * @retval The recommended LWORK value, if LWORK == -1 on entry. */ int -CUDA_zparfb(cham_side_t side, cham_trans_t trans, - cham_dir_t direct, cham_store_t storev, - int M1, int N1, int M2, int N2, int K, int L, - cuDoubleComplex *A1, int LDA1, - cuDoubleComplex *A2, int LDA2, - const cuDoubleComplex *V, int LDV, - const cuDoubleComplex *T, int LDT, - cuDoubleComplex *WORK, int LDWORK, - cuDoubleComplex *WORKC, int LDWORKC, - CUBLAS_STREAM_PARAM ) +CUDA_zparfb( cham_side_t side, cham_trans_t trans, + cham_dir_t direct, cham_store_t storev, + int M1, int N1, int M2, int N2, int K, int L, + cuDoubleComplex *A1, int LDA1, + cuDoubleComplex *A2, int LDA2, + const cuDoubleComplex *V, int LDV, + const cuDoubleComplex *T, int LDT, + cuDoubleComplex *WORK, int LWORK, + CUBLAS_STREAM_PARAM ) { #if defined(PRECISION_z) || defined(PRECISION_c) cuDoubleComplex zzero = make_cuDoubleComplex(0.0, 0.0); @@ -165,9 +157,13 @@ CUDA_zparfb(cham_side_t side, cham_trans_t trans, double mzone = -1.0; #endif /* defined(PRECISION_z) || defined(PRECISION_c) */ + cuDoubleComplex *workW, *workC, *workV; + int ldW, ldC, ldV; int j; cham_trans_t transW; cham_trans_t transA2; + int wssize = 0; + int wrsize = 0; CUBLAS_GET_STREAM; @@ -201,19 +197,30 @@ CUDA_zparfb(cham_side_t side, cham_trans_t trans, if (K < 0) { return -9; } - if ( ((LDWORK < K ) && (side == ChamLeft )) || - ((LDWORK < M1) && (side == ChamRight)) ) { + + if (direct == ChamDirForward) { + wssize = K * (M2 + N2); + wrsize = wssize; + if ( L > 0 ) { + wrsize += (side == ChamLeft) ? M2 * K : K * N2; + } + } + + if ( LWORK < 0 ) { + return wrsize; + } + else if ( LWORK < wssize ) { + cudablas_error(20, "Illegal value of LWORK"); return -20; } - /* Quick return */ - if ((M1 == 0) || (N1 == 0) || (M2 == 0) || (N2 == 0) || (K == 0)) + if ((M1 == 0) || (N1 == 0) || (M2 == 0) || (N2 == 0) || (K == 0)) { return CHAMELEON_SUCCESS; + } if (direct == ChamDirForward) { if (side == ChamLeft) { - /* * Column or Rowwise / Forward / Left * ---------------------------------- @@ -222,76 +229,137 @@ CUDA_zparfb(cham_side_t side, cham_trans_t trans, * ( A2 ) */ + /* + * Store in WORK (N1 == N2): + * - Workspace W for the copy of A1 + V' * A2 (K x N1) + * - Workspace C for the copy of V * T (M2 x K ) + * - Workspace V for the copy of V (M2 x K ) + */ + workW = WORK; + ldW = K; + + workC = workW + K * N1; + ldC = M2; + + if ( L == 0 ) { + workV = (cuDoubleComplex*)V; + ldV = LDV; + } + else { + if ( LWORK < wrsize ) { + workC = NULL; + workV = workW + K * N1; + } + else { + workV = workC + M2 * K; + } + + if ( storev == ChamColumnwise ) { + ldV = M2; + + /* + * Backup V, and put 0 in the lower part + */ + cudaMemcpy2DAsync( workV, ldV * sizeof(cuDoubleComplex), + V, LDV * sizeof(cuDoubleComplex), + M2 * sizeof(cuDoubleComplex), K, + cudaMemcpyDeviceToDevice, stream ); + + for(j = 1; j < K; j++) { + cudaMemsetAsync( workV + (j-1) * ldV + M2 - L + j, + 0, + (L - j) * sizeof(cuDoubleComplex), + stream ); + } + } + else { + ldV = K; + + /* + * Backup V, and put 0 in the lower part + */ + cudaMemcpy2DAsync( workV, ldV * sizeof(cuDoubleComplex), + V, LDV * sizeof(cuDoubleComplex), + K * sizeof(cuDoubleComplex), M2, + cudaMemcpyDeviceToDevice, stream ); + + for(j = 1; j < K; j++) { + cudaMemsetAsync( workV + ldV * ( M2 - L + j ), + 0, + j * sizeof(cuDoubleComplex), + stream ); + } + } + } + /* * W = A1 + V' * A2: * W = A1 * W = W + V' * A2 * */ - cudaMemcpy2DAsync( WORK, LDWORK * sizeof(cuDoubleComplex), - A1, LDA1 * sizeof(cuDoubleComplex), + cudaMemcpy2DAsync( workW, ldW * sizeof(cuDoubleComplex), + A1, LDA1 * sizeof(cuDoubleComplex), K * sizeof(cuDoubleComplex), N1, cudaMemcpyDeviceToDevice, stream ); transW = storev == ChamColumnwise ? ChamConjTrans : ChamNoTrans; transA2 = storev == ChamColumnwise ? ChamNoTrans : ChamConjTrans; - cublasZgemm(CUBLAS_HANDLE - chameleon_cublas_const(transW), chameleon_cublas_const(ChamNoTrans), - K, N1, M2, - CUBLAS_SADDR(zone), - V /* K*M2 */, LDV, - A2 /* M2*N1 */, LDA2, - CUBLAS_SADDR(zone), - WORK /* K*N1 */, LDWORK); - - if (WORKC == NULL) { + cublasZgemm( CUBLAS_HANDLE + chameleon_cublas_const(transW), chameleon_cublas_const(ChamNoTrans), + K, N1, M2, + CUBLAS_SADDR(zone), workV /* M2*K */, ldV, + A2 /* M2*N2 */, LDA2, + CUBLAS_SADDR(zone), workW /* K *N2 */, ldW ); + + if ( workC == NULL ) { /* W = op(T) * W */ CUDA_ztrmm( ChamLeft, ChamUpper, trans, ChamNonUnit, K, N2, - CUBLAS_SADDR(zone), T, LDT, - WORK, LDWORK, + &zone, T, LDT, + workW, ldW, CUBLAS_STREAM_VALUE ); /* A1 = A1 - W = A1 - op(T) * W */ for(j = 0; j < N1; j++) { - cublasZaxpy(CUBLAS_HANDLE - K, CUBLAS_SADDR(mzone), - (WORK + LDWORK*j), 1, - (A1 + LDA1*j), 1); + cublasZaxpy( CUBLAS_HANDLE + K, CUBLAS_SADDR(mzone), + workW + ldW * j, 1, + A1 + LDA1 * j, 1 ); } /* A2 = A2 - op(V) * W */ - cublasZgemm(CUBLAS_HANDLE - chameleon_cublas_const(transA2), chameleon_cublas_const(ChamNoTrans), - M2, N2, K, - CUBLAS_SADDR(mzone), V /* M2*K */, LDV, - WORK /* K*N2 */, LDWORK, - CUBLAS_SADDR(zone), A2 /* m2*N2 */, LDA2); + cublasZgemm( CUBLAS_HANDLE + chameleon_cublas_const(transA2), chameleon_cublas_const(ChamNoTrans), + M2, N2, K, + CUBLAS_SADDR(mzone), workV /* M2 * K */, ldV, + workW /* K * N2 */, ldW, + CUBLAS_SADDR(zone), A2 /* M2 * N2 */, LDA2 ); } else { /* Wc = V * op(T) */ cublasZgemm( CUBLAS_HANDLE chameleon_cublas_const(transA2), chameleon_cublas_const(trans), M2, K, K, - CUBLAS_SADDR(zone), V, LDV, - T, LDT, - CUBLAS_SADDR(zzero), WORKC, LDWORKC ); + CUBLAS_SADDR(zone), workV, ldV, + T, LDT, + CUBLAS_SADDR(zzero), workC, ldC ); /* A1 = A1 - opt(T) * W */ cublasZgemm( CUBLAS_HANDLE chameleon_cublas_const(trans), chameleon_cublas_const(ChamNoTrans), K, N1, K, - CUBLAS_SADDR(mzone), T, LDT, - WORK, LDWORK, - CUBLAS_SADDR(zone), A1, LDA1 ); + CUBLAS_SADDR(mzone), T, LDT, + workW, ldW, + CUBLAS_SADDR(zone), A1, LDA1 ); /* A2 = A2 - Wc * W */ cublasZgemm( CUBLAS_HANDLE chameleon_cublas_const(ChamNoTrans), chameleon_cublas_const(ChamNoTrans), M2, N2, K, - CUBLAS_SADDR(mzone), WORKC, LDWORKC, - WORK, LDWORK, + CUBLAS_SADDR(mzone), workC, ldC, + workW, ldW, CUBLAS_SADDR(zone), A2, LDA2 ); } } @@ -304,14 +372,77 @@ CUDA_zparfb(cham_side_t side, cham_trans_t trans, * */ + /* + * Store in WORK (M1 == M2): + * - Workspace W for the copy of A1 + A2 * V' (M1 x K ) + * - Workspace C for the copy of V * T (K x N2) + * - Workspace V for the copy of V (K x N2) + */ + workW = WORK; + ldW = M1; + + workC = workW + M1 * K; + ldC = K; + + if ( L == 0 ) { + workV = (cuDoubleComplex*)V; + ldV = LDV; + } + else { + if ( LWORK < wrsize ) { + workC = NULL; + workV = workW + M2 * K; + } + else { + workV = workC + K * N2; + } + + if ( storev == ChamColumnwise ) { + ldV = N2; + + /* + * Backup V, and put 0 in the lower part + */ + cudaMemcpy2DAsync( workV, ldV * sizeof(cuDoubleComplex), + V, LDV * sizeof(cuDoubleComplex), + N2 * sizeof(cuDoubleComplex), K, + cudaMemcpyDeviceToDevice, stream ); + + for(j = 1; j < K; j++) { + cudaMemsetAsync( workV + (j-1) * ldV + N2 - L + j, + 0, + (L - j) * sizeof(cuDoubleComplex), + stream ); + } + } + else { + ldV = K; + + /* + * Backup V, and put 0 in the upper part + */ + cudaMemcpy2DAsync( workV, ldV * sizeof(cuDoubleComplex), + V, LDV * sizeof(cuDoubleComplex), + K * sizeof(cuDoubleComplex), N2, + cudaMemcpyDeviceToDevice, stream ); + + for(j = 1; j < K; j++) { + cudaMemsetAsync( workV + ldV * ( N2 - L + j ), + 0, + j * sizeof(cuDoubleComplex), + stream ); + } + } + } + /* * W = A1 + A2 * V': * W = A1 * W = W + A2 * V' * */ - cudaMemcpy2DAsync( WORK, LDWORK * sizeof(cuDoubleComplex), - A1, LDA1 * sizeof(cuDoubleComplex), + cudaMemcpy2DAsync( workW, ldW * sizeof(cuDoubleComplex), + A1, LDA1 * sizeof(cuDoubleComplex), M1 * sizeof(cuDoubleComplex), K, cudaMemcpyDeviceToDevice, stream ); @@ -321,40 +452,40 @@ CUDA_zparfb(cham_side_t side, cham_trans_t trans, cublasZgemm(CUBLAS_HANDLE chameleon_cublas_const(ChamNoTrans), chameleon_cublas_const(transW), M1, K, N2, - CUBLAS_SADDR(zone), A2 /* M1*N2 */, LDA2, - V /* N2*K */, LDV, - CUBLAS_SADDR(zone), WORK /* M1*K */, LDWORK); + CUBLAS_SADDR(zone), A2 /* M1*N2 */, LDA2, + workV /* K *N2 */, ldV, + CUBLAS_SADDR(zone), workW /* M1*K */, ldW); - if (WORKC == NULL) { + if ( workC == NULL ) { /* W = W * op(T) */ CUDA_ztrmm( ChamRight, ChamUpper, trans, ChamNonUnit, M2, K, - CUBLAS_SADDR(zone), T, LDT, - WORK, LDWORK, + &zone, T, LDT, + workW, ldW, CUBLAS_STREAM_VALUE ); /* A1 = A1 - W = A1 - W * op(T) */ for(j = 0; j < K; j++) { - cublasZaxpy(CUBLAS_HANDLE - M1, CUBLAS_SADDR(mzone), - (WORK + LDWORK*j), 1, - (A1 + LDA1*j), 1); + cublasZaxpy( CUBLAS_HANDLE + M1, CUBLAS_SADDR(mzone), + workW + ldW * j, 1, + A1 + LDA1 * j, 1 ); } /* A2 = A2 - W * op(V) */ cublasZgemm(CUBLAS_HANDLE chameleon_cublas_const(ChamNoTrans), chameleon_cublas_const(transA2), M2, N2, K, - CUBLAS_SADDR(mzone), WORK /* M2*K */, LDWORK, - V /* K*N2 */, LDV, - CUBLAS_SADDR(zone), A2 /* M2*N2 */, LDA2); + CUBLAS_SADDR(mzone), workW /* M2*K */, ldW, + workV /* K *N2 */, ldV, + CUBLAS_SADDR(zone), A2 /* M2*N2 */, LDA2); } else { /* A1 = A1 - W * opt(T) */ cublasZgemm( CUBLAS_HANDLE chameleon_cublas_const(ChamNoTrans), chameleon_cublas_const(trans), M1, K, K, - CUBLAS_SADDR(mzone), WORK, LDWORK, + CUBLAS_SADDR(mzone), workW, ldW, T, LDT, CUBLAS_SADDR(zone), A1, LDA1 ); @@ -363,15 +494,15 @@ CUDA_zparfb(cham_side_t side, cham_trans_t trans, chameleon_cublas_const(trans), chameleon_cublas_const(transA2), K, N2, K, CUBLAS_SADDR(zone), T, LDT, - V, LDV, - CUBLAS_SADDR(zzero), WORKC, LDWORKC ); + workV, ldV, + CUBLAS_SADDR(zzero), workC, ldC ); /* A2 = A2 - W * Wc */ cublasZgemm( CUBLAS_HANDLE chameleon_cublas_const(ChamNoTrans), chameleon_cublas_const(ChamNoTrans), M2, N2, K, - CUBLAS_SADDR(mzone), WORK, LDWORK, - WORKC, LDWORKC, + CUBLAS_SADDR(mzone), workW, ldW, + workC, ldC, CUBLAS_SADDR(zone), A2, LDA2 ); } } diff --git a/cudablas/compute/cuda_ztpmlqt.c b/cudablas/compute/cuda_ztpmlqt.c new file mode 100644 index 0000000000000000000000000000000000000000..4f01e0e28cf34690cc3e3daa1823c418e9557fac --- /dev/null +++ b/cudablas/compute/cuda_ztpmlqt.c @@ -0,0 +1,184 @@ +/** + * + * @file cuda_ztpmlqt.c + * + * @copyright 2009-2016 The University of Tennessee and The University of + * Tennessee Research Foundation. All rights reserved. + * @copyright 2012-2018 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, + * Univ. Bordeaux. All rights reserved. + * + *** + * + * @brief Chameleon cuda_ztpmlqt GPU kernel + * + * @version 1.0.0 + * @author Mathieu Faverge + * @date 2018-11-09 + * @precisions normal z -> c d s + * + */ +#include "cudablas.h" + +/** + ******************************************************************************* + * + * @ingroup CORE_CHAMELEON_Complex64_t + * + * @brief Applies a complex orthogonal matrix Q. + * + * The matrix Q is obtained from a "triangular-pentagonal" complex block + * reflector H to a general complex matrix C, which consists of two blocks A and + * B. + * + ******************************************************************************* + * + * @param[in] side + * @arg ChamLeft : apply Q or Q**H from the Left; + * @arg ChamRight : apply Q or Q**H from the Right. + * + * @param[in] trans + * @arg ChamNoTrans : No transpose, apply Q; + * @arg ChamConjTrans : ConjTranspose, apply Q**H. + * + * @param[in] M + * The number of rows of the tile B. M >= 0. + * + * @param[in] N + * The number of columns of the tile B. N >= 0. + * + * @param[in] K + * The number of elementary reflectors whose product defines + * the matrix Q. + * + * @param[in] L + * The number of rows of the upper trapezoidal part of V. + * K >= L >= 0. See Further Details. + * + * @param[in] IB + * The inner-blocking size. IB >= 0. + * + * @param[in] V + * The i-th row must contain the vector which defines the + * elementary reflector H(i), for i = 1,2,...,k, as returned by + * CORE_ZTPQRT in the first k rows of its array argument V. + * + * @param[in] LDV + * The leading dimension of the array V. LDV >= max(1,K). + * + * @param[in] T + * The IB-by-N1 triangular factor T of the block reflector. + * T is upper triangular by block (economic storage); + * The rest of the array is not referenced. + * + * @param[in] LDT + * The leading dimension of the array T. LDT >= IB. + * + * @param[in,out] A + * A is COMPLEX*16 array, dimension (LDA,N) if side = ChamLeft + * or (LDA,K) if SIDE = ChamRight + * On entry, the K-by-N or M-by-K matrix A. + * On exit, A is overwritten by the corresponding block of + * Q*C or Q**H*C or C*Q or C*Q**H. See Further Details. + * + * @param[in] LDA + * The leading dimension of the array A. LDA >= max(1,M). + * If side = ChamLeft, LDA >= max(1,K); + * If side = Chamright, LDA >= max(1,M). + * + * @param[in,out] B + * On entry, the M-by-N tile B. + * On exit, B is overwritten by the corresponding block of + * Q*C or Q**H*C or C*Q or C*Q**H. See Further Details. + * + * @param[in] LDB + * The leading dimension of the tile B. LDB >= max(1,M). + * + * @param[out] WORK + * Workspace array of size LDWORK-by-NB. + * LDWORK = N if side = ChamLeft, or M if side = ChamRight. + * + ******************************************************************************* + * + * @par Further Details: + * ===================== + * + * The columns of the pentagonal matrix V contain the elementary reflectors + * H(1), H(2), ..., H(K); V is composed of a rectangular block V1 and a + * trapezoidal block V2: + * + * V = [V1] [V2]. + * + * The size of the trapezoidal block V2 is determined by the parameter L, + * where 0 <= L <= K; V2 is lower trapezoidal, consisting of the first L + * rows of a K-by-K upper triangular matrix. If L=K, V2 is lower triangular; + * if L=0, there is no trapezoidal block, hence V = V1 is rectangular. + * + * If side = ChamLeft: C = [A] where A is K-by-N, B is M-by-N and V is K-by-M. + * [B] + * + * If side = ChamRight: C = [A B] where A is M-by-K, B is M-by-N and V is K-by-N. + * + * The complex orthogonal matrix Q is formed from V and T. + * + * If trans='N' and side='L', C is on exit replaced with Q * C. + * + * If trans='C' and side='L', C is on exit replaced with Q**H * C. + * + * If trans='N' and side='R', C is on exit replaced with C * Q. + * + * If trans='C' and side='R', C is on exit replaced with C * Q**H. + * + ******************************************************************************* + * + * @retval CHAMELEON_SUCCESS successful exit + * @retval <0 if -i, the i-th argument had an illegal value + * + */ +int +CUDA_ztpmlqt( cham_side_t side, cham_trans_t trans, + int M, int N, int K, int L, int IB, + const cuDoubleComplex *V, int LDV, + const cuDoubleComplex *T, int LDT, + cuDoubleComplex *A, int LDA, + cuDoubleComplex *B, int LDB, + cuDoubleComplex *WORK, int lwork, + CUBLAS_STREAM_PARAM ) +{ + int m1, n1; + + /* Check input arguments */ + if ((side != ChamLeft) && (side != ChamRight)) { + cudablas_error(1, "Illegal value of side"); + return -1; + } + + if ( side == ChamLeft ) { + m1 = K; + n1 = N; + } + else { + m1 = M; + n1 = K; + } + + /* TS case */ + if (L == 0) { + CUDA_ztsmlq( side, trans, m1, n1, M, N, K, IB, + A, LDA, B, LDB, V, LDV, T, LDT, + WORK, lwork, + CUBLAS_STREAM_VALUE ); + } + /* TT case */ + else if( L == N ) { + CUDA_zttmlq( side, trans, m1, n1, M, N, K, IB, + A, LDA, B, LDB, V, LDV, T, LDT, + WORK, lwork, + CUBLAS_STREAM_VALUE ); + } + else { + cudablas_error(-6, "TPMLQT not available on GPU for general cases yet\n" ); + return -6; + } + + return CHAMELEON_SUCCESS; +} diff --git a/cudablas/compute/cuda_ztpmqrt.c b/cudablas/compute/cuda_ztpmqrt.c index 2719edd32b1fef478dd150f67b89ba0c8dc465e2..c7a19fd76726491b982c0da9be7c0ff444fd829c 100644 --- a/cudablas/compute/cuda_ztpmqrt.c +++ b/cudablas/compute/cuda_ztpmqrt.c @@ -13,12 +13,128 @@ * * @version 1.0.0 * @author Florent Pruvost - * @date 2015-09-16 + * @date 2018-11-09 * @precisions normal z -> c d s * */ #include "cudablas.h" +/** + ******************************************************************************* + * + * @ingroup CORE_CHAMELEON_Complex64_t + * + * @brief Applies a complex orthogonal matrix Q. + * + * The matrix Q is obtained from a "triangular-pentagonal" complex block + * reflector H to a general complex matrix C, which consists of two blocks A and + * B. + * + ******************************************************************************* + * + * @param[in] side + * @arg ChamLeft : apply Q or Q**H from the Left; + * @arg ChamRight : apply Q or Q**H from the Right. + * + * @param[in] trans + * @arg ChamNoTrans : No transpose, apply Q; + * @arg ChamConjTrans : ConjTranspose, apply Q**H. + * + * @param[in] M + * The number of rows of the tile B. M >= 0. + * + * @param[in] N + * The number of columns of the tile B. N >= 0. + * + * @param[in] K + * The number of elementary reflectors whose product defines + * the matrix Q. + * + * @param[in] L + * The number of rows of the upper trapezoidal part of V. + * K >= L >= 0. See Further Details. + * + * @param[in] IB + * The inner-blocking size. IB >= 0. + * + * @param[in] V + * The i-th row must contain the vector which defines the + * elementary reflector H(i), for i = 1,2,...,k, as returned by + * CORE_ZTPQRT in the first k rows of its array argument V. + * + * @param[in] LDV + * The leading dimension of the array V. LDV >= max(1,K). + * + * @param[in] T + * The IB-by-N1 triangular factor T of the block reflector. + * T is upper triangular by block (economic storage); + * The rest of the array is not referenced. + * + * @param[in] LDT + * The leading dimension of the array T. LDT >= IB. + * + * @param[in,out] A + * A is COMPLEX*16 array, dimension (LDA,N) if side = ChamLeft + * or (LDA,K) if SIDE = ChamRight + * On entry, the K-by-N or M-by-K matrix A. + * On exit, A is overwritten by the corresponding block of + * Q*C or Q**H*C or C*Q or C*Q**H. See Further Details. + * + * @param[in] LDA + * The leading dimension of the array A. LDA >= max(1,M). + * If side = ChamLeft, LDA >= max(1,K); + * If side = Chamright, LDA >= max(1,M). + * + * @param[in,out] B + * On entry, the M-by-N tile B. + * On exit, B is overwritten by the corresponding block of + * Q*C or Q**H*C or C*Q or C*Q**H. See Further Details. + * + * @param[in] LDB + * The leading dimension of the tile B. LDB >= max(1,M). + * + * @param[out] WORK + * Workspace array of size LDWORK-by-NB. + * LDWORK = N if side = ChamLeft, or M if side = ChamRight. + * + ******************************************************************************* + * + * @par Further Details: + * ===================== + * + * The columns of the pentagonal matrix V contain the elementary reflectors + * H(1), H(2), ..., H(K); V is composed of a rectangular block V1 and a + * trapezoidal block V2: + * + * V = [V1] + * [V2]. + * + * The size of the trapezoidal block V2 is determined by the parameter L, + * where 0 <= L <= K; V2 is upper trapezoidal, consisting of the first L + * rows of a K-by-K upper triangular matrix. If L=K, V2 is upper triangular; + * if L=0, there is no trapezoidal block, hence V = V1 is rectangular. + * + * If side = ChamLeft: C = [A] where A is K-by-N, B is M-by-N and V is M-by-K. + * [B] + * + * If side = ChamRight: C = [A B] where A is M-by-K, B is M-by-N and V is N-by-K. + * + * The complex orthogonal matrix Q is formed from V and T. + * + * If trans='N' and side='L', C is on exit replaced with Q * C. + * + * If trans='C' and side='L', C is on exit replaced with Q**H * C. + * + * If trans='N' and side='R', C is on exit replaced with C * Q. + * + * If trans='C' and side='R', C is on exit replaced with C * Q**H. + * + ******************************************************************************* + * + * @retval CHAMELEON_SUCCESS successful exit + * @retval <0 if -i, the i-th argument had an illegal value + * + */ int CUDA_ztpmqrt( cham_side_t side, cham_trans_t trans, int M, int N, int K, int L, int IB, @@ -26,10 +142,10 @@ CUDA_ztpmqrt( cham_side_t side, cham_trans_t trans, const cuDoubleComplex *T, int LDT, cuDoubleComplex *A, int LDA, cuDoubleComplex *B, int LDB, - cuDoubleComplex *WORK, + cuDoubleComplex *WORK, int lwork, CUBLAS_STREAM_PARAM ) { - int m1, n1, ldwork, ldworkc, ws; + int m1, n1; /* Check input arguments */ if ((side != ChamLeft) && (side != ChamRight)) { @@ -40,30 +156,24 @@ CUDA_ztpmqrt( cham_side_t side, cham_trans_t trans, if ( side == ChamLeft ) { m1 = K; n1 = N; - ldwork = IB; - ldworkc = M; - ws = ldwork * n1; } else { m1 = M; n1 = K; - ldwork = chameleon_max( K, chameleon_max( M, N ) ); - ldworkc = IB; - ws = ldwork * IB; } /* TS case */ if (L == 0) { CUDA_ztsmqr( side, trans, m1, n1, M, N, K, IB, A, LDA, B, LDB, V, LDV, T, LDT, - WORK, ldwork, WORK + ws, ldworkc, + WORK, lwork, CUBLAS_STREAM_VALUE ); } /* TT case */ else if( L == M ) { CUDA_zttmqr( side, trans, m1, n1, M, N, K, IB, A, LDA, B, LDB, V, LDV, T, LDT, - WORK, ldwork, WORK + ws, ldworkc, + WORK, lwork, CUBLAS_STREAM_VALUE ); } else { diff --git a/cudablas/compute/cuda_ztrmm.c b/cudablas/compute/cuda_ztrmm.c index 390311e0834fd5b126f912b033b2dc5748c6b2e9..b2809c779ce0e00003a3d5b22e26e1fefb63805f 100644 --- a/cudablas/compute/cuda_ztrmm.c +++ b/cudablas/compute/cuda_ztrmm.c @@ -20,13 +20,13 @@ #include "cudablas.h" int CUDA_ztrmm( - cham_side_t side, cham_uplo_t uplo, - cham_trans_t transa, cham_diag_t diag, - int m, int n, - cuDoubleComplex *alpha, - const cuDoubleComplex *A, int lda, - cuDoubleComplex *B, int ldb, - CUBLAS_STREAM_PARAM) + cham_side_t side, cham_uplo_t uplo, + cham_trans_t transa, cham_diag_t diag, + int m, int n, + cuDoubleComplex *alpha, + const cuDoubleComplex *A, int lda, + cuDoubleComplex *B, int ldb, + CUBLAS_STREAM_PARAM) { #if defined(CHAMELEON_USE_CUBLAS_V2) diff --git a/cudablas/compute/cuda_ztsmlq.c b/cudablas/compute/cuda_ztsmlq.c index 7d2e3c4a3fbc19d5ec319991a9d56a537608016b..2dcfc24158a77a0427ed00bd294a72b55a97f238 100644 --- a/cudablas/compute/cuda_ztsmlq.c +++ b/cudablas/compute/cuda_ztsmlq.c @@ -13,27 +13,26 @@ * * @version 1.0.0 * @author Florent Pruvost - * @date 2015-09-16 + * @author Mathieu Faverge + * @date 2018-11-09 * @precisions normal z -> c d s * */ #include "cudablas.h" int CUDA_ztsmlq( - cham_side_t side, cham_trans_t trans, - int M1, int N1, - int M2, int N2, - int K, int IB, - cuDoubleComplex *A1, int LDA1, - cuDoubleComplex *A2, int LDA2, - const cuDoubleComplex *V, int LDV, - const cuDoubleComplex *T, int LDT, - cuDoubleComplex *WORK, int LDWORK, - cuDoubleComplex *WORKC, int LDWORKC, - CUBLAS_STREAM_PARAM) + cham_side_t side, cham_trans_t trans, + int M1, int N1, + int M2, int N2, + int K, int IB, + cuDoubleComplex *A1, int LDA1, + cuDoubleComplex *A2, int LDA2, + const cuDoubleComplex *V, int LDV, + const cuDoubleComplex *T, int LDT, + cuDoubleComplex *WORK, int LWORK, + CUBLAS_STREAM_PARAM) { int i, i1, i3; - int NW; int kb; int ic = 0; int jc = 0; @@ -45,14 +44,6 @@ int CUDA_ztsmlq( return -1; } - /* NW is the minimum dimension of WORK */ - if (side == ChamLeft) { - NW = IB; - } - else { - NW = N1; - } - if ((trans != ChamNoTrans) && (trans != ChamConjTrans)) { return -2; } @@ -90,21 +81,20 @@ int CUDA_ztsmlq( if (LDT < chameleon_max(1,IB)){ return -16; } - if (LDWORK < chameleon_max(1,NW)){ - return -18; - } /* Quick return */ - if ((M1 == 0) || (N1 == 0) || (M2 == 0) || (N2 == 0) || (K == 0) || (IB == 0)) + if ((M1 == 0) || (N1 == 0) || (M2 == 0) || (N2 == 0) || (K == 0) || (IB == 0)) { return CHAMELEON_SUCCESS; + } - if (((side == ChamLeft) && (trans == ChamNoTrans)) - || ((side == ChamRight) && (trans != ChamNoTrans))) { + if ( ((side == ChamLeft ) && (trans == ChamNoTrans)) || + ((side == ChamRight) && (trans != ChamNoTrans)) ) + { i1 = 0; i3 = IB; } else { - i1 = ((K-1) / IB)*IB; + i1 = ( ( K-1 ) / IB )*IB; i3 = -IB; } @@ -115,7 +105,7 @@ int CUDA_ztsmlq( trans = ChamNoTrans; } - for(i = i1; (i > -1) && (i < K); i += i3) { + for (i = i1; (i > -1) && (i < K); i+=i3) { kb = chameleon_min(IB, K-i); if (side == ChamLeft) { @@ -137,13 +127,13 @@ int CUDA_ztsmlq( * Apply H or H' (NOTE: CORE_zparfb used to be CORE_ztsrfb) */ CUDA_zparfb( - side, trans, ChamDirForward, ChamRowwise, - mi, ni, M2, N2, kb, 0, - A1 + LDA1*jc+ic, LDA1, - A2, LDA2, - V + i, LDV, - T + LDT*i, LDT, - WORK, LDWORK, WORKC, LDWORKC, CUBLAS_STREAM_VALUE ); + side, trans, ChamDirForward, ChamRowwise, + mi, ni, M2, N2, kb, 0, + A1 + LDA1*jc+ic, LDA1, + A2, LDA2, + V + i, LDV, + T + LDT*i, LDT, + WORK, LWORK, CUBLAS_STREAM_VALUE ); } return CHAMELEON_SUCCESS; } diff --git a/cudablas/compute/cuda_ztsmqr.c b/cudablas/compute/cuda_ztsmqr.c index 4b07b9b9ce16397eabf8e358d81b7499eeab2c39..e731dfbf451bb06ab0d9f519292b529c04bfe00a 100644 --- a/cudablas/compute/cuda_ztsmqr.c +++ b/cudablas/compute/cuda_ztsmqr.c @@ -13,27 +13,27 @@ * * @version 1.0.0 * @author Florent Pruvost - * @date 2015-09-16 + * @author Mathieu Faverge + * @date 2018-11-09 * @precisions normal z -> c d s * */ #include "cudablas.h" int CUDA_ztsmqr( - cham_side_t side, cham_trans_t trans, - int M1, int N1, - int M2, int N2, - int K, int IB, - cuDoubleComplex *A1, int LDA1, - cuDoubleComplex *A2, int LDA2, - const cuDoubleComplex *V, int LDV, - const cuDoubleComplex *T, int LDT, - cuDoubleComplex *WORK, int LDWORK, - cuDoubleComplex *WORKC, int LDWORKC, - CUBLAS_STREAM_PARAM) + cham_side_t side, cham_trans_t trans, + int M1, int N1, + int M2, int N2, + int K, int IB, + cuDoubleComplex *A1, int LDA1, + cuDoubleComplex *A2, int LDA2, + const cuDoubleComplex *V, int LDV, + const cuDoubleComplex *T, int LDT, + cuDoubleComplex *WORK, int LWORK, + CUBLAS_STREAM_PARAM) { int i, i1, i3; - int NQ, NW; + int NQ; int kb; int ic = 0; int jc = 0; @@ -48,11 +48,9 @@ int CUDA_ztsmqr( /* NQ is the order of Q */ if (side == ChamLeft) { NQ = M2; - NW = IB; } else { NQ = N2; - NW = M1; } if ((trans != ChamNoTrans) && (trans != ChamConjTrans)) { @@ -92,25 +90,24 @@ int CUDA_ztsmqr( if (LDT < chameleon_max(1,IB)){ return -16; } - if (LDWORK < chameleon_max(1,NW)){ - return -18; - } /* Quick return */ - if ((M1 == 0) || (N1 == 0) || (M2 == 0) || (N2 == 0) || (K == 0) || (IB == 0)) + if ((M1 == 0) || (N1 == 0) || (M2 == 0) || (N2 == 0) || (K == 0) || (IB == 0)) { return CHAMELEON_SUCCESS; + } - if (((side == ChamLeft) && (trans != ChamNoTrans)) - || ((side == ChamRight) && (trans == ChamNoTrans))) { + if ( ((side == ChamLeft ) && (trans != ChamNoTrans)) || + ((side == ChamRight) && (trans == ChamNoTrans)) ) + { i1 = 0; i3 = IB; } else { - i1 = ((K-1) / IB)*IB; + i1 = ( ( K-1 ) / IB )*IB; i3 = -IB; } - for(i = i1; (i > -1) && (i < K); i += i3) { + for (i = i1; (i > -1) && (i < K); i+=i3) { kb = chameleon_min(IB, K-i); if (side == ChamLeft) { @@ -127,17 +124,18 @@ int CUDA_ztsmqr( ni = N1 - i; jc = i; } + /* * Apply H or H' (NOTE: CORE_zparfb used to be CORE_ztsrfb) */ CUDA_zparfb( - side, trans, ChamDirForward, ChamColumnwise, - mi, ni, M2, N2, kb, 0, - A1 + LDA1*jc+ic, LDA1, - A2, LDA2, - V + LDV*i, LDV, - T + LDT*i, LDT, - WORK, LDWORK, WORKC, LDWORKC, CUBLAS_STREAM_VALUE ); + side, trans, ChamDirForward, ChamColumnwise, + mi, ni, M2, N2, kb, 0, + A1 + LDA1*jc+ic, LDA1, + A2, LDA2, + V + LDV*i, LDV, + T + LDT*i, LDT, + WORK, LWORK, CUBLAS_STREAM_VALUE ); } return CHAMELEON_SUCCESS; } diff --git a/cudablas/compute/cuda_zttmlq.c b/cudablas/compute/cuda_zttmlq.c new file mode 100644 index 0000000000000000000000000000000000000000..c194adfe7f7c76847a4b64feaa9b0d5b560eb78b --- /dev/null +++ b/cudablas/compute/cuda_zttmlq.c @@ -0,0 +1,140 @@ +/** + * + * @file cuda_zttmlq.c + * + * @copyright 2009-2014 The University of Tennessee and The University of + * Tennessee Research Foundation. All rights reserved. + * @copyright 2012-2018 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, + * Univ. Bordeaux. All rights reserved. + * + *** + * + * @brief Chameleon cuda_zttmlq GPU kernel + * + * @version 1.0.0 + * @author Florent Pruvost + * @author Mathieu Faverge + * @date 2018-11-09 + * @precisions normal z -> c d s + * + */ +#include "cudablas.h" + +int CUDA_zttmlq( + cham_side_t side, cham_trans_t trans, + int M1, int N1, + int M2, int N2, + int K, int IB, + cuDoubleComplex *A1, int LDA1, + cuDoubleComplex *A2, int LDA2, + const cuDoubleComplex *V, int LDV, + const cuDoubleComplex *T, int LDT, + cuDoubleComplex *WORK, int LWORK, + CUBLAS_STREAM_PARAM) +{ + int i, i1, i3; + int kb, l; + int ic = 0; + int jc = 0; + int mi1 = M1; + int mi2 = M2; + int ni1 = N1; + int ni2 = N2; + + /* Check input arguments */ + if ((side != ChamLeft) && (side != ChamRight)) { + return -1; + } + + if ((trans != ChamNoTrans) && (trans != ChamConjTrans)) { + return -2; + } + if (M1 < 0) { + return -3; + } + if (N1 < 0) { + return -4; + } + if ( (M2 < 0) || + ( (M2 != M1) && (side == ChamRight) ) ){ + return -5; + } + if ( (N2 < 0) || + ( (N2 != N1) && (side == ChamLeft) ) ){ + return -6; + } + if ((K < 0) || + ( (side == ChamLeft) && (K > M1) ) || + ( (side == ChamRight) && (K > N1) ) ) { + return -7; + } + if (IB < 0) { + return -8; + } + if (LDA1 < chameleon_max(1,M1)){ + return -10; + } + if (LDA2 < chameleon_max(1,M2)){ + return -12; + } + if (LDV < chameleon_max(1,K)){ + return -14; + } + if (LDT < chameleon_max(1,IB)){ + return -16; + } + + /* Quick return */ + if ((M1 == 0) || (N1 == 0) || (M2 == 0) || (N2 == 0) || (K == 0) || (IB == 0)) { + return CHAMELEON_SUCCESS; + } + + if ( ((side == ChamLeft ) && (trans == ChamNoTrans)) || + ((side == ChamRight) && (trans != ChamNoTrans)) ) + { + i1 = 0; + i3 = IB; + } + else { + i1 = ( ( K-1 ) / IB )*IB; + i3 = -IB; + } + + /* Transpose */ + if (trans == ChamNoTrans) { + trans = ChamConjTrans; + } + else { + trans = ChamNoTrans; + } + + for (i = i1; (i > -1) && (i < K); i+=i3) { + kb = chameleon_min(IB, K-i); + + if (side == ChamLeft) { + mi1 = kb; + mi2 = chameleon_min(i+kb, M2); + l = chameleon_min(kb, chameleon_max(0, M2-i)); + ic = i; + } + else { + ni1 = kb; + ni2 = chameleon_min(i+kb, N2); + l = chameleon_min(kb, chameleon_max(0, N2-i)); + jc = i; + } + + /* + * Apply H or H' (NOTE: CORE_zparfb used to be CORE_zttrfb) + */ + CUDA_zparfb( + side, trans, ChamDirForward, ChamRowwise, + mi1, ni1, mi2, ni2, kb, l, + A1 + LDA1 * jc + ic, LDA1, + A2, LDA2, + V + i, LDV, + T + LDT * i, LDT, + WORK, LWORK, CUBLAS_STREAM_VALUE ); + } + return CHAMELEON_SUCCESS; +} diff --git a/cudablas/compute/cuda_zttmqr.c b/cudablas/compute/cuda_zttmqr.c index 236405cdf080fd3c7941aa62426a70cde2e5d8f7..8664d0675afdb82b60e92bf054caa45122c7ec89 100644 --- a/cudablas/compute/cuda_zttmqr.c +++ b/cudablas/compute/cuda_zttmqr.c @@ -6,6 +6,7 @@ * Tennessee Research Foundation. All rights reserved. * @copyright 2012-2018 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, * Univ. Bordeaux. All rights reserved. + * *** * * @brief Chameleon cuda_zttmqr GPU kernel @@ -13,7 +14,7 @@ * @version 1.0.0 * @author Florent Pruvost * @author Mathieu Faverge - * @date 2015-09-16 + * @date 2018-11-09 * @precisions normal z -> c d s * */ @@ -28,13 +29,12 @@ int CUDA_zttmqr( cuDoubleComplex *A2, int LDA2, const cuDoubleComplex *V, int LDV, const cuDoubleComplex *T, int LDT, - cuDoubleComplex *WORK, int LDWORK, - cuDoubleComplex *WORKC, int LDWORKC, + cuDoubleComplex *WORK, int LWORK, CUBLAS_STREAM_PARAM) { - int i, i1, i3, l; - int NQ, NW; - int kb; + int i, i1, i3; + int NQ; + int kb, l; int ic = 0; int jc = 0; int mi1 = M1; @@ -50,11 +50,9 @@ int CUDA_zttmqr( /* NQ is the order of Q */ if (side == ChamLeft) { NQ = M2; - NW = IB; } else { NQ = N2; - NW = M1; } if ((trans != ChamNoTrans) && (trans != ChamConjTrans)) { @@ -94,25 +92,24 @@ int CUDA_zttmqr( if (LDT < chameleon_max(1,IB)){ return -16; } - if (LDWORK < chameleon_max(1,NW)){ - return -18; - } /* Quick return */ - if ((M1 == 0) || (N1 == 0) || (M2 == 0) || (N2 == 0) || (K == 0) || (IB == 0)) + if ((M1 == 0) || (N1 == 0) || (M2 == 0) || (N2 == 0) || (K == 0) || (IB == 0)) { return CHAMELEON_SUCCESS; + } - if (((side == ChamLeft) && (trans != ChamNoTrans)) - || ((side == ChamRight) && (trans == ChamNoTrans))) { + if ( ((side == ChamLeft ) && (trans != ChamNoTrans)) || + ((side == ChamRight) && (trans == ChamNoTrans)) ) + { i1 = 0; i3 = IB; } else { - i1 = ((K-1) / IB)*IB; + i1 = ( ( K-1 ) / IB )*IB; i3 = -IB; } - for(i = i1; (i > -1) && (i < K); i += i3) { + for (i = i1; (i > -1) && (i < K); i+=i3) { kb = chameleon_min(IB, K-i); if (side == ChamLeft) { @@ -138,8 +135,7 @@ int CUDA_zttmqr( A2, LDA2, V + LDV*i, LDV, T + LDT*i, LDT, - WORK, LDWORK, - WORKC, LDWORKC, CUBLAS_STREAM_VALUE ); + WORK, LWORK, CUBLAS_STREAM_VALUE ); } return CHAMELEON_SUCCESS; } diff --git a/cudablas/eztrace_module/cudablas_eztrace_module b/cudablas/eztrace_module/cudablas_eztrace_module index 7ec370d3f3ad568264dc17c045c1c1f90de71eec..c631193b1fcedd7597a645dad9d208b29c1faed8 100644 --- a/cudablas/eztrace_module/cudablas_eztrace_module +++ b/cudablas/eztrace_module/cudablas_eztrace_module @@ -273,6 +273,30 @@ int CUDA_ctsmqr( void *WORK, void* LDWORK, void *WORKC, void* LDWORKC, void* stream); +int CUDA_cttmlq( + void* side, void* trans, + void* M1, void* N1, + void* M2, void* N2, + void* K, void* IB, + void *A1, void* LDA1, + void *A2, void* LDA2, + void *V, void* LDV, + void *T, void* LDT, + void *WORK, void* LDWORK, + void *WORKC, void* LDWORKC, + void* stream); +int CUDA_cttmqr( + void* side, void* trans, + void* M1, void* N1, + void* M2, void* N2, + void* K, void* IB, + void *A1, void* LDA1, + void *A2, void* LDA2, + void *V, void* LDV, + void *T, void* LDT, + void *WORK, void* LDWORK, + void *WORKC, void* LDWORKC, + void* stream); int CUDA_ctsqrt( void* m, void* n, void* nb, void *da1, void* ldda1, @@ -528,6 +552,30 @@ int CUDA_dtsmqr( double *WORK, void* LDWORK, double *WORKC, void* LDWORKC, void* stream); +int CUDA_dttmlq( + void* side, void* trans, + void* M1, void* N1, + void* M2, void* N2, + void* K, void* IB, + double *A1, void* LDA1, + double *A2, void* LDA2, + const double *V, void* LDV, + const double *T, void* LDT, + double *WORK, void* LDWORK, + double *WORKC, void* LDWORKC, + void* stream); +int CUDA_dttmqr( + void* side, void* trans, + void* M1, void* N1, + void* M2, void* N2, + void* K, void* IB, + double *A1, void* LDA1, + double *A2, void* LDA2, + const double *V, void* LDV, + const double *T, void* LDT, + double *WORK, void* LDWORK, + double *WORKC, void* LDWORKC, + void* stream); int CUDA_dtsqrt( void* m, void* n, void* nb, double *da1, void* ldda1, @@ -783,6 +831,30 @@ int CUDA_stsmqr( float *WORK, void* LDWORK, float *WORKC, void* LDWORKC, void* stream); +int CUDA_sttmlq( + void* side, void* trans, + void* M1, void* N1, + void* M2, void* N2, + void* K, void* IB, + float *A1, void* LDA1, + float *A2, void* LDA2, + const float *V, void* LDV, + const float *T, void* LDT, + float *WORK, void* LDWORK, + float *WORKC, void* LDWORKC, + void* stream); +int CUDA_sttmqr( + void* side, void* trans, + void* M1, void* N1, + void* M2, void* N2, + void* K, void* IB, + float *A1, void* LDA1, + float *A2, void* LDA2, + const float *V, void* LDV, + const float *T, void* LDT, + float *WORK, void* LDWORK, + float *WORKC, void* LDWORKC, + void* stream); int CUDA_stsqrt( void* m, void* n, void* nb, float *da1, void* ldda1, @@ -1090,6 +1162,30 @@ int CUDA_ztsmqr( void *WORK, void* LDWORK, void *WORKC, void* LDWORKC, void* stream); +int CUDA_zttmlq( + void* side, void* trans, + void* M1, void* N1, + void* M2, void* N2, + void* K, void* IB, + void *A1, void* LDA1, + void *A2, void* LDA2, + void *V, void* LDV, + void *T, void* LDT, + void *WORK, void* LDWORK, + void *WORKC, void* LDWORKC, + void* stream); +int CUDA_zttmqr( + void* side, void* trans, + void* M1, void* N1, + void* M2, void* N2, + void* K, void* IB, + void *A1, void* LDA1, + void *A2, void* LDA2, + void *V, void* LDV, + void *T, void* LDT, + void *WORK, void* LDWORK, + void *WORKC, void* LDWORKC, + void* stream); int CUDA_ztsqrt( void* m, void* n, void* nb, void *da1, void* ldda1, diff --git a/cudablas/include/cudablas/cudablas_z.h b/cudablas/include/cudablas/cudablas_z.h index 8e96d463ca451c74b17f91c22c2b6c03a27a101e..8895ff6485b33c423bd80b5b99599883eec58ce7 100644 --- a/cudablas/include/cudablas/cudablas_z.h +++ b/cudablas/include/cudablas/cudablas_z.h @@ -13,7 +13,7 @@ * * @version 1.0.0 * @author Florent Pruvost - * @date 2015-09-16 + * @date 2018-11-09 * @precisions normal z -> c d s * */ @@ -31,16 +31,18 @@ int CUDA_zher2k( cham_uplo_t uplo, cham_trans_t trans, int n, int k, cuDoubleCom int CUDA_zherfb( cham_uplo_t uplo, int n, int k, int ib, int nb, const cuDoubleComplex *A, int lda, const cuDoubleComplex *T, int ldt, cuDoubleComplex *C, int ldc, cuDoubleComplex *WORK, int ldwork, CUBLAS_STREAM_PARAM ); int CUDA_zherk( cham_uplo_t uplo, cham_trans_t trans, int n, int k, double *alpha, const cuDoubleComplex *A, int lda, double *beta, cuDoubleComplex *B, int ldb, CUBLAS_STREAM_PARAM ); int CUDA_zlarfb(cham_side_t side, cham_trans_t trans, cham_dir_t direct, cham_store_t storev, int M, int N, int K, const cuDoubleComplex *V, int LDV, const cuDoubleComplex *T, int LDT, cuDoubleComplex *C, int LDC, cuDoubleComplex *WORK, int LDWORK, CUBLAS_STREAM_PARAM ); -int CUDA_zparfb(cham_side_t side, cham_trans_t trans, cham_dir_t direct, cham_store_t storev, int M1, int N1, int M2, int N2, int K, int L, cuDoubleComplex *A1, int LDA1, cuDoubleComplex *A2, int LDA2, const cuDoubleComplex *V, int LDV, const cuDoubleComplex *T, int LDT, cuDoubleComplex *WORK, int LDWORK, cuDoubleComplex *WORKC, int LDWORKC, CUBLAS_STREAM_PARAM ); +int CUDA_zparfb(cham_side_t side, cham_trans_t trans, cham_dir_t direct, cham_store_t storev, int M1, int N1, int M2, int N2, int K, int L, cuDoubleComplex *A1, int LDA1, cuDoubleComplex *A2, int LDA2, const cuDoubleComplex *V, int LDV, const cuDoubleComplex *T, int LDT, cuDoubleComplex *WORK, int LWORK, CUBLAS_STREAM_PARAM ); int CUDA_zsymm( cham_side_t side, cham_uplo_t uplo, int m, int n, cuDoubleComplex *alpha, const cuDoubleComplex *A, int lda, const cuDoubleComplex *B, int ldb, cuDoubleComplex *beta, cuDoubleComplex *C, int ldc, CUBLAS_STREAM_PARAM ); int CUDA_zsyr2k( cham_uplo_t uplo, cham_trans_t trans, int n, int k, cuDoubleComplex *alpha, const cuDoubleComplex *A, int lda, const cuDoubleComplex *B, int ldb, cuDoubleComplex *beta, cuDoubleComplex *C, int ldc, CUBLAS_STREAM_PARAM ); int CUDA_zsyrk( cham_uplo_t uplo, cham_trans_t trans, int n, int k, cuDoubleComplex *alpha, const cuDoubleComplex *A, int lda, cuDoubleComplex *beta, cuDoubleComplex *C, int ldc, CUBLAS_STREAM_PARAM ); -int CUDA_ztpmqrt( cham_side_t side, cham_trans_t trans, int M, int N, int K, int L, int IB, const cuDoubleComplex *V, int LDV, const cuDoubleComplex *T, int LDT, cuDoubleComplex *A, int LDA, cuDoubleComplex *B, int LDB, cuDoubleComplex *WORK, CUBLAS_STREAM_PARAM ); +int CUDA_ztpmqrt( cham_side_t side, cham_trans_t trans, int M, int N, int K, int L, int IB, const cuDoubleComplex *V, int LDV, const cuDoubleComplex *T, int LDT, cuDoubleComplex *A, int LDA, cuDoubleComplex *B, int LDB, cuDoubleComplex *WORK, int lwork, CUBLAS_STREAM_PARAM ); +int CUDA_ztpmlqt( cham_side_t side, cham_trans_t trans, int M, int N, int K, int L, int IB, const cuDoubleComplex *V, int LDV, const cuDoubleComplex *T, int LDT, cuDoubleComplex *A, int LDA, cuDoubleComplex *B, int LDB, cuDoubleComplex *WORK, int lwork, CUBLAS_STREAM_PARAM ); int CUDA_ztrmm( cham_side_t side, cham_uplo_t uplo, cham_trans_t transa, cham_diag_t diag, int m, int n, cuDoubleComplex *alpha, const cuDoubleComplex *A, int lda, cuDoubleComplex *B, int ldb, CUBLAS_STREAM_PARAM ); int CUDA_ztrsm( cham_side_t side, cham_uplo_t uplo, cham_trans_t transa, cham_diag_t diag, int m, int n, cuDoubleComplex *alpha, const cuDoubleComplex *A, int lda, cuDoubleComplex *B, int ldb, CUBLAS_STREAM_PARAM ); -int CUDA_ztsmlq( cham_side_t side, cham_trans_t trans, int M1, int N1, int M2, int N2, int K, int IB, cuDoubleComplex *A1, int LDA1, cuDoubleComplex *A2, int LDA2, const cuDoubleComplex *V, int LDV, const cuDoubleComplex *T, int LDT, cuDoubleComplex *WORK, int LDWORK, cuDoubleComplex *WORKC, int LDWORKC, CUBLAS_STREAM_PARAM ); -int CUDA_ztsmqr( cham_side_t side, cham_trans_t trans, int M1, int N1, int M2, int N2, int K, int IB, cuDoubleComplex *A1, int LDA1, cuDoubleComplex *A2, int LDA2, const cuDoubleComplex *V, int LDV, const cuDoubleComplex *T, int LDT, cuDoubleComplex *WORK, int LDWORK, cuDoubleComplex *WORKC, int LDWORKC, CUBLAS_STREAM_PARAM ); -int CUDA_zttmqr( cham_side_t side, cham_trans_t trans, int M1, int N1, int M2, int N2, int K, int IB, cuDoubleComplex *A1, int LDA1, cuDoubleComplex *A2, int LDA2, const cuDoubleComplex *V, int LDV, const cuDoubleComplex *T, int LDT, cuDoubleComplex *WORK, int LDWORK, cuDoubleComplex *WORKC, int LDWORKC, CUBLAS_STREAM_PARAM ); +int CUDA_ztsmlq( cham_side_t side, cham_trans_t trans, int M1, int N1, int M2, int N2, int K, int IB, cuDoubleComplex *A1, int LDA1, cuDoubleComplex *A2, int LDA2, const cuDoubleComplex *V, int LDV, const cuDoubleComplex *T, int LDT, cuDoubleComplex *WORK, int LWORK, CUBLAS_STREAM_PARAM ); +int CUDA_zttmlq( cham_side_t side, cham_trans_t trans, int M1, int N1, int M2, int N2, int K, int IB, cuDoubleComplex *A1, int LDA1, cuDoubleComplex *A2, int LDA2, const cuDoubleComplex *V, int LDV, const cuDoubleComplex *T, int LDT, cuDoubleComplex *WORK, int LWORK, CUBLAS_STREAM_PARAM ); +int CUDA_ztsmqr( cham_side_t side, cham_trans_t trans, int M1, int N1, int M2, int N2, int K, int IB, cuDoubleComplex *A1, int LDA1, cuDoubleComplex *A2, int LDA2, const cuDoubleComplex *V, int LDV, const cuDoubleComplex *T, int LDT, cuDoubleComplex *WORK, int LWORK, CUBLAS_STREAM_PARAM ); +int CUDA_zttmqr( cham_side_t side, cham_trans_t trans, int M1, int N1, int M2, int N2, int K, int IB, cuDoubleComplex *A1, int LDA1, cuDoubleComplex *A2, int LDA2, const cuDoubleComplex *V, int LDV, const cuDoubleComplex *T, int LDT, cuDoubleComplex *WORK, int LWORK, CUBLAS_STREAM_PARAM ); int CUDA_zunmlqt(cham_side_t side, cham_trans_t trans, int M, int N, int K, int IB, const cuDoubleComplex *A, int LDA, const cuDoubleComplex *T, int LDT, cuDoubleComplex *C, int LDC, cuDoubleComplex *WORK, int LDWORK, CUBLAS_STREAM_PARAM ); int CUDA_zunmqrt(cham_side_t side, cham_trans_t trans, int M, int N, int K, int IB, const cuDoubleComplex *A, int LDA, const cuDoubleComplex *T, int LDT, cuDoubleComplex *C, int LDC, cuDoubleComplex *WORK, int LDWORK, CUBLAS_STREAM_PARAM ); diff --git a/include/chameleon/tasks.h b/include/chameleon/tasks.h index bb1794316b74504f8629d2fb3e9a40ac50d18879..b8f91fb211f98a417a9c722794df6c45c441dcdc 100644 --- a/include/chameleon/tasks.h +++ b/include/chameleon/tasks.h @@ -14,7 +14,7 @@ * @version 1.0.0 * @author Mathieu Faverge * @author Cedric Augonnet - * @date 2011-06-01 + * @date 2018-11-08 * */ #ifndef _chameleon_tasks_h_ @@ -54,16 +54,12 @@ typedef enum chameleon_tasktype_e { TASK_ORMQR, TASK_POTRF, TASK_SSSSM, + TASK_TPLQT, + TASK_TPMLQT, + TASK_TPMQRT, + TASK_TPQRT, TASK_TRTRI, - TASK_TSLQT, - TASK_TSMLQ, - TASK_TSMQR, - TASK_TSQRT, TASK_TSTRF, - TASK_TTLQT, - TASK_TTMLQ, - TASK_TTMQR, - TASK_TTQRT, TASK_UNMLQ, TASK_UNMQR, @@ -86,6 +82,15 @@ typedef enum chameleon_tasktype_e { TASK_NBKERNELS } cham_tasktype_t; +#define TASK_TSLQT TASK_TPLQT +#define TASK_TSMLQ TASK_TPMLQT +#define TASK_TSMQR TASK_TPMQRT +#define TASK_TSQRT TASK_TPQRT +#define TASK_TTLQT TASK_TPLQT +#define TASK_TTMLQ TASK_TPMLQT +#define TASK_TTMQR TASK_TPMQRT +#define TASK_TTQRT TASK_TPQRT + typedef int (*cham_unary_operator_t)( const CHAM_desc_t *desc, cham_uplo_t uplo, int m, int n, void *data, void *op_args ); diff --git a/include/chameleon/tasks_z.h b/include/chameleon/tasks_z.h index 28aeaa1b889e0eb5fe16618b02d9bebd73de905a..8265e990686f22639acc23d0919348d017947cb2 100644 --- a/include/chameleon/tasks_z.h +++ b/include/chameleon/tasks_z.h @@ -20,7 +20,7 @@ * @author Mathieu Faverge * @author Emmanuel Agullo * @author Cedric Castagnede - * @date 2010-11-15 + * @date 2018-11-07 * @precisions normal z -> c d s * */ @@ -31,435 +31,499 @@ * Declarations of QUARK wrappers (called by CHAMELEON) - alphabetical order */ void INSERT_TASK_dzasum( const RUNTIME_option_t *options, - cham_store_t storev, cham_uplo_t uplo, int M, int N, - const CHAM_desc_t *A, int Am, int An, int lda, - const CHAM_desc_t *B, int Bm, int Bn ); + cham_store_t storev, cham_uplo_t uplo, int M, int N, + const CHAM_desc_t *A, int Am, int An, int lda, + const CHAM_desc_t *B, int Bm, int Bn ); void INSERT_TASK_zgeadd( const RUNTIME_option_t *options, - cham_trans_t trans, int m, int n, int nb, - CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, int lda, - CHAMELEON_Complex64_t beta, const CHAM_desc_t *B, int Bm, int Bn, int ldb ); + cham_trans_t trans, int m, int n, int nb, + CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, int lda, + CHAMELEON_Complex64_t beta, const CHAM_desc_t *B, int Bm, int Bn, int ldb ); void INSERT_TASK_zlascal( const RUNTIME_option_t *options, - cham_uplo_t uplo, - int m, int n, int nb, - CHAMELEON_Complex64_t alpha, - const CHAM_desc_t *A, int Am, int An, int lda ); + cham_uplo_t uplo, + int m, int n, int nb, + CHAMELEON_Complex64_t alpha, + const CHAM_desc_t *A, int Am, int An, int lda ); void INSERT_TASK_zbrdalg( const RUNTIME_option_t *options, - cham_uplo_t uplo, - int N, int NB, - const CHAM_desc_t *A, - const CHAM_desc_t *C, int Cm, int Cn, - const CHAM_desc_t *S, int Sm, int Sn, - int i, int j, int m, int grsiz, int BAND, - int *PCOL, int *ACOL, int *MCOL ); + cham_uplo_t uplo, + int N, int NB, + const CHAM_desc_t *A, + const CHAM_desc_t *C, int Cm, int Cn, + const CHAM_desc_t *S, int Sm, int Sn, + int i, int j, int m, int grsiz, int BAND, + int *PCOL, int *ACOL, int *MCOL ); void INSERT_TASK_zgelqt( const RUNTIME_option_t *options, - int m, int n, int ib, int nb, - const CHAM_desc_t *A, int Am, int An, int lda, - const CHAM_desc_t *T, int Tm, int Tn, int ldt ); + int m, int n, int ib, int nb, + const CHAM_desc_t *A, int Am, int An, int lda, + const CHAM_desc_t *T, int Tm, int Tn, int ldt ); void INSERT_TASK_zgemm( const RUNTIME_option_t *options, - cham_trans_t transA, cham_trans_t transB, - int m, int n, int k, int nb, - CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, int lda, - const CHAM_desc_t *B, int Bm, int Bn, int ldb, - CHAMELEON_Complex64_t beta, const CHAM_desc_t *C, int Cm, int Cn, int ldc ); -void INSERT_TASK_zgemm2( const RUNTIME_option_t *options, cham_trans_t transA, cham_trans_t transB, int m, int n, int k, int nb, CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, int lda, const CHAM_desc_t *B, int Bm, int Bn, int ldb, CHAMELEON_Complex64_t beta, const CHAM_desc_t *C, int Cm, int Cn, int ldc ); +void INSERT_TASK_zgemm2( const RUNTIME_option_t *options, + cham_trans_t transA, cham_trans_t transB, + int m, int n, int k, int nb, + CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, int lda, + const CHAM_desc_t *B, int Bm, int Bn, int ldb, + CHAMELEON_Complex64_t beta, const CHAM_desc_t *C, int Cm, int Cn, int ldc ); void INSERT_TASK_zgemm_f2( const RUNTIME_option_t *options, - cham_trans_t transA, cham_trans_t transB, - int m, int n, int k, int nb, - CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, int lda, - const CHAM_desc_t *B, int Bm, int Bn, int ldb, - CHAMELEON_Complex64_t beta, const CHAM_desc_t *C, int Cm, int Cn, int ldc, - const CHAM_desc_t *fake1, int fake1m, int fake1n, int szefake1, int flag1, - const CHAM_desc_t *fake2, int fake2m, int fake2n, int szefake2, int flag2 ); + cham_trans_t transA, cham_trans_t transB, + int m, int n, int k, int nb, + CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, int lda, + const CHAM_desc_t *B, int Bm, int Bn, int ldb, + CHAMELEON_Complex64_t beta, const CHAM_desc_t *C, int Cm, int Cn, int ldc, + const CHAM_desc_t *fake1, int fake1m, int fake1n, int szefake1, int flag1, + const CHAM_desc_t *fake2, int fake2m, int fake2n, int szefake2, int flag2 ); void INSERT_TASK_zgemm_p2( const RUNTIME_option_t *options, - cham_trans_t transA, cham_trans_t transB, - int m, int n, int k, int nb, - CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, int lda, - const CHAMELEON_Complex64_t **B, int ldb, - CHAMELEON_Complex64_t beta, const CHAM_desc_t *C, int Cm, int Cn, int ldc ); + cham_trans_t transA, cham_trans_t transB, + int m, int n, int k, int nb, + CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, int lda, + const CHAMELEON_Complex64_t **B, int ldb, + CHAMELEON_Complex64_t beta, const CHAM_desc_t *C, int Cm, int Cn, int ldc ); void INSERT_TASK_zgemm_p2f1( const RUNTIME_option_t *options, - cham_trans_t transA, cham_trans_t transB, - int m, int n, int k, int nb, - CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, int lda, - const CHAMELEON_Complex64_t **B, int ldb, - CHAMELEON_Complex64_t beta, const CHAM_desc_t *C, int Cm, int Cn, int ldc, - const CHAM_desc_t *fake1, int fake1m, int fake1n, int szefake1, int flag1 ); + cham_trans_t transA, cham_trans_t transB, + int m, int n, int k, int nb, + CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, int lda, + const CHAMELEON_Complex64_t **B, int ldb, + CHAMELEON_Complex64_t beta, const CHAM_desc_t *C, int Cm, int Cn, int ldc, + const CHAM_desc_t *fake1, int fake1m, int fake1n, int szefake1, int flag1 ); void INSERT_TASK_zgemm_p3( const RUNTIME_option_t *options, - cham_trans_t transA, cham_trans_t transB, - int m, int n, int k, int nb, - CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, int lda, - const CHAM_desc_t *B, int Bm, int Bn, int ldb, - CHAMELEON_Complex64_t beta, CHAMELEON_Complex64_t **C, int ldc ); + cham_trans_t transA, cham_trans_t transB, + int m, int n, int k, int nb, + CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, int lda, + const CHAM_desc_t *B, int Bm, int Bn, int ldb, + CHAMELEON_Complex64_t beta, CHAMELEON_Complex64_t **C, int ldc ); void INSERT_TASK_zgeqrt( const RUNTIME_option_t *options, - int m, int n, int ib, int nb, - const CHAM_desc_t *A, int Am, int An, int lda, - const CHAM_desc_t *T, int Tm, int Tn, int ldt ); + int m, int n, int ib, int nb, + const CHAM_desc_t *A, int Am, int An, int lda, + const CHAM_desc_t *T, int Tm, int Tn, int ldt ); void INSERT_TASK_zgessm( const RUNTIME_option_t *options, - int m, int n, int k, int ib, int nb, - int *IPIV, - const CHAM_desc_t *L, int Lm, int Ln, int ldl, - const CHAM_desc_t *D, int Dm, int Dn, int ldd, - const CHAM_desc_t *A, int Am, int An, int lda ); + int m, int n, int k, int ib, int nb, + int *IPIV, + const CHAM_desc_t *L, int Lm, int Ln, int ldl, + const CHAM_desc_t *D, int Dm, int Dn, int ldd, + const CHAM_desc_t *A, int Am, int An, int lda ); void INSERT_TASK_zgessq( const RUNTIME_option_t *options, - int m, int n, - const CHAM_desc_t *A, int Am, int An, int lda, - const CHAM_desc_t *SCALESUMSQ, int SCALESUMSQm, int SCALESUMSQn ); + int m, int n, + const CHAM_desc_t *A, int Am, int An, int lda, + const CHAM_desc_t *SCALESUMSQ, int SCALESUMSQm, int SCALESUMSQn ); void INSERT_TASK_zgetrf( const RUNTIME_option_t *options, - int m, int n, int nb, - const CHAM_desc_t *A, int Am, int An, int lda, - int *IPIV, - cham_bool_t check_info, int iinfo ); + int m, int n, int nb, + const CHAM_desc_t *A, int Am, int An, int lda, + int *IPIV, + cham_bool_t check_info, int iinfo ); void INSERT_TASK_zgetrf_incpiv( const RUNTIME_option_t *options, - int m, int n, int ib, int nb, - const CHAM_desc_t *A, int Am, int An, int lda, - const CHAM_desc_t *L, int Lm, int Ln, int ldl, - int *IPIV, - cham_bool_t check_info, int iinfo ); + int m, int n, int ib, int nb, + const CHAM_desc_t *A, int Am, int An, int lda, + const CHAM_desc_t *L, int Lm, int Ln, int ldl, + int *IPIV, + cham_bool_t check_info, int iinfo ); void INSERT_TASK_zgetrf_nopiv( const RUNTIME_option_t *options, - int m, int n, int ib, int nb, - const CHAM_desc_t *A, int Am, int An, int lda, int iinfo ); + int m, int n, int ib, int nb, + const CHAM_desc_t *A, int Am, int An, int lda, int iinfo ); void INSERT_TASK_zgetrf_reclap( const RUNTIME_option_t *options, - int m, int n, int nb, - const CHAM_desc_t *A, int Am, int An, int lda, - int *IPIV, + int m, int n, int nb, + const CHAM_desc_t *A, int Am, int An, int lda, + int *IPIV, - cham_bool_t check_info, int iinfo, - int nbthread ); + cham_bool_t check_info, int iinfo, + int nbthread ); void INSERT_TASK_zgetrf_rectil( const RUNTIME_option_t *options, - const CHAM_desc_t A, const CHAM_desc_t *Amn, int Amnm, int Amnn, int size, - int *IPIV, + const CHAM_desc_t A, const CHAM_desc_t *Amn, int Amnm, int Amnn, int size, + int *IPIV, - cham_bool_t check_info, int iinfo, - int nbthread ); + cham_bool_t check_info, int iinfo, + int nbthread ); void INSERT_TASK_zgetrip( const RUNTIME_option_t *options, - int m, int n, const CHAM_desc_t *A, int Am, int An, int szeA ); + int m, int n, const CHAM_desc_t *A, int Am, int An, int szeA ); void INSERT_TASK_zgetrip_f1( const RUNTIME_option_t *options, - int m, int n, const CHAM_desc_t *A, int Am, int An, int szeA, - const CHAM_desc_t *fake, int fakem, int faken, int szeF, int paramF ); + int m, int n, const CHAM_desc_t *A, int Am, int An, int szeA, + const CHAM_desc_t *fake, int fakem, int faken, int szeF, int paramF ); void INSERT_TASK_zgetrip_f2( const RUNTIME_option_t *options, - int m, int n, const CHAM_desc_t *A, int Am, int An, int szeA, - const CHAM_desc_t *fake1, int fake1m, int fake1n, int szeF1, int paramF1, - const CHAM_desc_t *fake2, int fake2m, int fake2n, int szeF2, int paramF2 ); + int m, int n, const CHAM_desc_t *A, int Am, int An, int szeA, + const CHAM_desc_t *fake1, int fake1m, int fake1n, int szeF1, int paramF1, + const CHAM_desc_t *fake2, int fake2m, int fake2n, int szeF2, int paramF2 ); void INSERT_TASK_zhe2ge( const RUNTIME_option_t *options, - cham_uplo_t uplo, - int m, int n, int mb, - const CHAM_desc_t *A, int Am, int An, int lda, - const CHAM_desc_t *B, int Bm, int Bn, int ldb ); + cham_uplo_t uplo, + int m, int n, int mb, + const CHAM_desc_t *A, int Am, int An, int lda, + const CHAM_desc_t *B, int Bm, int Bn, int ldb ); void INSERT_TASK_zhemm( const RUNTIME_option_t *options, - cham_side_t side, cham_uplo_t uplo, - int m, int n, int nb, - CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, int lda, - const CHAM_desc_t *B, int Bm, int Bn, int ldb, - CHAMELEON_Complex64_t beta, const CHAM_desc_t *C, int Cm, int Cn, int ldc ); + cham_side_t side, cham_uplo_t uplo, + int m, int n, int nb, + CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, int lda, + const CHAM_desc_t *B, int Bm, int Bn, int ldb, + CHAMELEON_Complex64_t beta, const CHAM_desc_t *C, int Cm, int Cn, int ldc ); void INSERT_TASK_zhegst( const RUNTIME_option_t *options, - int itype, cham_uplo_t uplo, int N, - const CHAM_desc_t *A, int Am, int An, int LDA, - const CHAM_desc_t *B, int Bm, int Bn, int LDB, - int iinfo ); + int itype, cham_uplo_t uplo, int N, + const CHAM_desc_t *A, int Am, int An, int LDA, + const CHAM_desc_t *B, int Bm, int Bn, int LDB, + int iinfo ); void INSERT_TASK_zherk( const RUNTIME_option_t *options, - cham_uplo_t uplo, cham_trans_t trans, - int n, int k, int nb, - double alpha, const CHAM_desc_t *A, int Am, int An, int lda, - double beta, const CHAM_desc_t *C, int Cm, int Cn, int ldc ); -void INSERT_TASK_zher2k( const RUNTIME_option_t *options, cham_uplo_t uplo, cham_trans_t trans, int n, int k, int nb, - CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, int lda, - const CHAM_desc_t *B, int Bm, int Bn, int LDB, + double alpha, const CHAM_desc_t *A, int Am, int An, int lda, double beta, const CHAM_desc_t *C, int Cm, int Cn, int ldc ); +void INSERT_TASK_zher2k( const RUNTIME_option_t *options, + cham_uplo_t uplo, cham_trans_t trans, + int n, int k, int nb, + CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, int lda, + const CHAM_desc_t *B, int Bm, int Bn, int LDB, + double beta, const CHAM_desc_t *C, int Cm, int Cn, int ldc ); void INSERT_TASK_zherfb( const RUNTIME_option_t *options, - cham_uplo_t uplo, - int n, int k, int ib, int nb, - const CHAM_desc_t *A, int Am, int An, int lda, - const CHAM_desc_t *T, int Tm, int Tn, int ldt, - const CHAM_desc_t *C, int Cm, int Cn, int ldc ); + cham_uplo_t uplo, + int n, int k, int ib, int nb, + const CHAM_desc_t *A, int Am, int An, int lda, + const CHAM_desc_t *T, int Tm, int Tn, int ldt, + const CHAM_desc_t *C, int Cm, int Cn, int ldc ); void INSERT_TASK_zlacpy( const RUNTIME_option_t *options, - cham_uplo_t uplo, int m, int n, int mb, - const CHAM_desc_t *A, int Am, int An, int lda, - const CHAM_desc_t *B, int Bm, int Bn, int ldb ); -void INSERT_TASK_zlacpyx( const RUNTIME_option_t *options, cham_uplo_t uplo, int m, int n, int mb, - int displA, const CHAM_desc_t *A, int Am, int An, int lda, - int displB, const CHAM_desc_t *B, int Bm, int Bn, int ldb ); + const CHAM_desc_t *A, int Am, int An, int lda, + const CHAM_desc_t *B, int Bm, int Bn, int ldb ); +void INSERT_TASK_zlacpyx( const RUNTIME_option_t *options, + cham_uplo_t uplo, int m, int n, int mb, + int displA, const CHAM_desc_t *A, int Am, int An, int lda, + int displB, const CHAM_desc_t *B, int Bm, int Bn, int ldb ); void INSERT_TASK_zlange( const RUNTIME_option_t *options, - cham_normtype_t norm, int M, int N, int NB, - const CHAM_desc_t *A, int Am, int An, int LDA, - const CHAM_desc_t *B, int Bm, int Bn ); + cham_normtype_t norm, int M, int N, int NB, + const CHAM_desc_t *A, int Am, int An, int LDA, + const CHAM_desc_t *B, int Bm, int Bn ); void INSERT_TASK_zlange_max( const RUNTIME_option_t *options, - const CHAM_desc_t *A, int Am, int An, - const CHAM_desc_t *B, int Bm, int Bn ); + const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *B, int Bm, int Bn ); void INSERT_TASK_zhessq( const RUNTIME_option_t *options, - cham_uplo_t uplo, int n, - const CHAM_desc_t *A, int Am, int An, int lda, - const CHAM_desc_t *SCALESUMSQ, int SCALESUMSQm, int SCALESUMSQn ); + cham_uplo_t uplo, int n, + const CHAM_desc_t *A, int Am, int An, int lda, + const CHAM_desc_t *SCALESUMSQ, int SCALESUMSQm, int SCALESUMSQn ); void INSERT_TASK_zlanhe( const RUNTIME_option_t *options, - cham_normtype_t norm, cham_uplo_t uplo, int N, int NB, - const CHAM_desc_t *A, int Am, int An, int LDA, - const CHAM_desc_t *B, int Bm, int Bn ); + cham_normtype_t norm, cham_uplo_t uplo, int N, int NB, + const CHAM_desc_t *A, int Am, int An, int LDA, + const CHAM_desc_t *B, int Bm, int Bn ); void INSERT_TASK_zlansy( const RUNTIME_option_t *options, - cham_normtype_t norm, cham_uplo_t uplo, int N, int NB, - const CHAM_desc_t *A, int Am, int An, int LDA, - const CHAM_desc_t *B, int Bm, int Bn ); + cham_normtype_t norm, cham_uplo_t uplo, int N, int NB, + const CHAM_desc_t *A, int Am, int An, int LDA, + const CHAM_desc_t *B, int Bm, int Bn ); void INSERT_TASK_zlantr( const RUNTIME_option_t *options, - cham_normtype_t norm, cham_uplo_t uplo, cham_diag_t diag, - int M, int N, int NB, - const CHAM_desc_t *A, int Am, int An, int LDA, - const CHAM_desc_t *B, int Bm, int Bn ); + cham_normtype_t norm, cham_uplo_t uplo, cham_diag_t diag, + int M, int N, int NB, + const CHAM_desc_t *A, int Am, int An, int LDA, + const CHAM_desc_t *B, int Bm, int Bn ); void INSERT_TASK_zlaset( const RUNTIME_option_t *options, - cham_uplo_t uplo, int n1, int n2, CHAMELEON_Complex64_t alpha, - CHAMELEON_Complex64_t beta, const CHAM_desc_t *tileA, int tileAm, int tileAn, int ldtilea ); -void INSERT_TASK_zlaset2( const RUNTIME_option_t *options, cham_uplo_t uplo, int n1, int n2, CHAMELEON_Complex64_t alpha, - const CHAM_desc_t *tileA, int tileAm, int tileAn, int ldtilea ); + CHAMELEON_Complex64_t beta, const CHAM_desc_t *tileA, int tileAm, int tileAn, int ldtilea ); +void INSERT_TASK_zlaset2( const RUNTIME_option_t *options, + cham_uplo_t uplo, int n1, int n2, CHAMELEON_Complex64_t alpha, + const CHAM_desc_t *tileA, int tileAm, int tileAn, int ldtilea ); void INSERT_TASK_zlaswp( const RUNTIME_option_t *options, - int n, const CHAM_desc_t *A, int Am, int An, int lda, - int i1, int i2, int *ipiv, int inc ); + int n, const CHAM_desc_t *A, int Am, int An, int lda, + int i1, int i2, int *ipiv, int inc ); void INSERT_TASK_zlaswp_f2( const RUNTIME_option_t *options, - int n, const CHAM_desc_t *A, int Am, int An, int lda, - int i1, int i2, int *ipiv, int inc, - const CHAM_desc_t *fake1, int fake1m, int fake1n, int szefake1, int flag1, - const CHAM_desc_t *fake2, int fake2m, int fake2n, int szefake2, int flag2 ); + int n, const CHAM_desc_t *A, int Am, int An, int lda, + int i1, int i2, int *ipiv, int inc, + const CHAM_desc_t *fake1, int fake1m, int fake1n, int szefake1, int flag1, + const CHAM_desc_t *fake2, int fake2m, int fake2n, int szefake2, int flag2 ); void INSERT_TASK_zlaswp_ontile( const RUNTIME_option_t *options, - const CHAM_desc_t descA, const CHAM_desc_t *A, int Am, int An, - int i1, int i2, int *ipiv, int inc, CHAMELEON_Complex64_t *fakepanel ); -void INSERT_TASK_zlaswp_ontile_f2( const RUNTIME_option_t *options, - const CHAM_desc_t descA, const CHAM_desc_t *A, int Am, int An, - int i1, int i2, int *ipiv, int inc, - const CHAM_desc_t *fake1, int fake1m, int fake1n, int szefake1, int flag1, - const CHAM_desc_t *fake2, int fake2m, int fake2n, int szefake2, int flag2 ); -void INSERT_TASK_zlaswpc_ontile( const RUNTIME_option_t *options, const CHAM_desc_t descA, const CHAM_desc_t *A, int Am, int An, int i1, int i2, int *ipiv, int inc, CHAMELEON_Complex64_t *fakepanel ); +void INSERT_TASK_zlaswp_ontile_f2( const RUNTIME_option_t *options, + const CHAM_desc_t descA, const CHAM_desc_t *A, int Am, int An, + int i1, int i2, int *ipiv, int inc, + const CHAM_desc_t *fake1, int fake1m, int fake1n, int szefake1, int flag1, + const CHAM_desc_t *fake2, int fake2m, int fake2n, int szefake2, int flag2 ); +void INSERT_TASK_zlaswpc_ontile( const RUNTIME_option_t *options, + const CHAM_desc_t descA, const CHAM_desc_t *A, int Am, int An, + int i1, int i2, int *ipiv, int inc, CHAMELEON_Complex64_t *fakepanel ); void INSERT_TASK_zlatro( const RUNTIME_option_t *options, - cham_uplo_t uplo, cham_trans_t trans, int m, int n, int mb, - const CHAM_desc_t *A, int Am, int An, int lda, - const CHAM_desc_t *B, int Bm, int Bn, int ldb ); + cham_uplo_t uplo, cham_trans_t trans, int m, int n, int mb, + const CHAM_desc_t *A, int Am, int An, int lda, + const CHAM_desc_t *B, int Bm, int Bn, int ldb ); void INSERT_TASK_zlauum( const RUNTIME_option_t *options, - cham_uplo_t uplo, int n, int nb, - const CHAM_desc_t *A, int Am, int An, int lda ); + cham_uplo_t uplo, int n, int nb, + const CHAM_desc_t *A, int Am, int An, int lda ); void INSERT_TASK_zplghe( const RUNTIME_option_t *options, - double bump, int m, int n, const CHAM_desc_t *A, int Am, int An, int lda, - int bigM, int m0, int n0, unsigned long long int seed ); + double bump, int m, int n, const CHAM_desc_t *A, int Am, int An, int lda, + int bigM, int m0, int n0, unsigned long long int seed ); void INSERT_TASK_zplgsy( const RUNTIME_option_t *options, - CHAMELEON_Complex64_t bump, int m, int n, const CHAM_desc_t *A, int Am, int An, int lda, - int bigM, int m0, int n0, unsigned long long int seed ); + CHAMELEON_Complex64_t bump, int m, int n, const CHAM_desc_t *A, int Am, int An, int lda, + int bigM, int m0, int n0, unsigned long long int seed ); void INSERT_TASK_zplrnt( const RUNTIME_option_t *options, - int m, int n, const CHAM_desc_t *A, int Am, int An, int lda, - int bigM, int m0, int n0, unsigned long long int seed ); + int m, int n, const CHAM_desc_t *A, int Am, int An, int lda, + int bigM, int m0, int n0, unsigned long long int seed ); void INSERT_TASK_zpotrf( const RUNTIME_option_t *options, - cham_uplo_t uplo, int n, int nb, - const CHAM_desc_t *A, int Am, int An, int lda, + cham_uplo_t uplo, int n, int nb, + const CHAM_desc_t *A, int Am, int An, int lda, - int iinfo ); + int iinfo ); void INSERT_TASK_zshift( const RUNTIME_option_t *options, - int s, int m, int n, int L, - CHAMELEON_Complex64_t *A ); + int s, int m, int n, int L, + CHAMELEON_Complex64_t *A ); void INSERT_TASK_zshiftw( const RUNTIME_option_t *options, - int s, int cl, int m, int n, int L, - const CHAM_desc_t *A, int Am, int An, CHAMELEON_Complex64_t *W ); + int s, int cl, int m, int n, int L, + const CHAM_desc_t *A, int Am, int An, CHAMELEON_Complex64_t *W ); void INSERT_TASK_zssssm( const RUNTIME_option_t *options, - int m1, int n1, int m2, int n2, int k, int ib, int nb, - const CHAM_desc_t *A1, int A1m, int A1n, int lda1, - const CHAM_desc_t *A2, int A2m, int A2n, int lda2, - const CHAM_desc_t *L1, int L1m, int L1n, int ldl1, - const CHAM_desc_t *L2, int L2m, int L2n, int ldl2, - const int *IPIV ); + int m1, int n1, int m2, int n2, int k, int ib, int nb, + const CHAM_desc_t *A1, int A1m, int A1n, int lda1, + const CHAM_desc_t *A2, int A2m, int A2n, int lda2, + const CHAM_desc_t *L1, int L1m, int L1n, int ldl1, + const CHAM_desc_t *L2, int L2m, int L2n, int ldl2, + const int *IPIV ); void INSERT_TASK_zsymm( const RUNTIME_option_t *options, - cham_side_t side, cham_uplo_t uplo, - int m, int n, int nb, - CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, int lda, - const CHAM_desc_t *B, int Bm, int Bn, int ldb, - CHAMELEON_Complex64_t beta, const CHAM_desc_t *C, int Cm, int Cn, int ldc ); + cham_side_t side, cham_uplo_t uplo, + int m, int n, int nb, + CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, int lda, + const CHAM_desc_t *B, int Bm, int Bn, int ldb, + CHAMELEON_Complex64_t beta, const CHAM_desc_t *C, int Cm, int Cn, int ldc ); void INSERT_TASK_zsyrk( const RUNTIME_option_t *options, - cham_uplo_t uplo, cham_trans_t trans, - int n, int k, int nb, - CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, int lda, - CHAMELEON_Complex64_t beta, const CHAM_desc_t *C, int Cm, int Cn, int ldc ); -void INSERT_TASK_zsyr2k( const RUNTIME_option_t *options, cham_uplo_t uplo, cham_trans_t trans, int n, int k, int nb, CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, int lda, - const CHAM_desc_t *B, int Bm, int Bn, int LDB, CHAMELEON_Complex64_t beta, const CHAM_desc_t *C, int Cm, int Cn, int ldc ); +void INSERT_TASK_zsyr2k( const RUNTIME_option_t *options, + cham_uplo_t uplo, cham_trans_t trans, + int n, int k, int nb, + CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, int lda, + const CHAM_desc_t *B, int Bm, int Bn, int LDB, + CHAMELEON_Complex64_t beta, const CHAM_desc_t *C, int Cm, int Cn, int ldc ); void INSERT_TASK_zsyssq( const RUNTIME_option_t *options, - cham_uplo_t uplo, int n, - const CHAM_desc_t *A, int Am, int An, int lda, - const CHAM_desc_t *SCALESUMSQ, int SCALESUMSQm, int SCALESUMSQn ); + cham_uplo_t uplo, int n, + const CHAM_desc_t *A, int Am, int An, int lda, + const CHAM_desc_t *SCALESUMSQ, int SCALESUMSQm, int SCALESUMSQn ); void INSERT_TASK_zsytrf_nopiv( const RUNTIME_option_t *options, - cham_uplo_t uplo, int n, int nb, - const CHAM_desc_t *A, int Am, int An, int lda, - int iinfo ); + cham_uplo_t uplo, int n, int nb, + const CHAM_desc_t *A, int Am, int An, int lda, + int iinfo ); void INSERT_TASK_zswpab( const RUNTIME_option_t *options, - int i, int n1, int n2, - const CHAM_desc_t *A, int Am, int An, int szeA ); + int i, int n1, int n2, + const CHAM_desc_t *A, int Am, int An, int szeA ); void INSERT_TASK_zswptr_ontile( const RUNTIME_option_t *options, - const CHAM_desc_t descA, const CHAM_desc_t *Aij, int Aijm, int Aijn, - int i1, int i2, int *ipiv, int inc, - const CHAM_desc_t *Akk, int Akkm, int Akkn, int ldak ); + const CHAM_desc_t descA, const CHAM_desc_t *Aij, int Aijm, int Aijn, + int i1, int i2, int *ipiv, int inc, + const CHAM_desc_t *Akk, int Akkm, int Akkn, int ldak ); void INSERT_TASK_ztplqt( const RUNTIME_option_t *options, - int m, int n, int l, int ib, int nb, - const CHAM_desc_t *A, int Am, int An, int lda, - const CHAM_desc_t *B, int Bm, int Bn, int ldb, - const CHAM_desc_t *T, int Tm, int Tn, int ldt ); -void INSERT_TASK_ztpmlqt( const RUNTIME_option_t *options, - cham_side_t side, cham_trans_t trans, - int M, int N, int K, int L, int ib, int nb, - const CHAM_desc_t *V, int Vm, int Vn, int ldv, - const CHAM_desc_t *T, int Tm, int Tn, int ldt, + int m, int n, int l, int ib, int nb, const CHAM_desc_t *A, int Am, int An, int lda, - const CHAM_desc_t *B, int Bm, int Bn, int ldb ); + const CHAM_desc_t *B, int Bm, int Bn, int ldb, + const CHAM_desc_t *T, int Tm, int Tn, int ldt ); +void INSERT_TASK_ztpmlqt( const RUNTIME_option_t *options, + cham_side_t side, cham_trans_t trans, + int M, int N, int K, int L, int ib, int nb, + const CHAM_desc_t *V, int Vm, int Vn, int ldv, + const CHAM_desc_t *T, int Tm, int Tn, int ldt, + const CHAM_desc_t *A, int Am, int An, int lda, + const CHAM_desc_t *B, int Bm, int Bn, int ldb ); void INSERT_TASK_ztpmqrt( const RUNTIME_option_t *options, - cham_side_t side, cham_trans_t trans, - int m, int n, int k, int l, int ib, int nb, - const CHAM_desc_t *V, int Vm, int Vn, int ldv, - const CHAM_desc_t *T, int Tm, int Tn, int ldt, - const CHAM_desc_t *A, int Am, int An, int lda, - const CHAM_desc_t *B, int Bm, int Bn, int ldb ); + cham_side_t side, cham_trans_t trans, + int m, int n, int k, int l, int ib, int nb, + const CHAM_desc_t *V, int Vm, int Vn, int ldv, + const CHAM_desc_t *T, int Tm, int Tn, int ldt, + const CHAM_desc_t *A, int Am, int An, int lda, + const CHAM_desc_t *B, int Bm, int Bn, int ldb ); void INSERT_TASK_ztpqrt( const RUNTIME_option_t *options, - int m, int n, int l, int ib, int nb, - const CHAM_desc_t *A, int Am, int An, int lda, - const CHAM_desc_t *B, int Bm, int Bn, int ldb, - const CHAM_desc_t *T, int Tm, int Tn, int ldt ); + int m, int n, int l, int ib, int nb, + const CHAM_desc_t *A, int Am, int An, int lda, + const CHAM_desc_t *B, int Bm, int Bn, int ldb, + const CHAM_desc_t *T, int Tm, int Tn, int ldt ); void INSERT_TASK_ztrdalg( const RUNTIME_option_t *options, - cham_uplo_t uplo, - int N, int NB, - const CHAM_desc_t *A, - const CHAM_desc_t *C, int Cm, int Cn, - const CHAM_desc_t *S, int Sm, int Sn, - int i, int j, int m, int grsiz, int BAND, - int *PCOL, int *ACOL, int *MCOL ); + cham_uplo_t uplo, + int N, int NB, + const CHAM_desc_t *A, + const CHAM_desc_t *C, int Cm, int Cn, + const CHAM_desc_t *S, int Sm, int Sn, + int i, int j, int m, int grsiz, int BAND, + int *PCOL, int *ACOL, int *MCOL ); void INSERT_TASK_ztradd( const RUNTIME_option_t *options, - cham_uplo_t uplo, cham_trans_t trans, int m, int n, int nb, - CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, int lda, - CHAMELEON_Complex64_t beta, const CHAM_desc_t *B, int Bm, int Bn, int ldb ); + cham_uplo_t uplo, cham_trans_t trans, int m, int n, int nb, + CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, int lda, + CHAMELEON_Complex64_t beta, const CHAM_desc_t *B, int Bm, int Bn, int ldb ); void INSERT_TASK_ztrasm( const RUNTIME_option_t *options, - cham_store_t storev, cham_uplo_t uplo, cham_diag_t diag, int M, int N, - const CHAM_desc_t *A, int Am, int An, int lda, - const CHAM_desc_t *B, int Bm, int Bn ); + cham_store_t storev, cham_uplo_t uplo, cham_diag_t diag, int M, int N, + const CHAM_desc_t *A, int Am, int An, int lda, + const CHAM_desc_t *B, int Bm, int Bn ); void INSERT_TASK_ztrmm( const RUNTIME_option_t *options, - cham_side_t side, cham_uplo_t uplo, cham_trans_t transA, cham_diag_t diag, - int m, int n, int nb, - CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, int lda, - const CHAM_desc_t *B, int Bm, int Bn, int ldb ); + cham_side_t side, cham_uplo_t uplo, cham_trans_t transA, cham_diag_t diag, + int m, int n, int nb, + CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, int lda, + const CHAM_desc_t *B, int Bm, int Bn, int ldb ); void INSERT_TASK_ztrmm_p2( const RUNTIME_option_t *options, - cham_side_t side, cham_uplo_t uplo, cham_trans_t transA, cham_diag_t diag, - int m, int n, int nb, - CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, int lda, - CHAMELEON_Complex64_t **B, int ldb ); + cham_side_t side, cham_uplo_t uplo, cham_trans_t transA, cham_diag_t diag, + int m, int n, int nb, + CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, int lda, + CHAMELEON_Complex64_t **B, int ldb ); void INSERT_TASK_ztrsm( const RUNTIME_option_t *options, - cham_side_t side, cham_uplo_t uplo, cham_trans_t transA, cham_diag_t diag, - int m, int n, int nb, - CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, int lda, - const CHAM_desc_t *B, int Bm, int Bn, int ldb ); + cham_side_t side, cham_uplo_t uplo, cham_trans_t transA, cham_diag_t diag, + int m, int n, int nb, + CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, int lda, + const CHAM_desc_t *B, int Bm, int Bn, int ldb ); void INSERT_TASK_ztrssq( const RUNTIME_option_t *options, - cham_uplo_t uplo, cham_diag_t diag, - int m, int n, - const CHAM_desc_t *A, int Am, int An, int lda, - const CHAM_desc_t *SCALESUMSQ, int SCALESUMSQm, int SCALESUMSQn ); + cham_uplo_t uplo, cham_diag_t diag, + int m, int n, + const CHAM_desc_t *A, int Am, int An, int lda, + const CHAM_desc_t *SCALESUMSQ, int SCALESUMSQm, int SCALESUMSQn ); void INSERT_TASK_ztrtri( const RUNTIME_option_t *options, - cham_uplo_t uplo, cham_diag_t diag, int n, int nb, - const CHAM_desc_t *A, int Am, int An, int lda, + cham_uplo_t uplo, cham_diag_t diag, int n, int nb, + const CHAM_desc_t *A, int Am, int An, int lda, - int iinfo ); -void INSERT_TASK_ztslqt( const RUNTIME_option_t *options, - int m, int n, int ib, int nb, - const CHAM_desc_t *A1, int A1m, int A1n, int lda1, - const CHAM_desc_t *A2, int A2m, int A2n, int lda2, - const CHAM_desc_t *T, int Tm, int Tn, int ldt ); -void INSERT_TASK_ztsmlq( const RUNTIME_option_t *options, - cham_side_t side, cham_trans_t trans, - int m1, int n1, int m2, int n2, int k, int ib, int nb, - const CHAM_desc_t *A1, int A1m, int A1n, int lda1, - const CHAM_desc_t *A2, int A2m, int A2n, int lda2, - const CHAM_desc_t *V, int Vm, int Vn, int ldv, - const CHAM_desc_t *T, int Tm, int Tn, int ldt ); + int iinfo ); void INSERT_TASK_ztsmlq_hetra1( const RUNTIME_option_t *options, - cham_side_t side, cham_trans_t trans, - int m1, int n1, int m2, int n2, int k, int ib, int nb, - const CHAM_desc_t *A1, int A1m, int A1n, int lda1, - const CHAM_desc_t *A2, int A2m, int A2n, int lda2, - const CHAM_desc_t *V, int Vm, int Vn, int ldv, - const CHAM_desc_t *T, int Tm, int Tn, int ldt ); -void INSERT_TASK_ztsmqr( const RUNTIME_option_t *options, - cham_side_t side, cham_trans_t trans, - int m1, int n1, int m2, int n2, int k, int ib, int nb, - const CHAM_desc_t *A1, int A1m, int A1n, int lda1, - const CHAM_desc_t *A2, int A2m, int A2n, int lda2, - const CHAM_desc_t *V, int Vm, int Vn, int ldv, - const CHAM_desc_t *T, int Tm, int Tn, int ldt ); + cham_side_t side, cham_trans_t trans, + int m1, int n1, int m2, int n2, int k, int ib, int nb, + const CHAM_desc_t *A1, int A1m, int A1n, int lda1, + const CHAM_desc_t *A2, int A2m, int A2n, int lda2, + const CHAM_desc_t *V, int Vm, int Vn, int ldv, + const CHAM_desc_t *T, int Tm, int Tn, int ldt ); void INSERT_TASK_ztsmqr_hetra1( const RUNTIME_option_t *options, - cham_side_t side, cham_trans_t trans, - int m1, int n1, int m2, int n2, int k, int ib, int nb, - const CHAM_desc_t *A1, int A1m, int A1n, int lda1, - const CHAM_desc_t *A2, int A2m, int A2n, int lda2, - const CHAM_desc_t *V, int Vm, int Vn, int ldv, - const CHAM_desc_t *T, int Tm, int Tn, int ldt ); -void INSERT_TASK_ztsqrt( const RUNTIME_option_t *options, - int m, int n, int ib, int nb, - const CHAM_desc_t *A1, int A1m, int A1n, int lda1, - const CHAM_desc_t *A2, int A2m, int A2n, int lda2, - const CHAM_desc_t *T, int Tm, int Tn, int ldt ); + cham_side_t side, cham_trans_t trans, + int m1, int n1, int m2, int n2, int k, int ib, int nb, + const CHAM_desc_t *A1, int A1m, int A1n, int lda1, + const CHAM_desc_t *A2, int A2m, int A2n, int lda2, + const CHAM_desc_t *V, int Vm, int Vn, int ldv, + const CHAM_desc_t *T, int Tm, int Tn, int ldt ); void INSERT_TASK_ztstrf( const RUNTIME_option_t *options, - int m, int n, int ib, int nb, - const CHAM_desc_t *U, int Um, int Un, int ldu, - const CHAM_desc_t *A, int Am, int An, int lda, - const CHAM_desc_t *L, int Lm, int Ln, int ldl, - int *IPIV, - cham_bool_t check_info, int iinfo ); -void INSERT_TASK_zttmqr( const RUNTIME_option_t *options, - cham_side_t side, cham_trans_t trans, - int m1, int n1, int m2, int n2, int k, int ib, int nb, - const CHAM_desc_t *A1, int A1m, int A1n, int lda1, - const CHAM_desc_t *A2, int A2m, int A2n, int lda2, - const CHAM_desc_t *V, int Vm, int Vn, int ldv, - const CHAM_desc_t *T, int Tm, int Tn, int ldt ); -void INSERT_TASK_zttqrt( const RUNTIME_option_t *options, - int m, int n, int ib, int nb, - const CHAM_desc_t *A1, int A1m, int A1n, int lda1, - const CHAM_desc_t *A2, int A2m, int A2n, int lda2, - const CHAM_desc_t *T, int Tm, int Tn, int ldt ); -void INSERT_TASK_zttmlq( const RUNTIME_option_t *options, - cham_side_t side, cham_trans_t trans, - int m1, int n1, int m2, int n2, int k, int ib, int nb, + int m, int n, int ib, int nb, + const CHAM_desc_t *U, int Um, int Un, int ldu, + const CHAM_desc_t *A, int Am, int An, int lda, + const CHAM_desc_t *L, int Lm, int Ln, int ldl, + int *IPIV, + cham_bool_t check_info, int iinfo ); +void INSERT_TASK_zpamm( const RUNTIME_option_t *options, + int op, cham_side_t side, cham_store_t storev, + int m, int n, int k, int l, const CHAM_desc_t *A1, int A1m, int A1n, int lda1, const CHAM_desc_t *A2, int A2m, int A2n, int lda2, const CHAM_desc_t *V, int Vm, int Vn, int ldv, - const CHAM_desc_t *T, int Tm, int Tn, int ldt ); -void INSERT_TASK_zttlqt( const RUNTIME_option_t *options, - int m, int n, int ib, int nb, - const CHAM_desc_t *A1, int A1m, int A1n, int lda1, - const CHAM_desc_t *A2, int A2m, int A2n, int lda2, - const CHAM_desc_t *T, int Tm, int Tn, int ldt ); -void INSERT_TASK_zpamm( const RUNTIME_option_t *options, - int op, cham_side_t side, cham_store_t storev, - int m, int n, int k, int l, - const CHAM_desc_t *A1, int A1m, int A1n, int lda1, - const CHAM_desc_t *A2, int A2m, int A2n, int lda2, - const CHAM_desc_t *V, int Vm, int Vn, int ldv, - const CHAM_desc_t *W, int Wm, int Wn, int ldw ); + const CHAM_desc_t *W, int Wm, int Wn, int ldw ); void INSERT_TASK_zplssq( const RUNTIME_option_t *options, - const CHAM_desc_t *SCALESUMSQ, int SCALESUMSQm, int SCALESUMSQn, - const CHAM_desc_t *SCLSSQ, int SCLSSQm, int SCLSSQn ); + const CHAM_desc_t *SCALESUMSQ, int SCALESUMSQm, int SCALESUMSQn, + const CHAM_desc_t *SCLSSQ, int SCLSSQm, int SCLSSQn ); void INSERT_TASK_zplssq2( const RUNTIME_option_t *options, - const CHAM_desc_t *RESULT, int RESULTm, int RESULTn ); + const CHAM_desc_t *RESULT, int RESULTm, int RESULTn ); void INSERT_TASK_zunmlq( const RUNTIME_option_t *options, - cham_side_t side, cham_trans_t trans, - int m, int n, int ib, int nb, int k, - const CHAM_desc_t *A, int Am, int An, int lda, - const CHAM_desc_t *T, int Tm, int Tn, int ldt, - const CHAM_desc_t *C, int Cm, int Cn, int ldc ); + cham_side_t side, cham_trans_t trans, + int m, int n, int ib, int nb, int k, + const CHAM_desc_t *A, int Am, int An, int lda, + const CHAM_desc_t *T, int Tm, int Tn, int ldt, + const CHAM_desc_t *C, int Cm, int Cn, int ldc ); void INSERT_TASK_zunmqr( const RUNTIME_option_t *options, - cham_side_t side, cham_trans_t trans, - int m, int n, int k, int ib, int nb, - const CHAM_desc_t *A, int Am, int An, int lda, - const CHAM_desc_t *T, int Tm, int Tn, int ldt, - const CHAM_desc_t *C, int Cm, int Cn, int ldc ); + cham_side_t side, cham_trans_t trans, + int m, int n, int k, int ib, int nb, + const CHAM_desc_t *A, int Am, int An, int lda, + const CHAM_desc_t *T, int Tm, int Tn, int ldt, + const CHAM_desc_t *C, int Cm, int Cn, int ldc ); void INSERT_TASK_zbuild( const RUNTIME_option_t *options, - const CHAM_desc_t *A, int Am, int An, int lda, - void *user_data, void* user_build_callback ); + const CHAM_desc_t *A, int Am, int An, int lda, + void *user_data, void* user_build_callback ); + + +/** + * Keep these insert_task for retro-compatibility + */ +static inline void +INSERT_TASK_ztslqt( const RUNTIME_option_t *options, + int m, int n, int ib, int nb, + const CHAM_desc_t *A1, int A1m, int A1n, int lda1, + const CHAM_desc_t *A2, int A2m, int A2n, int lda2, + const CHAM_desc_t *T, int Tm, int Tn, int ldt ) +{ + return INSERT_TASK_ztplqt( options, m, n, 0, ib, nb, + A1, A1m, A1n, lda1, + A2, A2m, A2n, lda2, + T, Tm, Tn, ldt ); +} + +static inline void +INSERT_TASK_ztsqrt( const RUNTIME_option_t *options, + int m, int n, int ib, int nb, + const CHAM_desc_t *A1, int A1m, int A1n, int lda1, + const CHAM_desc_t *A2, int A2m, int A2n, int lda2, + const CHAM_desc_t *T, int Tm, int Tn, int ldt ) +{ + return INSERT_TASK_ztpqrt( options, m, n, 0, ib, nb, + A1, A1m, A1n, lda1, + A2, A2m, A2n, lda2, + T, Tm, Tn, ldt ); +} + +static inline void +INSERT_TASK_zttlqt( const RUNTIME_option_t *options, + int m, int n, int ib, int nb, + const CHAM_desc_t *A1, int A1m, int A1n, int lda1, + const CHAM_desc_t *A2, int A2m, int A2n, int lda2, + const CHAM_desc_t *T, int Tm, int Tn, int ldt ) +{ + return INSERT_TASK_ztplqt( options, m, n, n, ib, nb, + A1, A1m, A1n, lda1, + A2, A2m, A2n, lda2, + T, Tm, Tn, ldt ); +} + +static inline void +INSERT_TASK_zttqrt( const RUNTIME_option_t *options, + int m, int n, int ib, int nb, + const CHAM_desc_t *A1, int A1m, int A1n, int lda1, + const CHAM_desc_t *A2, int A2m, int A2n, int lda2, + const CHAM_desc_t *T, int Tm, int Tn, int ldt ) +{ + return INSERT_TASK_ztpqrt( options, m, n, m, ib, nb, + A1, A1m, A1n, lda1, + A2, A2m, A2n, lda2, + T, Tm, Tn, ldt ); +} + +static inline void +INSERT_TASK_ztsmlq( const RUNTIME_option_t *options, + cham_side_t side, cham_trans_t trans, + int m1, int n1, int m2, int n2, int k, int ib, int nb, + const CHAM_desc_t *A1, int A1m, int A1n, int lda1, + const CHAM_desc_t *A2, int A2m, int A2n, int lda2, + const CHAM_desc_t *V, int Vm, int Vn, int ldv, + const CHAM_desc_t *T, int Tm, int Tn, int ldt ) +{ + return INSERT_TASK_ztpmlqt( options, side, trans, m2, n2, k, 0, ib, nb, + V, Vm, Vn, ldv, T, Tm, Tn, ldt, + A1, A1m, A1n, lda1, A2, A2m, A2n, lda2 ); +} + +static inline void +INSERT_TASK_ztsmqr( const RUNTIME_option_t *options, + cham_side_t side, cham_trans_t trans, + int m1, int n1, int m2, int n2, int k, int ib, int nb, + const CHAM_desc_t *A1, int A1m, int A1n, int lda1, + const CHAM_desc_t *A2, int A2m, int A2n, int lda2, + const CHAM_desc_t *V, int Vm, int Vn, int ldv, + const CHAM_desc_t *T, int Tm, int Tn, int ldt ) +{ + return INSERT_TASK_ztpmqrt( options, side, trans, m2, n2, k, 0, ib, nb, + V, Vm, Vn, ldv, T, Tm, Tn, ldt, + A1, A1m, A1n, lda1, A2, A2m, A2n, lda2 ); +} + +static inline void +INSERT_TASK_zttmlq( const RUNTIME_option_t *options, + cham_side_t side, cham_trans_t trans, + int m1, int n1, int m2, int n2, int k, int ib, int nb, + const CHAM_desc_t *A1, int A1m, int A1n, int lda1, + const CHAM_desc_t *A2, int A2m, int A2n, int lda2, + const CHAM_desc_t *V, int Vm, int Vn, int ldv, + const CHAM_desc_t *T, int Tm, int Tn, int ldt ) +{ + return INSERT_TASK_ztpmlqt( options, side, trans, m2, n2, k, n2, ib, nb, + V, Vm, Vn, ldv, T, Tm, Tn, ldt, + A1, A1m, A1n, lda1, A2, A2m, A2n, lda2 ); +} + +static inline void +INSERT_TASK_zttmqr( const RUNTIME_option_t *options, + cham_side_t side, cham_trans_t trans, + int m1, int n1, int m2, int n2, int k, int ib, int nb, + const CHAM_desc_t *A1, int A1m, int A1n, int lda1, + const CHAM_desc_t *A2, int A2m, int A2n, int lda2, + const CHAM_desc_t *V, int Vm, int Vn, int ldv, + const CHAM_desc_t *T, int Tm, int Tn, int ldt ) +{ + return INSERT_TASK_ztpmqrt( options, side, trans, m2, n2, k, m2, ib, nb, + V, Vm, Vn, ldv, T, Tm, Tn, ldt, + A1, A1m, A1n, lda1, A2, A2m, A2n, lda2 ); +} #endif /* _chameleon_tasks_z_h_ */ diff --git a/runtime/CMakeLists.txt b/runtime/CMakeLists.txt index f14ef4838888f5481b52774b5b63116825fc00d8..73503ee0ce89aff056eb5971f4904bdbe0787315 100644 --- a/runtime/CMakeLists.txt +++ b/runtime/CMakeLists.txt @@ -21,7 +21,7 @@ # @author Cedric Castagnede # @author Emmanuel Agullo # @author Mathieu Faverge -# @date 2012-07-13 +# @date 2018-11-07 # ### @@ -86,17 +86,9 @@ set(CODELETS_ZSRC codelets/codelet_ztrasm.c codelets/codelet_ztrssq.c codelets/codelet_ztrtri.c - codelets/codelet_ztslqt.c - codelets/codelet_ztsmlq.c - codelets/codelet_ztsmqr.c codelets/codelet_ztsmlq_hetra1.c codelets/codelet_ztsmqr_hetra1.c - codelets/codelet_ztsqrt.c codelets/codelet_ztstrf.c - codelets/codelet_zttlqt.c - codelets/codelet_zttmlq.c - codelets/codelet_zttmqr.c - codelets/codelet_zttqrt.c codelets/codelet_zunmlq.c codelets/codelet_zunmqr.c ################## diff --git a/runtime/parsec/codelets/codelet_ztslqt.c b/runtime/parsec/codelets/codelet_ztslqt.c deleted file mode 100644 index 89c8721131a948e63657ebbd08eea7c553dca5c2..0000000000000000000000000000000000000000 --- a/runtime/parsec/codelets/codelet_ztslqt.c +++ /dev/null @@ -1,70 +0,0 @@ -/** - * - * @file parsec/codelet_ztslqt.c - * - * @copyright 2009-2015 The University of Tennessee and The University of - * Tennessee Research Foundation. All rights reserved. - * @copyright 2012-2018 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, - * Univ. Bordeaux. All rights reserved. - * - *** - * - * @brief Chameleon ztslqt PaRSEC codelet - * - * @version 1.0.0 - * @author Reazul Hoque - * @precisions normal z -> c d s - * - */ -#include "chameleon_parsec.h" -#include "chameleon/tasks_z.h" -#include "coreblas/coreblas_z.h" - -static inline int -CORE_ztslqt_parsec( parsec_execution_stream_t *context, - parsec_task_t *this_task ) -{ - int m; - int n; - int ib; - CHAMELEON_Complex64_t *A1; - int lda1; - CHAMELEON_Complex64_t *A2; - int lda2; - CHAMELEON_Complex64_t *T; - int ldt; - CHAMELEON_Complex64_t *TAU; - CHAMELEON_Complex64_t *WORK; - - parsec_dtd_unpack_args( - this_task, &m, &n, &ib, &A1, &lda1, &A2, &lda2, &T, &ldt, &TAU, &WORK ); - - CORE_ztslqt( m, n, ib, A1, lda1, A2, lda2, T, ldt, TAU, WORK ); - - (void)context; - return PARSEC_HOOK_RETURN_DONE; -} - -void INSERT_TASK_ztslqt(const RUNTIME_option_t *options, - int m, int n, int ib, int nb, - const CHAM_desc_t *A1, int A1m, int A1n, int lda1, - const CHAM_desc_t *A2, int A2m, int A2n, int lda2, - const CHAM_desc_t *T, int Tm, int Tn, int ldt) -{ - parsec_taskpool_t* PARSEC_dtd_taskpool = (parsec_taskpool_t *)(options->sequence->schedopt); - - parsec_dtd_taskpool_insert_task( - PARSEC_dtd_taskpool, CORE_ztslqt_parsec, options->priority, "tslqt", - sizeof(int), &m, VALUE, - sizeof(int), &n, VALUE, - sizeof(int), &ib, VALUE, - PASSED_BY_REF, RTBLKADDR( A1, CHAMELEON_Complex64_t, A1m, A1n ), chameleon_parsec_get_arena_index( A1 ) | INOUT, - sizeof(int), &lda1, VALUE, - PASSED_BY_REF, RTBLKADDR( A2, CHAMELEON_Complex64_t, A2m, A2n ), chameleon_parsec_get_arena_index( A2 ) | INOUT | AFFINITY, - sizeof(int), &lda2, VALUE, - PASSED_BY_REF, RTBLKADDR( T, CHAMELEON_Complex64_t, Tm, Tn ), chameleon_parsec_get_arena_index( T ) | OUTPUT, - sizeof(int), &ldt, VALUE, - sizeof(CHAMELEON_Complex64_t)*nb, NULL, SCRATCH, - sizeof(CHAMELEON_Complex64_t)*ib*nb, NULL, SCRATCH, - PARSEC_DTD_ARG_END ); -} diff --git a/runtime/parsec/codelets/codelet_ztsmlq.c b/runtime/parsec/codelets/codelet_ztsmlq.c deleted file mode 100644 index 56b86887aec4bcf6c3ad6685d6f027a40b188491..0000000000000000000000000000000000000000 --- a/runtime/parsec/codelets/codelet_ztsmlq.c +++ /dev/null @@ -1,89 +0,0 @@ -/** - * - * @file parsec/codelet_ztsmlq.c - * - * @copyright 2009-2015 The University of Tennessee and The University of - * Tennessee Research Foundation. All rights reserved. - * @copyright 2012-2018 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, - * Univ. Bordeaux. All rights reserved. - * - *** - * - * @brief Chameleon ztsmlq PaRSEC codelet - * - * @version 1.0.0 - * @author Reazul Hoque - * @precisions normal z -> c d s - * - */ -#include "chameleon_parsec.h" -#include "chameleon/tasks_z.h" -#include "coreblas/coreblas_z.h" - -static inline int -CORE_ztsmlq_parsec( parsec_execution_stream_t *context, - parsec_task_t *this_task ) -{ - cham_side_t side; - cham_trans_t trans; - int m1; - int n1; - int m2; - int n2; - int k; - int ib; - CHAMELEON_Complex64_t *A1; - int lda1; - CHAMELEON_Complex64_t *A2; - int lda2; - CHAMELEON_Complex64_t *V; - int ldv; - CHAMELEON_Complex64_t *T; - int ldt; - CHAMELEON_Complex64_t *WORK; - int ldwork; - - parsec_dtd_unpack_args( - this_task, &side, &trans, &m1, &n1, &m2, &n2, &k, &ib, &A1, &lda1, &A2, &lda2, &V, &ldv, &T, &ldt, &WORK, &ldwork ); - - CORE_ztsmlq( side, trans, m1, n1, m2, n2, k, ib, - A1, lda1, A2, lda2, V, ldv, T, ldt, WORK, ldwork); - - (void)context; - return PARSEC_HOOK_RETURN_DONE; -} - -void INSERT_TASK_ztsmlq(const RUNTIME_option_t *options, - cham_side_t side, cham_trans_t trans, - int m1, int n1, int m2, int n2, int k, int ib, int nb, - const CHAM_desc_t *A1, int A1m, int A1n, int lda1, - const CHAM_desc_t *A2, int A2m, int A2n, int lda2, - const CHAM_desc_t *V, int Vm, int Vn, int ldv, - const CHAM_desc_t *T, int Tm, int Tn, int ldt) -{ - int ldwork = side == ChamLeft ? ib : nb; - - parsec_taskpool_t* PARSEC_dtd_taskpool = (parsec_taskpool_t *)(options->sequence->schedopt); - - parsec_dtd_taskpool_insert_task( - PARSEC_dtd_taskpool, CORE_ztsmlq_parsec, options->priority, "tsmlq", - sizeof(int), &side, VALUE, - sizeof(int), &trans, VALUE, - sizeof(int), &m1, VALUE, - sizeof(int), &n1, VALUE, - sizeof(int), &m2, VALUE, - sizeof(int), &n2, VALUE, - sizeof(int), &k, VALUE, - sizeof(int), &ib, VALUE, - PASSED_BY_REF, RTBLKADDR( A1, CHAMELEON_Complex64_t, A1m, A1n ), chameleon_parsec_get_arena_index( A1 ) | INOUT, - sizeof(int), &lda1, VALUE, - PASSED_BY_REF, RTBLKADDR( A2, CHAMELEON_Complex64_t, A2m, A2n ), chameleon_parsec_get_arena_index( A2 ) | INOUT | AFFINITY, - sizeof(int), &lda2, VALUE, - PASSED_BY_REF, RTBLKADDR( V, CHAMELEON_Complex64_t, Vm, Vn ), chameleon_parsec_get_arena_index( V ) | INPUT, - sizeof(int), &ldv, VALUE, - PASSED_BY_REF, RTBLKADDR( T, CHAMELEON_Complex64_t, Tm, Tn ), chameleon_parsec_get_arena_index( T ) | INPUT, - sizeof(int), &ldt, VALUE, - sizeof(CHAMELEON_Complex64_t)*ib*nb, NULL, SCRATCH, - sizeof(int), &ldwork, VALUE, - PARSEC_DTD_ARG_END ); -} diff --git a/runtime/parsec/codelets/codelet_ztsmqr.c b/runtime/parsec/codelets/codelet_ztsmqr.c deleted file mode 100644 index e8059bde8466f5957cd435dbc0e1a7b3b55714eb..0000000000000000000000000000000000000000 --- a/runtime/parsec/codelets/codelet_ztsmqr.c +++ /dev/null @@ -1,89 +0,0 @@ -/** - * - * @file parsec/codelet_ztsmqr.c - * - * @copyright 2009-2015 The University of Tennessee and The University of - * Tennessee Research Foundation. All rights reserved. - * @copyright 2012-2018 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, - * Univ. Bordeaux. All rights reserved. - * - *** - * - * @brief Chameleon ztsmqr PaRSEC codelet - * - * @version 1.0.0 - * @author Reazul Hoque - * @precisions normal z -> c d s - * - */ -#include "chameleon_parsec.h" -#include "chameleon/tasks_z.h" -#include "coreblas/coreblas_z.h" - -static inline int -CORE_ztsmqr_parsec( parsec_execution_stream_t *context, - parsec_task_t *this_task ) -{ - cham_side_t side; - cham_trans_t trans; - int m1; - int n1; - int m2; - int n2; - int k; - int ib; - CHAMELEON_Complex64_t *A1; - int lda1; - CHAMELEON_Complex64_t *A2; - int lda2; - CHAMELEON_Complex64_t *V; - int ldv; - CHAMELEON_Complex64_t *T; - int ldt; - CHAMELEON_Complex64_t *WORK; - int ldwork; - - parsec_dtd_unpack_args( - this_task, &side, &trans, &m1, &n1, &m2, &n2, &k, &ib, &A1, &lda1, &A2, &lda2, &V, &ldv, &T, &ldt, &WORK, &ldwork ); - - CORE_ztsmqr( side, trans, m1, n1, m2, n2, k, ib, - A1, lda1, A2, lda2, V, ldv, T, ldt, WORK, ldwork); - - (void)context; - return PARSEC_HOOK_RETURN_DONE; -} - -void INSERT_TASK_ztsmqr(const RUNTIME_option_t *options, - cham_side_t side, cham_trans_t trans, - int m1, int n1, int m2, int n2, int k, int ib, int nb, - const CHAM_desc_t *A1, int A1m, int A1n, int lda1, - const CHAM_desc_t *A2, int A2m, int A2n, int lda2, - const CHAM_desc_t *V, int Vm, int Vn, int ldv, - const CHAM_desc_t *T, int Tm, int Tn, int ldt) -{ - int ldwork = side == ChamLeft ? ib : nb; - - parsec_taskpool_t* PARSEC_dtd_taskpool = (parsec_taskpool_t *)(options->sequence->schedopt); - - parsec_dtd_taskpool_insert_task( - PARSEC_dtd_taskpool, CORE_ztsmqr_parsec, options->priority, "tsmqr", - sizeof(int), &side, VALUE, - sizeof(int), &trans, VALUE, - sizeof(int), &m1, VALUE, - sizeof(int), &n1, VALUE, - sizeof(int), &m2, VALUE, - sizeof(int), &n2, VALUE, - sizeof(int), &k, VALUE, - sizeof(int), &ib, VALUE, - PASSED_BY_REF, RTBLKADDR( A1, CHAMELEON_Complex64_t, A1m, A1n ), chameleon_parsec_get_arena_index( A1 ) | INOUT, - sizeof(int), &lda1, VALUE, - PASSED_BY_REF, RTBLKADDR( A2, CHAMELEON_Complex64_t, A2m, A2n ), chameleon_parsec_get_arena_index( A2 ) | INOUT | AFFINITY, - sizeof(int), &lda2, VALUE, - PASSED_BY_REF, RTBLKADDR( V, CHAMELEON_Complex64_t, Vm, Vn ), chameleon_parsec_get_arena_index( V ) | INPUT, - sizeof(int), &ldv, VALUE, - PASSED_BY_REF, RTBLKADDR( T, CHAMELEON_Complex64_t, Tm, Tn ), chameleon_parsec_get_arena_index( T ) | INPUT, - sizeof(int), &ldt, VALUE, - sizeof(CHAMELEON_Complex64_t)*ib*nb, NULL, SCRATCH, - sizeof(int), &ldwork, VALUE, - PARSEC_DTD_ARG_END ); -} diff --git a/runtime/parsec/codelets/codelet_ztsqrt.c b/runtime/parsec/codelets/codelet_ztsqrt.c deleted file mode 100644 index a8edb3c0fdf0562f9c374bdb2100919d6f825b97..0000000000000000000000000000000000000000 --- a/runtime/parsec/codelets/codelet_ztsqrt.c +++ /dev/null @@ -1,70 +0,0 @@ -/** - * - * @file parsec/codelet_ztsqrt.c - * - * @copyright 2009-2015 The University of Tennessee and The University of - * Tennessee Research Foundation. All rights reserved. - * @copyright 2012-2018 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, - * Univ. Bordeaux. All rights reserved. - * - *** - * - * @brief Chameleon ztsqrt PaRSEC codelet - * - * @version 1.0.0 - * @author Reazul Hoque - * @precisions normal z -> c d s - * - */ -#include "chameleon_parsec.h" -#include "chameleon/tasks_z.h" -#include "coreblas/coreblas_z.h" - -static inline int -CORE_ztsqrt_parsec( parsec_execution_stream_t *context, - parsec_task_t *this_task ) -{ - int m; - int n; - int ib; - CHAMELEON_Complex64_t *A1; - int lda1; - CHAMELEON_Complex64_t *A2; - int lda2; - CHAMELEON_Complex64_t *T; - int ldt; - CHAMELEON_Complex64_t *TAU; - CHAMELEON_Complex64_t *WORK; - - parsec_dtd_unpack_args( - this_task, &m, &n, &ib, &A1, &lda1, &A2, &lda2, &T, &ldt, &TAU, &WORK ); - - CORE_ztsqrt( m, n, ib, A1, lda1, A2, lda2, T, ldt, TAU, WORK ); - - (void)context; - return PARSEC_HOOK_RETURN_DONE; -} - -void INSERT_TASK_ztsqrt(const RUNTIME_option_t *options, - int m, int n, int ib, int nb, - const CHAM_desc_t *A1, int A1m, int A1n, int lda1, - const CHAM_desc_t *A2, int A2m, int A2n, int lda2, - const CHAM_desc_t *T, int Tm, int Tn, int ldt) -{ - parsec_taskpool_t* PARSEC_dtd_taskpool = (parsec_taskpool_t *)(options->sequence->schedopt); - - parsec_dtd_taskpool_insert_task( - PARSEC_dtd_taskpool, CORE_ztsqrt_parsec, options->priority, "tsqrt", - sizeof(int), &m, VALUE, - sizeof(int), &n, VALUE, - sizeof(int), &ib, VALUE, - PASSED_BY_REF, RTBLKADDR( A1, CHAMELEON_Complex64_t, A1m, A1n ), chameleon_parsec_get_arena_index( A1 ) | INOUT, - sizeof(int), &lda1, VALUE, - PASSED_BY_REF, RTBLKADDR( A2, CHAMELEON_Complex64_t, A2m, A2n ), chameleon_parsec_get_arena_index( A2 ) | INOUT | AFFINITY, - sizeof(int), &lda2, VALUE, - PASSED_BY_REF, RTBLKADDR( T, CHAMELEON_Complex64_t, Tm, Tn ), chameleon_parsec_get_arena_index( T ) | OUTPUT, - sizeof(int), &ldt, VALUE, - sizeof(CHAMELEON_Complex64_t)*nb, NULL, SCRATCH, - sizeof(CHAMELEON_Complex64_t)*ib*nb, NULL, SCRATCH, - PARSEC_DTD_ARG_END ); -} diff --git a/runtime/parsec/codelets/codelet_zttlqt.c b/runtime/parsec/codelets/codelet_zttlqt.c deleted file mode 100644 index 1a72dd5cffefc3c4cc929558042fa10d5343aa33..0000000000000000000000000000000000000000 --- a/runtime/parsec/codelets/codelet_zttlqt.c +++ /dev/null @@ -1,71 +0,0 @@ -/** - * - * @file parsec/codelet_zttlqt.c - * - * @copyright 2009-2015 The University of Tennessee and The University of - * Tennessee Research Foundation. All rights reserved. - * @copyright 2012-2018 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, - * Univ. Bordeaux. All rights reserved. - * - *** - * - * @brief Chameleon zttlqt PaRSEC codelet - * - * @version 1.0.0 - * @author Reazul Hoque - * @precisions normal z -> c d s - * - */ -#include "chameleon_parsec.h" -#include "chameleon/tasks_z.h" -#include "coreblas/coreblas_z.h" - -static inline int -CORE_zttlqt_parsec( parsec_execution_stream_t *context, - parsec_task_t *this_task ) -{ - int m; - int n; - int ib; - CHAMELEON_Complex64_t *A1; - int lda1; - CHAMELEON_Complex64_t *A2; - int lda2; - CHAMELEON_Complex64_t *T; - int ldt; - CHAMELEON_Complex64_t *TAU; - CHAMELEON_Complex64_t *WORK; - - parsec_dtd_unpack_args( - this_task, &m, &n, &ib, &A1, &lda1, &A2, &lda2, &T, &ldt, &TAU, &WORK ); - - CORE_zttlqt( m, n, ib, A1, lda1, A2, lda2, T, ldt, TAU, WORK ); - - (void)context; - return PARSEC_HOOK_RETURN_DONE; -} - -void INSERT_TASK_zttlqt(const RUNTIME_option_t *options, - int m, int n, int ib, int nb, - const CHAM_desc_t *A1, int A1m, int A1n, int lda1, - const CHAM_desc_t *A2, int A2m, int A2n, int lda2, - const CHAM_desc_t *T, int Tm, int Tn, int ldt) -{ - - parsec_taskpool_t* PARSEC_dtd_taskpool = (parsec_taskpool_t *)(options->sequence->schedopt); - - parsec_dtd_taskpool_insert_task( - PARSEC_dtd_taskpool, CORE_zttlqt_parsec, options->priority, "ttlqt", - sizeof(int), &m, VALUE, - sizeof(int), &n, VALUE, - sizeof(int), &ib, VALUE, - PASSED_BY_REF, RTBLKADDR( A1, CHAMELEON_Complex64_t, A1m, A1n ), chameleon_parsec_get_arena_index( A1 ) | INOUT, - sizeof(int), &lda1, VALUE, - PASSED_BY_REF, RTBLKADDR( A2, CHAMELEON_Complex64_t, A2m, A2n ), chameleon_parsec_get_arena_index( A2 ) | INOUT | AFFINITY, - sizeof(int), &lda2, VALUE, - PASSED_BY_REF, RTBLKADDR( T, CHAMELEON_Complex64_t, Tm, Tn ), chameleon_parsec_get_arena_index( T ) | OUTPUT, - sizeof(int), &ldt, VALUE, - sizeof(CHAMELEON_Complex64_t)*nb, NULL, SCRATCH, - sizeof(CHAMELEON_Complex64_t)*ib*nb, NULL, SCRATCH, - PARSEC_DTD_ARG_END ); -} diff --git a/runtime/parsec/codelets/codelet_zttmlq.c b/runtime/parsec/codelets/codelet_zttmlq.c deleted file mode 100644 index b0788876c79e5197cbc30ffa4880b77a68531ff5..0000000000000000000000000000000000000000 --- a/runtime/parsec/codelets/codelet_zttmlq.c +++ /dev/null @@ -1,89 +0,0 @@ -/** - * - * @file parsec/codelet_zttmlq.c - * - * @copyright 2009-2015 The University of Tennessee and The University of - * Tennessee Research Foundation. All rights reserved. - * @copyright 2012-2018 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, - * Univ. Bordeaux. All rights reserved. - * - *** - * - * @brief Chameleon zttmlq PaRSEC codelet - * - * @version 1.0.0 - * @author Reazul Hoque - * @precisions normal z -> c d s - * - */ -#include "chameleon_parsec.h" -#include "chameleon/tasks_z.h" -#include "coreblas/coreblas_z.h" - -static inline int -CORE_zttmlq_parsec( parsec_execution_stream_t *context, - parsec_task_t *this_task ) -{ - cham_side_t side; - cham_trans_t trans; - int m1; - int n1; - int m2; - int n2; - int k; - int ib; - CHAMELEON_Complex64_t *A1; - int lda1; - CHAMELEON_Complex64_t *A2; - int lda2; - CHAMELEON_Complex64_t *V; - int ldv; - CHAMELEON_Complex64_t *T; - int ldt; - CHAMELEON_Complex64_t *WORK; - int ldwork; - - parsec_dtd_unpack_args( - this_task, &side, &trans, &m1, &n1, &m2, &n2, &k, &ib, &A1, &lda1, &A2, &lda2, &V, &ldv, &T, &ldt, &WORK, &ldwork ); - - CORE_zttmlq( side, trans, m1, n1, m2, n2, k, ib, A1, lda1, - A2, lda2, V, ldv, T, ldt, WORK, ldwork); - - (void)context; - return PARSEC_HOOK_RETURN_DONE; -} - -void INSERT_TASK_zttmlq(const RUNTIME_option_t *options, - cham_side_t side, cham_trans_t trans, - int m1, int n1, int m2, int n2, int k, int ib, int nb, - const CHAM_desc_t *A1, int A1m, int A1n, int lda1, - const CHAM_desc_t *A2, int A2m, int A2n, int lda2, - const CHAM_desc_t *V, int Vm, int Vn, int ldv, - const CHAM_desc_t *T, int Tm, int Tn, int ldt) -{ - int ldwork = side == ChamLeft ? ib : nb; - - parsec_taskpool_t* PARSEC_dtd_taskpool = (parsec_taskpool_t *)(options->sequence->schedopt); - - parsec_dtd_taskpool_insert_task( - PARSEC_dtd_taskpool, CORE_zttmlq_parsec, options->priority, "ttmlq", - sizeof(int), &side, VALUE, - sizeof(int), &trans, VALUE, - sizeof(int), &m1, VALUE, - sizeof(int), &n1, VALUE, - sizeof(int), &m2, VALUE, - sizeof(int), &n2, VALUE, - sizeof(int), &k, VALUE, - sizeof(int), &ib, VALUE, - PASSED_BY_REF, RTBLKADDR( A1, CHAMELEON_Complex64_t, A1m, A1n ), chameleon_parsec_get_arena_index( A1 ) | INOUT, - sizeof(int), &lda1, VALUE, - PASSED_BY_REF, RTBLKADDR( A2, CHAMELEON_Complex64_t, A2m, A2n ), chameleon_parsec_get_arena_index( A2 ) | INOUT | AFFINITY, - sizeof(int), &lda2, VALUE, - PASSED_BY_REF, RTBLKADDR( V, CHAMELEON_Complex64_t, Vm, Vn ), chameleon_parsec_get_arena_index( V ) | INPUT, - sizeof(int), &ldv, VALUE, - PASSED_BY_REF, RTBLKADDR( T, CHAMELEON_Complex64_t, Tm, Tn ), chameleon_parsec_get_arena_index( T ) | INPUT, - sizeof(int), &ldt, VALUE, - sizeof(CHAMELEON_Complex64_t)*ib*nb, NULL, SCRATCH, - sizeof(int), &ldwork, VALUE, - PARSEC_DTD_ARG_END ); -} diff --git a/runtime/parsec/codelets/codelet_zttmqr.c b/runtime/parsec/codelets/codelet_zttmqr.c deleted file mode 100644 index f8a8b8f6bc9fb415323554d7eb8efea1dab0725d..0000000000000000000000000000000000000000 --- a/runtime/parsec/codelets/codelet_zttmqr.c +++ /dev/null @@ -1,90 +0,0 @@ -/** - * - * @file parsec/codelet_zttmqr.c - * - * @copyright 2009-2015 The University of Tennessee and The University of - * Tennessee Research Foundation. All rights reserved. - * @copyright 2012-2018 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, - * Univ. Bordeaux. All rights reserved. - * - *** - * - * @brief Chameleon zttmqr PaRSEC codelet - * - * @version 1.0.0 - * @author Reazul Hoque - * @precisions normal z -> c d s - * - */ -#include "chameleon_parsec.h" -#include "chameleon/tasks_z.h" -#include "coreblas/coreblas_z.h" - -static inline int -CORE_zttmqr_parsec( parsec_execution_stream_t *context, - parsec_task_t *this_task ) -{ - cham_side_t side; - cham_trans_t trans; - int m1; - int n1; - int m2; - int n2; - int k; - int ib; - CHAMELEON_Complex64_t *A1; - int lda1; - CHAMELEON_Complex64_t *A2; - int lda2; - CHAMELEON_Complex64_t *V; - int ldv; - CHAMELEON_Complex64_t *T; - int ldt; - CHAMELEON_Complex64_t *WORK; - int ldwork; - - parsec_dtd_unpack_args( - this_task, &side, &trans, &m1, &n1, &m2, &n2, &k, &ib, &A1, &lda1, &A2, &lda2, &V, &ldv, &T, &ldt, &WORK, &ldwork ); - - CORE_zttmqr( side, trans, m1, n1, m2, n2, k, ib, - A1, lda1, A2, lda2, V, ldv, T, ldt, WORK, ldwork); - - (void)context; - return PARSEC_HOOK_RETURN_DONE; -} - - -void INSERT_TASK_zttmqr(const RUNTIME_option_t *options, - cham_side_t side, cham_trans_t trans, - int m1, int n1, int m2, int n2, int k, int ib, int nb, - const CHAM_desc_t *A1, int A1m, int A1n, int lda1, - const CHAM_desc_t *A2, int A2m, int A2n, int lda2, - const CHAM_desc_t *V, int Vm, int Vn, int ldv, - const CHAM_desc_t *T, int Tm, int Tn, int ldt) -{ - int ldwork = side == ChamLeft ? ib : nb; - - parsec_taskpool_t* PARSEC_dtd_taskpool = (parsec_taskpool_t *)(options->sequence->schedopt); - - parsec_dtd_taskpool_insert_task( - PARSEC_dtd_taskpool, CORE_zttmqr_parsec, options->priority, "ttmqr", - sizeof(int), &side, VALUE, - sizeof(int), &trans, VALUE, - sizeof(int), &m1, VALUE, - sizeof(int), &n1, VALUE, - sizeof(int), &m2, VALUE, - sizeof(int), &n2, VALUE, - sizeof(int), &k, VALUE, - sizeof(int), &ib, VALUE, - PASSED_BY_REF, RTBLKADDR( A1, CHAMELEON_Complex64_t, A1m, A1n ), chameleon_parsec_get_arena_index( A1 ) | INOUT, - sizeof(int), &lda1, VALUE, - PASSED_BY_REF, RTBLKADDR( A2, CHAMELEON_Complex64_t, A2m, A2n ), chameleon_parsec_get_arena_index( A2 ) | INOUT | AFFINITY, - sizeof(int), &lda2, VALUE, - PASSED_BY_REF, RTBLKADDR( V, CHAMELEON_Complex64_t, Vm, Vn ), chameleon_parsec_get_arena_index( V ) | INPUT, - sizeof(int), &ldv, VALUE, - PASSED_BY_REF, RTBLKADDR( T, CHAMELEON_Complex64_t, Tm, Tn ), chameleon_parsec_get_arena_index( T ) | INPUT, - sizeof(int), &ldt, VALUE, - sizeof(CHAMELEON_Complex64_t)*ib*nb, NULL, SCRATCH, - sizeof(int), &ldwork, VALUE, - PARSEC_DTD_ARG_END ); -} diff --git a/runtime/parsec/codelets/codelet_zttqrt.c b/runtime/parsec/codelets/codelet_zttqrt.c deleted file mode 100644 index 6b22180076622b164425b06089774675fc873cfb..0000000000000000000000000000000000000000 --- a/runtime/parsec/codelets/codelet_zttqrt.c +++ /dev/null @@ -1,70 +0,0 @@ -/** - * - * @file parsec/codelet_zttqrt.c - * - * @copyright 2009-2015 The University of Tennessee and The University of - * Tennessee Research Foundation. All rights reserved. - * @copyright 2012-2018 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, - * Univ. Bordeaux. All rights reserved. - * - *** - * - * @brief Chameleon zttqrt PaRSEC codelet - * - * @version 1.0.0 - * @author Reazul Hoque - * @precisions normal z -> c d s - * - */ -#include "chameleon_parsec.h" -#include "chameleon/tasks_z.h" -#include "coreblas/coreblas_z.h" - -static inline int -CORE_zttqrt_parsec( parsec_execution_stream_t *context, - parsec_task_t *this_task ) -{ - int m; - int n; - int ib; - CHAMELEON_Complex64_t *A1; - int lda1; - CHAMELEON_Complex64_t *A2; - int lda2; - CHAMELEON_Complex64_t *T; - int ldt; - CHAMELEON_Complex64_t *TAU; - CHAMELEON_Complex64_t *WORK; - - parsec_dtd_unpack_args( - this_task, &m, &n, &ib, &A1, &lda1, &A2, &lda2, &T, &ldt, &TAU, &WORK ); - - CORE_zttqrt( m, n, ib, A1, lda1, A2, lda2, T, ldt, TAU, WORK ); - - (void)context; - return PARSEC_HOOK_RETURN_DONE; -} - -void INSERT_TASK_zttqrt(const RUNTIME_option_t *options, - int m, int n, int ib, int nb, - const CHAM_desc_t *A1, int A1m, int A1n, int lda1, - const CHAM_desc_t *A2, int A2m, int A2n, int lda2, - const CHAM_desc_t *T, int Tm, int Tn, int ldt) -{ - parsec_taskpool_t* PARSEC_dtd_taskpool = (parsec_taskpool_t *)(options->sequence->schedopt); - - parsec_dtd_taskpool_insert_task( - PARSEC_dtd_taskpool, CORE_zttqrt_parsec, options->priority, "ttqrt", - sizeof(int), &m, VALUE, - sizeof(int), &n, VALUE, - sizeof(int), &ib, VALUE, - PASSED_BY_REF, RTBLKADDR( A1, CHAMELEON_Complex64_t, A1m, A1n ), chameleon_parsec_get_arena_index( A1 ) | INOUT, - sizeof(int), &lda1, VALUE, - PASSED_BY_REF, RTBLKADDR( A2, CHAMELEON_Complex64_t, A2m, A2n ), chameleon_parsec_get_arena_index( A2 ) | INOUT | AFFINITY, - sizeof(int), &lda2, VALUE, - PASSED_BY_REF, RTBLKADDR( T, CHAMELEON_Complex64_t, Tm, Tn ), chameleon_parsec_get_arena_index( T ) | OUTPUT, - sizeof(int), &ldt, VALUE, - sizeof(CHAMELEON_Complex64_t)*nb, NULL, SCRATCH, - sizeof(CHAMELEON_Complex64_t)*ib*nb, NULL, SCRATCH, - PARSEC_DTD_ARG_END ); -} diff --git a/runtime/quark/codelets/codelet_ztplqt.c b/runtime/quark/codelets/codelet_ztplqt.c index 5b3f15ab4527f3d753d3e74d1d65932f6ce84e34..f0e51b3754d6460ea9d3be4a8e9d4f826c8a997f 100644 --- a/runtime/quark/codelets/codelet_ztplqt.c +++ b/runtime/quark/codelets/codelet_ztplqt.c @@ -13,7 +13,7 @@ * * @version 1.0.0 * @author Mathieu Faverge - * @date 2016-12-15 + * @date 2018-11-08 * @precisions normal z -> s d c * */ @@ -50,7 +50,7 @@ void INSERT_TASK_ztplqt( const RUNTIME_option_t *options, const CHAM_desc_t *T, int Tm, int Tn, int ldt ) { quark_option_t *opt = (quark_option_t*)(options->schedopt); - DAG_CORE_TSLQT; + DAG_CORE_TPLQT; int shapeB = ( L == 0 ) ? 0 : (QUARK_REGION_L | QUARK_REGION_D); diff --git a/runtime/quark/codelets/codelet_ztpmlqt.c b/runtime/quark/codelets/codelet_ztpmlqt.c index e82f40c4dbe58525926f2381eef3cea39fb6ccdc..fa435550dbb7cea349898f43b5e8fa9ec71c8ef4 100644 --- a/runtime/quark/codelets/codelet_ztpmlqt.c +++ b/runtime/quark/codelets/codelet_ztpmlqt.c @@ -13,7 +13,7 @@ * * @version 1.0.0 * @author Mathieu Faverge - * @date 2016-12-15 + * @date 2018-11-08 * @precisions normal z -> s d c * */ @@ -57,7 +57,7 @@ void INSERT_TASK_ztpmlqt( const RUNTIME_option_t *options, const CHAM_desc_t *B, int Bm, int Bn, int ldb ) { quark_option_t *opt = (quark_option_t*)(options->schedopt); - DAG_CORE_TSMQR; + DAG_CORE_TPMLQT; int shapeV = ( L == 0 ) ? 0 : (QUARK_REGION_L | QUARK_REGION_D); diff --git a/runtime/quark/codelets/codelet_ztpmqrt.c b/runtime/quark/codelets/codelet_ztpmqrt.c index 933ca3f1327d999f671f786002e6f559eb0130f8..bdf6627273065b4bdf18d93f07384628d5b113f7 100644 --- a/runtime/quark/codelets/codelet_ztpmqrt.c +++ b/runtime/quark/codelets/codelet_ztpmqrt.c @@ -13,7 +13,7 @@ * * @version 1.0.0 * @author Mathieu Faverge - * @date 2016-12-15 + * @date 2018-11-08 * @precisions normal z -> s d c * */ @@ -57,7 +57,7 @@ void INSERT_TASK_ztpmqrt( const RUNTIME_option_t *options, const CHAM_desc_t *B, int Bm, int Bn, int ldb ) { quark_option_t *opt = (quark_option_t*)(options->schedopt); - DAG_CORE_TSMQR; + DAG_CORE_TPMQRT; int shapeV = ( L == 0 ) ? 0 : (QUARK_REGION_U | QUARK_REGION_D); diff --git a/runtime/quark/codelets/codelet_ztpqrt.c b/runtime/quark/codelets/codelet_ztpqrt.c index 50470ac8e806aed4479679a577629091f573b4cd..24ce98e124023f90184379c211200d21693ed503 100644 --- a/runtime/quark/codelets/codelet_ztpqrt.c +++ b/runtime/quark/codelets/codelet_ztpqrt.c @@ -13,7 +13,7 @@ * * @version 1.0.0 * @author Mathieu Faverge - * @date 2016-12-15 + * @date 2018-11-08 * @precisions normal z -> s d c * */ @@ -50,7 +50,7 @@ void INSERT_TASK_ztpqrt( const RUNTIME_option_t *options, const CHAM_desc_t *T, int Tm, int Tn, int ldt ) { quark_option_t *opt = (quark_option_t*)(options->schedopt); - DAG_CORE_TSQRT; + DAG_CORE_TPQRT; int shapeB = ( L == 0 ) ? 0 : (QUARK_REGION_U | QUARK_REGION_D); diff --git a/runtime/quark/codelets/codelet_ztslqt.c b/runtime/quark/codelets/codelet_ztslqt.c deleted file mode 100644 index 4efb19be91de19cb3819cf674e4a08a9f1defaaf..0000000000000000000000000000000000000000 --- a/runtime/quark/codelets/codelet_ztslqt.c +++ /dev/null @@ -1,143 +0,0 @@ -/** - * - * @file quark/codelet_ztslqt.c - * - * @copyright 2009-2014 The University of Tennessee and The University of - * Tennessee Research Foundation. All rights reserved. - * @copyright 2012-2018 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, - * Univ. Bordeaux. All rights reserved. - * - *** - * - * @brief Chameleon ztslqt Quark codelet - * - * @version 1.0.0 - * @comment This file has been automatically generated - * from Plasma 2.5.0 for CHAMELEON 1.0.0 - * @author Hatem Ltaief - * @author Jakub Kurzak - * @author Mathieu Faverge - * @author Emmanuel Agullo - * @author Cedric Castagnede - * @date 2010-11-15 - * @precisions normal z -> c d s - * - */ -#include "chameleon_quark.h" -#include "chameleon/tasks_z.h" -#include "coreblas/coreblas_z.h" - -void CORE_ztslqt_quark(Quark *quark) -{ - int m; - int n; - int ib; - CHAMELEON_Complex64_t *A1; - int lda1; - CHAMELEON_Complex64_t *A2; - int lda2; - CHAMELEON_Complex64_t *T; - int ldt; - CHAMELEON_Complex64_t *TAU; - CHAMELEON_Complex64_t *WORK; - - quark_unpack_args_11(quark, m, n, ib, A1, lda1, A2, lda2, T, ldt, TAU, WORK); - CORE_ztslqt(m, n, ib, A1, lda1, A2, lda2, T, ldt, TAU, WORK); -} - -/** - * - * @ingroup INSERT_TASK_Complex64_t - * - * CORE_ztslqt computes a LQ factorization of a rectangular matrix - * formed by coupling side-by-side a complex M-by-M - * lower triangular tile A1 and a complex M-by-N tile A2: - * - * | A1 A2 | = L * Q - * - * The tile Q is represented as a product of elementary reflectors - * - * Q = H(k)' . . . H(2)' H(1)', where k = min(M,N). - * - * Each H(i) has the form - * - * H(i) = I - tau * v * v' - * - * where tau is a complex scalar, and v is a complex vector with - * v(1:i-1) = 0 and v(i) = 1; conjg(v(i+1:n)) is stored on exit in - * A2(i,1:n), and tau in TAU(i). - * - ******************************************************************************* - * - * @param[in] M - * The number of rows of the tile A1 and A2. M >= 0. - * The number of columns of the tile A1. - * - * @param[in] N - * The number of columns of the tile A2. N >= 0. - * - * @param[in] IB - * The inner-blocking size. IB >= 0. - * - * @param[in,out] A1 - * On entry, the M-by-M tile A1. - * On exit, the elements on and below the diagonal of the array - * contain the M-by-M lower trapezoidal tile L; - * the elements above the diagonal are not referenced. - * - * @param[in] LDA1 - * The leading dimension of the array A1. LDA1 >= max(1,M). - * - * @param[in,out] A2 - * On entry, the M-by-N tile A2. - * On exit, all the elements with the array TAU, represent - * the unitary tile Q as a product of elementary reflectors - * (see Further Details). - * - * @param[in] LDA2 - * The leading dimension of the tile A2. LDA2 >= max(1,M). - * - * @param[out] T - * The IB-by-N triangular factor T of the block reflector. - * T is upper triangular by block (economic storage); - * The rest of the array is not referenced. - * - * @param[in] LDT - * The leading dimension of the array T. LDT >= IB. - * - * @param[out] TAU - * The scalar factors of the elementary reflectors (see Further - * Details). - * - * @param[out] WORK - * - ******************************************************************************* - * - * @return - * \retval CHAMELEON_SUCCESS successful exit - * \retval <0 if -i, the i-th argument had an illegal value - * - */ - -void INSERT_TASK_ztslqt(const RUNTIME_option_t *options, - int m, int n, int ib, int nb, - const CHAM_desc_t *A1, int A1m, int A1n, int lda1, - const CHAM_desc_t *A2, int A2m, int A2n, int lda2, - const CHAM_desc_t *T, int Tm, int Tn, int ldt) -{ - quark_option_t *opt = (quark_option_t*)(options->schedopt); - DAG_CORE_TSLQT; - QUARK_Insert_Task(opt->quark, CORE_ztslqt_quark, (Quark_Task_Flags*)opt, - sizeof(int), &m, VALUE, - sizeof(int), &n, VALUE, - sizeof(int), &ib, VALUE, - sizeof(CHAMELEON_Complex64_t)*nb*nb, RTBLKADDR(A1, CHAMELEON_Complex64_t, A1m, A1n), INOUT | QUARK_REGION_L | QUARK_REGION_D, - sizeof(int), &lda1, VALUE, - sizeof(CHAMELEON_Complex64_t)*nb*nb, RTBLKADDR(A2, CHAMELEON_Complex64_t, A2m, A2n), INOUT | LOCALITY, - sizeof(int), &lda2, VALUE, - sizeof(CHAMELEON_Complex64_t)*ib*nb, RTBLKADDR(T, CHAMELEON_Complex64_t, Tm, Tn), OUTPUT, - sizeof(int), &ldt, VALUE, - sizeof(CHAMELEON_Complex64_t)*nb, NULL, SCRATCH, - sizeof(CHAMELEON_Complex64_t)*ib*nb, NULL, SCRATCH, - 0); -} diff --git a/runtime/quark/codelets/codelet_ztsmlq.c b/runtime/quark/codelets/codelet_ztsmlq.c deleted file mode 100644 index b3003d130f6c08c8e5b60130b13b1baf5653ed04..0000000000000000000000000000000000000000 --- a/runtime/quark/codelets/codelet_ztsmlq.c +++ /dev/null @@ -1,190 +0,0 @@ -/** - * - * @file quark/codelet_ztsmlq.c - * - * @copyright 2009-2014 The University of Tennessee and The University of - * Tennessee Research Foundation. All rights reserved. - * @copyright 2012-2018 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, - * Univ. Bordeaux. All rights reserved. - * - *** - * - * @brief Chameleon ztsmlq Quark codelet - * - * @version 1.0.0 - * @comment This file has been automatically generated - * from Plasma 2.5.0 for CHAMELEON 1.0.0 - * @author Hatem Ltaief - * @author Jakub Kurzak - * @author Azzam Haidar - * @author Dulceneia Becker - * @author Mathieu Faverge - * @author Emmanuel Agullo - * @author Cedric Castagnede - * @date 2010-11-15 - * @precisions normal z -> c d s - * - */ -#include "chameleon_quark.h" -#include "chameleon/tasks_z.h" -#include "coreblas/coreblas_z.h" - -void CORE_ztsmlq_quark(Quark *quark) -{ - cham_side_t side; - cham_trans_t trans; - int m1; - int n1; - int m2; - int n2; - int k; - int ib; - CHAMELEON_Complex64_t *A1; - int lda1; - CHAMELEON_Complex64_t *A2; - int lda2; - CHAMELEON_Complex64_t *V; - int ldv; - CHAMELEON_Complex64_t *T; - int ldt; - CHAMELEON_Complex64_t *WORK; - int ldwork; - - quark_unpack_args_18(quark, side, trans, m1, n1, m2, n2, k, ib, - A1, lda1, A2, lda2, V, ldv, T, ldt, WORK, ldwork); - CORE_ztsmlq(side, trans, m1, n1, m2, n2, k, ib, - A1, lda1, A2, lda2, V, ldv, T, ldt, WORK, ldwork); -} - -/** - * - * @ingroup INSERT_TASK_Complex64_t - * - * CORE_ztsmlq overwrites the general complex M1-by-N1 tile A1 and - * M2-by-N2 tile A2 with - * - * SIDE = 'L' SIDE = 'R' - * TRANS = 'N': Q * | A1 | | A1 A2 | * Q - * | A2 | - * - * TRANS = 'C': Q**H * | A1 | | A1 A2 | * Q**H - * | A2 | - * - * where Q is a complex unitary matrix defined as the product of k - * elementary reflectors - * - * Q = H(k)' . . . H(2)' H(1)' - * - * as returned by CORE_ZTSLQT. - * - ******************************************************************************* - * - * @param[in] side - * @arg ChamLeft : apply Q or Q**H from the Left; - * @arg ChamRight : apply Q or Q**H from the Right. - * - * @param[in] trans - * @arg ChamNoTrans : No transpose, apply Q; - * @arg ChamConjTrans : ConjTranspose, apply Q**H. - * - * @param[in] M1 - * The number of rows of the tile A1. M1 >= 0. - * - * @param[in] N1 - * The number of columns of the tile A1. N1 >= 0. - * - * @param[in] M2 - * The number of rows of the tile A2. M2 >= 0. - * M2 = M1 if side == ChamRight. - * - * @param[in] N2 - * The number of columns of the tile A2. N2 >= 0. - * N2 = N1 if side == ChamLeft. - * - * @param[in] K - * The number of elementary reflectors whose product defines - * the matrix Q. - * - * @param[in] IB - * The inner-blocking size. IB >= 0. - * - * @param[in,out] A1 - * On entry, the M1-by-N1 tile A1. - * On exit, A1 is overwritten by the application of Q. - * - * @param[in] LDA1 - * The leading dimension of the array A1. LDA1 >= max(1,M1). - * - * @param[in,out] A2 - * On entry, the M2-by-N2 tile A2. - * On exit, A2 is overwritten by the application of Q. - * - * @param[in] LDA2 - * The leading dimension of the tile A2. LDA2 >= max(1,M2). - * - * @param[in] V - * The i-th row must contain the vector which defines the - * elementary reflector H(i), for i = 1,2,...,k, as returned by - * CORE_ZTSLQT in the first k rows of its array argument V. - * - * @param[in] LDV - * The leading dimension of the array V. LDV >= max(1,K). - * - * @param[in] T - * The IB-by-N1 triangular factor T of the block reflector. - * T is upper triangular by block (economic storage); - * The rest of the array is not referenced. - * - * @param[in] LDT - * The leading dimension of the array T. LDT >= IB. - * - * @param[out] WORK - * Workspace array of size - * LDWORK-by-M1 if side == ChamLeft - * LDWORK-by-IB if side == ChamRight - * - * @param[in] LDWORK - * The leading dimension of the array WORK. - * LDWORK >= max(1,IB) if side == ChamLeft - * LDWORK >= max(1,N1) if side == ChamRight - * - ******************************************************************************* - * - * @return - * \retval CHAMELEON_SUCCESS successful exit - * \retval <0 if -i, the i-th argument had an illegal value - * - */ -void INSERT_TASK_ztsmlq(const RUNTIME_option_t *options, - cham_side_t side, cham_trans_t trans, - int m1, int n1, int m2, int n2, int k, int ib, int nb, - const CHAM_desc_t *A1, int A1m, int A1n, int lda1, - const CHAM_desc_t *A2, int A2m, int A2n, int lda2, - const CHAM_desc_t *V, int Vm, int Vn, int ldv, - const CHAM_desc_t *T, int Tm, int Tn, int ldt) -{ - int ldwork = side == ChamLeft ? ib : nb; - - quark_option_t *opt = (quark_option_t*)(options->schedopt); - DAG_CORE_TSMLQ; - QUARK_Insert_Task(opt->quark, CORE_ztsmlq_quark, (Quark_Task_Flags*)opt, - sizeof(int), &side, VALUE, - sizeof(int), &trans, VALUE, - sizeof(int), &m1, VALUE, - sizeof(int), &n1, VALUE, - sizeof(int), &m2, VALUE, - sizeof(int), &n2, VALUE, - sizeof(int), &k, VALUE, - sizeof(int), &ib, VALUE, - sizeof(CHAMELEON_Complex64_t)*nb*nb, RTBLKADDR(A1, CHAMELEON_Complex64_t, A1m, A1n), INOUT, - sizeof(int), &lda1, VALUE, - sizeof(CHAMELEON_Complex64_t)*nb*nb, RTBLKADDR(A2, CHAMELEON_Complex64_t, A2m, A2n), INOUT | LOCALITY, - sizeof(int), &lda2, VALUE, - sizeof(CHAMELEON_Complex64_t)*nb*nb, RTBLKADDR(V, CHAMELEON_Complex64_t, Vm, Vn), INPUT, - sizeof(int), &ldv, VALUE, - sizeof(CHAMELEON_Complex64_t)*ib*nb, RTBLKADDR(T, CHAMELEON_Complex64_t, Tm, Tn), INPUT, - sizeof(int), &ldt, VALUE, - sizeof(CHAMELEON_Complex64_t)*ib*nb, NULL, SCRATCH, - sizeof(int), &ldwork, VALUE, - 0); -} diff --git a/runtime/quark/codelets/codelet_ztsmqr.c b/runtime/quark/codelets/codelet_ztsmqr.c deleted file mode 100644 index afcde5dfa02d9f67e8363f57698b47bd256ea59c..0000000000000000000000000000000000000000 --- a/runtime/quark/codelets/codelet_ztsmqr.c +++ /dev/null @@ -1,190 +0,0 @@ -/** - * - * @file quark/codelet_ztsmqr.c - * - * @copyright 2009-2014 The University of Tennessee and The University of - * Tennessee Research Foundation. All rights reserved. - * @copyright 2012-2018 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, - * Univ. Bordeaux. All rights reserved. - * - *** - * - * @brief Chameleon ztsmqr Quark codelet - * - * @version 1.0.0 - * @comment This file has been automatically generated - * from Plasma 2.5.0 for CHAMELEON 1.0.0 - * @author Hatem Ltaief - * @author Jakub Kurzak - * @author Azzam Haidar - * @author Dulceneia Becker - * @author Mathieu Faverge - * @author Emmanuel Agullo - * @author Cedric Castagnede - * @date 2010-11-15 - * @precisions normal z -> c d s - * - */ -#include "chameleon_quark.h" -#include "chameleon/tasks_z.h" -#include "coreblas/coreblas_z.h" - -void CORE_ztsmqr_quark(Quark *quark) -{ - cham_side_t side; - cham_trans_t trans; - int m1; - int n1; - int m2; - int n2; - int k; - int ib; - CHAMELEON_Complex64_t *A1; - int lda1; - CHAMELEON_Complex64_t *A2; - int lda2; - CHAMELEON_Complex64_t *V; - int ldv; - CHAMELEON_Complex64_t *T; - int ldt; - CHAMELEON_Complex64_t *WORK; - int ldwork; - - quark_unpack_args_18(quark, side, trans, m1, n1, m2, n2, k, ib, - A1, lda1, A2, lda2, V, ldv, T, ldt, WORK, ldwork); - CORE_ztsmqr(side, trans, m1, n1, m2, n2, k, ib, - A1, lda1, A2, lda2, V, ldv, T, ldt, WORK, ldwork); -} - -/** - * - * @ingroup INSERT_TASK_Complex64_t - * - * CORE_ztsmqr overwrites the general complex M1-by-N1 tile A1 and - * M2-by-N2 tile A2 with - * - * SIDE = 'L' SIDE = 'R' - * TRANS = 'N': Q * | A1 | | A1 A2 | * Q - * | A2 | - * - * TRANS = 'C': Q**H * | A1 | | A1 A2 | * Q**H - * | A2 | - * - * where Q is a complex unitary matrix defined as the product of k - * elementary reflectors - * - * Q = H(1) H(2) . . . H(k) - * - * as returned by CORE_ZTSQRT. - * - ******************************************************************************* - * - * @param[in] side - * @arg ChamLeft : apply Q or Q**H from the Left; - * @arg ChamRight : apply Q or Q**H from the Right. - * - * @param[in] trans - * @arg ChamNoTrans : No transpose, apply Q; - * @arg ChamConjTrans : ConjTranspose, apply Q**H. - * - * @param[in] M1 - * The number of rows of the tile A1. M1 >= 0. - * - * @param[in] N1 - * The number of columns of the tile A1. N1 >= 0. - * - * @param[in] M2 - * The number of rows of the tile A2. M2 >= 0. - * M2 = M1 if side == ChamRight. - * - * @param[in] N2 - * The number of columns of the tile A2. N2 >= 0. - * N2 = N1 if side == ChamLeft. - * - * @param[in] K - * The number of elementary reflectors whose product defines - * the matrix Q. - * - * @param[in] IB - * The inner-blocking size. IB >= 0. - * - * @param[in,out] A1 - * On entry, the M1-by-N1 tile A1. - * On exit, A1 is overwritten by the application of Q. - * - * @param[in] LDA1 - * The leading dimension of the array A1. LDA1 >= max(1,M1). - * - * @param[in,out] A2 - * On entry, the M2-by-N2 tile A2. - * On exit, A2 is overwritten by the application of Q. - * - * @param[in] LDA2 - * The leading dimension of the tile A2. LDA2 >= max(1,M2). - * - * @param[in] V - * The i-th row must contain the vector which defines the - * elementary reflector H(i), for i = 1,2,...,k, as returned by - * CORE_ZTSQRT in the first k columns of its array argument V. - * - * @param[in] LDV - * The leading dimension of the array V. LDV >= max(1,K). - * - * @param[in] T - * The IB-by-N1 triangular factor T of the block reflector. - * T is upper triangular by block (economic storage); - * The rest of the array is not referenced. - * - * @param[in] LDT - * The leading dimension of the array T. LDT >= IB. - * - * @param[out] WORK - * Workspace array of size - * LDWORK-by-N1 if side == ChamLeft - * LDWORK-by-IB if side == ChamRight - * - * @param[in] LDWORK - * The leading dimension of the array WORK. - * LDWORK >= max(1,IB) if side == ChamLeft - * LDWORK >= max(1,M1) if side == ChamRight - * - ******************************************************************************* - * - * @return - * \retval CHAMELEON_SUCCESS successful exit - * \retval <0 if -i, the i-th argument had an illegal value - * - */ -void INSERT_TASK_ztsmqr(const RUNTIME_option_t *options, - cham_side_t side, cham_trans_t trans, - int m1, int n1, int m2, int n2, int k, int ib, int nb, - const CHAM_desc_t *A1, int A1m, int A1n, int lda1, - const CHAM_desc_t *A2, int A2m, int A2n, int lda2, - const CHAM_desc_t *V, int Vm, int Vn, int ldv, - const CHAM_desc_t *T, int Tm, int Tn, int ldt) -{ - int ldwork = side == ChamLeft ? ib : nb; - - quark_option_t *opt = (quark_option_t*)(options->schedopt); - DAG_CORE_TSMQR; - QUARK_Insert_Task(opt->quark, CORE_ztsmqr_quark, (Quark_Task_Flags*)opt, - sizeof(int), &side, VALUE, - sizeof(int), &trans, VALUE, - sizeof(int), &m1, VALUE, - sizeof(int), &n1, VALUE, - sizeof(int), &m2, VALUE, - sizeof(int), &n2, VALUE, - sizeof(int), &k, VALUE, - sizeof(int), &ib, VALUE, - sizeof(CHAMELEON_Complex64_t)*nb*nb, RTBLKADDR(A1, CHAMELEON_Complex64_t, A1m, A1n), INOUT, - sizeof(int), &lda1, VALUE, - sizeof(CHAMELEON_Complex64_t)*nb*nb, RTBLKADDR(A2, CHAMELEON_Complex64_t, A2m, A2n), INOUT | LOCALITY, - sizeof(int), &lda2, VALUE, - sizeof(CHAMELEON_Complex64_t)*nb*nb, RTBLKADDR(V, CHAMELEON_Complex64_t, Vm, Vn), INPUT, - sizeof(int), &ldv, VALUE, - sizeof(CHAMELEON_Complex64_t)*ib*nb, RTBLKADDR(T, CHAMELEON_Complex64_t, Tm, Tn), INPUT, - sizeof(int), &ldt, VALUE, - sizeof(CHAMELEON_Complex64_t)*ib*nb, NULL, SCRATCH, - sizeof(int), &ldwork, VALUE, - 0); -} diff --git a/runtime/quark/codelets/codelet_ztsqrt.c b/runtime/quark/codelets/codelet_ztsqrt.c deleted file mode 100644 index 44457debb9112a93fd00ad3a6f6585d7f9d49ef0..0000000000000000000000000000000000000000 --- a/runtime/quark/codelets/codelet_ztsqrt.c +++ /dev/null @@ -1,131 +0,0 @@ -/** - * - * @file quark/codelet_ztsqrt.c - * - * @copyright 2009-2014 The University of Tennessee and The University of - * Tennessee Research Foundation. All rights reserved. - * @copyright 2012-2018 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, - * Univ. Bordeaux. All rights reserved. - * - *** - * - * @brief Chameleon ztsqrt Quark codelet - * - * @version 1.0.0 - * @comment This file has been automatically generated - * from Plasma 2.5.0 for CHAMELEON 1.0.0 - * @author Hatem Ltaief - * @author Jakub Kurzak - * @author Mathieu Faverge - * @author Emmanuel Agullo - * @author Cedric Castagnede - * @date 2010-11-15 - * @precisions normal z -> c d s - * - */ -#include "chameleon_quark.h" -#include "chameleon/tasks_z.h" -#include "coreblas/coreblas_z.h" - -void CORE_ztsqrt_quark(Quark *quark) -{ - int m; - int n; - int ib; - CHAMELEON_Complex64_t *A1; - int lda1; - CHAMELEON_Complex64_t *A2; - int lda2; - CHAMELEON_Complex64_t *T; - int ldt; - CHAMELEON_Complex64_t *TAU; - CHAMELEON_Complex64_t *WORK; - - quark_unpack_args_11(quark, m, n, ib, A1, lda1, A2, lda2, T, ldt, TAU, WORK); - CORE_ztsqrt(m, n, ib, A1, lda1, A2, lda2, T, ldt, TAU, WORK); -} - -/** - * - * @ingroup INSERT_TASK_Complex64_t - * - * CORE_ztsqrt computes a QR factorization of a rectangular matrix - * formed by coupling a complex N-by-N upper triangular tile A1 - * on top of a complex M-by-N tile A2: - * - * | A1 | = Q * R - * | A2 | - * - ******************************************************************************* - * - * @param[in] M - * The number of columns of the tile A2. M >= 0. - * - * @param[in] N - * The number of rows of the tile A1. - * The number of columns of the tiles A1 and A2. N >= 0. - * - * @param[in] IB - * The inner-blocking size. IB >= 0. - * - * @param[in,out] A1 - * On entry, the N-by-N tile A1. - * On exit, the elements on and above the diagonal of the array - * contain the N-by-N upper trapezoidal tile R; - * the elements below the diagonal are not referenced. - * - * @param[in] LDA1 - * The leading dimension of the array A1. LDA1 >= max(1,N). - * - * @param[in,out] A2 - * On entry, the M-by-N tile A2. - * On exit, all the elements with the array TAU, represent - * the unitary tile Q as a product of elementary reflectors - * (see Further Details). - * - * @param[in] LDA2 - * The leading dimension of the tile A2. LDA2 >= max(1,M). - * - * @param[out] T - * The IB-by-N triangular factor T of the block reflector. - * T is upper triangular by block (economic storage); - * The rest of the array is not referenced. - * - * @param[in] LDT - * The leading dimension of the array T. LDT >= IB. - * - * @param[out] TAU - * The scalar factors of the elementary reflectors (see Further - * Details). - * - * @param[out] WORK - * - ******************************************************************************* - * - * @return - * \retval CHAMELEON_SUCCESS successful exit - * \retval <0 if -i, the i-th argument had an illegal value - * - */ -void INSERT_TASK_ztsqrt(const RUNTIME_option_t *options, - int m, int n, int ib, int nb, - const CHAM_desc_t *A1, int A1m, int A1n, int lda1, - const CHAM_desc_t *A2, int A2m, int A2n, int lda2, - const CHAM_desc_t *T, int Tm, int Tn, int ldt) -{ - quark_option_t *opt = (quark_option_t*)(options->schedopt); - DAG_CORE_TSQRT; - QUARK_Insert_Task(opt->quark, CORE_ztsqrt_quark, (Quark_Task_Flags*)opt, - sizeof(int), &m, VALUE, - sizeof(int), &n, VALUE, - sizeof(int), &ib, VALUE, - sizeof(CHAMELEON_Complex64_t)*nb*nb, RTBLKADDR(A1, CHAMELEON_Complex64_t, A1m, A1n), INOUT | QUARK_REGION_U | QUARK_REGION_D, - sizeof(int), &lda1, VALUE, - sizeof(CHAMELEON_Complex64_t)*nb*nb, RTBLKADDR(A2, CHAMELEON_Complex64_t, A2m, A2n), INOUT | LOCALITY, - sizeof(int), &lda2, VALUE, - sizeof(CHAMELEON_Complex64_t)*ib*nb, RTBLKADDR(T, CHAMELEON_Complex64_t, Tm, Tn), OUTPUT, - sizeof(int), &ldt, VALUE, - sizeof(CHAMELEON_Complex64_t)*nb, NULL, SCRATCH, - sizeof(CHAMELEON_Complex64_t)*ib*nb, NULL, SCRATCH, - 0); -} diff --git a/runtime/quark/codelets/codelet_zttlqt.c b/runtime/quark/codelets/codelet_zttlqt.c deleted file mode 100644 index 85eb8e3d80859879501a3592eff0ec4628c88e13..0000000000000000000000000000000000000000 --- a/runtime/quark/codelets/codelet_zttlqt.c +++ /dev/null @@ -1,143 +0,0 @@ -/** - * - * @file quark/codelet_zttlqt.c - * - * @copyright 2009-2014 The University of Tennessee and The University of - * Tennessee Research Foundation. All rights reserved. - * @copyright 2012-2018 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, - * Univ. Bordeaux. All rights reserved. - * - *** - * - * @brief Chameleon zttlqt Quark codelet - * - * @version 1.0.0 - * @comment This file has been automatically generated - * from Plasma 2.5.0 for CHAMELEON 1.0.0 - * @author Hatem Ltaief - * @author Dulceneia Becker - * @author Mathieu Faverge - * @author Emmanuel Agullo - * @author Cedric Castagnede - * @date 2010-11-15 - * @precisions normal z -> c d s - * - */ -#include "chameleon_quark.h" -#include "chameleon/tasks_z.h" -#include "coreblas/coreblas_z.h" - -void CORE_zttlqt_quark(Quark *quark) -{ - int m; - int n; - int ib; - CHAMELEON_Complex64_t *A1; - int lda1; - CHAMELEON_Complex64_t *A2; - int lda2; - CHAMELEON_Complex64_t *T; - int ldt; - CHAMELEON_Complex64_t *TAU; - CHAMELEON_Complex64_t *WORK; - - quark_unpack_args_11(quark, m, n, ib, A1, lda1, A2, lda2, T, ldt, TAU, WORK); - CORE_zttlqt(m, n, ib, A1, lda1, A2, lda2, T, ldt, TAU, WORK); -} - -/** - * - * @ingroup INSERT_TASK_Complex64_t - * - * CORE_zttlqt computes a LQ factorization of a rectangular matrix - * formed by coupling side-by-side a complex M-by-M lower triangular tile A1 - * and a complex M-by-N lower triangular tile A2: - * - * | A1 A2 | = L * Q - * - * The tile Q is represented as a product of elementary reflectors - * - * Q = H(k)' . . . H(2)' H(1)', where k = min(M,N). - * - * Each H(i) has the form - * - * H(i) = I - tau * v * v' - * - * where tau is a complex scalar, and v is a complex vector with - * v(1:i-1) = 0 and v(i) = 1; conjg(v(i+1:n)) is stored on exit in - * A2(i,1:n), and tau in TAU(i). - * - ******************************************************************************* - * - * @param[in] M - * The number of rows of the tile A1 and A2. M >= 0. - * The number of columns of the tile A1. - * - * @param[in] N - * The number of columns of the tile A2. N >= 0. - * - * @param[in] IB - * The inner-blocking size. IB >= 0. - * - * @param[in,out] A1 - * On entry, the M-by-M tile A1. - * On exit, the elements on and below the diagonal of the array - * contain the M-by-M lower trapezoidal tile L; - * the elements above the diagonal are not referenced. - * - * @param[in] LDA1 - * The leading dimension of the array A1. LDA1 >= max(1,N). - * - * @param[in,out] A2 - * On entry, the M-by-N lower triangular tile A2. - * On exit, the elements on and below the diagonal of the array - * with the array TAU, represent - * the unitary tile Q as a product of elementary reflectors - * (see Further Details). - * - * @param[in] LDA2 - * The leading dimension of the array A2. LDA2 >= max(1,M). - * - * @param[out] T - * The IB-by-N triangular factor T of the block reflector. - * T is upper triangular by block (economic storage); - * The rest of the array is not referenced. - * - * @param[in] LDT - * The leading dimension of the array T. LDT >= IB. - * - * @param[out] TAU - * The scalar factors of the elementary reflectors (see Further - * Details). - * - * @param[in,out] WORK - * - ******************************************************************************* - * - * @return - * \retval CHAMELEON_SUCCESS successful exit - * \retval <0 if -i, the i-th argument had an illegal value - * - */ -void INSERT_TASK_zttlqt(const RUNTIME_option_t *options, - int m, int n, int ib, int nb, - const CHAM_desc_t *A1, int A1m, int A1n, int lda1, - const CHAM_desc_t *A2, int A2m, int A2n, int lda2, - const CHAM_desc_t *T, int Tm, int Tn, int ldt) -{ - quark_option_t *opt = (quark_option_t*)(options->schedopt); - DAG_CORE_TTLQT; - QUARK_Insert_Task(opt->quark, CORE_zttlqt_quark, (Quark_Task_Flags*)opt, - sizeof(int), &m, VALUE, - sizeof(int), &n, VALUE, - sizeof(int), &ib, VALUE, - sizeof(CHAMELEON_Complex64_t)*nb*nb, RTBLKADDR(A1, CHAMELEON_Complex64_t, A1m, A1n), INOUT | QUARK_REGION_L | QUARK_REGION_D, - sizeof(int), &lda1, VALUE, - sizeof(CHAMELEON_Complex64_t)*nb*nb, RTBLKADDR(A2, CHAMELEON_Complex64_t, A2m, A2n), INOUT | QUARK_REGION_L | QUARK_REGION_D | LOCALITY, - sizeof(int), &lda2, VALUE, - sizeof(CHAMELEON_Complex64_t)*ib*nb, RTBLKADDR(T, CHAMELEON_Complex64_t, Tm, Tn), OUTPUT, - sizeof(int), &ldt, VALUE, - sizeof(CHAMELEON_Complex64_t)*nb, NULL, SCRATCH, - sizeof(CHAMELEON_Complex64_t)*ib*nb, NULL, SCRATCH, - 0); -} diff --git a/runtime/quark/codelets/codelet_zttmlq.c b/runtime/quark/codelets/codelet_zttmlq.c deleted file mode 100644 index f3701869ca32a8a1376a8cf04d500eb8049a36f0..0000000000000000000000000000000000000000 --- a/runtime/quark/codelets/codelet_zttmlq.c +++ /dev/null @@ -1,182 +0,0 @@ -/** - * - * @file quark/codelet_zttmlq.c - * - * @copyright 2009-2014 The University of Tennessee and The University of - * Tennessee Research Foundation. All rights reserved. - * @copyright 2012-2018 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, - * Univ. Bordeaux. All rights reserved. - * - *** - * - * @brief Chameleon zttmlq Quark codelet - * - * @version 1.0.0 - * @comment This file has been automatically generated - * from Plasma 2.5.0 for CHAMELEON 1.0.0 - * @author Hatem Ltaief - * @author Dulceneia Becker - * @author Mathieu Faverge - * @author Emmanuel Agullo - * @author Cedric Castagnede - * @date 2010-11-15 - * @precisions normal z -> c d s - * - */ -#include "chameleon_quark.h" -#include "chameleon/tasks_z.h" -#include "coreblas/coreblas_z.h" - -void CORE_zttmlq_quark(Quark *quark) -{ - cham_side_t side; - cham_trans_t trans; - int m1; - int n1; - int m2; - int n2; - int k; - int ib; - CHAMELEON_Complex64_t *A1; - int lda1; - CHAMELEON_Complex64_t *A2; - int lda2; - CHAMELEON_Complex64_t *V; - int ldv; - CHAMELEON_Complex64_t *T; - int ldt; - CHAMELEON_Complex64_t *WORK; - int ldwork; - - quark_unpack_args_18(quark, side, trans, m1, n1, m2, n2, k, ib, - A1, lda1, A2, lda2, V, ldv, T, ldt, WORK, ldwork); - CORE_zttmlq(side, trans, m1, n1, m2, n2, k, ib, A1, lda1, - A2, lda2, V, ldv, T, ldt, WORK, ldwork); -} - -/** - * - * @ingroup INSERT_TASK_Complex64_t - * - * CORE_zttmlq overwrites the general complex M1-by-N1 tile A1 and - * M2-by-N2 tile A2 (N1 == N2) with - * - * SIDE = 'L' SIDE = 'R' - * TRANS = 'N': Q * | A1 | | A1 | * Q - * | A2 | | A2 | - * - * TRANS = 'C': Q**H * | A1 | | A1 | * Q**H - * | A2 | | A2 | - * - * where Q is a complex unitary matrix defined as the product of k - * elementary reflectors - * - * Q = H(1) H(2) . . . H(k) - * - * as returned by CORE_zttqrt. - * - ******************************************************************************* - * - * @param[in] side - * @arg ChamLeft : apply Q or Q**H from the Left; - * @arg ChamRight : apply Q or Q**H from the Right. - * - * @param[in] trans - * @arg ChamNoTrans : No transpose, apply Q; - * @arg ChamConjTrans : ConjTranspose, apply Q**H. - * - * @param[in] M1 - * The number of rows of the tile A1. M1 >= 0. - * - * @param[in] N1 - * The number of columns of the tile A1. N1 >= 0. - * - * @param[in] M2 - * The number of rows of the tile A2. M2 >= 0. - * - * @param[in] N2 - * The number of columns of the tile A2. N2 >= 0. - * - * @param[in] K - * The number of elementary reflectors whose product defines - * the matrix Q. - * - * @param[in] IB - * The inner-blocking size. IB >= 0. - * - * @param[in,out] A1 - * On entry, the M1-by-N1 tile A1. - * On exit, A1 is overwritten by the application of Q. - * - * @param[in] LDA1 - * The leading dimension of the array A1. LDA1 >= max(1,M1). - * - * @param[in,out] A2 - * On entry, the M2-by-N2 tile A2. - * On exit, A2 is overwritten by the application of Q. - * - * @param[in] LDA2 - * The leading dimension of the tile A2. LDA2 >= max(1,M2). - * - * @param[in] V - * The i-th row must contain the vector which defines the - * elementary reflector H(i), for i = 1,2,...,k, as returned by - * CORE_ZTTQRT in the first k rows of its array argument V. - * - * @param[in] LDV - * The leading dimension of the array V. LDV >= max(1,K). - * - * @param[in] T - * The IB-by-N1 triangular factor T of the block reflector. - * T is upper triangular by block (economic storage); - * The rest of the array is not referenced. - * - * @param[in] LDT - * The leading dimension of the array T. LDT >= IB. - * - * @param[out] WORK - * Workspace array of size LDWORK-by-N1. - * - * @param[in] LDWORK - * The dimension of the array WORK. LDWORK >= max(1,IB). - * - ******************************************************************************* - * - * @return - * \retval CHAMELEON_SUCCESS successful exit - * \retval <0 if -i, the i-th argument had an illegal value - * - */ -void INSERT_TASK_zttmlq(const RUNTIME_option_t *options, - cham_side_t side, cham_trans_t trans, - int m1, int n1, int m2, int n2, int k, int ib, int nb, - const CHAM_desc_t *A1, int A1m, int A1n, int lda1, - const CHAM_desc_t *A2, int A2m, int A2n, int lda2, - const CHAM_desc_t *V, int Vm, int Vn, int ldv, - const CHAM_desc_t *T, int Tm, int Tn, int ldt) -{ - int ldwork = side == ChamLeft ? ib : nb; - - quark_option_t *opt = (quark_option_t*)(options->schedopt); - DAG_CORE_TTMLQ; - QUARK_Insert_Task(opt->quark, CORE_zttmlq_quark, (Quark_Task_Flags*)opt, - sizeof(int), &side, VALUE, - sizeof(int), &trans, VALUE, - sizeof(int), &m1, VALUE, - sizeof(int), &n1, VALUE, - sizeof(int), &m2, VALUE, - sizeof(int), &n2, VALUE, - sizeof(int), &k, VALUE, - sizeof(int), &ib, VALUE, - sizeof(CHAMELEON_Complex64_t)*nb*nb, RTBLKADDR(A1, CHAMELEON_Complex64_t, A1m, A1n), INOUT, - sizeof(int), &lda1, VALUE, - sizeof(CHAMELEON_Complex64_t)*nb*nb, RTBLKADDR(A2, CHAMELEON_Complex64_t, A2m, A2n), INOUT | LOCALITY, - sizeof(int), &lda2, VALUE, - sizeof(CHAMELEON_Complex64_t)*nb*nb, RTBLKADDR(V, CHAMELEON_Complex64_t, Vm, Vn), INPUT | QUARK_REGION_L | QUARK_REGION_D, - sizeof(int), &ldv, VALUE, - sizeof(CHAMELEON_Complex64_t)*ib*nb, RTBLKADDR(T, CHAMELEON_Complex64_t, Tm, Tn), INPUT, - sizeof(int), &ldt, VALUE, - sizeof(CHAMELEON_Complex64_t)*ib*nb, NULL, SCRATCH, - sizeof(int), &ldwork, VALUE, - 0); -} diff --git a/runtime/quark/codelets/codelet_zttmqr.c b/runtime/quark/codelets/codelet_zttmqr.c deleted file mode 100644 index e106a34ce42948c3786f10f3af9930368299a781..0000000000000000000000000000000000000000 --- a/runtime/quark/codelets/codelet_zttmqr.c +++ /dev/null @@ -1,183 +0,0 @@ -/** - * - * @file quark/codelet_zttmqr.c - * - * @copyright 2009-2014 The University of Tennessee and The University of - * Tennessee Research Foundation. All rights reserved. - * @copyright 2012-2018 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, - * Univ. Bordeaux. All rights reserved. - * - *** - * - * @brief Chameleon zttmqr Quark codelet - * - * @version 1.0.0 - * @comment This file has been automatically generated - * from Plasma 2.5.0 for CHAMELEON 1.0.0 - * @author Hatem Ltaief - * @author Dulceneia Becker - * @author Mathieu Faverge - * @author Emmanuel Agullo - * @author Cedric Castagnede - * @date 2010-11-15 - * @precisions normal z -> c d s - * - */ -#include "chameleon_quark.h" -#include "chameleon/tasks_z.h" -#include "coreblas/coreblas_z.h" - -static void -CORE_zttmqr_quark( Quark *quark ) -{ - cham_side_t side; - cham_trans_t trans; - int m1; - int n1; - int m2; - int n2; - int k; - int ib; - CHAMELEON_Complex64_t *A1; - int lda1; - CHAMELEON_Complex64_t *A2; - int lda2; - CHAMELEON_Complex64_t *V; - int ldv; - CHAMELEON_Complex64_t *T; - int ldt; - CHAMELEON_Complex64_t *WORK; - int ldwork; - - quark_unpack_args_18(quark, side, trans, m1, n1, m2, n2, k, ib, - A1, lda1, A2, lda2, V, ldv, T, ldt, WORK, ldwork); - CORE_zttmqr(side, trans, m1, n1, m2, n2, k, ib, - A1, lda1, A2, lda2, V, ldv, T, ldt, WORK, ldwork); -} - -/** - * - * @ingroup INSERT_TASK_Complex64_t - * - * CORE_zttmqr overwrites the general complex M1-by-N1 tile A1 and - * M2-by-N2 tile A2 (N1 == N2) with - * - * SIDE = 'L' SIDE = 'R' - * TRANS = 'N': Q * | A1 | | A1 | * Q - * | A2 | | A2 | - * - * TRANS = 'C': Q**H * | A1 | | A1 | * Q**H - * | A2 | | A2 | - * - * where Q is a complex unitary matrix defined as the product of k - * elementary reflectors - * - * Q = H(1) H(2) . . . H(k) - * - * as returned by CORE_zttqrt. - * - ******************************************************************************* - * - * @param[in] side - * @arg ChamLeft : apply Q or Q**H from the Left; - * @arg ChamRight : apply Q or Q**H from the Right. - * - * @param[in] trans - * @arg ChamNoTrans : No transpose, apply Q; - * @arg ChamConjTrans : ConjTranspose, apply Q**H. - * - * @param[in] M1 - * The number of rows of the tile A1. M1 >= 0. - * - * @param[in] N1 - * The number of columns of the tile A1. N1 >= 0. - * - * @param[in] M2 - * The number of rows of the tile A2. M2 >= 0. - * - * @param[in] N2 - * The number of columns of the tile A2. N2 >= 0. - * - * @param[in] K - * The number of elementary reflectors whose product defines - * the matrix Q. - * - * @param[in] IB - * The inner-blocking size. IB >= 0. - * - * @param[in,out] A1 - * On entry, the M1-by-N1 tile A1. - * On exit, A1 is overwritten by the application of Q. - * - * @param[in] LDA1 - * The leading dimension of the array A1. LDA1 >= max(1,M1). - * - * @param[in,out] A2 - * On entry, the M2-by-N2 tile A2. - * On exit, A2 is overwritten by the application of Q. - * - * @param[in] LDA2 - * The leading dimension of the tile A2. LDA2 >= max(1,M2). - * - * @param[in] V - * The i-th row must contain the vector which defines the - * elementary reflector H(i), for i = 1,2,...,k, as returned by - * CORE_ZTTQRT in the first k rows of its array argument V. - * - * @param[in] LDV - * The leading dimension of the array V. LDV >= max(1,K). - * - * @param[in] T - * The IB-by-N1 triangular factor T of the block reflector. - * T is upper triangular by block (economic storage); - * The rest of the array is not referenced. - * - * @param[in] LDT - * The leading dimension of the array T. LDT >= IB. - * - * @param[out] WORK - * Workspace array of size LDWORK-by-N1. - * - * @param[in] LDWORK - * The dimension of the array WORK. LDWORK >= max(1,IB). - * - ******************************************************************************* - * - * @return - * \retval CHAMELEON_SUCCESS successful exit - * \retval <0 if -i, the i-th argument had an illegal value - * - */ -void INSERT_TASK_zttmqr(const RUNTIME_option_t *options, - cham_side_t side, cham_trans_t trans, - int m1, int n1, int m2, int n2, int k, int ib, int nb, - const CHAM_desc_t *A1, int A1m, int A1n, int lda1, - const CHAM_desc_t *A2, int A2m, int A2n, int lda2, - const CHAM_desc_t *V, int Vm, int Vn, int ldv, - const CHAM_desc_t *T, int Tm, int Tn, int ldt) -{ - int ldwork = side == ChamLeft ? ib : nb; - - quark_option_t *opt = (quark_option_t*)(options->schedopt); - DAG_CORE_TTMQR; - QUARK_Insert_Task(opt->quark, CORE_zttmqr_quark, (Quark_Task_Flags*)opt, - sizeof(int), &side, VALUE, - sizeof(int), &trans, VALUE, - sizeof(int), &m1, VALUE, - sizeof(int), &n1, VALUE, - sizeof(int), &m2, VALUE, - sizeof(int), &n2, VALUE, - sizeof(int), &k, VALUE, - sizeof(int), &ib, VALUE, - sizeof(CHAMELEON_Complex64_t)*nb*nb, RTBLKADDR(A1, CHAMELEON_Complex64_t, A1m, A1n), INOUT, - sizeof(int), &lda1, VALUE, - sizeof(CHAMELEON_Complex64_t)*nb*nb, RTBLKADDR(A2, CHAMELEON_Complex64_t, A2m, A2n), INOUT | LOCALITY, - sizeof(int), &lda2, VALUE, - sizeof(CHAMELEON_Complex64_t)*nb*nb, RTBLKADDR(V, CHAMELEON_Complex64_t, Vm, Vn), INPUT | QUARK_REGION_U | QUARK_REGION_D, - sizeof(int), &ldv, VALUE, - sizeof(CHAMELEON_Complex64_t)*ib*nb, RTBLKADDR(T, CHAMELEON_Complex64_t, Tm, Tn), INPUT, - sizeof(int), &ldt, VALUE, - sizeof(CHAMELEON_Complex64_t)*ib*nb, NULL, SCRATCH, - sizeof(int), &ldwork, VALUE, - 0); -} diff --git a/runtime/quark/codelets/codelet_zttqrt.c b/runtime/quark/codelets/codelet_zttqrt.c deleted file mode 100644 index d5f62c44110d6d8f39baecfd9ca2ba75c611f0b1..0000000000000000000000000000000000000000 --- a/runtime/quark/codelets/codelet_zttqrt.c +++ /dev/null @@ -1,143 +0,0 @@ -/** - * - * @file quark/codelet_zttqrt.c - * - * @copyright 2009-2014 The University of Tennessee and The University of - * Tennessee Research Foundation. All rights reserved. - * @copyright 2012-2018 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, - * Univ. Bordeaux. All rights reserved. - * - *** - * - * @brief Chameleon zttqrt Quark codelet - * - * @version 1.0.0 - * @comment This file has been automatically generated - * from Plasma 2.5.0 for CHAMELEON 1.0.0 - * @author Hatem Ltaief - * @author Dulceneia Becker - * @author Mathieu Faverge - * @author Emmanuel Agullo - * @author Cedric Castagnede - * @date 2010-11-15 - * @precisions normal z -> c d s - * - */ -#include "chameleon_quark.h" -#include "chameleon/tasks_z.h" -#include "coreblas/coreblas_z.h" - -void CORE_zttqrt_quark(Quark *quark) -{ - int m; - int n; - int ib; - CHAMELEON_Complex64_t *A1; - int lda1; - CHAMELEON_Complex64_t *A2; - int lda2; - CHAMELEON_Complex64_t *T; - int ldt; - CHAMELEON_Complex64_t *TAU; - CHAMELEON_Complex64_t *WORK; - - quark_unpack_args_11(quark, m, n, ib, A1, lda1, A2, lda2, T, ldt, TAU, WORK); - CORE_zttqrt(m, n, ib, A1, lda1, A2, lda2, T, ldt, TAU, WORK); -} - -/** - * - * @ingroup INSERT_TASK_Complex64_t - * - * CORE_zttqrt computes a QR factorization of a rectangular matrix - * formed by coupling a complex N-by-N upper triangular tile A1 - * on top of a complex M-by-N upper trapezoidal tile A2: - * - * | A1 | = Q * R - * | A2 | - * - * The tile Q is represented as a product of elementary reflectors - * - * Q = H(1) H(2) . . . H(k), where k = min(M,N). - * - * Each H(i) has the form - * - * H(i) = I - tau * v * v' - * - * where tau is a complex scalar, and v is a complex vector with - * v(1:i-1) = 0 and v(i) = 1; v(i+1:m) is stored on exit in A2(1:m,i), - * and tau in TAU(i). - * - ******************************************************************************* - * - * @param[in] M - * The number of rows of the tile A2. M >= 0. - * - * @param[in] N - * The number of columns of the tile A1 and A2. N >= 0. - * - * @param[in] IB - * The inner-blocking size. IB >= 0. - * - * @param[in,out] A1 - * On entry, the N-by-N tile A1. - * On exit, the elements on and above the diagonal of the array - * contain the N-by-N upper trapezoidal tile R; - * the elements below the diagonal are not referenced. - * - * @param[in] LDA1 - * The leading dimension of the array A1. LDA1 >= max(1,N). - * - * @param[in,out] A2 - * On entry, the M-by-N upper triangular tile A2. - * On exit, the elements on and above the diagonal of the array - * with the array TAU, represent - * the unitary tile Q as a product of elementary reflectors - * (see Further Details). - * - * @param[in] LDA2 - * The leading dimension of the array A2. LDA2 >= max(1,M). - * - * @param[out] T - * The IB-by-N triangular factor T of the block reflector. - * T is upper triangular by block (economic storage); - * The rest of the array is not referenced. - * - * @param[in] LDT - * The leading dimension of the array T. LDT >= IB. - * - * @param[out] TAU - * The scalar factors of the elementary reflectors (see Further - * Details). - * - * @param[in,out] WORK - * - ******************************************************************************* - * - * @return - * \retval CHAMELEON_SUCCESS successful exit - * \retval <0 if -i, the i-th argument had an illegal value - * - */ -void INSERT_TASK_zttqrt(const RUNTIME_option_t *options, - int m, int n, int ib, int nb, - const CHAM_desc_t *A1, int A1m, int A1n, int lda1, - const CHAM_desc_t *A2, int A2m, int A2n, int lda2, - const CHAM_desc_t *T, int Tm, int Tn, int ldt) -{ - quark_option_t *opt = (quark_option_t*)(options->schedopt); - DAG_CORE_TTQRT; - QUARK_Insert_Task(opt->quark, CORE_zttqrt_quark, (Quark_Task_Flags*)opt, - sizeof(int), &m, VALUE, - sizeof(int), &n, VALUE, - sizeof(int), &ib, VALUE, - sizeof(CHAMELEON_Complex64_t)*nb*nb, RTBLKADDR(A1, CHAMELEON_Complex64_t, A1m, A1n), INOUT | QUARK_REGION_U | QUARK_REGION_D, - sizeof(int), &lda1, VALUE, - sizeof(CHAMELEON_Complex64_t)*nb*nb, RTBLKADDR(A2, CHAMELEON_Complex64_t, A2m, A2n), INOUT | QUARK_REGION_U | QUARK_REGION_D | LOCALITY, - sizeof(int), &lda2, VALUE, - sizeof(CHAMELEON_Complex64_t)*ib*nb, RTBLKADDR(T, CHAMELEON_Complex64_t, Tm, Tn), OUTPUT, - sizeof(int), &ldt, VALUE, - sizeof(CHAMELEON_Complex64_t)*nb, NULL, SCRATCH, - sizeof(CHAMELEON_Complex64_t)*ib*nb, NULL, SCRATCH, - 0); -} diff --git a/runtime/quark/include/core_blas_dag.h b/runtime/quark/include/core_blas_dag.h index fbee7539cdfcaa3066a71b5b85eb243a903325dc..649330aa1871b6e60cd322ba34f49ba9d4839af8 100644 --- a/runtime/quark/include/core_blas_dag.h +++ b/runtime/quark/include/core_blas_dag.h @@ -14,7 +14,7 @@ * @version 1.0.0 * @author Mathieu Faverge * @author Cedric Castagnede - * @date 2010-11-15 + * @date 2018-11-08 * */ #ifndef _core_blas_dag_h_ @@ -71,16 +71,21 @@ #define DAG_CORE_TRSM DAG_SET_PROPERTIES( "TRSM" , "cyan" ) #define DAG_CORE_TRSSQ DAG_SET_PROPERTIES( "TRSSQ" , "white" ) #define DAG_CORE_TRTRI DAG_SET_PROPERTIES( "TRTRI" , "white" ) -#define DAG_CORE_TSLQT DAG_SET_PROPERTIES( "TSLQT" , "red" ) -#define DAG_CORE_TSMLQ DAG_SET_PROPERTIES( "TSMLQ" , "yellow" ) -#define DAG_CORE_TSMQR DAG_SET_PROPERTIES( "TSMQR" , "yellow" ) -#define DAG_CORE_TSQRT DAG_SET_PROPERTIES( "TSQRT" , "red" ) +#define DAG_CORE_TPLQT DAG_SET_PROPERTIES( "TPLQT" , "red" ) +#define DAG_CORE_TPMLQT DAG_SET_PROPERTIES( "TPMLQT" , "yellow" ) +#define DAG_CORE_TPMQRT DAG_SET_PROPERTIES( "TPMQRT" , "yellow" ) +#define DAG_CORE_TPQRT DAG_SET_PROPERTIES( "TPQRT" , "red" ) #define DAG_CORE_TSTRF DAG_SET_PROPERTIES( "TSTRF" , "red" ) -#define DAG_CORE_TTLQT DAG_SET_PROPERTIES( "TTLQT" , "pink" ) -#define DAG_CORE_TTMLQ DAG_SET_PROPERTIES( "TTMLQ" , "magenta" ) -#define DAG_CORE_TTMQR DAG_SET_PROPERTIES( "TTMQR" , "magenta" ) -#define DAG_CORE_TTQRT DAG_SET_PROPERTIES( "TTQRT" , "pink" ) #define DAG_CORE_UNMLQ DAG_SET_PROPERTIES( "UNMLQ" , "cyan" ) #define DAG_CORE_UNMQR DAG_SET_PROPERTIES( "UNMQR" , "cyan" ) +#define DAG_CORE_TSLQT DAG_CORE_TPLQT +#define DAG_CORE_TSMLQ DAG_CORE_TPMLQT +#define DAG_CORE_TSMQR DAG_CORE_TPMQRT +#define DAG_CORE_TSQRT DAG_CORE_TPQRT +#define DAG_CORE_TTLQT DAG_CORE_TPLQT +#define DAG_CORE_TTMLQ DAG_CORE_TPMLQT +#define DAG_CORE_TTMQR DAG_CORE_TPMQRT +#define DAG_CORE_TTQRT DAG_CORE_TPQRT + #endif /* _core_blas_dag_h_ */ diff --git a/runtime/starpu/codelets/codelet_zcallback.c b/runtime/starpu/codelets/codelet_zcallback.c index c42035d967b99c896705c8341a70155e26678025..745b122e3f11a4c9d92a126369fb736d9b8bb4fe 100644 --- a/runtime/starpu/codelets/codelet_zcallback.c +++ b/runtime/starpu/codelets/codelet_zcallback.c @@ -15,7 +15,7 @@ * @author Mathieu Faverge * @author Cedric Augonnet * @author Florent Pruvost - * @date 2015-09-16 + * @date 2018-11-08 * @precisions normal z -> c d s * */ @@ -69,16 +69,8 @@ CHAMELEON_CL_CB(ztrasm, starpu_matrix_get_nx(task->handles[0]), starpu_ma CHAMELEON_CL_CB(ztrmm, starpu_matrix_get_nx(task->handles[1]), starpu_matrix_get_ny(task->handles[1]), 0, M*M*N) CHAMELEON_CL_CB(ztrsm, starpu_matrix_get_nx(task->handles[1]), starpu_matrix_get_ny(task->handles[1]), 0, M*M*N) CHAMELEON_CL_CB(ztrtri, starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_nx(task->handles[0]), (1./3.)*M *M*M) -CHAMELEON_CL_CB(ztslqt, starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_nx(task->handles[0]), 2. *M* M*M) -CHAMELEON_CL_CB(ztsmlq, starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_nx(task->handles[0]), (4.0*M+starpu_matrix_get_nx(task->handles[3]))*M*M) -CHAMELEON_CL_CB(ztsmqr, starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_nx(task->handles[0]), (4.0*M+starpu_matrix_get_nx(task->handles[3]))*M*M) CHAMELEON_CL_CB(ztsmlq_hetra1, starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_nx(task->handles[0]), (4.0*M+starpu_matrix_get_nx(task->handles[3]))*M*M) CHAMELEON_CL_CB(ztsmqr_hetra1, starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_nx(task->handles[0]), (4.0*M+starpu_matrix_get_nx(task->handles[3]))*M*M) -CHAMELEON_CL_CB(ztsqrt, starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_nx(task->handles[0]), 2. *M* M*M) CHAMELEON_CL_CB(ztstrf, starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_nx(task->handles[0]), M* M*M) -CHAMELEON_CL_CB(zttlqt, starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_nx(task->handles[0]), 1. *M* M*M) -CHAMELEON_CL_CB(zttmlq, starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_nx(task->handles[0]), (2.0*M+starpu_matrix_get_nx(task->handles[3]))*M*M) -CHAMELEON_CL_CB(zttmqr, starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_nx(task->handles[0]), (2.0*M+starpu_matrix_get_nx(task->handles[3]))*M*M) -CHAMELEON_CL_CB(zttqrt, starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_nx(task->handles[0]), 1. *M* M*M) CHAMELEON_CL_CB(zunmlq, starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_nx(task->handles[0]), 2. *M* M*M) CHAMELEON_CL_CB(zunmqr, starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_nx(task->handles[0]), 2. *M* M*M) diff --git a/runtime/starpu/codelets/codelet_ztpmlqt.c b/runtime/starpu/codelets/codelet_ztpmlqt.c index 714b45762488101d571b7ed4a7f1132a256d5301..8dffa4ff2292c448c710713a7556bec3920f3d73 100644 --- a/runtime/starpu/codelets/codelet_ztpmlqt.c +++ b/runtime/starpu/codelets/codelet_ztpmlqt.c @@ -11,7 +11,7 @@ * * @version 1.0.0 * @author Mathieu Faverge - * @date 2016-12-15 + * @date 2018-11-07 * @precisions normal z -> s d c * */ @@ -37,6 +37,7 @@ static void cl_ztpmlqt_cpu_func(void *descr[], void *cl_arg) CHAMELEON_Complex64_t *B; int ldb; CHAMELEON_Complex64_t *WORK; + size_t lwork; V = (const CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[0]); T = (const CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[1]); @@ -45,13 +46,15 @@ static void cl_ztpmlqt_cpu_func(void *descr[], void *cl_arg) WORK = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[4]); /* ib * nb */ starpu_codelet_unpack_args( cl_arg, &side, &trans, &M, &N, &K, &L, &ib, - &ldv, &ldt, &lda, &ldb ); + &ldv, &ldt, &lda, &ldb, &lwork ); CORE_ztpmlqt( side, trans, M, N, K, L, ib, V, ldv, T, ldt, A, lda, B, ldb, WORK ); + + (void)lwork; } -#if defined(CHAMELEON_USE_CUDA) && 0 +#if defined(CHAMELEON_USE_CUDA) static void cl_ztpmlqt_cuda_func(void *descr[], void *cl_arg) { cham_side_t side; @@ -70,6 +73,7 @@ static void cl_ztpmlqt_cuda_func(void *descr[], void *cl_arg) cuDoubleComplex *B; int ldb; cuDoubleComplex *W; + size_t lwork; V = (const cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[0]); T = (const cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[1]); @@ -78,14 +82,14 @@ static void cl_ztpmlqt_cuda_func(void *descr[], void *cl_arg) W = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[4]); /* 2*ib*nb */ starpu_codelet_unpack_args( cl_arg, &side, &trans, &M, &N, &K, &L, &ib, - &ldv, &ldt, &lda, &ldb ); + &ldv, &ldt, &lda, &ldb, &lwork ); RUNTIME_getStream(stream); CUDA_ztpmlqt( side, trans, M, N, K, L, ib, V, ldv, T, ldt, A, lda, B, ldb, - W, stream ); + W, lwork, stream ); #ifndef STARPU_CUDA_ASYNC cudaStreamSynchronize( stream ); @@ -97,8 +101,7 @@ static void cl_ztpmlqt_cuda_func(void *descr[], void *cl_arg) /* * Codelet definition */ -CODELETS_CPU(ztpmlqt, 5, cl_ztpmlqt_cpu_func) -//CODELETS(ztpmlqt, 5, cl_ztpmlqt_cpu_func, cl_ztpmlqt_cuda_func, STARPU_CUDA_ASYNC) +CODELETS(ztpmlqt, 5, cl_ztpmlqt_cpu_func, cl_ztpmlqt_cuda_func, STARPU_CUDA_ASYNC) void INSERT_TASK_ztpmlqt( const RUNTIME_option_t *options, @@ -136,6 +139,7 @@ INSERT_TASK_ztpmlqt( const RUNTIME_option_t *options, STARPU_VALUE, &lda, sizeof(int), STARPU_RW, RTBLKADDR(B, CHAMELEON_Complex64_t, Bm, Bn), STARPU_VALUE, &ldb, sizeof(int), + STARPU_VALUE, &(options->ws_wsize), sizeof(size_t), /* Other options */ STARPU_SCRATCH, options->ws_worker, STARPU_PRIORITY, options->priority, diff --git a/runtime/starpu/codelets/codelet_ztpmqrt.c b/runtime/starpu/codelets/codelet_ztpmqrt.c index 40f83ab2d137bfe6c9de11d58a4ad05307edb78e..6684e59f82d579cd129a1ff40c25dd377bc6166e 100644 --- a/runtime/starpu/codelets/codelet_ztpmqrt.c +++ b/runtime/starpu/codelets/codelet_ztpmqrt.c @@ -11,7 +11,7 @@ * * @version 1.0.0 * @author Mathieu Faverge - * @date 2016-12-15 + * @date 2018-11-07 * @precisions normal z -> s d c * */ @@ -37,6 +37,7 @@ static void cl_ztpmqrt_cpu_func(void *descr[], void *cl_arg) CHAMELEON_Complex64_t *B; int ldb; CHAMELEON_Complex64_t *WORK; + size_t lwork; V = (const CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[0]); T = (const CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[1]); @@ -45,10 +46,12 @@ static void cl_ztpmqrt_cpu_func(void *descr[], void *cl_arg) WORK = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[4]); /* ib * nb */ starpu_codelet_unpack_args( cl_arg, &side, &trans, &M, &N, &K, &L, &ib, - &ldv, &ldt, &lda, &ldb ); + &ldv, &ldt, &lda, &ldb, &lwork ); CORE_ztpmqrt( side, trans, M, N, K, L, ib, V, ldv, T, ldt, A, lda, B, ldb, WORK ); + + (void)lwork; } @@ -71,22 +74,23 @@ static void cl_ztpmqrt_cuda_func(void *descr[], void *cl_arg) cuDoubleComplex *B; int ldb; cuDoubleComplex *W; + size_t lwork; V = (const cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[0]); T = (const cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[1]); A = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[2]); B = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[3]); - W = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[4]); /* 2*ib*nb */ + W = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[4]); /* 3*ib*nb */ starpu_codelet_unpack_args( cl_arg, &side, &trans, &M, &N, &K, &L, &ib, - &ldv, &ldt, &lda, &ldb ); + &ldv, &ldt, &lda, &ldb, &lwork ); RUNTIME_getStream(stream); CUDA_ztpmqrt( side, trans, M, N, K, L, ib, V, ldv, T, ldt, A, lda, B, ldb, - W, stream ); + W, lwork, stream ); #ifndef STARPU_CUDA_ASYNC cudaStreamSynchronize( stream ); @@ -102,12 +106,12 @@ CODELETS(ztpmqrt, 5, cl_ztpmqrt_cpu_func, cl_ztpmqrt_cuda_func, STARPU_CUDA_ASYN void INSERT_TASK_ztpmqrt( const RUNTIME_option_t *options, - cham_side_t side, cham_trans_t trans, - int M, int N, int K, int L, int ib, int nb, - const CHAM_desc_t *V, int Vm, int Vn, int ldv, - const CHAM_desc_t *T, int Tm, int Tn, int ldt, - const CHAM_desc_t *A, int Am, int An, int lda, - const CHAM_desc_t *B, int Bm, int Bn, int ldb ) + cham_side_t side, cham_trans_t trans, + int M, int N, int K, int L, int ib, int nb, + const CHAM_desc_t *V, int Vm, int Vn, int ldv, + const CHAM_desc_t *T, int Tm, int Tn, int ldt, + const CHAM_desc_t *A, int Am, int An, int lda, + const CHAM_desc_t *B, int Bm, int Bn, int ldb ) { struct starpu_codelet *codelet = &cl_ztpmqrt; void (*callback)(void*) = options->profiling ? cl_ztpmqrt_callback : NULL; @@ -136,6 +140,7 @@ INSERT_TASK_ztpmqrt( const RUNTIME_option_t *options, STARPU_VALUE, &lda, sizeof(int), STARPU_RW, RTBLKADDR(B, CHAMELEON_Complex64_t, Bm, Bn), STARPU_VALUE, &ldb, sizeof(int), + STARPU_VALUE, &(options->ws_wsize), sizeof(size_t), /* Other options */ STARPU_SCRATCH, options->ws_worker, STARPU_PRIORITY, options->priority, diff --git a/runtime/starpu/codelets/codelet_ztslqt.c b/runtime/starpu/codelets/codelet_ztslqt.c deleted file mode 100644 index 870e1349af880473ae1e92787e84e0a6ad3df035..0000000000000000000000000000000000000000 --- a/runtime/starpu/codelets/codelet_ztslqt.c +++ /dev/null @@ -1,174 +0,0 @@ -/** - * - * @file starpu/codelet_ztslqt.c - * - * @copyright 2009-2014 The University of Tennessee and The University of - * Tennessee Research Foundation. All rights reserved. - * @copyright 2012-2018 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, - * Univ. Bordeaux. All rights reserved. - * - *** - * - * @brief Chameleon ztslqt StarPU codelet - * - * @version 1.0.0 - * @comment This file has been automatically generated - * from Plasma 2.5.0 for CHAMELEON 1.0.0 - * @author Hatem Ltaief - * @author Jakub Kurzak - * @author Mathieu Faverge - * @author Emmanuel Agullo - * @author Cedric Castagnede - * @date 2010-11-15 - * @precisions normal z -> c d s - * - */ -#include "chameleon_starpu.h" -#include "runtime_codelet_z.h" - -/** - * - * @ingroup INSERT_TASK_Complex64_t - * - * CORE_ztslqt computes a LQ factorization of a rectangular matrix - * formed by coupling side-by-side a complex M-by-M - * lower triangular tile A1 and a complex M-by-N tile A2: - * - * | A1 A2 | = L * Q - * - * The tile Q is represented as a product of elementary reflectors - * - * Q = H(k)' . . . H(2)' H(1)', where k = min(M,N). - * - * Each H(i) has the form - * - * H(i) = I - tau * v * v' - * - * where tau is a complex scalar, and v is a complex vector with - * v(1:i-1) = 0 and v(i) = 1; conjg(v(i+1:n)) is stored on exit in - * A2(i,1:n), and tau in TAU(i). - * - ******************************************************************************* - * - * @param[in] M - * The number of rows of the tile A1 and A2. M >= 0. - * The number of columns of the tile A1. - * - * @param[in] N - * The number of columns of the tile A2. N >= 0. - * - * @param[in] IB - * The inner-blocking size. IB >= 0. - * - * @param[in,out] A1 - * On entry, the M-by-M tile A1. - * On exit, the elements on and below the diagonal of the array - * contain the M-by-M lower trapezoidal tile L; - * the elements above the diagonal are not referenced. - * - * @param[in] LDA1 - * The leading dimension of the array A1. LDA1 >= max(1,M). - * - * @param[in,out] A2 - * On entry, the M-by-N tile A2. - * On exit, all the elements with the array TAU, represent - * the unitary tile Q as a product of elementary reflectors - * (see Further Details). - * - * @param[in] LDA2 - * The leading dimension of the tile A2. LDA2 >= max(1,M). - * - * @param[out] T - * The IB-by-N triangular factor T of the block reflector. - * T is upper triangular by block (economic storage); - * The rest of the array is not referenced. - * - * @param[in] LDT - * The leading dimension of the array T. LDT >= IB. - * - * @param[out] TAU - * The scalar factors of the elementary reflectors (see Further - * Details). - * - * @param[out] WORK - * - ******************************************************************************* - * - * @return - * \retval CHAMELEON_SUCCESS successful exit - * \retval <0 if -i, the i-th argument had an illegal value - * - */ - -void INSERT_TASK_ztslqt(const RUNTIME_option_t *options, - int m, int n, int ib, int nb, - const CHAM_desc_t *A1, int A1m, int A1n, int lda1, - const CHAM_desc_t *A2, int A2m, int A2n, int lda2, - const CHAM_desc_t *T, int Tm, int Tn, int ldt) -{ - (void)nb; - struct starpu_codelet *codelet = &cl_ztslqt; - void (*callback)(void*) = options->profiling ? cl_ztslqt_callback : NULL; - CHAMELEON_starpu_ws_t *h_work = (CHAMELEON_starpu_ws_t*)(options->ws_host); - - CHAMELEON_BEGIN_ACCESS_DECLARATION; - CHAMELEON_ACCESS_RW(A1, A1m, A1n); - CHAMELEON_ACCESS_RW(A2, A2m, A2n); - CHAMELEON_ACCESS_W(T, Tm, Tn); - CHAMELEON_END_ACCESS_DECLARATION; - - starpu_insert_task( - starpu_mpi_codelet(codelet), - STARPU_VALUE, &m, sizeof(int), - STARPU_VALUE, &n, sizeof(int), - STARPU_VALUE, &ib, sizeof(int), - STARPU_RW, RTBLKADDR(A1, CHAMELEON_Complex64_t, A1m, A1n), - STARPU_VALUE, &lda1, sizeof(int), - STARPU_RW, RTBLKADDR(A2, CHAMELEON_Complex64_t, A2m, A2n), - STARPU_VALUE, &lda2, sizeof(int), - STARPU_W, RTBLKADDR(T, CHAMELEON_Complex64_t, Tm, Tn), - STARPU_VALUE, &ldt, sizeof(int), - /* max( nb * (ib+1), ib * (ib+nb) ) */ - STARPU_SCRATCH, options->ws_worker, - /* /\* 2 * ib * (nb+ib) + nb *\/ */ - STARPU_VALUE, &h_work, sizeof(CHAMELEON_starpu_ws_t *), - STARPU_PRIORITY, options->priority, - STARPU_CALLBACK, callback, -#if defined(CHAMELEON_CODELETS_HAVE_NAME) - STARPU_NAME, "ztslqt", -#endif - 0); -} - - -#if !defined(CHAMELEON_SIMULATION) -static void cl_ztslqt_cpu_func(void *descr[], void *cl_arg) -{ - CHAMELEON_starpu_ws_t *h_work; - int m; - int n; - int ib; - CHAMELEON_Complex64_t *A1; - int lda1; - CHAMELEON_Complex64_t *A2; - int lda2; - CHAMELEON_Complex64_t *T; - int ldt; - CHAMELEON_Complex64_t *TAU, *WORK; - - A1 = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[0]); - A2 = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[1]); - T = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[2]); - TAU= (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[3]); /* nb + ib*nb */ - - starpu_codelet_unpack_args(cl_arg, &m, &n, &ib, &lda1, &lda2, &ldt, &h_work); - - WORK = TAU + chameleon_max( m, n ); - CORE_ztslqt(m, n, ib, A1, lda1, A2, lda2, T, ldt, TAU, WORK); -} -#endif /* !defined(CHAMELEON_SIMULATION) */ - -/* - * Codelet definition - */ -CODELETS_CPU(ztslqt, 4, cl_ztslqt_cpu_func) diff --git a/runtime/starpu/codelets/codelet_ztsmlq.c b/runtime/starpu/codelets/codelet_ztsmlq.c deleted file mode 100644 index b0a2e38ec8ead1eb249c0e29ba5f98f8a8e91a90..0000000000000000000000000000000000000000 --- a/runtime/starpu/codelets/codelet_ztsmlq.c +++ /dev/null @@ -1,266 +0,0 @@ -/** - * - * @file starpu/codelet_ztsmlq.c - * - * @copyright 2009-2014 The University of Tennessee and The University of - * Tennessee Research Foundation. All rights reserved. - * @copyright 2012-2018 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, - * Univ. Bordeaux. All rights reserved. - * - *** - * - * @brief Chameleon ztsmlq StarPU codelet - * - * @version 1.0.0 - * @comment This file has been automatically generated - * from Plasma 2.5.0 for CHAMELEON 1.0.0 - * @author Hatem Ltaief - * @author Jakub Kurzak - * @author Azzam Haidar - * @author Dulceneia Becker - * @author Mathieu Faverge - * @author Emmanuel Agullo - * @author Cedric Castagnede - * @date 2010-11-15 - * @precisions normal z -> c d s - * - */ -#include "chameleon_starpu.h" -#include "runtime_codelet_z.h" - -/** - * - * @ingroup INSERT_TASK_Complex64_t - * - * CORE_ztsmlq overwrites the general complex M1-by-N1 tile A1 and - * M2-by-N2 tile A2 with - * - * SIDE = 'L' SIDE = 'R' - * TRANS = 'N': Q * | A1 | | A1 A2 | * Q - * | A2 | - * - * TRANS = 'C': Q**H * | A1 | | A1 A2 | * Q**H - * | A2 | - * - * where Q is a complex unitary matrix defined as the product of k - * elementary reflectors - * - * Q = H(k)' . . . H(2)' H(1)' - * - * as returned by CORE_ZTSLQT. - * - ******************************************************************************* - * - * @param[in] side - * @arg ChamLeft : apply Q or Q**H from the Left; - * @arg ChamRight : apply Q or Q**H from the Right. - * - * @param[in] trans - * @arg ChamNoTrans : No transpose, apply Q; - * @arg ChamConjTrans : ConjTranspose, apply Q**H. - * - * @param[in] M1 - * The number of rows of the tile A1. M1 >= 0. - * - * @param[in] N1 - * The number of columns of the tile A1. N1 >= 0. - * - * @param[in] M2 - * The number of rows of the tile A2. M2 >= 0. - * M2 = M1 if side == ChamRight. - * - * @param[in] N2 - * The number of columns of the tile A2. N2 >= 0. - * N2 = N1 if side == ChamLeft. - * - * @param[in] K - * The number of elementary reflectors whose product defines - * the matrix Q. - * - * @param[in] IB - * The inner-blocking size. IB >= 0. - * - * @param[in,out] A1 - * On entry, the M1-by-N1 tile A1. - * On exit, A1 is overwritten by the application of Q. - * - * @param[in] LDA1 - * The leading dimension of the array A1. LDA1 >= max(1,M1). - * - * @param[in,out] A2 - * On entry, the M2-by-N2 tile A2. - * On exit, A2 is overwritten by the application of Q. - * - * @param[in] LDA2 - * The leading dimension of the tile A2. LDA2 >= max(1,M2). - * - * @param[in] V - * The i-th row must contain the vector which defines the - * elementary reflector H(i), for i = 1,2,...,k, as returned by - * CORE_ZTSLQT in the first k rows of its array argument V. - * - * @param[in] LDV - * The leading dimension of the array V. LDV >= max(1,K). - * - * @param[in] T - * The IB-by-N1 triangular factor T of the block reflector. - * T is upper triangular by block (economic storage); - * The rest of the array is not referenced. - * - * @param[in] LDT - * The leading dimension of the array T. LDT >= IB. - * - * @param[out] WORK - * Workspace array of size - * LDWORK-by-M1 if side == ChamLeft - * LDWORK-by-IB if side == ChamRight - * - * @param[in] LDWORK - * The leading dimension of the array WORK. - * LDWORK >= max(1,IB) if side == ChamLeft - * LDWORK >= max(1,N1) if side == ChamRight - * - ******************************************************************************* - * - * @return - * \retval CHAMELEON_SUCCESS successful exit - * \retval <0 if -i, the i-th argument had an illegal value - * - */ - -void INSERT_TASK_ztsmlq(const RUNTIME_option_t *options, - cham_side_t side, cham_trans_t trans, - int m1, int n1, int m2, int n2, int k, int ib, int nb, - const CHAM_desc_t *A1, int A1m, int A1n, int lda1, - const CHAM_desc_t *A2, int A2m, int A2n, int lda2, - const CHAM_desc_t *V, int Vm, int Vn, int ldv, - const CHAM_desc_t *T, int Tm, int Tn, int ldt) -{ - struct starpu_codelet *codelet = &cl_ztsmlq; - void (*callback)(void*) = options->profiling ? cl_ztsmlq_callback : NULL; - int ldwork = side == ChamLeft ? ib : nb; - - CHAMELEON_BEGIN_ACCESS_DECLARATION; - CHAMELEON_ACCESS_RW(A1, A1m, A1n); - CHAMELEON_ACCESS_RW(A2, A2m, A2n); - CHAMELEON_ACCESS_R(V, Vm, Vn); - CHAMELEON_ACCESS_R(T, Tm, Tn); - CHAMELEON_END_ACCESS_DECLARATION; - - starpu_insert_task( - starpu_mpi_codelet(codelet), - STARPU_VALUE, &side, sizeof(int), - STARPU_VALUE, &trans, sizeof(int), - STARPU_VALUE, &m1, sizeof(int), - STARPU_VALUE, &n1, sizeof(int), - STARPU_VALUE, &m2, sizeof(int), - STARPU_VALUE, &n2, sizeof(int), - STARPU_VALUE, &k, sizeof(int), - STARPU_VALUE, &ib, sizeof(int), - STARPU_RW, RTBLKADDR(A1, CHAMELEON_Complex64_t, A1m, A1n), - STARPU_VALUE, &lda1, sizeof(int), - STARPU_RW, RTBLKADDR(A2, CHAMELEON_Complex64_t, A2m, A2n), - STARPU_VALUE, &lda2, sizeof(int), - STARPU_R, RTBLKADDR(V, CHAMELEON_Complex64_t, Vm, Vn), - STARPU_VALUE, &ldv, sizeof(int), - STARPU_R, RTBLKADDR(T, CHAMELEON_Complex64_t, Tm, Tn), - STARPU_VALUE, &ldt, sizeof(int), - /* max( ib*nb, 2*ib*nb ) */ - STARPU_SCRATCH, options->ws_worker, - STARPU_VALUE, &ldwork, sizeof(int), - STARPU_PRIORITY, options->priority, - STARPU_CALLBACK, callback, -#if defined(CHAMELEON_CODELETS_HAVE_NAME) - STARPU_NAME, "ztsmlq", -#endif - 0); -} - - -#if !defined(CHAMELEON_SIMULATION) -static void cl_ztsmlq_cpu_func(void *descr[], void *cl_arg) -{ - cham_side_t side; - cham_trans_t trans; - int m1; - int n1; - int m2; - int n2; - int k; - int ib; - CHAMELEON_Complex64_t *A1; - int lda1; - CHAMELEON_Complex64_t *A2; - int lda2; - CHAMELEON_Complex64_t *V; - int ldv; - CHAMELEON_Complex64_t *T; - int ldt; - CHAMELEON_Complex64_t *WORK; - int ldwork; - - A1 = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[0]); - A2 = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[1]); - V = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[2]); - T = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[3]); - WORK = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[4]); /* ib * nb */ - - starpu_codelet_unpack_args(cl_arg, &side, &trans, &m1, &n1, &m2, &n2, &k, &ib, - &lda1, &lda2, &ldv, &ldt, &ldwork); - - CORE_ztsmlq(side, trans, m1, n1, m2, n2, k, ib, - A1, lda1, A2, lda2, V, ldv, T, ldt, WORK, ldwork); -} - -#if defined(CHAMELEON_USE_CUDA) -static void cl_ztsmlq_cuda_func(void *descr[], void *cl_arg) -{ - cham_side_t side; - cham_trans_t trans; - int m1; - int n1; - int m2; - int n2; - int k; - int ib; - cuDoubleComplex *A1; - int lda1; - cuDoubleComplex *A2; - int lda2; - cuDoubleComplex *V; - int ldv; - cuDoubleComplex *T; - int ldt; - cuDoubleComplex *W, *WC; - int ldwork; - int ldworkc; - - A1 = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[0]); - A2 = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[1]); - V = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[2]); - T = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[3]); - W = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[4]); /* 2*ib*nb */ - - starpu_codelet_unpack_args(cl_arg, &side, &trans, &m1, &n1, &m2, &n2, &k, &ib, - &lda1, &lda2, &ldv, &ldt, &ldwork); - - WC = W + ib * ldwork; - ldworkc = (side == ChamLeft) ? m1 : ib; - - RUNTIME_getStream(stream); - - CUDA_ztsmlq( side, trans, m1, n1, m2, n2, k, ib, - A1, lda1, A2, lda2, V, ldv, T, ldt, - W, ldwork, WC, ldworkc, stream ); - -#ifndef STARPU_CUDA_ASYNC - cudaStreamSynchronize( stream ); -#endif -} -#endif /* defined(CHAMELEON_USE_CUDA) */ -#endif /* !defined(CHAMELEON_SIMULATION) */ - -/* - * Codelet definition - */ -CODELETS(ztsmlq, 5, cl_ztsmlq_cpu_func, cl_ztsmlq_cuda_func, STARPU_CUDA_ASYNC) diff --git a/runtime/starpu/codelets/codelet_ztsmlq_hetra1.c b/runtime/starpu/codelets/codelet_ztsmlq_hetra1.c index 8996121be9fbce09fb360a4cbe0c7ec262cff5fc..d68e2bebf803ac1c449c30435dec5ea1883ba86e 100644 --- a/runtime/starpu/codelets/codelet_ztsmlq_hetra1.c +++ b/runtime/starpu/codelets/codelet_ztsmlq_hetra1.c @@ -15,7 +15,7 @@ * @author Hatem Ltaief * @author Mathieu Faverge * @author Azzam Haidar - * @date 2010-11-15 + * @date 2018-11-07 * @precisions normal z -> c d s * */ @@ -106,8 +106,8 @@ static void cl_ztsmlq_hetra1_cpu_func(void *descr[], void *cl_arg) T = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[3]); WORK = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[4]); /* ib * nb */ - starpu_codelet_unpack_args(cl_arg, &side, &trans, &m1, &n1, &m2, &n2, &k, - &ib, &nb, &lda1, &lda2, &ldv, &ldt, &ldwork); + starpu_codelet_unpack_args( cl_arg, &side, &trans, &m1, &n1, &m2, &n2, &k, + &ib, &nb, &lda1, &lda2, &ldv, &ldt, &ldwork); CORE_ztsmlq_hetra1(side, trans, m1, n1, m2, n2, k, ib, A1, lda1, A2, lda2, V, ldv, T, ldt, WORK, ldwork); } diff --git a/runtime/starpu/codelets/codelet_ztsmqr.c b/runtime/starpu/codelets/codelet_ztsmqr.c deleted file mode 100644 index c38a8fbd129656fc5681a25a00bd7a57cefdb6e9..0000000000000000000000000000000000000000 --- a/runtime/starpu/codelets/codelet_ztsmqr.c +++ /dev/null @@ -1,271 +0,0 @@ -/** - * - * @file starpu/codelet_ztsmqr.c - * - * @copyright 2009-2014 The University of Tennessee and The University of - * Tennessee Research Foundation. All rights reserved. - * @copyright 2012-2018 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, - * Univ. Bordeaux. All rights reserved. - * - *** - * - * @brief Chameleon ztsmqr StarPU codelet - * - * @version 1.0.0 - * @comment This file has been automatically generated - * from Plasma 2.5.0 for CHAMELEON 1.0.0 - * @author Hatem Ltaief - * @author Jakub Kurzak - * @author Azzam Haidar - * @author Dulceneia Becker - * @author Mathieu Faverge - * @author Emmanuel Agullo - * @author Cedric Castagnede - * @date 2010-11-15 - * @precisions normal z -> c d s - * - */ -#include "chameleon_starpu.h" -#include "runtime_codelet_z.h" - -/** - * - * @ingroup INSERT_TASK_Complex64_t - * - * CORE_ztsmqr overwrites the general complex M1-by-N1 tile A1 and - * M2-by-N2 tile A2 with - * - * SIDE = 'L' SIDE = 'R' - * TRANS = 'N': Q * | A1 | | A1 A2 | * Q - * | A2 | - * - * TRANS = 'C': Q**H * | A1 | | A1 A2 | * Q**H - * | A2 | - * - * where Q is a complex unitary matrix defined as the product of k - * elementary reflectors - * - * Q = H(1) H(2) . . . H(k) - * - * as returned by CORE_ZTSQRT. - * - ******************************************************************************* - * - * @param[in] side - * @arg ChamLeft : apply Q or Q**H from the Left; - * @arg ChamRight : apply Q or Q**H from the Right. - * - * @param[in] trans - * @arg ChamNoTrans : No transpose, apply Q; - * @arg ChamConjTrans : ConjTranspose, apply Q**H. - * - * @param[in] M1 - * The number of rows of the tile A1. M1 >= 0. - * - * @param[in] N1 - * The number of columns of the tile A1. N1 >= 0. - * - * @param[in] M2 - * The number of rows of the tile A2. M2 >= 0. - * M2 = M1 if side == ChamRight. - * - * @param[in] N2 - * The number of columns of the tile A2. N2 >= 0. - * N2 = N1 if side == ChamLeft. - * - * @param[in] K - * The number of elementary reflectors whose product defines - * the matrix Q. - * - * @param[in] IB - * The inner-blocking size. IB >= 0. - * - * @param[in,out] A1 - * On entry, the M1-by-N1 tile A1. - * On exit, A1 is overwritten by the application of Q. - * - * @param[in] LDA1 - * The leading dimension of the array A1. LDA1 >= max(1,M1). - * - * @param[in,out] A2 - * On entry, the M2-by-N2 tile A2. - * On exit, A2 is overwritten by the application of Q. - * - * @param[in] LDA2 - * The leading dimension of the tile A2. LDA2 >= max(1,M2). - * - * @param[in] V - * The i-th row must contain the vector which defines the - * elementary reflector H(i), for i = 1,2,...,k, as returned by - * CORE_ZTSQRT in the first k columns of its array argument V. - * - * @param[in] LDV - * The leading dimension of the array V. LDV >= max(1,K). - * - * @param[in] T - * The IB-by-N1 triangular factor T of the block reflector. - * T is upper triangular by block (economic storage); - * The rest of the array is not referenced. - * - * @param[in] LDT - * The leading dimension of the array T. LDT >= IB. - * - * @param[out] WORK - * Workspace array of size - * LDWORK-by-N1 if side == ChamLeft - * LDWORK-by-IB if side == ChamRight - * - * @param[in] LDWORK - * The leading dimension of the array WORK. - * LDWORK >= max(1,IB) if side == ChamLeft - * LDWORK >= max(1,M1) if side == ChamRight - * - ******************************************************************************* - * - * @return - * \retval CHAMELEON_SUCCESS successful exit - * \retval <0 if -i, the i-th argument had an illegal value - * - */ - -void INSERT_TASK_ztsmqr(const RUNTIME_option_t *options, - cham_side_t side, cham_trans_t trans, - int m1, int n1, int m2, int n2, int k, int ib, int nb, - const CHAM_desc_t *A1, int A1m, int A1n, int lda1, - const CHAM_desc_t *A2, int A2m, int A2n, int lda2, - const CHAM_desc_t *V, int Vm, int Vn, int ldv, - const CHAM_desc_t *T, int Tm, int Tn, int ldt) -{ - (void)nb; - struct starpu_codelet *codelet = &cl_ztsmqr; - void (*callback)(void*) = options->profiling ? cl_ztsmqr_callback : NULL; - int ldwork = side == ChamLeft ? ib : nb; - - CHAMELEON_BEGIN_ACCESS_DECLARATION; - CHAMELEON_ACCESS_RW(A1, A1m, A1n); - CHAMELEON_ACCESS_RW(A2, A2m, A2n); - CHAMELEON_ACCESS_R(V, Vm, Vn); - CHAMELEON_ACCESS_R(T, Tm, Tn); - CHAMELEON_END_ACCESS_DECLARATION; - - starpu_insert_task( - starpu_mpi_codelet(codelet), - STARPU_VALUE, &side, sizeof(int), - STARPU_VALUE, &trans, sizeof(int), - STARPU_VALUE, &m1, sizeof(int), - STARPU_VALUE, &n1, sizeof(int), - STARPU_VALUE, &m2, sizeof(int), - STARPU_VALUE, &n2, sizeof(int), - STARPU_VALUE, &k, sizeof(int), - STARPU_VALUE, &ib, sizeof(int), - STARPU_RW, RTBLKADDR(A1, CHAMELEON_Complex64_t, A1m, A1n), - STARPU_VALUE, &lda1, sizeof(int), - STARPU_RW, RTBLKADDR(A2, CHAMELEON_Complex64_t, A2m, A2n), - STARPU_VALUE, &lda2, sizeof(int), - STARPU_R, RTBLKADDR(V, CHAMELEON_Complex64_t, Vm, Vn), - STARPU_VALUE, &ldv, sizeof(int), - STARPU_R, RTBLKADDR(T, CHAMELEON_Complex64_t, Tm, Tn), - STARPU_VALUE, &ldt, sizeof(int), - /* max( ib*nb, 2*ib*nb ) */ - STARPU_SCRATCH, options->ws_worker, - STARPU_VALUE, &ldwork, sizeof(int), - STARPU_PRIORITY, options->priority, - STARPU_CALLBACK, callback, -#if defined(CHAMELEON_USE_MPI) - STARPU_EXECUTE_ON_NODE, A2->get_rankof(A2, A2m, A2n), -#endif -#if defined(CHAMELEON_CODELETS_HAVE_NAME) - STARPU_NAME, "ztsmqr", -#endif - 0); -} - - -#if !defined(CHAMELEON_SIMULATION) -static void cl_ztsmqr_cpu_func(void *descr[], void *cl_arg) -{ - cham_side_t side; - cham_trans_t trans; - int m1; - int n1; - int m2; - int n2; - int k; - int ib; - CHAMELEON_Complex64_t *A1; - int lda1; - CHAMELEON_Complex64_t *A2; - int lda2; - CHAMELEON_Complex64_t *V; - int ldv; - CHAMELEON_Complex64_t *T; - int ldt; - CHAMELEON_Complex64_t *WORK; - int ldwork; - - A1 = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[0]); - A2 = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[1]); - V = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[2]); - T = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[3]); - WORK = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[4]); /* ib * nb */ - - starpu_codelet_unpack_args(cl_arg, &side, &trans, &m1, &n1, &m2, &n2, &k, &ib, - &lda1, &lda2, &ldv, &ldt, &ldwork); - - CORE_ztsmqr(side, trans, m1, n1, m2, n2, k, ib, - A1, lda1, A2, lda2, V, ldv, T, ldt, WORK, ldwork); -} - -#if defined(CHAMELEON_USE_CUDA) -static void cl_ztsmqr_cuda_func(void *descr[], void *cl_arg) -{ - cham_side_t side; - cham_trans_t trans; - int m1; - int n1; - int m2; - int n2; - int k; - int ib; - cuDoubleComplex *A1; - int lda1; - cuDoubleComplex *A2; - int lda2; - cuDoubleComplex *V; - int ldv; - cuDoubleComplex *T; - int ldt; - cuDoubleComplex *W, *WC; - int ldwork; - int ldworkc; - - A1 = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[0]); - A2 = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[1]); - V = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[2]); - T = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[3]); - W = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[4]); /* 2*ib*nb */ - - starpu_codelet_unpack_args(cl_arg, &side, &trans, &m1, &n1, &m2, &n2, &k, &ib, - &lda1, &lda2, &ldv, &ldt, &ldwork); - - WC = W + ib * (side == ChamLeft ? m1 : n1); - ldworkc = (side == ChamLeft) ? m2 : ib; - - RUNTIME_getStream(stream); - - CUDA_ztsmqr( - side, trans, m1, n1, m2, n2, k, ib, - A1, lda1, A2, lda2, V, ldv, T, ldt, - W, ldwork, WC, ldworkc, stream ); - -#ifndef STARPU_CUDA_ASYNC - cudaStreamSynchronize( stream ); -#endif -} -#endif /* defined(CHAMELEON_USE_CUDA) */ -#endif /* !defined(CHAMELEON_SIMULATION) */ - -/* - * Codelet definition - */ -CODELETS(ztsmqr, 5, cl_ztsmqr_cpu_func, cl_ztsmqr_cuda_func, STARPU_CUDA_ASYNC) diff --git a/runtime/starpu/codelets/codelet_ztsqrt.c b/runtime/starpu/codelets/codelet_ztsqrt.c deleted file mode 100644 index cb93ba7c22c3f832f365f293545366f03863bd6c..0000000000000000000000000000000000000000 --- a/runtime/starpu/codelets/codelet_ztsqrt.c +++ /dev/null @@ -1,166 +0,0 @@ -/** - * - * @file starpu/codelet_ztsqrt.c - * - * @copyright 2009-2014 The University of Tennessee and The University of - * Tennessee Research Foundation. All rights reserved. - * @copyright 2012-2018 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, - * Univ. Bordeaux. All rights reserved. - * - *** - * - * @brief Chameleon ztsqrt StarPU codelet - * - * @version 1.0.0 - * @comment This file has been automatically generated - * from Plasma 2.5.0 for CHAMELEON 1.0.0 - * @author Hatem Ltaief - * @author Jakub Kurzak - * @author Mathieu Faverge - * @author Emmanuel Agullo - * @author Cedric Castagnede - * @date 2010-11-15 - * @precisions normal z -> c d s - * - */ -#include "chameleon_starpu.h" -#include "runtime_codelet_z.h" - -/** - * - * @ingroup INSERT_TASK_Complex64_t - * - * CORE_ztsqrt computes a QR factorization of a rectangular matrix - * formed by coupling a complex N-by-N upper triangular tile A1 - * on top of a complex M-by-N tile A2: - * - * | A1 | = Q * R - * | A2 | - * - ******************************************************************************* - * - * @param[in] M - * The number of columns of the tile A2. M >= 0. - * - * @param[in] N - * The number of rows of the tile A1. - * The number of columns of the tiles A1 and A2. N >= 0. - * - * @param[in] IB - * The inner-blocking size. IB >= 0. - * - * @param[in,out] A1 - * On entry, the N-by-N tile A1. - * On exit, the elements on and above the diagonal of the array - * contain the N-by-N upper trapezoidal tile R; - * the elements below the diagonal are not referenced. - * - * @param[in] LDA1 - * The leading dimension of the array A1. LDA1 >= max(1,N). - * - * @param[in,out] A2 - * On entry, the M-by-N tile A2. - * On exit, all the elements with the array TAU, represent - * the unitary tile Q as a product of elementary reflectors - * (see Further Details). - * - * @param[in] LDA2 - * The leading dimension of the tile A2. LDA2 >= max(1,M). - * - * @param[out] T - * The IB-by-N triangular factor T of the block reflector. - * T is upper triangular by block (economic storage); - * The rest of the array is not referenced. - * - * @param[in] LDT - * The leading dimension of the array T. LDT >= IB. - * - * @param[out] TAU - * The scalar factors of the elementary reflectors (see Further - * Details). - * - * @param[out] WORK - * - ******************************************************************************* - * - * @return - * \retval CHAMELEON_SUCCESS successful exit - * \retval <0 if -i, the i-th argument had an illegal value - * - */ - -void INSERT_TASK_ztsqrt(const RUNTIME_option_t *options, - int m, int n, int ib, int nb, - const CHAM_desc_t *A1, int A1m, int A1n, int lda1, - const CHAM_desc_t *A2, int A2m, int A2n, int lda2, - const CHAM_desc_t *T, int Tm, int Tn, int ldt) -{ - (void)nb; - struct starpu_codelet *codelet = &cl_ztsqrt; - void (*callback)(void*) = options->profiling ? cl_ztsqrt_callback : NULL; - CHAMELEON_starpu_ws_t *h_work = (CHAMELEON_starpu_ws_t*)(options->ws_host); - - CHAMELEON_BEGIN_ACCESS_DECLARATION; - CHAMELEON_ACCESS_RW(A1, A1m, A1n); - CHAMELEON_ACCESS_RW(A2, A2m, A2n); - CHAMELEON_ACCESS_W(T, Tm, Tn); - CHAMELEON_END_ACCESS_DECLARATION; - - starpu_insert_task( - starpu_mpi_codelet(codelet), - STARPU_VALUE, &m, sizeof(int), - STARPU_VALUE, &n, sizeof(int), - STARPU_VALUE, &ib, sizeof(int), - STARPU_RW, RTBLKADDR(A1, CHAMELEON_Complex64_t, A1m, A1n), - STARPU_VALUE, &lda1, sizeof(int), - STARPU_RW, RTBLKADDR(A2, CHAMELEON_Complex64_t, A2m, A2n), - STARPU_VALUE, &lda2, sizeof(int), - STARPU_W, RTBLKADDR(T, CHAMELEON_Complex64_t, Tm, Tn ), - STARPU_VALUE, &ldt, sizeof(int), - /* max( nb * (ib+1), ib * (ib+nb) ) */ - STARPU_SCRATCH, options->ws_worker, - /* 2 * ib * (nb+ib) + nb */ - STARPU_VALUE, &h_work, sizeof(CHAMELEON_starpu_ws_t *), - STARPU_PRIORITY, options->priority, - STARPU_CALLBACK, callback, -#if defined(CHAMELEON_USE_MPI) - STARPU_EXECUTE_ON_NODE, A2->get_rankof(A2, A2m, A2n), -#endif -#if defined(CHAMELEON_CODELETS_HAVE_NAME) - STARPU_NAME, "ztsqrt", -#endif - 0); -} - - -#if !defined(CHAMELEON_SIMULATION) -static void cl_ztsqrt_cpu_func(void *descr[], void *cl_arg) -{ - CHAMELEON_starpu_ws_t *h_work; - int m; - int n; - int ib; - CHAMELEON_Complex64_t *A1; - int lda1; - CHAMELEON_Complex64_t *A2; - int lda2; - CHAMELEON_Complex64_t *T; - int ldt; - CHAMELEON_Complex64_t *TAU, *WORK; - - A1 = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[0]); - A2 = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[1]); - T = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[2]); - TAU= (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[3]); /* nb + ib*nb */ - - starpu_codelet_unpack_args(cl_arg, &m, &n, &ib, &lda1, &lda2, &ldt, &h_work); - - WORK = TAU + chameleon_max( m, n ); - CORE_ztsqrt(m, n, ib, A1, lda1, A2, lda2, T, ldt, TAU, WORK); -} -#endif /* !defined(CHAMELEON_SIMULATION) */ - -/* - * Codelet definition - */ -CODELETS_CPU(ztsqrt, 4, cl_ztsqrt_cpu_func) diff --git a/runtime/starpu/codelets/codelet_zttlqt.c b/runtime/starpu/codelets/codelet_zttlqt.c deleted file mode 100644 index a673832082e5aaa466b31b53230f0f6421c77333..0000000000000000000000000000000000000000 --- a/runtime/starpu/codelets/codelet_zttlqt.c +++ /dev/null @@ -1,173 +0,0 @@ -/** - * - * @file starpu/codelet_zttlqt.c - * - * @copyright 2009-2014 The University of Tennessee and The University of - * Tennessee Research Foundation. All rights reserved. - * @copyright 2012-2018 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, - * Univ. Bordeaux. All rights reserved. - * - *** - * - * @brief Chameleon zttlqt StarPU codelet - * - * @version 1.0.0 - * @comment This file has been automatically generated - * from Plasma 2.5.0 for CHAMELEON 1.0.0 - * @author Hatem Ltaief - * @author Dulceneia Becker - * @author Mathieu Faverge - * @author Emmanuel Agullo - * @author Cedric Castagnede - * @date 2010-11-15 - * @precisions normal z -> c d s - * - */ -#include "chameleon_starpu.h" -#include "runtime_codelet_z.h" - -/** - * - * @ingroup INSERT_TASK_Complex64_t - * - * CORE_zttlqt computes a LQ factorization of a rectangular matrix - * formed by coupling side-by-side a complex M-by-M lower triangular tile A1 - * and a complex M-by-N lower triangular tile A2: - * - * | A1 A2 | = L * Q - * - * The tile Q is represented as a product of elementary reflectors - * - * Q = H(k)' . . . H(2)' H(1)', where k = min(M,N). - * - * Each H(i) has the form - * - * H(i) = I - tau * v * v' - * - * where tau is a complex scalar, and v is a complex vector with - * v(1:i-1) = 0 and v(i) = 1; conjg(v(i+1:n)) is stored on exit in - * A2(i,1:n), and tau in TAU(i). - * - ******************************************************************************* - * - * @param[in] M - * The number of rows of the tile A1 and A2. M >= 0. - * The number of columns of the tile A1. - * - * @param[in] N - * The number of columns of the tile A2. N >= 0. - * - * @param[in] IB - * The inner-blocking size. IB >= 0. - * - * @param[in,out] A1 - * On entry, the M-by-M tile A1. - * On exit, the elements on and below the diagonal of the array - * contain the M-by-M lower trapezoidal tile L; - * the elements above the diagonal are not referenced. - * - * @param[in] LDA1 - * The leading dimension of the array A1. LDA1 >= max(1,N). - * - * @param[in,out] A2 - * On entry, the M-by-N lower triangular tile A2. - * On exit, the elements on and below the diagonal of the array - * with the array TAU, represent - * the unitary tile Q as a product of elementary reflectors - * (see Further Details). - * - * @param[in] LDA2 - * The leading dimension of the array A2. LDA2 >= max(1,M). - * - * @param[out] T - * The IB-by-N triangular factor T of the block reflector. - * T is upper triangular by block (economic storage); - * The rest of the array is not referenced. - * - * @param[in] LDT - * The leading dimension of the array T. LDT >= IB. - * - * @param[out] TAU - * The scalar factors of the elementary reflectors (see Further - * Details). - * - * @param[in,out] WORK - * - ******************************************************************************* - * - * @return - * \retval CHAMELEON_SUCCESS successful exit - * \retval <0 if -i, the i-th argument had an illegal value - * - */ - -void INSERT_TASK_zttlqt(const RUNTIME_option_t *options, - int m, int n, int ib, int nb, - const CHAM_desc_t *A1, int A1m, int A1n, int lda1, - const CHAM_desc_t *A2, int A2m, int A2n, int lda2, - const CHAM_desc_t *T, int Tm, int Tn, int ldt) -{ - (void)nb; - struct starpu_codelet *codelet = &cl_zttlqt; - void (*callback)(void*) = options->profiling ? cl_zttlqt_callback : NULL; - - CHAMELEON_BEGIN_ACCESS_DECLARATION; - CHAMELEON_ACCESS_RW(A1, A1m, A1n); - CHAMELEON_ACCESS_RW(A2, A2m, A2n); - CHAMELEON_ACCESS_W(T, Tm, Tn); - CHAMELEON_END_ACCESS_DECLARATION; - - starpu_insert_task( - starpu_mpi_codelet(codelet), - STARPU_VALUE, &m, sizeof(int), - STARPU_VALUE, &n, sizeof(int), - STARPU_VALUE, &ib, sizeof(int), - STARPU_RW, RTBLKADDR(A1, CHAMELEON_Complex64_t, A1m, A1n), - STARPU_VALUE, &lda1, sizeof(int), - STARPU_RW, RTBLKADDR(A2, CHAMELEON_Complex64_t, A2m, A2n), - STARPU_VALUE, &lda2, sizeof(int), - STARPU_W, RTBLKADDR(T, CHAMELEON_Complex64_t, Tm, Tn), - STARPU_VALUE, &ldt, sizeof(int), - /* nb * (ib+1) */ - STARPU_SCRATCH, options->ws_worker, - STARPU_PRIORITY, options->priority, - STARPU_CALLBACK, callback, -#if defined(CHAMELEON_CODELETS_HAVE_NAME) - STARPU_NAME, "zttlqt", -#endif - 0); -} - - -#if !defined(CHAMELEON_SIMULATION) -static void cl_zttlqt_cpu_func(void *descr[], void *cl_arg) -{ - int m; - int n; - int ib; - CHAMELEON_Complex64_t *A1; - int lda1; - CHAMELEON_Complex64_t *A2; - int lda2; - CHAMELEON_Complex64_t *T; - int ldt; - CHAMELEON_Complex64_t *TAU; - CHAMELEON_Complex64_t *WORK; - - A1 = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[0]); - A2 = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[1]); - T = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[2]); - TAU = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[3]); /* nb * (ib+1) */ - - starpu_codelet_unpack_args(cl_arg, &m, &n, &ib, &lda1, &lda2, &ldt); - - WORK = TAU + chameleon_max( m, n ); - - CORE_zttlqt(m, n, ib, A1, lda1, A2, lda2, T, ldt, TAU, WORK); -} -#endif /* !defined(CHAMELEON_SIMULATION) */ - -/* - * Codelet definition - */ -CODELETS_CPU(zttlqt, 4, cl_zttlqt_cpu_func) diff --git a/runtime/starpu/codelets/codelet_zttmlq.c b/runtime/starpu/codelets/codelet_zttmlq.c deleted file mode 100644 index c2924eafcef47724732584b52ab235208c578a78..0000000000000000000000000000000000000000 --- a/runtime/starpu/codelets/codelet_zttmlq.c +++ /dev/null @@ -1,212 +0,0 @@ -/** - * - * @file starpu/codelet_zttmlq.c - * - * @copyright 2009-2014 The University of Tennessee and The University of - * Tennessee Research Foundation. All rights reserved. - * @copyright 2012-2018 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, - * Univ. Bordeaux. All rights reserved. - * - *** - * - * @brief Chameleon zttmlq StarPU codelet - * - * @version 1.0.0 - * @comment This file has been automatically generated - * from Plasma 2.5.0 for CHAMELEON 1.0.0 - * @author Hatem Ltaief - * @author Dulceneia Becker - * @author Mathieu Faverge - * @author Emmanuel Agullo - * @author Cedric Castagnede - * @date 2010-11-15 - * @precisions normal z -> c d s - * - */ -#include "chameleon_starpu.h" -#include "runtime_codelet_z.h" - -/** - * - * @ingroup INSERT_TASK_Complex64_t - * - * CORE_zttmlq overwrites the general complex M1-by-N1 tile A1 and - * M2-by-N2 tile A2 (N1 == N2) with - * - * SIDE = 'L' SIDE = 'R' - * TRANS = 'N': Q * | A1 | | A1 | * Q - * | A2 | | A2 | - * - * TRANS = 'C': Q**H * | A1 | | A1 | * Q**H - * | A2 | | A2 | - * - * where Q is a complex unitary matrix defined as the product of k - * elementary reflectors - * - * Q = H(1) H(2) . . . H(k) - * - * as returned by CORE_zttqrt. - * - ******************************************************************************* - * - * @param[in] side - * @arg ChamLeft : apply Q or Q**H from the Left; - * @arg ChamRight : apply Q or Q**H from the Right. - * - * @param[in] trans - * @arg ChamNoTrans : No transpose, apply Q; - * @arg ChamConjTrans : ConjTranspose, apply Q**H. - * - * @param[in] M1 - * The number of rows of the tile A1. M1 >= 0. - * - * @param[in] N1 - * The number of columns of the tile A1. N1 >= 0. - * - * @param[in] M2 - * The number of rows of the tile A2. M2 >= 0. - * - * @param[in] N2 - * The number of columns of the tile A2. N2 >= 0. - * - * @param[in] K - * The number of elementary reflectors whose product defines - * the matrix Q. - * - * @param[in] IB - * The inner-blocking size. IB >= 0. - * - * @param[in,out] A1 - * On entry, the M1-by-N1 tile A1. - * On exit, A1 is overwritten by the application of Q. - * - * @param[in] LDA1 - * The leading dimension of the array A1. LDA1 >= max(1,M1). - * - * @param[in,out] A2 - * On entry, the M2-by-N2 tile A2. - * On exit, A2 is overwritten by the application of Q. - * - * @param[in] LDA2 - * The leading dimension of the tile A2. LDA2 >= max(1,M2). - * - * @param[in] V - * The i-th row must contain the vector which defines the - * elementary reflector H(i), for i = 1,2,...,k, as returned by - * CORE_ZTTQRT in the first k rows of its array argument V. - * - * @param[in] LDV - * The leading dimension of the array V. LDV >= max(1,K). - * - * @param[out] T - * The IB-by-N1 triangular factor T of the block reflector. - * T is upper triangular by block (economic storage); - * The rest of the array is not referenced. - * - * @param[in] LDT - * The leading dimension of the array T. LDT >= IB. - * - * @param[out] WORK - * Workspace array of size LDWORK-by-N1. - * - * @param[in] LDWORK - * The dimension of the array WORK. LDWORK >= max(1,IB). - * - ******************************************************************************* - * - * @return - * \retval CHAMELEON_SUCCESS successful exit - * \retval <0 if -i, the i-th argument had an illegal value - * - */ - -void INSERT_TASK_zttmlq(const RUNTIME_option_t *options, - cham_side_t side, cham_trans_t trans, - int m1, int n1, int m2, int n2, int k, int ib, int nb, - const CHAM_desc_t *A1, int A1m, int A1n, int lda1, - const CHAM_desc_t *A2, int A2m, int A2n, int lda2, - const CHAM_desc_t *V, int Vm, int Vn, int ldv, - const CHAM_desc_t *T, int Tm, int Tn, int ldt) -{ - (void)nb; - struct starpu_codelet *codelet = &cl_zttmlq; - void (*callback)(void*) = options->profiling ? cl_zttmlq_callback : NULL; - int ldwork = side == ChamLeft ? ib : nb; - - CHAMELEON_BEGIN_ACCESS_DECLARATION; - CHAMELEON_ACCESS_RW(A1, A1m, A1n); - CHAMELEON_ACCESS_RW(A2, A2m, A2n); - CHAMELEON_ACCESS_R(V, Vm, Vn); - CHAMELEON_ACCESS_R(T, Tm, Tn); - CHAMELEON_END_ACCESS_DECLARATION; - - starpu_insert_task( - starpu_mpi_codelet(codelet), - STARPU_VALUE, &side, sizeof(int), - STARPU_VALUE, &trans, sizeof(int), - STARPU_VALUE, &m1, sizeof(int), - STARPU_VALUE, &n1, sizeof(int), - STARPU_VALUE, &m2, sizeof(int), - STARPU_VALUE, &n2, sizeof(int), - STARPU_VALUE, &k, sizeof(int), - STARPU_VALUE, &ib, sizeof(int), - STARPU_RW, RTBLKADDR(A1, CHAMELEON_Complex64_t, A1m, A1n), - STARPU_VALUE, &lda1, sizeof(int), - STARPU_RW, RTBLKADDR(A2, CHAMELEON_Complex64_t, A2m, A2n), - STARPU_VALUE, &lda2, sizeof(int), - STARPU_R, RTBLKADDR(V, CHAMELEON_Complex64_t, Vm, Vn), - STARPU_VALUE, &ldv, sizeof(int), - STARPU_R, RTBLKADDR(T, CHAMELEON_Complex64_t, Tm, Tn), - STARPU_VALUE, &ldt, sizeof(int), - /* nb * ib */ - STARPU_SCRATCH, options->ws_worker, - STARPU_VALUE, &ldwork, sizeof(int), - STARPU_PRIORITY, options->priority, - STARPU_CALLBACK, callback, -#if defined(CHAMELEON_CODELETS_HAVE_NAME) - STARPU_NAME, "zttmlq", -#endif - 0); -} - - -#if !defined(CHAMELEON_SIMULATION) -static void cl_zttmlq_cpu_func(void *descr[], void *cl_arg) -{ - cham_side_t side; - cham_trans_t trans; - int m1; - int n1; - int m2; - int n2; - int k; - int ib; - CHAMELEON_Complex64_t *A1; - int lda1; - CHAMELEON_Complex64_t *A2; - int lda2; - CHAMELEON_Complex64_t *V; - int ldv; - CHAMELEON_Complex64_t *T; - int ldt; - CHAMELEON_Complex64_t *WORK; - int ldwork; - - A1 = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[0]); - A2 = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[1]); - V = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[2]); - T = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[3]); - WORK = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[4]); /* nb * ib */ - - starpu_codelet_unpack_args(cl_arg, &side, &trans, &m1, &n1, &m2, &n2, &k, &ib, - &lda1, &lda2, &ldv, &ldt, &ldwork); - - CORE_zttmlq(side, trans, m1, n1, m2, n2, k, ib, A1, lda1, - A2, lda2, V, ldv, T, ldt, WORK, ldwork); -} -#endif /* !defined(CHAMELEON_SIMULATION) */ - -/* - * Codelet definition - */ -CODELETS_CPU(zttmlq, 5, cl_zttmlq_cpu_func) diff --git a/runtime/starpu/codelets/codelet_zttmqr.c b/runtime/starpu/codelets/codelet_zttmqr.c deleted file mode 100644 index d485d16b9bcf582e1a86489612ad61b7724eb004..0000000000000000000000000000000000000000 --- a/runtime/starpu/codelets/codelet_zttmqr.c +++ /dev/null @@ -1,269 +0,0 @@ -/** - * - * @file starpu/codelet_zttmqr.c - * - * @copyright 2009-2014 The University of Tennessee and The University of - * Tennessee Research Foundation. All rights reserved. - * @copyright 2012-2018 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, - * Univ. Bordeaux. All rights reserved. - * - *** - * - * @brief Chameleon zttmqr StarPU codelet - * - * @version 1.0.0 - * @comment This file has been automatically generated - * from Plasma 2.5.0 for CHAMELEON 1.0.0 - * @author Hatem Ltaief - * @author Dulceneia Becker - * @author Mathieu Faverge - * @author Emmanuel Agullo - * @author Cedric Castagnede - * @date 2010-11-15 - * @precisions normal z -> c d s - * - */ -#include "chameleon_starpu.h" -#include "runtime_codelet_z.h" - -/** - * - * @ingroup INSERT_TASK_Complex64_t - * - * CORE_zttmqr overwrites the general complex M1-by-N1 tile A1 and - * M2-by-N2 tile A2 (N1 == N2) with - * - * SIDE = 'L' SIDE = 'R' - * TRANS = 'N': Q * | A1 | | A1 | * Q - * | A2 | | A2 | - * - * TRANS = 'C': Q**H * | A1 | | A1 | * Q**H - * | A2 | | A2 | - * - * where Q is a complex unitary matrix defined as the product of k - * elementary reflectors - * - * Q = H(1) H(2) . . . H(k) - * - * as returned by CORE_zttqrt. - * - ******************************************************************************* - * - * @param[in] side - * @arg ChamLeft : apply Q or Q**H from the Left; - * @arg ChamRight : apply Q or Q**H from the Right. - * - * @param[in] trans - * @arg ChamNoTrans : No transpose, apply Q; - * @arg ChamConjTrans : ConjTranspose, apply Q**H. - * - * @param[in] M1 - * The number of rows of the tile A1. M1 >= 0. - * - * @param[in] N1 - * The number of columns of the tile A1. N1 >= 0. - * - * @param[in] M2 - * The number of rows of the tile A2. M2 >= 0. - * M2 = M1 if side == ChamRight. - * - * @param[in] N2 - * The number of columns of the tile A2. N2 >= 0. - * N2 = N1 if side == ChamLeft. - * - * @param[in] K - * The number of elementary reflectors whose product defines - * the matrix Q. - * - * @param[in] IB - * The inner-blocking size. IB >= 0. - * - * @param[in,out] A1 - * On entry, the M1-by-N1 tile A1. - * On exit, A1 is overwritten by the application of Q. - * - * @param[in] LDA1 - * The leading dimension of the array A1. LDA1 >= max(1,M1). - * - * @param[in,out] A2 - * On entry, the M2-by-N2 tile A2. - * On exit, A2 is overwritten by the application of Q. - * - * @param[in] LDA2 - * The leading dimension of the tile A2. LDA2 >= max(1,M2). - * - * @param[in] V - * The i-th row must contain the vector which defines the - * elementary reflector H(i), for i = 1,2,...,k, as returned by - * CORE_ZTTQRT in the first k columns of its array argument V. - * - * @param[in] LDV - * The leading dimension of the array V. LDV >= max(1,K). - * - * @param[in] T - * The IB-by-N1 triangular factor T of the block reflector. - * T is upper triangular by block (economic storage); - * The rest of the array is not referenced. - * - * @param[in] LDT - * The leading dimension of the array T. LDT >= IB. - * - * @param[out] WORK - * Workspace array of size - * LDWORK-by-N1 if side == ChamLeft - * LDWORK-by-IB if side == ChamRight - * - * @param[in] LDWORK - * The leading dimension of the array WORK. - * LDWORK >= max(1,IB) if side == ChamLeft - * LDWORK >= max(1,M1) if side == ChamRight - * - ******************************************************************************* - * - * @return - * \retval CHAMELEON_SUCCESS successful exit - * \retval <0 if -i, the i-th argument had an illegal value - * - */ - -void INSERT_TASK_zttmqr(const RUNTIME_option_t *options, - cham_side_t side, cham_trans_t trans, - int m1, int n1, int m2, int n2, int k, int ib, int nb, - const CHAM_desc_t *A1, int A1m, int A1n, int lda1, - const CHAM_desc_t *A2, int A2m, int A2n, int lda2, - const CHAM_desc_t *V, int Vm, int Vn, int ldv, - const CHAM_desc_t *T, int Tm, int Tn, int ldt) -{ - (void)nb; - struct starpu_codelet *codelet = &cl_zttmqr; - void (*callback)(void*) = options->profiling ? cl_zttmqr_callback : NULL; - int ldwork = side == ChamLeft ? ib : nb; - - CHAMELEON_BEGIN_ACCESS_DECLARATION; - CHAMELEON_ACCESS_RW(A1, A1m, A1n); - CHAMELEON_ACCESS_RW(A2, A2m, A2n); - CHAMELEON_ACCESS_R(V, Vm, Vn); - CHAMELEON_ACCESS_R(T, Tm, Tn); - CHAMELEON_END_ACCESS_DECLARATION; - - starpu_insert_task( - starpu_mpi_codelet(codelet), - STARPU_VALUE, &side, sizeof(int), - STARPU_VALUE, &trans, sizeof(int), - STARPU_VALUE, &m1, sizeof(int), - STARPU_VALUE, &n1, sizeof(int), - STARPU_VALUE, &m2, sizeof(int), - STARPU_VALUE, &n2, sizeof(int), - STARPU_VALUE, &k, sizeof(int), - STARPU_VALUE, &ib, sizeof(int), - STARPU_RW, RTBLKADDR(A1, CHAMELEON_Complex64_t, A1m, A1n), - STARPU_VALUE, &lda1, sizeof(int), - STARPU_RW, RTBLKADDR(A2, CHAMELEON_Complex64_t, A2m, A2n), - STARPU_VALUE, &lda2, sizeof(int), - STARPU_R, RTBLKADDR(V, CHAMELEON_Complex64_t, Vm, Vn), - STARPU_VALUE, &ldv, sizeof(int), - STARPU_R, RTBLKADDR(T, CHAMELEON_Complex64_t, Tm, Tn), - STARPU_VALUE, &ldt, sizeof(int), - /* max( ib*nb, 2*ib*nb ) */ - STARPU_SCRATCH, options->ws_worker, - STARPU_VALUE, &ldwork, sizeof(int), - STARPU_PRIORITY, options->priority, - STARPU_CALLBACK, callback, -#if defined(CHAMELEON_USE_MPI) - STARPU_EXECUTE_ON_NODE, A2->get_rankof(A2, A2m, A2n), -#endif -#if defined(CHAMELEON_CODELETS_HAVE_NAME) - STARPU_NAME, "zttmqr", -#endif - 0); -} - - -#if !defined(CHAMELEON_SIMULATION) -static void cl_zttmqr_cpu_func(void *descr[], void *cl_arg) -{ - cham_side_t side; - cham_trans_t trans; - int m1; - int n1; - int m2; - int n2; - int k; - int ib; - CHAMELEON_Complex64_t *A1; - int lda1; - CHAMELEON_Complex64_t *A2; - int lda2; - CHAMELEON_Complex64_t *V; - int ldv; - CHAMELEON_Complex64_t *T; - int ldt; - CHAMELEON_Complex64_t *WORK; - int ldwork; - - A1 = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[0]); - A2 = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[1]); - V = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[2]); - T = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[3]); - WORK = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[4]); /* ib * nb */ - - starpu_codelet_unpack_args(cl_arg, &side, &trans, &m1, &n1, &m2, &n2, &k, &ib, - &lda1, &lda2, &ldv, &ldt, &ldwork); - - CORE_zttmqr(side, trans, m1, n1, m2, n2, k, ib, - A1, lda1, A2, lda2, V, ldv, T, ldt, WORK, ldwork); -} - -#if defined(CHAMELEON_USE_CUDA) -static void cl_zttmqr_cuda_func(void *descr[], void *cl_arg) -{ - cham_side_t side; - cham_trans_t trans; - int m1; - int n1; - int m2; - int n2; - int k; - int ib; - cuDoubleComplex *A1; - int lda1; - cuDoubleComplex *A2; - int lda2; - cuDoubleComplex *V; - int ldv; - cuDoubleComplex *T; - int ldt; - cuDoubleComplex *W, *WC; - int ldwork; - int ldworkc; - - A1 = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[0]); - A2 = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[1]); - V = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[2]); - T = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[3]); - W = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[4]); /* 2*ib*nb */ - - starpu_codelet_unpack_args(cl_arg, &side, &trans, &m1, &n1, &m2, &n2, &k, &ib, - &lda1, &lda2, &ldv, &ldt, &ldwork); - - WC = W + ib * (side == ChamLeft ? m1 : n1); - ldworkc = (side == ChamLeft) ? m2 : ib; - - RUNTIME_getStream(stream); - - CUDA_zttmqr( - side, trans, m1, n1, m2, n2, k, ib, - A1, lda1, A2, lda2, V, ldv, T, ldt, - W, ldwork, WC, ldworkc, stream ); - -#ifndef STARPU_CUDA_ASYNC - cudaStreamSynchronize( stream ); -#endif -} -#endif /* defined(CHAMELEON_USE_CUDA) */ -#endif /* !defined(CHAMELEON_SIMULATION) */ - -/* - * Codelet definition - */ -CODELETS(zttmqr, 5, cl_zttmqr_cpu_func, cl_zttmqr_cuda_func, STARPU_CUDA_ASYNC) diff --git a/runtime/starpu/codelets/codelet_zttqrt.c b/runtime/starpu/codelets/codelet_zttqrt.c deleted file mode 100644 index 39d52185fdc68654e966b34db700b31733cde5c6..0000000000000000000000000000000000000000 --- a/runtime/starpu/codelets/codelet_zttqrt.c +++ /dev/null @@ -1,176 +0,0 @@ -/** - * - * @file starpu/codelet_zttqrt.c - * - * @copyright 2009-2014 The University of Tennessee and The University of - * Tennessee Research Foundation. All rights reserved. - * @copyright 2012-2018 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, - * Univ. Bordeaux. All rights reserved. - * - *** - * - * @brief Chameleon zttqrt StarPU codelet - * - * @version 1.0.0 - * @comment This file has been automatically generated - * from Plasma 2.5.0 for CHAMELEON 1.0.0 - * @author Hatem Ltaief - * @author Dulceneia Becker - * @author Mathieu Faverge - * @author Emmanuel Agullo - * @author Cedric Castagnede - * @date 2010-11-15 - * @precisions normal z -> c d s - * - */ -#include "chameleon_starpu.h" -#include "runtime_codelet_z.h" - -/** - * - * @ingroup INSERT_TASK_Complex64_t - * - * CORE_zttqrt computes a QR factorization of a rectangular matrix - * formed by coupling a complex N-by-N upper triangular tile A1 - * on top of a complex M-by-N upper trapezoidal tile A2: - * - * | A1 | = Q * R - * | A2 | - * - * The tile Q is represented as a product of elementary reflectors - * - * Q = H(1) H(2) . . . H(k), where k = min(M,N). - * - * Each H(i) has the form - * - * H(i) = I - tau * v * v' - * - * where tau is a complex scalar, and v is a complex vector with - * v(1:i-1) = 0 and v(i) = 1; v(i+1:m) is stored on exit in A2(1:m,i), - * and tau in TAU(i). - * - ******************************************************************************* - * - * @param[in] M - * The number of rows of the tile A2. M >= 0. - * - * @param[in] N - * The number of columns of the tile A1 and A2. N >= 0. - * - * @param[in] IB - * The inner-blocking size. IB >= 0. - * - * @param[in,out] A1 - * On entry, the N-by-N tile A1. - * On exit, the elements on and above the diagonal of the array - * contain the N-by-N upper trapezoidal tile R; - * the elements below the diagonal are not referenced. - * - * @param[in] LDA1 - * The leading dimension of the array A1. LDA1 >= max(1,N). - * - * @param[in,out] A2 - * On entry, the M-by-N upper triangular tile A2. - * On exit, the elements on and above the diagonal of the array - * with the array TAU, represent - * the unitary tile Q as a product of elementary reflectors - * (see Further Details). - * - * @param[in] LDA2 - * The leading dimension of the array A2. LDA2 >= max(1,M). - * - * @param[out] T - * The IB-by-N triangular factor T of the block reflector. - * T is upper triangular by block (economic storage); - * The rest of the array is not referenced. - * - * @param[in] LDT - * The leading dimension of the array T. LDT >= IB. - * - * @param[out] TAU - * The scalar factors of the elementary reflectors (see Further - * Details). - * - * @param[in,out] WORK - * - ******************************************************************************* - * - * @return - * \retval CHAMELEON_SUCCESS successful exit - * \retval <0 if -i, the i-th argument had an illegal value - * - */ - -void INSERT_TASK_zttqrt(const RUNTIME_option_t *options, - int m, int n, int ib, int nb, - const CHAM_desc_t *A1, int A1m, int A1n, int lda1, - const CHAM_desc_t *A2, int A2m, int A2n, int lda2, - const CHAM_desc_t *T, int Tm, int Tn, int ldt) -{ - (void)nb; - struct starpu_codelet *codelet = &cl_zttqrt; - void (*callback)(void*) = options->profiling ? cl_zttqrt_callback : NULL; - - CHAMELEON_BEGIN_ACCESS_DECLARATION; - CHAMELEON_ACCESS_RW(A1, A1m, A1n); - CHAMELEON_ACCESS_RW(A2, A2m, A2n); - CHAMELEON_ACCESS_W(T, Tm, Tn); - CHAMELEON_END_ACCESS_DECLARATION; - - starpu_insert_task( - starpu_mpi_codelet(codelet), - STARPU_VALUE, &m, sizeof(int), - STARPU_VALUE, &n, sizeof(int), - STARPU_VALUE, &ib, sizeof(int), - STARPU_RW, RTBLKADDR(A1, CHAMELEON_Complex64_t, A1m, A1n), - STARPU_VALUE, &lda1, sizeof(int), - STARPU_RW, RTBLKADDR(A2, CHAMELEON_Complex64_t, A2m, A2n), - STARPU_VALUE, &lda2, sizeof(int), - STARPU_W, RTBLKADDR(T, CHAMELEON_Complex64_t, Tm, Tn), - STARPU_VALUE, &ldt, sizeof(int), - /* nb * (ib+1) */ - STARPU_SCRATCH, options->ws_worker, - STARPU_PRIORITY, options->priority, - STARPU_CALLBACK, callback, -#if defined(CHAMELEON_USE_MPI) - STARPU_EXECUTE_ON_NODE, A2->get_rankof(A2, A2m, A2n), -#endif -#if defined(CHAMELEON_CODELETS_HAVE_NAME) - STARPU_NAME, "zttqrt", -#endif - 0); -} - - -#if !defined(CHAMELEON_SIMULATION) -static void cl_zttqrt_cpu_func(void *descr[], void *cl_arg) -{ - int m; - int n; - int ib; - CHAMELEON_Complex64_t *A1; - int lda1; - CHAMELEON_Complex64_t *A2; - int lda2; - CHAMELEON_Complex64_t *T; - int ldt; - CHAMELEON_Complex64_t *TAU; - CHAMELEON_Complex64_t *WORK; - - A1 = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[0]); - A2 = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[1]); - T = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[2]); - TAU = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[3]); /* nb * (ib+1) */ - - starpu_codelet_unpack_args(cl_arg, &m, &n, &ib, &lda1, &lda2, &ldt); - - WORK = TAU + chameleon_max( m, n ); - - CORE_zttqrt(m, n, ib, A1, lda1, A2, lda2, T, ldt, TAU, WORK); -} -#endif /* !defined(CHAMELEON_SIMULATION) */ - -/* - * Codelet definition - */ -CODELETS_CPU(zttqrt, 4, cl_zttqrt_cpu_func) diff --git a/runtime/starpu/control/runtime_zlocality.c b/runtime/starpu/control/runtime_zlocality.c index 0b2a7bc282dd6d39e852cb6d2679367d8b213f31..025e300c62775d79287593f316fb5671dea40345 100644 --- a/runtime/starpu/control/runtime_zlocality.c +++ b/runtime/starpu/control/runtime_zlocality.c @@ -15,7 +15,7 @@ * @author Cedric Augonnet * @author Mathieu Faverge * @author Cedric Castagnede - * @date 2011-06-01 + * @date 2018-11-08 * @precisions normal z -> s d c * */ @@ -58,23 +58,15 @@ void RUNTIME_zlocality_allrestrict( uint32_t where ) /* QR */ cl_zgeqrt_restrict_where( where ); - cl_ztsqrt_restrict_where( where ); + cl_ztpqrt_restrict_where( where ); cl_zunmqr_restrict_where( where ); - cl_ztsmqr_restrict_where( where ); - - /* QR-RH */ -/* cl_zttqrt_restrict_where( where ); */ -/* cl_zttmqr_restrict_where( where ); */ + cl_ztpmqrt_restrict_where( where ); /* LQ */ cl_zgelqt_restrict_where( where ); - cl_ztslqt_restrict_where( where ); + cl_ztplqt_restrict_where( where ); cl_zunmlq_restrict_where( where ); - cl_ztsmlq_restrict_where( where ); - - /* LQ-RH */ -/* cl_zttlqt_restrict_where( where ); */ -/* cl_zttmlq_restrict_where( where ); */ + cl_ztpmlqt_restrict_where( where ); } @@ -112,23 +104,15 @@ void RUNTIME_zlocality_onerestrict( cham_tasktype_t kernel, uint32_t where ) /* QR */ case TASK_GEQRT: cl_zgeqrt_restrict_where( where ); break; + case TASK_TPQRT: cl_ztpqrt_restrict_where( where ); break; case TASK_UNMQR: cl_zunmqr_restrict_where( where ); break; - case TASK_TSMQR: cl_ztsmqr_restrict_where( where ); break; - case TASK_TSQRT: cl_ztsqrt_restrict_where( where ); break; - - /* QR-RH */ -/* case TASK_TTMQR: cl_zttmqr_restrict_where( where ); break; */ -/* case TASK_TTQRT: cl_zttqrt_restrict_where( where ); break; */ + case TASK_TPMQRT: cl_ztpmqrt_restrict_where( where ); break; /* LQ */ case TASK_GELQT: cl_zgelqt_restrict_where( where ); break; + case TASK_TPLQT: cl_ztplqt_restrict_where( where ); break; case TASK_UNMLQ: cl_zunmlq_restrict_where( where ); break; - case TASK_TSMLQ: cl_ztsmlq_restrict_where( where ); break; - case TASK_TSLQT: cl_ztslqt_restrict_where( where ); break; - - /* LQ-RH */ -/* case TASK_TTMLQ: cl_zttmlq_restrict_where( where ); break; */ -/* case TASK_TTLQT: cl_zttlqt_restrict_where( where ); break; */ + case TASK_TPMLQT: cl_ztpmlqt_restrict_where( where ); break; default: return; @@ -167,23 +151,15 @@ void RUNTIME_zlocality_allrestore( ) /* QR */ cl_zgeqrt_restore_where(); - cl_ztsqrt_restore_where(); + cl_ztpqrt_restore_where(); cl_zunmqr_restore_where(); - cl_ztsmqr_restore_where(); - - /* QR-RH */ -/* cl_zttqrt_restore_where(); */ -/* cl_zttmqr_restore_where(); */ + cl_ztpmqrt_restore_where(); /* LQ */ cl_zgelqt_restore_where(); - cl_ztslqt_restore_where(); + cl_ztplqt_restore_where(); cl_zunmlq_restore_where(); - cl_ztsmlq_restore_where(); - - /* LQ-RH */ -/* cl_zttlqt_restore_where(); */ -/* cl_zttmlq_restore_where(); */ + cl_ztpmlqt_restore_where(); } @@ -221,23 +197,15 @@ void RUNTIME_zlocality_onerestore( cham_tasktype_t kernel ) /* QR */ case TASK_GEQRT: cl_zgeqrt_restore_where(); break; + case TASK_TPQRT: cl_ztpqrt_restore_where(); break; case TASK_UNMQR: cl_zunmqr_restore_where(); break; - case TASK_TSMQR: cl_ztsmqr_restore_where(); break; - case TASK_TSQRT: cl_ztsqrt_restore_where(); break; - - /* QR-RH */ -/* case TASK_TTMQR: cl_zttmqr_restore_where(); break; */ -/* case TASK_TTQRT: cl_zttqrt_restore_where(); break; */ + case TASK_TPMQRT: cl_ztpmqrt_restore_where(); break; /* LQ */ case TASK_GELQT: cl_zgelqt_restore_where(); break; + case TASK_TPLQT: cl_ztplqt_restore_where(); break; case TASK_UNMLQ: cl_zunmlq_restore_where(); break; - case TASK_TSMLQ: cl_ztsmlq_restore_where(); break; - case TASK_TSLQT: cl_ztslqt_restore_where(); break; - - /* LQ-RH */ -/* case TASK_TTMLQ: cl_zttmlq_restore_where(); break; */ -/* case TASK_TTLQT: cl_zttlqt_restore_where(); break; */ + case TASK_TPMLQT: cl_ztpmlqt_restore_where(); break; default: return; diff --git a/runtime/starpu/control/runtime_zprofiling.c b/runtime/starpu/control/runtime_zprofiling.c index 82af2b32836b8e3d6400f17cadd86b8f4f02de00..848746b7f5e4a9c64f060d06393983382907e0f1 100644 --- a/runtime/starpu/control/runtime_zprofiling.c +++ b/runtime/starpu/control/runtime_zprofiling.c @@ -15,7 +15,7 @@ * @author Cedric Augonnet * @author Mathieu Faverge * @author Cedric Castagnede - * @date 2011-06-01 + * @date 2018-11-08 * @precisions normal z -> s d c * */ @@ -43,21 +43,18 @@ void RUNTIME_zdisplay_allprofile() profiling_display_zgelqt_info(); profiling_display_zgeqrt_info(); profiling_display_zgessm_info(); - profiling_display_zgetrf_info(); profiling_display_zgetrf_incpiv_info(); + profiling_display_zgetrf_info(); profiling_display_zgetrf_nopiv_info(); profiling_display_zlauum_info(); profiling_display_zpotrf_info(); profiling_display_zssssm_info(); + profiling_display_ztplqt_info(); + profiling_display_ztpmlqt_info(); + profiling_display_ztpmqrt_info(); + profiling_display_ztpqrt_info(); profiling_display_ztrtri_info(); - profiling_display_ztslqt_info(); - profiling_display_ztsmqr_info(); - profiling_display_ztsqrt_info(); profiling_display_ztstrf_info(); - profiling_display_zttlqt_info(); - profiling_display_zttmlq_info(); - profiling_display_zttmqr_info(); - profiling_display_zttqrt_info(); profiling_display_zunmlq_info(); profiling_display_zunmqr_info(); @@ -78,7 +75,7 @@ void RUNTIME_zdisplay_oneprofile( cham_tasktype_t kernel ) case TASK_SYMM: profiling_display_zsymm_info(); break; case TASK_SYR2K: profiling_display_zsyr2k_info(); break; case TASK_SYRK: profiling_display_zsyrk_info(); break; - case TASK_TRMM: profiling_display_ztrmm_info(); break; + case TASK_TRMM: profiling_display_ztrmm_info(); break; case TASK_TRSM: profiling_display_ztrsm_info(); break; /* Lapack */ @@ -92,14 +89,13 @@ void RUNTIME_zdisplay_oneprofile( cham_tasktype_t kernel ) case TASK_POTRF: profiling_display_zpotrf_info(); break; case TASK_SSSSM: profiling_display_zssssm_info(); break; case TASK_TRTRI: profiling_display_ztrtri_info(); break; - case TASK_TSLQT: profiling_display_ztslqt_info(); break; - case TASK_TSMQR: profiling_display_ztsmqr_info(); break; - case TASK_TSQRT: profiling_display_ztsqrt_info(); break; case TASK_TSTRF: profiling_display_ztstrf_info(); break; - case TASK_TTLQT: profiling_display_zttlqt_info(); break; - case TASK_TTMLQ: profiling_display_zttmlq_info(); break; - case TASK_TTMQR: profiling_display_zttmqr_info(); break; - case TASK_TTQRT: profiling_display_zttqrt_info(); break; + + case TASK_TPLQT: profiling_display_ztplqt_info(); break; + case TASK_TPMLQT: profiling_display_ztpmlqt_info(); break; + case TASK_TPMQRT: profiling_display_ztpmqrt_info(); break; + case TASK_TPQRT: profiling_display_ztpqrt_info(); break; + case TASK_UNMLQ: profiling_display_zunmlq_info(); break; case TASK_UNMQR: profiling_display_zunmqr_info(); break; diff --git a/runtime/starpu/include/runtime_codelet_z.h b/runtime/starpu/include/runtime_codelet_z.h index 30b5f282db608afbb653daadf4b2c5159d2142be..509abacfc9a3dd9c2fd09729f8a7e7a351778476 100644 --- a/runtime/starpu/include/runtime_codelet_z.h +++ b/runtime/starpu/include/runtime_codelet_z.h @@ -15,7 +15,7 @@ * @author Cedric Augonnet * @author Mathieu Faverge * @author Cedric Castagnede - * @date 2011-06-01 + * @date 2018-11-08 * @precisions normal z -> c d s * */ @@ -78,17 +78,9 @@ ZCODELETS_HEADER(tplqt) ZCODELETS_HEADER(tpqrt) ZCODELETS_HEADER(tpmlqt) ZCODELETS_HEADER(tpmqrt) -ZCODELETS_HEADER(tslqt) -ZCODELETS_HEADER(tsmlq) -ZCODELETS_HEADER(tsmqr) ZCODELETS_HEADER(tsmlq_hetra1) ZCODELETS_HEADER(tsmqr_hetra1) -ZCODELETS_HEADER(tsqrt) ZCODELETS_HEADER(tstrf) -ZCODELETS_HEADER(ttlqt) -ZCODELETS_HEADER(ttmlq) -ZCODELETS_HEADER(ttmqr) -ZCODELETS_HEADER(ttqrt) ZCODELETS_HEADER(unmlq) ZCODELETS_HEADER(unmqr)