diff --git a/compute/pzgetrf.c b/compute/pzgetrf.c index 165801efb0e1bf4cfc5f5f36edf33def54063797..1d93acc5ed3ec411fea4396c85012689e6560ee3 100644 --- a/compute/pzgetrf.c +++ b/compute/pzgetrf.c @@ -24,6 +24,50 @@ #define A(m,n) A, m, n #define U(m,n) &(ws->U), m, n +#define IPIV(m) IPIV, m, 1 + +/* + * Static variable to know how to handle the data within the kernel + * This assumes that only one runtime is enabled at a time. + */ +static RUNTIME_id_t zgetrf_runtime_id = RUNTIME_SCHED_STARPU; + +static inline int +zgetrf_ipiv_init( const CHAM_desc_t *descIPIV, + cham_uplo_t uplo, int m, int n, + CHAM_tile_t *tileIPIV, void *op_args ) +{ + int *IPIV; + (void)op_args; + + if ( zgetrf_runtime_id == RUNTIME_SCHED_PARSEC ) { + IPIV = (int*)tileIPIV; + } + else { + IPIV = CHAM_tile_get_ptr( tileIPIV ); + } + +#if !defined(CHAMELEON_SIMULATION) + { + int tempmm = m == descIPIV->mt-1 ? descIPIV->m - m * descIPIV->mb : descIPIV->mb; + int i; + + for( i=0; i<tempmm; i++ ) { + IPIV[i] = m * descIPIV->mb + i + 1; + } + } +#endif + + return 0; +} + +static inline void +chameleon_pzgetrf_ipiv_init( CHAM_desc_t *IPIV, + RUNTIME_sequence_t *sequence, + RUNTIME_request_t *request ) +{ + chameleon_pmap( ChamW, ChamUpperLower, IPIV, zgetrf_ipiv_init, NULL, sequence, request ); +} /* * All the functions below are panel factorization variant. @@ -183,6 +227,7 @@ chameleon_pzgetrf_panel_update( struct chameleon_pzgetrf_s *ws, */ void chameleon_pzgetrf( struct chameleon_pzgetrf_s *ws, CHAM_desc_t *A, + CHAM_desc_t *IPIV, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request ) { @@ -198,6 +243,9 @@ void chameleon_pzgetrf( struct chameleon_pzgetrf_s *ws, } RUNTIME_options_init( &options, chamctxt, sequence, request ); + /* Initialize IPIV */ + chameleon_pzgetrf_ipiv_init( IPIV, sequence, request ); + for (k = 0; k < min_mnt; k++) { RUNTIME_iteration_push( chamctxt, k ); diff --git a/compute/zgetrf.c b/compute/zgetrf.c index bcb8ee0c8c560863cf27248f180ce6af0d11e628..72c595373df2f4c95bf844597993b1c7ed8aec38 100644 --- a/compute/zgetrf.c +++ b/compute/zgetrf.c @@ -167,7 +167,7 @@ CHAMELEON_zgetrf_WS_Free( void *user_ws ) * */ int -CHAMELEON_zgetrf( int M, int N, CHAMELEON_Complex64_t *A, int *IPIV, int LDA ) +CHAMELEON_zgetrf( int M, int N, CHAMELEON_Complex64_t *A, int LDA, int *IPIV ) { int NB; int status; @@ -271,7 +271,7 @@ CHAMELEON_zgetrf( int M, int N, CHAMELEON_Complex64_t *A, int *IPIV, int LDA ) * */ int -CHAMELEON_zgetrf_Tile( CHAM_desc_t *A ) +CHAMELEON_zgetrf_Tile( CHAM_desc_t *A, CHAM_desc_t *IPIV ) { CHAM_context_t *chamctxt; RUNTIME_sequence_t *sequence = NULL; @@ -287,8 +287,7 @@ CHAMELEON_zgetrf_Tile( CHAM_desc_t *A ) chameleon_sequence_create( chamctxt, &sequence ); ws = CHAMELEON_zgetrf_WS_Alloc( A ); - CHAMELEON_zgetrf_Tile_Async( A, ws, sequence, &request ); - + CHAMELEON_zgetrf_Tile_Async( A, IPIV, ws, sequence, &request ); CHAMELEON_Desc_Flush( A, sequence ); chameleon_sequence_wait( chamctxt, sequence ); @@ -334,6 +333,7 @@ CHAMELEON_zgetrf_Tile( CHAM_desc_t *A ) */ int CHAMELEON_zgetrf_Tile_Async( CHAM_desc_t *A, + CHAM_desc_t *IPIV, void *user_ws, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request ) @@ -375,12 +375,24 @@ CHAMELEON_zgetrf_Tile_Async( CHAM_desc_t *A, chameleon_error( "CHAMELEON_zgetrf_Tile", "invalid first descriptor" ); return chameleon_request_fail( sequence, request, CHAMELEON_ERR_ILLEGAL_VALUE ); } + if ( chameleon_desc_check( IPIV ) != CHAMELEON_SUCCESS ) { + chameleon_error( "CHAMELEON_zgetrf_Tile", "invalid second descriptor" ); + return chameleon_request_fail( sequence, request, CHAMELEON_ERR_ILLEGAL_VALUE ); + } /* Check input arguments */ if ( A->nb != A->mb ) { chameleon_error( "CHAMELEON_zgetrf_Tile", "only square tiles supported" ); return chameleon_request_fail( sequence, request, CHAMELEON_ERR_ILLEGAL_VALUE ); } + if ( IPIV->mb != A->mb ) { + chameleon_error( "CHAMELEON_zgetrf_Tile", "IPIV tiles must have the number of rows as tiles of A" ); + return chameleon_request_fail( sequence, request, CHAMELEON_ERR_ILLEGAL_VALUE ); + } + if ( IPIV->nb != 1 ) { + chameleon_error( "CHAMELEON_zgetrf_Tile", "IPIV tiles must be vectore with only one column per tile" ); + return chameleon_request_fail( sequence, request, CHAMELEON_ERR_ILLEGAL_VALUE ); + } if ( user_ws == NULL ) { ws = CHAMELEON_zgetrf_WS_Alloc( A ); @@ -389,7 +401,7 @@ CHAMELEON_zgetrf_Tile_Async( CHAM_desc_t *A, ws = user_ws; } - chameleon_pzgetrf( user_ws, A, sequence, request ); + chameleon_pzgetrf( user_ws, A, IPIV, sequence, request ); if ( user_ws == NULL ) { CHAMELEON_Desc_Flush( A, sequence ); diff --git a/control/compute_z.h b/control/compute_z.h index 1dd9b13055672181c935ebf67fe8ff1db6b0199e..634bd2d5cf8088a8cab84724a9c1d82498d70610 100644 --- a/control/compute_z.h +++ b/control/compute_z.h @@ -87,7 +87,7 @@ void chameleon_pzgepdf_qdwh( cham_mtxtype_t trans, CHAM_desc_t *descU, CHAM_desc void chameleon_pzgepdf_qr( int genD, int doqr, int optid, const libhqr_tree_t *qrtreeT, const libhqr_tree_t *qrtreeB, CHAM_desc_t *A1, CHAM_desc_t *TS1, CHAM_desc_t *TT1, CHAM_desc_t *D1, CHAM_desc_t *Q1, CHAM_desc_t *A2, CHAM_desc_t *TS2, CHAM_desc_t *TT2, CHAM_desc_t *D2, CHAM_desc_t *Q2, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request ); void chameleon_pzgeqrf( int genD, CHAM_desc_t *A, CHAM_desc_t *T, CHAM_desc_t *D, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request); void chameleon_pzgeqrfrh( int genD, int BS, CHAM_desc_t *A, CHAM_desc_t *T, CHAM_desc_t *D, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request); -void chameleon_pzgetrf( struct chameleon_pzgetrf_s *ws, CHAM_desc_t *A, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request ); +void chameleon_pzgetrf( struct chameleon_pzgetrf_s *ws, CHAM_desc_t *A, CHAM_desc_t *IPIV, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request ); void chameleon_pzgetrf_incpiv(CHAM_desc_t *A, CHAM_desc_t *L, CHAM_desc_t *D, int *IPIV, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request); void chameleon_pzgetrf_nopiv(CHAM_desc_t *A, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request); void chameleon_pzgetrf_reclap(CHAM_desc_t *A, int *IPIV, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request); diff --git a/control/descriptor.c b/control/descriptor.c index 8c054d98d43b96f9e15c864452c933d3eec00762..1005a04cff833e3b38bb8134cfa5b47ebef40fd5 100644 --- a/control/descriptor.c +++ b/control/descriptor.c @@ -376,7 +376,8 @@ int chameleon_desc_check(const CHAM_desc_t *desc) chameleon_error("chameleon_desc_check", "NULL matrix pointer"); return CHAMELEON_ERR_UNALLOCATED; } - if (desc->dtyp != ChamRealFloat && + if (desc->dtyp != ChamInteger && + desc->dtyp != ChamRealFloat && desc->dtyp != ChamRealDouble && desc->dtyp != ChamComplexFloat && desc->dtyp != ChamComplexDouble ) { diff --git a/include/chameleon/chameleon_z.h b/include/chameleon/chameleon_z.h index 1f04c61aad249f13d4653e94886eba891236da35..da5a5bc651ad2e4803f2c05425ad923bb58c8daf 100644 --- a/include/chameleon/chameleon_z.h +++ b/include/chameleon/chameleon_z.h @@ -53,7 +53,7 @@ int CHAMELEON_zgesvd(cham_job_t jobu, cham_job_t jobvt, int M, int N, CHAMELEON_ //int CHAMELEON_zgetrf(int M, int N, CHAMELEON_Complex64_t *A, int LDA, int *IPIV); int CHAMELEON_zgetrf_incpiv(int M, int N, CHAMELEON_Complex64_t *A, int LDA, CHAM_desc_t *descL, int *IPIV); int CHAMELEON_zgetrf_nopiv(int M, int N, CHAMELEON_Complex64_t *A, int LDA); -int CHAMELEON_zgetrf( int M, int N, CHAMELEON_Complex64_t *A, int LDA ); +int CHAMELEON_zgetrf( int M, int N, CHAMELEON_Complex64_t *A, int LDA, int *IPIV ); //int CHAMELEON_zgetri(int N, CHAMELEON_Complex64_t *A, int LDA, int *IPIV); //int CHAMELEON_zgetrs(cham_trans_t trans, int N, int NRHS, CHAMELEON_Complex64_t *A, int LDA, int *IPIV, CHAMELEON_Complex64_t *B, int LDB); int CHAMELEON_zgetrs_incpiv(cham_trans_t trans, int N, int NRHS, CHAMELEON_Complex64_t *A, int LDA, CHAM_desc_t *descL, int *IPIV, CHAMELEON_Complex64_t *B, int LDB); @@ -134,7 +134,7 @@ int CHAMELEON_zgesvd_Tile(cham_job_t jobu, cham_job_t jobvt, CHAM_desc_t *A, dou //int CHAMELEON_zgetrf_Tile(CHAM_desc_t *A, int *IPIV); int CHAMELEON_zgetrf_incpiv_Tile(CHAM_desc_t *A, CHAM_desc_t *L, int *IPIV); int CHAMELEON_zgetrf_nopiv_Tile(CHAM_desc_t *A); -int CHAMELEON_zgetrf_Tile( CHAM_desc_t *A ); +int CHAMELEON_zgetrf_Tile( CHAM_desc_t *A, CHAM_desc_t *IPIV ); //int CHAMELEON_zgetri_Tile(CHAM_desc_t *A, int *IPIV); //int CHAMELEON_zgetrs_Tile(cham_trans_t trans, CHAM_desc_t *A, int *IPIV, CHAM_desc_t *B); int CHAMELEON_zgetrs_incpiv_Tile(CHAM_desc_t *A, CHAM_desc_t *L, int *IPIV, CHAM_desc_t *B); @@ -211,7 +211,7 @@ int CHAMELEON_zgesvd_Tile_Async(cham_job_t jobu, cham_job_t jobvt, CHAM_desc_t * //int CHAMELEON_zgetrf_Tile_Async(CHAM_desc_t *A, int *IPIV, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request); int CHAMELEON_zgetrf_incpiv_Tile_Async(CHAM_desc_t *A, CHAM_desc_t *L, int *IPIV, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request); int CHAMELEON_zgetrf_nopiv_Tile_Async(CHAM_desc_t *A, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request); -int CHAMELEON_zgetrf_Tile_Async( CHAM_desc_t *A, void *ws, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request ); +int CHAMELEON_zgetrf_Tile_Async( CHAM_desc_t *A, CHAM_desc_t *IPIV, void *ws, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request ); //int CHAMELEON_zgetri_Tile_Async(CHAM_desc_t *A, int *IPIV, CHAM_desc_t *W, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request); //int CHAMELEON_zgetrs_Tile_Async(cham_trans_t trans, CHAM_desc_t *A, int *IPIV, CHAM_desc_t *B, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request); int CHAMELEON_zgetrs_incpiv_Tile_Async(CHAM_desc_t *A, CHAM_desc_t *L, int *IPIV, CHAM_desc_t *B, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request); diff --git a/testing/testing_zgetrf.c b/testing/testing_zgetrf.c index b41ee056dfbcdfb89590841041c11014b9f296d6..5657d49b6435619644d70292096978e5d6ec2521 100644 --- a/testing/testing_zgetrf.c +++ b/testing/testing_zgetrf.c @@ -38,12 +38,13 @@ testing_zgetrf_desc( run_arg_list_t *args, int check ) int N = run_arg_get_int( args, "N", 1000 ); int M = run_arg_get_int( args, "M", N ); int LDA = run_arg_get_int( args, "LDA", M ); - int seedA = run_arg_get_int( args, "seedA", random() ); - cham_diag_t diag = run_arg_get_diag( args, "diag", ChamNonUnit ); + int seedA = run_arg_get_int( args, "seedA", testing_ialea() ); + cham_diag_t diag = run_arg_get_diag( args, "diag", ChamUnit ); int Q = parameters_compute_q( P ); + int minMN = chameleon_min( M, N ); /* Descriptors */ - CHAM_desc_t *descA; + CHAM_desc_t *descA, *descIPIV; void *ws = NULL; CHAMELEON_Set( CHAMELEON_TILE_SIZE, nb ); @@ -51,11 +52,13 @@ testing_zgetrf_desc( run_arg_list_t *args, int check ) /* Creates the matrices */ CHAMELEON_Desc_Create( &descA, (void*)(-mtxfmt), ChamComplexDouble, nb, nb, nb * nb, LDA, N, 0, 0, M, N, P, Q ); + CHAMELEON_Desc_Create( + &descIPIV, CHAMELEON_MAT_ALLOC_TILE, ChamInteger, nb, 1, nb, minMN, 1, 0, 0, minMN, 1, P, Q ); /* Fills the matrix with random values */ if ( diag == ChamUnit ) { - CHAMELEON_zplgtr_Tile( 0, ChamUpper, descA, seedA ); - CHAMELEON_zplgtr_Tile( 0, ChamLower, descA, seedA+1 ); + CHAMELEON_zplgtr_Tile( 0, ChamUpper, descA, seedA ); + CHAMELEON_zplgtr_Tile( minMN, ChamLower, descA, seedA+1 ); } else { CHAMELEON_zplrnt_Tile( descA, seedA ); @@ -68,11 +71,12 @@ testing_zgetrf_desc( run_arg_list_t *args, int check ) /* Calculates the solution */ testing_start( &test_data ); if ( async ) { - hres = CHAMELEON_zgetrf_Tile_Async( descA, ws, test_data.sequence, &test_data.request ); + hres = CHAMELEON_zgetrf_Tile_Async( descA, descIPIV, ws, test_data.sequence, &test_data.request ); CHAMELEON_Desc_Flush( descA, test_data.sequence ); + CHAMELEON_Desc_Flush( descIPIV, test_data.sequence ); } else { - hres = CHAMELEON_zgetrf_Tile( descA ); + hres = CHAMELEON_zgetrf_Tile( descA, descIPIV ); } test_data.hres = hres; testing_stop( &test_data, flops_zgetrf( M, N ) ); @@ -80,40 +84,41 @@ testing_zgetrf_desc( run_arg_list_t *args, int check ) /* Checks the factorization and residual */ #if !defined(CHAMELEON_SIMULATION) if ( check ) { - CHAM_desc_t *descA0c; - CHAM_desc_t *descA0 = CHAMELEON_Desc_Copy( descA, NULL ); + CHAM_desc_t *descA0c, *descIPIVc; + CHAM_desc_t *descA0 = CHAMELEON_Desc_Copy( descA, CHAMELEON_MAT_ALLOC_TILE ); + int *ipiv; - /* Create A0c as local to rank 0 on all nodes */ + /* Create A0c as local to rank 0 on all nodes to gather the matrix */ CHAMELEON_Desc_Create_User( &descA0c, (void*)CHAMELEON_MAT_ALLOC_GLOBAL, ChamComplexDouble, nb, nb, nb*nb, M, N, 0, 0, M, N, 1, 1, chameleon_getaddr_cm, chameleon_getblkldd_cm, NULL ); + CHAMELEON_Desc_Create_User( + &descIPIVc, (void*)CHAMELEON_MAT_ALLOC_GLOBAL, ChamInteger, + nb, 1, nb, M, 1, 0, 0, M, 1, 1, 1, + chameleon_getaddr_cm, chameleon_getblkldd_cm, NULL ); if ( diag == ChamUnit ) { - CHAMELEON_zplgtr_Tile( 0, ChamUpper, descA0, seedA ); - CHAMELEON_zplgtr_Tile( 0, ChamLower, descA0, seedA+1 ); + CHAMELEON_zplgtr_Tile( 0, ChamUpper, descA0c, seedA ); + CHAMELEON_zplgtr_Tile( minMN, ChamLower, descA0c, seedA+1 ); } else { - CHAMELEON_zplrnt_Tile( descA0, seedA ); + CHAMELEON_zplrnt_Tile( descA0c, seedA ); } -#if 0 + /* Cheat code: float (s) is the same size as int */ + CHAMELEON_slacpy_Tile( ChamUpperLower, descIPIV, descIPIVc ); + ipiv = descIPIVc->mat; + /* Compute the permutation of A0: P * A0 */ if ( CHAMELEON_Comm_rank() == 0 ) { - int i, j; - int *ipiv = (int *)calloc( M, sizeof(int) ); - - for (i = 0; i < M; i++) { - ipiv[i] = i+1; - } - LAPACKE_zlaswp( LAPACK_COL_MAJOR, N, descA0c->mat, M, 1, M, ipiv, 1 ); - free( ipiv ); } CHAMELEON_zlacpy_Tile( ChamUpperLower, descA0c, descA0 ); -#endif CHAMELEON_Desc_Destroy( &descA0c ); + CHAMELEON_Desc_Destroy( &descIPIVc ); + hres += check_zxxtrf( args, ChamGeneral, ChamUpperLower, descA0, descA ); @@ -126,6 +131,7 @@ testing_zgetrf_desc( run_arg_list_t *args, int check ) } CHAMELEON_Desc_Destroy( &descA ); + CHAMELEON_Desc_Destroy( &descIPIV ); return hres; }