diff --git a/compute/pzgetrf.c b/compute/pzgetrf.c index 64552d74636c4919a6283f4a856a10eb5c2f05f2..99152bc04575e72ab83819df54472161b57406f6 100644 --- a/compute/pzgetrf.c +++ b/compute/pzgetrf.c @@ -118,7 +118,7 @@ chameleon_pzgetrf_panel_facto_percol( struct chameleon_pzgetrf_s *ws, RUNTIME_option_t *options ) { int m, h; - int tempkm, tempkn, minmn; + int tempkm, tempkn, tempmm, minmn; tempkm = k == A->mt-1 ? A->m-k*A->mb : A->mb; tempkn = k == A->nt-1 ? A->n-k*A->nb : A->nb; @@ -133,14 +133,15 @@ chameleon_pzgetrf_panel_facto_percol( struct chameleon_pzgetrf_s *ws, for (h=0; h<=minmn; h++){ INSERT_TASK_zgetrf_percol_diag( options, - h, k * A->mb, + tempkm, tempkn, h, k * A->mb, A(k, k), ipiv ); for (m = k+1; m < A->mt; m++) { + tempmm = (m == (A->mt - 1)) ? A->m - m * A->mb : A->mb; INSERT_TASK_zgetrf_percol_offdiag( options, - h, m * A->mb, + tempmm, tempkn, h, m * A->mb, A(m, k), ipiv ); } @@ -164,7 +165,7 @@ chameleon_pzgetrf_panel_facto_blocked( struct chameleon_pzgetrf_s *ws, RUNTIME_option_t *options ) { int m, h, b, nbblock; - int tempkm, tempkn, minmn; + int tempkm, tempkn, tempmm, minmn; tempkm = k == A->mt-1 ? A->m-k*A->mb : A->mb; tempkn = k == A->nt-1 ? A->n-k*A->nb : A->nb; @@ -185,14 +186,15 @@ chameleon_pzgetrf_panel_facto_blocked( struct chameleon_pzgetrf_s *ws, INSERT_TASK_zgetrf_blocked_diag( options, - j, k * A->mb, ws->ib, + tempkm, tempkn, j, k * A->mb, ws->ib, A(k, k), Up(k, k), ipiv ); for (m = k+1; m < A->mt; m++) { + tempmm = (m == (A->mt - 1)) ? A->m - m * A->mb : A->mb; INSERT_TASK_zgetrf_blocked_offdiag( options, - j, m * A->mb, ws->ib, + tempmm, tempkn, j, m * A->mb, ws->ib, A(m, k), Up(k, k), ipiv ); } diff --git a/include/chameleon/tasks_z.h b/include/chameleon/tasks_z.h index 3d2d6de0dda97e18c0ab12a91dc680412372c610..eb855ec34be8d002a9d99ab63276b32c722edb4b 100644 --- a/include/chameleon/tasks_z.h +++ b/include/chameleon/tasks_z.h @@ -508,23 +508,23 @@ void INSERT_TASK_zgetrf_nopiv_percol_trsm( const RUNTIME_option_t *options, const CHAM_desc_t *U, int Um, int Un ); void INSERT_TASK_zgetrf_percol_diag( const RUNTIME_option_t *options, - int h, int m0, + int m, int n, int h, int m0, CHAM_desc_t *A, int Am, int An, CHAM_ipiv_t *ws ); void INSERT_TASK_zgetrf_percol_offdiag( const RUNTIME_option_t *options, - int h, int m0, + int m, int n, int h, int m0, CHAM_desc_t *A, int Am, int An, CHAM_ipiv_t *ws ); void INSERT_TASK_zgetrf_blocked_diag( const RUNTIME_option_t *options, - int h, int m0, int ib, + int m, int n, int h, int m0, int ib, CHAM_desc_t *A, int Am, int An, CHAM_desc_t *U, int Um, int Un, CHAM_ipiv_t *ws ); void INSERT_TASK_zgetrf_blocked_offdiag( const RUNTIME_option_t *options, - int h, int m0, int ib, + int m, int n, int h, int m0, int ib, CHAM_desc_t *A, int Am, int An, CHAM_desc_t *U, int Um, int Un, CHAM_ipiv_t *ws ); diff --git a/runtime/starpu/codelets/codelet_zgetrf_blocked.c b/runtime/starpu/codelets/codelet_zgetrf_blocked.c index 8df527007f8d004fdc73019de014e2361b6954a0..2c6daa18d9bda1f7ff433305aa98ad77f648b4b5 100644 --- a/runtime/starpu/codelets/codelet_zgetrf_blocked.c +++ b/runtime/starpu/codelets/codelet_zgetrf_blocked.c @@ -29,7 +29,7 @@ CHAMELEON_CL_CB( zgetrf_blocked_trsm, cti_handle_get_m(task->handles[0]), 0, #if !defined(CHAMELEON_SIMULATION) static void cl_zgetrf_blocked_diag_cpu_func(void *descr[], void *cl_arg) { - int h, m0, ib; + int m, n, h, m0, ib; RUNTIME_sequence_t *sequence; RUNTIME_request_t *request; CHAM_tile_t *tileA; @@ -40,7 +40,7 @@ static void cl_zgetrf_blocked_diag_cpu_func(void *descr[], void *cl_arg) CHAMELEON_Complex64_t *U = NULL; int ldu = -1;; - starpu_codelet_unpack_args( cl_arg, &h, &m0, &ib, + starpu_codelet_unpack_args( cl_arg, &m, &n, &h, &m0, &ib, &sequence, &request ); tileA = cti_interface_get(descr[0]); @@ -67,7 +67,7 @@ static void cl_zgetrf_blocked_diag_cpu_func(void *descr[], void *cl_arg) nextpiv->h = h; nextpiv->has_diag = 1; - CORE_zgetrf_panel_diag( tileA->m, tileA->n, h, m0, ib, + CORE_zgetrf_panel_diag( m, n, h, m0, ib, CHAM_tile_get_ptr( tileA ), tileA->ld, U, ldu, ipiv, &(nextpiv->pivot), &(prevpiv->pivot) ); @@ -87,7 +87,7 @@ static void cl_zgetrf_blocked_diag_cpu_func(void *descr[], void *cl_arg) CODELETS_CPU( zgetrf_blocked_diag, cl_zgetrf_blocked_diag_cpu_func ); void INSERT_TASK_zgetrf_blocked_diag( const RUNTIME_option_t *options, - int h, int m0, int ib, + int m, int n, int h, int m0, int ib, CHAM_desc_t *A, int Am, int An, CHAM_desc_t *U, int Um, int Un, CHAM_ipiv_t *ipiv ) @@ -123,6 +123,8 @@ void INSERT_TASK_zgetrf_blocked_diag( const RUNTIME_option_t *options, rt_starpu_insert_task( codelet, + STARPU_VALUE, &m, sizeof(int), + STARPU_VALUE, &n, sizeof(int), STARPU_VALUE, &h, sizeof(int), STARPU_VALUE, &m0, sizeof(int), STARPU_VALUE, &ib, sizeof(int), @@ -146,7 +148,7 @@ void INSERT_TASK_zgetrf_blocked_diag( const RUNTIME_option_t *options, #if !defined(CHAMELEON_SIMULATION) static void cl_zgetrf_blocked_offdiag_cpu_func(void *descr[], void *cl_arg) { - int h, m0, ib; + int m, n, h, m0, ib; RUNTIME_sequence_t *sequence; RUNTIME_request_t *request; CHAM_tile_t *tileA; @@ -156,7 +158,7 @@ static void cl_zgetrf_blocked_offdiag_cpu_func(void *descr[], void *cl_arg) CHAMELEON_Complex64_t *U = NULL; int ldu = -1;; - starpu_codelet_unpack_args( cl_arg, &h, &m0, &ib, &sequence, &request ); + starpu_codelet_unpack_args( cl_arg, &m, &n, &h, &m0, &ib, &sequence, &request ); tileA = cti_interface_get(descr[0]); nextpiv = (cppi_interface_t*) descr[1]; @@ -169,7 +171,7 @@ static void cl_zgetrf_blocked_offdiag_cpu_func(void *descr[], void *cl_arg) nextpiv->h = h; /* Initialize in case it uses a copy */ - CORE_zgetrf_panel_offdiag( tileA->m, tileA->n, h, m0, ib, + CORE_zgetrf_panel_offdiag( m, n, h, m0, ib, CHAM_tile_get_ptr(tileA), tileA->ld, U, ldu, &(nextpiv->pivot), &(prevpiv->pivot) ); @@ -182,7 +184,7 @@ static void cl_zgetrf_blocked_offdiag_cpu_func(void *descr[], void *cl_arg) CODELETS_CPU(zgetrf_blocked_offdiag, cl_zgetrf_blocked_offdiag_cpu_func) void INSERT_TASK_zgetrf_blocked_offdiag( const RUNTIME_option_t *options, - int h, int m0, int ib, + int m, int n, int h, int m0, int ib, CHAM_desc_t *A, int Am, int An, CHAM_desc_t *U, int Um, int Un, CHAM_ipiv_t *ipiv ) @@ -206,6 +208,8 @@ void INSERT_TASK_zgetrf_blocked_offdiag( const RUNTIME_option_t *options, rt_starpu_insert_task( codelet, + STARPU_VALUE, &m, sizeof(int), + STARPU_VALUE, &n, sizeof(int), STARPU_VALUE, &h, sizeof(int), STARPU_VALUE, &m0, sizeof(int), STARPU_VALUE, &ib, sizeof(int), diff --git a/runtime/starpu/codelets/codelet_zgetrf_percol.c b/runtime/starpu/codelets/codelet_zgetrf_percol.c index 104eee81e51ab032f78534e6b37f9b1340c23f96..5d3f83b6ce046a72135c8f513c8cc23822159595 100644 --- a/runtime/starpu/codelets/codelet_zgetrf_percol.c +++ b/runtime/starpu/codelets/codelet_zgetrf_percol.c @@ -28,7 +28,7 @@ CHAMELEON_CL_CB( zgetrf_percol_offdiag, cti_handle_get_m(task->handles[0]), 0, 0 #if !defined(CHAMELEON_SIMULATION) static void cl_zgetrf_percol_diag_cpu_func(void *descr[], void *cl_arg) { - int h, m0; + int m, n, h, m0; RUNTIME_sequence_t *sequence; RUNTIME_request_t *request; CHAM_tile_t *tileA; @@ -36,8 +36,7 @@ static void cl_zgetrf_percol_diag_cpu_func(void *descr[], void *cl_arg) cppi_interface_t *nextpiv; cppi_interface_t *prevpiv; - starpu_codelet_unpack_args( cl_arg, &h, &m0, - &sequence, &request ); + starpu_codelet_unpack_args( cl_arg, &m, &n, &h, &m0, &sequence, &request ); tileA = cti_interface_get(descr[0]); ipiv = (int *)STARPU_VECTOR_GET_PTR(descr[1]); @@ -58,7 +57,7 @@ static void cl_zgetrf_percol_diag_cpu_func(void *descr[], void *cl_arg) nextpiv->h = h; nextpiv->has_diag = 1; - CORE_zgetrf_panel_diag( tileA->m, tileA->n, h, m0, tileA->n, + CORE_zgetrf_panel_diag( m, n, h, m0, tileA->n, CHAM_tile_get_ptr( tileA ), tileA->ld, NULL, -1, ipiv, &(nextpiv->pivot), &(prevpiv->pivot) ); @@ -66,7 +65,7 @@ static void cl_zgetrf_percol_diag_cpu_func(void *descr[], void *cl_arg) if ( h > 0 ) { cppi_display_dbg( prevpiv, stderr, "Prevpiv after call: " ); } - if ( h < tileA->n ) { + if ( h < n ) { cppi_display_dbg( nextpiv, stderr, "Nextpiv after call: " ); } } @@ -78,7 +77,7 @@ static void cl_zgetrf_percol_diag_cpu_func(void *descr[], void *cl_arg) CODELETS_CPU( zgetrf_percol_diag, cl_zgetrf_percol_diag_cpu_func ); void INSERT_TASK_zgetrf_percol_diag( const RUNTIME_option_t *options, - int h, int m0, + int m, int n, int h, int m0, CHAM_desc_t *A, int Am, int An, CHAM_ipiv_t *ipiv ) { @@ -101,6 +100,8 @@ void INSERT_TASK_zgetrf_percol_diag( const RUNTIME_option_t *options, rt_starpu_insert_task( codelet, + STARPU_VALUE, &m, sizeof(int), + STARPU_VALUE, &n, sizeof(int), STARPU_VALUE, &h, sizeof(int), STARPU_VALUE, &m0, sizeof(int), STARPU_VALUE, &(options->sequence), sizeof(RUNTIME_sequence_t*), @@ -122,14 +123,14 @@ void INSERT_TASK_zgetrf_percol_diag( const RUNTIME_option_t *options, #if !defined(CHAMELEON_SIMULATION) static void cl_zgetrf_percol_offdiag_cpu_func(void *descr[], void *cl_arg) { - int h, m0; + int m, n, h, m0; RUNTIME_sequence_t *sequence; RUNTIME_request_t *request; CHAM_tile_t *tileA; cppi_interface_t *nextpiv; cppi_interface_t *prevpiv; - starpu_codelet_unpack_args( cl_arg, &h, &m0, &sequence, &request ); + starpu_codelet_unpack_args( cl_arg, &m, &n, &h, &m0, &sequence, &request ); tileA = cti_interface_get(descr[0]); nextpiv = (cppi_interface_t*) descr[1]; @@ -137,7 +138,7 @@ static void cl_zgetrf_percol_offdiag_cpu_func(void *descr[], void *cl_arg) nextpiv->h = h; /* Initialize in case it uses a copy */ - CORE_zgetrf_panel_offdiag( tileA->m, tileA->n, h, m0, tileA->n, + CORE_zgetrf_panel_offdiag( m, n, h, m0, tileA->n, CHAM_tile_get_ptr(tileA), tileA->ld, NULL, -1, &(nextpiv->pivot), &(prevpiv->pivot) ); @@ -150,7 +151,7 @@ static void cl_zgetrf_percol_offdiag_cpu_func(void *descr[], void *cl_arg) CODELETS_CPU(zgetrf_percol_offdiag, cl_zgetrf_percol_offdiag_cpu_func) void INSERT_TASK_zgetrf_percol_offdiag( const RUNTIME_option_t *options, - int h, int m0, + int m, int n, int h, int m0, CHAM_desc_t *A, int Am, int An, CHAM_ipiv_t *ipiv ) { @@ -170,6 +171,8 @@ void INSERT_TASK_zgetrf_percol_offdiag( const RUNTIME_option_t *options, rt_starpu_insert_task( codelet, + STARPU_VALUE, &m, sizeof(int), + STARPU_VALUE, &n, sizeof(int), STARPU_VALUE, &h, sizeof(int), STARPU_VALUE, &m0, sizeof(int), STARPU_VALUE, &(options->sequence), sizeof(RUNTIME_sequence_t *),