diff --git a/include/chameleon/flops.h b/include/chameleon/flops.h index dacb47113a618dfaf49ab20f9133d4cfb19da720..0635491bfbdc3c93046b5bd9fa13b418ac8b728d 100644 --- a/include/chameleon/flops.h +++ b/include/chameleon/flops.h @@ -71,10 +71,12 @@ #define FMULS_TRMM_2(__m, __n) (0.5 * (double)(__n) * (double)(__m) * ((double)(__m)+1.)) #define FADDS_TRMM_2(__m, __n) (0.5 * (double)(__n) * (double)(__m) * ((double)(__m)-1.)) - #define FMULS_TRMM(__side, __m, __n) ( ( (__side) == ChamLeft ) ? FMULS_TRMM_2((__m), (__n)) : FMULS_TRMM_2((__n), (__m)) ) #define FADDS_TRMM(__side, __m, __n) ( ( (__side) == ChamLeft ) ? FADDS_TRMM_2((__m), (__n)) : FADDS_TRMM_2((__n), (__m)) ) +#define FMULS_TRSM_UNIT_2(__m, __n) (0.5 * (double)(__n) * (double)(__m) * ((double)(__m)-1.)) +#define FMULS_TRSM_UNIT(__side, __m, __n) ( ( (__side) == ChamLeft ) ? FMULS_TRSM_UNIT_2((__m), (__n)) : FMULS_TRSM_UNIT_2((__n), (__m)) ) + #define FMULS_TRSM FMULS_TRMM #define FADDS_TRSM FADDS_TRMM @@ -236,6 +238,11 @@ static inline double flops_ctrsm( cham_side_t __side, double __m, double __n) { static inline double flops_dtrsm( cham_side_t __side, double __m, double __n) { double flops = ( FMULS_TRSM(__side, (__m), (__n)) + FADDS_TRSM(__side, (__m), (__n)) ); return flops; } static inline double flops_strsm( cham_side_t __side, double __m, double __n) { double flops = ( FMULS_TRSM(__side, (__m), (__n)) + FADDS_TRSM(__side, (__m), (__n)) ); return flops; } +static inline double flops_ztrsm_unit( cham_side_t __side, double __m, double __n) { double flops = (6. * FMULS_TRSM_UNIT(__side, (__m), (__n)) + 2.0 * FADDS_TRSM(__side, (__m), (__n)) ); return flops; } +static inline double flops_ctrsm_unit( cham_side_t __side, double __m, double __n) { double flops = (6. * FMULS_TRSM_UNIT(__side, (__m), (__n)) + 2.0 * FADDS_TRSM(__side, (__m), (__n)) ); return flops; } +static inline double flops_dtrsm_unit( cham_side_t __side, double __m, double __n) { double flops = ( FMULS_TRSM_UNIT(__side, (__m), (__n)) + FADDS_TRSM(__side, (__m), (__n)) ); return flops; } +static inline double flops_strsm_unit( cham_side_t __side, double __m, double __n) { double flops = ( FMULS_TRSM_UNIT(__side, (__m), (__n)) + FADDS_TRSM(__side, (__m), (__n)) ); return flops; } + /* * Lapack */ @@ -347,10 +354,68 @@ static inline double flops_cgebrd( double __m, double __n) { double flops = (6. static inline double flops_dgebrd( double __m, double __n) { double flops = ( FMULS_GEBRD((__m), (__n)) + FADDS_GEBRD((__m), (__n)) ); return flops; } static inline double flops_sgebrd( double __m, double __n) { double flops = ( FMULS_GEBRD((__m), (__n)) + FADDS_GEBRD((__m), (__n)) ); return flops; } +static inline double flops_zscal( double __m ) { double flops = (6. * (double)(__m)); return flops; } +static inline double flops_cscal( double __m ) { double flops = (6. * (double)(__m)); return flops; } +static inline double flops_dscal( double __m ) { double flops = ( (double)(__m)); return flops; } +static inline double flops_sscal( double __m ) { double flops = ( (double)(__m)); return flops; } + /* * Norms */ #define FMULS_LANGE(__m, __n) ((double)(__m) * (double)(__n)) #define FADDS_LANGE(__m, __n) ((double)(__m) * (double)(__n)) +/* + * Getrf with partial pivoting + */ +#define FLOPS_GETRF_BLOCKED_OFFDIAG( _prec_ ) \ + static inline double flops_##_prec_##getrf_blocked_offdiag( int m, int n, int h, int ib ) \ + { \ + double flops = 0.; \ + int kk, nn; \ + if ( h == 0 ) { \ + return 0.; \ + } \ + /* scal */ \ + flops += flops_##_prec_##scal( m ); \ + /* blas 3 gemm */ \ + if ( h % ib == 0 ) { \ + kk = ib; \ + nn = n - h; \ + } \ + /* blas 2 geru */ \ + else { \ + kk = 1; \ + nn = ib - h % ib; \ + } \ + flops += flops_##_prec_##gemm( m, nn, kk ); \ + return flops; \ + } + +FLOPS_GETRF_BLOCKED_OFFDIAG( z ) +FLOPS_GETRF_BLOCKED_OFFDIAG( c ) +FLOPS_GETRF_BLOCKED_OFFDIAG( d ) +FLOPS_GETRF_BLOCKED_OFFDIAG( s ) + +/* +1 for the 1/pivot */ +static inline double flops_zgetrf_blocked_diag( int m, int n, int h, int ib ){ return flops_zgetrf_blocked_offdiag( m-h, n, h, ib ) + 1. * 6.; } +static inline double flops_cgetrf_blocked_diag( int m, int n, int h, int ib ){ return flops_cgetrf_blocked_offdiag( m-h, n, h, ib ) + 1. * 6.; } +static inline double flops_dgetrf_blocked_diag( int m, int n, int h, int ib ){ return flops_dgetrf_blocked_offdiag( m-h, n, h, ib ) + 1.; } +static inline double flops_sgetrf_blocked_diag( int m, int n, int h, int ib ){ return flops_sgetrf_blocked_offdiag( m-h, n, h, ib ) + 1.; } + +static inline double flops_zgetrf_percol_diag( int m, int n, int h ){ return flops_zgetrf_blocked_offdiag( m-h, n, h, n ) + 1. * 6.; } +static inline double flops_cgetrf_percol_diag( int m, int n, int h ){ return flops_cgetrf_blocked_offdiag( m-h, n, h, n ) + 1. * 6.; } +static inline double flops_dgetrf_percol_diag( int m, int n, int h ){ return flops_dgetrf_blocked_offdiag( m-h, n, h, n ) + 1.; } +static inline double flops_sgetrf_percol_diag( int m, int n, int h ){ return flops_sgetrf_blocked_offdiag( m-h, n, h, n ) + 1.; } + +static inline double flops_zgetrf_percol_offdiag( int m, int n, int h ){ return flops_zgetrf_blocked_offdiag( m, n, h, n ); } +static inline double flops_cgetrf_percol_offdiag( int m, int n, int h ){ return flops_cgetrf_blocked_offdiag( m, n, h, n ); } +static inline double flops_dgetrf_percol_offdiag( int m, int n, int h ){ return flops_dgetrf_blocked_offdiag( m, n, h, n ); } +static inline double flops_sgetrf_percol_offdiag( int m, int n, int h ){ return flops_sgetrf_blocked_offdiag( m, n, h, n ); } + +static inline double flops_zgetrf_trsm( int m, int n, int h, int ib ){ return ( n - h ) > 0 ? flops_ztrsm_unit( ChamLeft, ib, n-h ) : 0.; } +static inline double flops_cgetrf_trsm( int m, int n, int h, int ib ){ return ( n - h ) > 0 ? flops_ctrsm_unit( ChamLeft, ib, n-h ) : 0.; } +static inline double flops_dgetrf_trsm( int m, int n, int h, int ib ){ return ( n - h ) > 0 ? flops_dtrsm_unit( ChamLeft, ib, n-h ) : 0.; } +static inline double flops_sgetrf_trsm( int m, int n, int h, int ib ){ return ( n - h ) > 0 ? flops_strsm_unit( ChamLeft, ib, n-h ) : 0.; } + #endif /* _flops_h_ */ diff --git a/runtime/starpu/codelets/codelet_zgetrf_batched.c b/runtime/starpu/codelets/codelet_zgetrf_batched.c index 40a1b443c0bf1218952bba9e18571dfb08235db2..0ff4ed9854228109928e30ae4b34013338a32a5c 100644 --- a/runtime/starpu/codelets/codelet_zgetrf_batched.c +++ b/runtime/starpu/codelets/codelet_zgetrf_batched.c @@ -36,6 +36,29 @@ struct cl_zgetrf_batched_args_s { struct starpu_data_descr handle_mode[CHAMELEON_BATCH_SIZE]; }; +static inline double flops_zgetrf_percol_batched( int *m, int *n, int h, int t ) +{ + double flops = 0.; + int k; + for ( k = 0; k < t; k ++ ) { + flops += flops_zgetrf_percol_offdiag( m[k], n[k], h ); + } + return flops; +} + +static inline double flops_zgetrf_blocked_batched( int *m, int *n, int h, int ib, int d, int t ) +{ + double flops = 0.; + int k; + if ( d == 1 ) { + flops += flops_zgetrf_blocked_diag( m[0]-h, n[0], h, ib ); + } + for ( k = d; k < t; k ++ ) { + flops += flops_zgetrf_blocked_offdiag( m[k]-h, n[k], h, ib ); + } + return flops; +} + #if !defined(CHAMELEON_SIMULATION) static void cl_zgetrf_panel_offdiag_batched_cpu_func( void *descr[], @@ -201,7 +224,7 @@ INSERT_TASK_zgetrf_panel_offdiag_batched_flush( const RUNTIME_option_t *options, starpu_cham_task_set_options( options, task, nbdata, descrs, NULL ); /* Flops */ - // task->flops = TODO; + task->flops = flops_zgetrf_percol_batched( myclargs->m, myclargs->n, myclargs->h, myclargs->tasks_nbr ); ret = starpu_task_submit( task ); if ( ret == -ENODEV ) { @@ -482,7 +505,8 @@ INSERT_TASK_zgetrf_panel_blocked_batched_flush( const RUNTIME_option_t *options, starpu_cham_task_set_options( options, task, nbdata, descrs, NULL ); /* Flops */ - // task->flops = TODO; + task->flops = flops_zgetrf_blocked_batched( myclargs->m, myclargs->n, myclargs->h, myclargs->ib, + myclargs->diag, myclargs->tasks_nbr ); ret = starpu_task_submit( task ); if ( ret == -ENODEV ) { diff --git a/runtime/starpu/codelets/codelet_zgetrf_blocked.c b/runtime/starpu/codelets/codelet_zgetrf_blocked.c index 63ccf116a5ac1b1871d8a9c7aad78dc98af3bca3..f1df48f3cc7c3b6d460f859bc064e841bd4f5dc7 100644 --- a/runtime/starpu/codelets/codelet_zgetrf_blocked.c +++ b/runtime/starpu/codelets/codelet_zgetrf_blocked.c @@ -245,7 +245,7 @@ void INSERT_TASK_zgetrf_blocked_diag( const RUNTIME_option_t *options, starpu_cham_task_set_options( options, task, nbdata, descrs, cl_zgetrf_blocked_diag_callback ); /* Flops */ - // task->flops = TODO; + task->flops = flops_zgetrf_blocked_diag( m, n, h, ib ); /* Refine name */ task->name = chameleon_codelet_name( cl_name, 1, A->get_blktile( A, Am, An ) ); @@ -450,7 +450,7 @@ void INSERT_TASK_zgetrf_blocked_offdiag( const RUNTIME_option_t *options, starpu_cham_task_set_options( options, task, nbdata, descrs, cl_zgetrf_blocked_offdiag_callback ); /* Flops */ - // task->flops = TODO; + task->flops = flops_zgetrf_blocked_offdiag( m, n, h, ib ); /* Refine name */ task->name = chameleon_codelet_name( cl_name, 1, A->get_blktile( A, Am, An ) ); @@ -596,7 +596,7 @@ void INSERT_TASK_zgetrf_blocked_trsm( const RUNTIME_option_t *options, starpu_cham_task_set_options( options, task, nbdata, descrs, cl_zgetrf_blocked_trsm_callback ); /* Flops */ - // task->flops = TODO; + task->flops = flops_zgetrf_trsm( m, n, h, ib ); /* Refine name */ task->name = chameleon_codelet_name( cl_name, 1, U->get_blktile( U, Um, Un ) ); diff --git a/runtime/starpu/codelets/codelet_zgetrf_percol.c b/runtime/starpu/codelets/codelet_zgetrf_percol.c index 8e6f541a0a6e4e2c31339dd235f2a9b260dbd58f..9a0ec048b78b68569974267edb5c62aa97ce65d2 100644 --- a/runtime/starpu/codelets/codelet_zgetrf_percol.c +++ b/runtime/starpu/codelets/codelet_zgetrf_percol.c @@ -193,7 +193,7 @@ void INSERT_TASK_zgetrf_percol_diag( const RUNTIME_option_t *options, starpu_cham_task_set_options( options, task, nbdata, descrs, cl_zgetrf_percol_diag_callback ); /* Flops */ - // task->flops = TODO; + task->flops = flops_zgetrf_percol_diag( m, n, h ); /* Refine name */ task->name = chameleon_codelet_name( cl_name, 1, A->get_blktile( A, Am, An ) ); @@ -343,7 +343,7 @@ void INSERT_TASK_zgetrf_percol_offdiag( const RUNTIME_option_t *options, starpu_cham_task_set_options( options, task, nbdata, descrs, cl_zgetrf_percol_offdiag_callback ); /* Flops */ - // task->flops = TODO; + task->flops = flops_zgetrf_percol_offdiag( m, n, h ); /* Refine name */ task->name = chameleon_codelet_name( cl_name, 1, A->get_blktile( A, Am, An ) );