using.texi 57.8 KB
Newer Older
1 2 3 4 5 6
@c -*-texinfo-*-

@c This file is part of the MORSE Handbook.
@c Copyright (C) 2014 Inria
@c Copyright (C) 2014 The University of Tennessee
@c Copyright (C) 2014 King Abdullah University of Science and Technology
7
@c See the file ../chameleon.texi for copying conditions.
8 9

@menu
10 11 12
* Using CHAMELEON executables::
* Linking an external application with CHAMELEON libraries::
* CHAMELEON API::
13 14
@end menu

15 16
@node Using CHAMELEON executables
@section Using CHAMELEON executables
17

18
CHAMELEON provides several test executables that are compiled and link with
19
CHAMELEON stack of dependencies.
20
Instructions about the arguments to give to executables are accessible thanks
21
to the option @option{-[-]help} or @option{-[-]h}.
22
This set of binaries are separated into three categories and can be found in
23 24 25 26 27 28
three different directories:

@itemize @bullet

  @item example

29 30 31
  contains examples of API usage and more specifically the
  sub-directory lapack_to_morse/ provides a tutorial that explain how to use
  CHAMELEON functionalities starting from a full LAPACK code, see
32
@ref{Tutorial LAPACK to CHAMELEON}
33 34 35

  @item testing

36
  contains testing drivers to check numerical correctness of
37
  CHAMELEON linear algebra routines with a wide range of parameters
38 39 40 41 42
  @example
  ./testing/stesting 4 1 LANGE 600 100 700
  @end example
  Two first arguments are the number of cores and gpus to use.
  The third one is the name of the algorithm to test.
43
  The other arguments depend on the algorithm, here it lies for the number of
44
  rows, columns and leading dimension of the problem.
45

46 47 48 49 50 51 52 53 54 55
  Name of algorithms available for testing are:
  @itemize @bullet
    @item LANGE: norms of matrices Infinite, One, Max, Frobenius
    @item GEMM: general matrix-matrix multiply
    @item HEMM: hermitian matrix-matrix multiply
    @item HERK: hermitian matrix-matrix rank k update
    @item HER2K: hermitian matrix-matrix rank 2k update
    @item SYMM: symmetric matrix-matrix multiply
    @item SYRK: symmetric matrix-matrix rank k update
    @item SYR2K: symmetric matrix-matrix rank 2k update
56
    @item PEMV: matrix-vector multiply with pentadiagonal matrix
57 58 59
    @item TRMM: triangular matrix-matrix multiply
    @item TRSM: triangular solve, multiple rhs
    @item POSV: solve linear systems with symmetric positive-definite matrix
60
    @item GESV_INCPIV: solve linear systems with general matrix
61 62 63 64 65
    @item GELS: linear least squares with general matrix
  @end itemize

  @item timing

66
  contains timing drivers to assess performances of CHAMELEON routines.
67
  There are two sets of executables, those who do not use the tile interface
68
and those who do (with _tile in the name of the executable).
69 70
  Executables without tile interface allocates data following LAPACK
conventions and these data can be given as arguments to CHAMELEON routines
71
as you would do with LAPACK.
72
  Executables with tile interface generate directly the data in the format
73
  CHAMELEON tile algorithms used to submit tasks to the runtime system.
74
  Executables with tile interface should be more performant because no data
75 76 77
copy from LAPACK matrix layout to tile matrix layout are necessary.
  Calling example:
  @example
78 79
  ./timing/time_dpotrf --n_range=1000:10000:1000 --nb=320
                       --threads=9 --gpus=3
80 81
                       --nowarmup
  @end example
82

83 84 85
  List of main options that can be used in timing:
  @itemize @bullet
    @item @option{--help}: show usage
Mathieu Faverge's avatar
Mathieu Faverge committed
86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132
    @item Machine parameters
    @itemize @bullet
       @item @option{-t x, --threads=x}: Number of CPU workers (default: automatic detection through runtime)
       @item @option{-g x, --gpus=x}: Number of GPU workers (default: @option{0})
       @item @option{-P x, --P=x}:  Rows (P) in the PxQ process grid (deafult: @option{1})
       @item @option{--nocpu}: All GPU kernels are exclusively executed on GPUs (default: @option{0})
    @end itemize
    @item Matrix parameters
    @itemize @bullet
      @item @option{-m x, --m=x, --M=x}: Dimension (M) of the matrices (default: @option{N})
      @item @option{-n x, --n=x, --N=x}: Dimension (N) of the matrices
      @item @option{-N R, --n_range=R}: Range of N values to time with R=Start:Stop:Step (default: @option{500:5000:500})
      @item @option{-k x, --k=x, --K=x, --nrhs=x}: Dimension (K) of the matrices or number of right-hand size (default: @option{1}). This is useful for GEMM like algorithms (k is the shared dimension and must be defined >1 to consider matrices and not vectors)
      @item @option{-b x, --nb=x}: NB size. (default: @option{320})
      @item @option{-i x, --ib=x}: IB size. (default: @option{32})
    @end itemize
    @item Check/prints
    @itemize @bullet
      @item @option{--niter=x}: number of iterations performed for each test (default: @option{1})
      @item @option{-W, --nowarnings}: Do not show warnings
      @item @option{-w, --nowarmup}: Cancel the warmup run to pre-load libraries
      @item @option{-c, --check}: Check result
      @item @option{-C, --inv}: Check on inverse
      @item @option{--mode=x}: Change the xLATMS matrix mode generation for SVD/EVD (default: @option{4}). It must be between 0 and 20 included.
    @end itemize
    @item Profiling parameters
    @itemize @bullet
      @item @option{-T, --trace}: Enable trace generation
      @item @option{--progress}: Display progress indicator
      @item @option{-d, --dag}: Enable DAG generation. Generates a dot_dag_file.dot.
      @item @option{-p, --profile}: Print profiling informations
    @end itemize
    @item HQR parameters
    @itemize @bullet
      @item @option{-a x, --qr_a=x, --rhblk=x}: Define the size of the local TS trees in housholder reduction trees for QR and LQ factorization. N is the size of each subdomain (default: @option{-1})
      @item @option{-l x, --llvl=x}: Tree used for low level reduction inside nodes (default: @option{-1})
      @item @option{-L x, --hlvl=x}: Tree used for high level reduction between nodes, only if P > 1 (default: @option{-1}). Possible values are -1: Automatic, 0: Flat, 1: Greedy, 2: Fibonacci, 3: Binary, 4: Replicated greedy.
      @item @option{-D, --domino}: Enable the domino between upper and lower trees
    @end itemize
    @item Advanced options
    @itemize @bullet
      @item @option{--nobigmat}: Disable single large matrix allocation for multiple tiled allocations
      @item @option{-s, --sync}: Enable synchronous calls in wrapper function such as POTRI
      @item @option{-o, --ooc}: Enable out-of-core (available only with StarPU)
      @item @option{-G, --gemm3m}: Use gemm3m complex method
      @item @option{--bound}: Compare result to area bound
    @end itemize
133

134 135 136 137
  List of timing algorithms available:
  @itemize @bullet
    @item LANGE: norms of matrices
    @item GEMM: general matrix-matrix multiply
138 139
    @item TRSM: triangular solve
    @item POTRF: Cholesky factorization with a symmetric
140 141 142
positive-definite matrix
    @item POSV: solve linear systems with symmetric positive-definite matrix
    @item GETRF_NOPIV: LU factorization of a general matrix
143
using the tile LU algorithm without row pivoting
144
    @item GESV_NOPIV: solve linear system for a general matrix
145
using the tile LU algorithm without row pivoting
146
    @item GETRF_INCPIV: LU factorization of a general matrix
147
using the tile LU algorithm with partial tile pivoting with row interchanges
148 149 150 151
    @item GESV_INCPIV: solve linear system for a general matrix
using the tile LU algorithm with partial tile pivoting with row interchanges
matrix
    @item GEQRF: QR factorization of a general matrix
152
    @item GELS: solves overdetermined or underdetermined linear systems
153 154
involving a general matrix using the QR or the LQ factorization
  @end itemize
155

156 157
@end itemize

158 159
@node Linking an external application with CHAMELEON libraries
@section Linking an external application with CHAMELEON libraries
160

161
Compilation and link with CHAMELEON libraries have been tested with
162 163 164 165 166
@strong{gcc/gfortran 4.8.1} and @strong{icc/ifort 14.0.2}.

@menu
* Static linking in C::
* Dynamic linking in C::
167
* Build a Fortran program with CHAMELEON::
168 169 170 171 172
@end menu

@node Static linking in C
@subsection Static linking in C

173
Lets imagine you have a file main.c that you want to link with CHAMELEON
174 175 176
static libraries.
Here could be your compilation command with gcc compiler:
@example
177
gcc -I/home/yourname/install/chameleon/include -o main.o -c main.c
178 179
@end example

180
Now if you want to link your application with CHAMELEON static libraries, you
181 182
could do:
@example
PRUVOST Florent's avatar
PRUVOST Florent committed
183
gcc main.o -o main                                         \
184 185
/home/yourname/install/chameleon/lib/libchameleon.a        \
/home/yourname/install/chameleon/lib/libchameleon_starpu.a \
PRUVOST Florent's avatar
PRUVOST Florent committed
186 187
/home/yourname/install/chameleon/lib/libcoreblas.a         \
-lstarpu-1.1 -Wl,--no-as-needed -lmkl_intel_lp64           \
188 189
-lmkl_sequential -lmkl_core -lpthread -lm -lrt
@end example
190 191 192
As you can see in this example, we also link with some dynamic libraries
@option{starpu-1.1}, @option{Intel MKL} libraries (for
BLAS/LAPACK/CBLAS/LAPACKE), @option{pthread}, @option{m} (math) and
193
@option{rt}.
194
These libraries will depend on the configuration of your CHAMELEON build.
195
You can find these dependencies in .pc files we generate during compilation and
196
CHAMELEON install directory.
197
Note also that you could need to specify where to find these libraries with
198
@option{-L} option of your compiler/linker.
199

200
Before to run your program, make sure that all shared libraries paths your
201 202
executable depends on are known.
Enter @code{ldd main} to check.
203 204
If some shared libraries paths are missing append them in the
@env{LD_LIBRARY_PATH} (for Linux systems) environment variable
205 206
(@env{DYLD_LIBRARY_PATH} on Mac, @env{LIB} on Windows).

207
@node Dynamic linking in C
208 209
@subsection Dynamic linking in C

210 211 212 213
For dynamic linking (need to build CHAMELEON with CMake
option @option{BUILD_SHARED_LIBS=ON}) it is similar to static compilation/link
but instead of specifying path to your static libraries you indicate the path
to dynamic libraries with @option{-L} option and you give the name of libraries
214 215 216
with @option{-l} option like this:
@example
gcc main.o -o main                               \
PRUVOST Florent's avatar
PRUVOST Florent committed
217 218
-L/home/yourname/install/chameleon/lib           \
-lchameleon -lchameleon_starpu -lcoreblas        \
219 220 221
-lstarpu-1.1 -Wl,--no-as-needed -lmkl_intel_lp64 \
-lmkl_sequential -lmkl_core -lpthread -lm -lrt
@end example
222

223 224
Note that an update of your environment variable
@env{LD_LIBRARY_PATH} (@env{DYLD_LIBRARY_PATH} on Mac, @env{LIB} on Windows)
225
with the path of the libraries could be required before executing, example:
226
@example
227
export @env{LD_LIBRARY_PATH}=path/to/libs:path/to/chameleon/lib
228 229
@end example

230 231
@node Build a Fortran program with CHAMELEON
@subsection Build a Fortran program with CHAMELEON
232

233
CHAMELEON provides a Fortran interface to user functions. Example:
234 235 236 237 238 239 240 241 242 243 244 245 246 247
@example
call morse_version(major, minor, patch) !or
call MORSE_VERSION(major, minor, patch)
@end example

Build and link are very similar to the C case.

Compilation example:
@example
gfortran -o main.o -c main.c
@end example

Static linking example:
@example
PRUVOST Florent's avatar
PRUVOST Florent committed
248
gfortran main.o -o main                                    \
249 250
/home/yourname/install/chameleon/lib/libchameleon.a        \
/home/yourname/install/chameleon/lib/libchameleon_starpu.a \
PRUVOST Florent's avatar
PRUVOST Florent committed
251 252
/home/yourname/install/chameleon/lib/libcoreblas.a         \
-lstarpu-1.1 -Wl,--no-as-needed -lmkl_intel_lp64           \
253 254 255 256 257 258
-lmkl_sequential -lmkl_core -lpthread -lm -lrt
@end example

Dynamic linking example:
@example
gfortran main.o -o main                          \
PRUVOST Florent's avatar
PRUVOST Florent committed
259 260
-L/home/yourname/install/chameleon/lib           \
-lchameleon -lchameleon_starpu -lcoreblas        \
261 262 263 264
-lstarpu-1.1 -Wl,--no-as-needed -lmkl_intel_lp64 \
-lmkl_sequential -lmkl_core -lpthread -lm -lrt
@end example

265 266
@node CHAMELEON API
@section CHAMELEON API
267

268 269
CHAMELEON provides routines to solve dense general systems of linear
equations, symmetric positive definite systems of linear equations and linear
270
least squares problems, using LU, Cholesky, QR and LQ factorizations.
271
Real arithmetic and complex arithmetic are supported in both single precision
272 273 274 275 276 277 278
and double precision.
Routines that compute linear algebra are of the folowing form:
@example
MORSE_name[_Tile[_Async]]
@end example
@itemize @bullet
@item all user routines are prefixed with @code{MORSE}
279
@item @code{name} follows BLAS/LAPACK naming scheme for algorithms
280
(@emph{e.g.} sgemm for general matrix-matrix multiply simple precision)
281
@item CHAMELEON provides three interface levels
282
  @itemize @minus
283 284 285
  @item @code{MORSE_name}: simplest interface, very close to CBLAS and LAPACKE,
matrices are given following the LAPACK data layout (1-D array column-major).
It involves copy of data from LAPACK layout to tile layout and conversely (to
286
update LAPACK data), see @ref{Step1}.
287 288 289
  @item @code{MORSE_name_Tile}: the tile interface avoid copies between LAPACK
and tile layouts. It is the standard interface of CHAMELEON and it should
achieved better performance than the previous simplest interface. The data are
290
given through a specific structure called a descriptor, see @ref{Step2}.
291 292 293 294 295
  @item @code{MORSE_name_Tile_Async}: similar to the tile interface, it avoids
synchonization barrier normally called between @code{Tile} routines.
At the end of an @code{Async} function, completion of tasks is not guarentee
and data are not necessarily up-to-date.
To ensure that tasks have been all executed a synchronization function has to
296 297 298 299
be called after the sequence of @code{Async} functions, see @ref{Step4}.
  @end itemize
@end itemize

300
MORSE routine calls have to be precede from
301 302 303 304 305 306 307 308 309 310
@example
MORSE_Init( NCPU, NGPU );
@end example
to initialize MORSE and the runtime system and followed by
@example
MORSE_Finalize();
@end example
to free some data and finalize the runtime and/or MPI.

@menu
311
* Tutorial LAPACK to CHAMELEON::
312 313 314
* List of available routines::
@end menu

315 316
@node Tutorial LAPACK to CHAMELEON
@subsection Tutorial LAPACK to CHAMELEON
317

318
This tutorial is dedicated to the API usage of CHAMELEON.
319 320 321
The idea is to start from a simple code and step by step explain how to
use CHAMELEON routines.
The first step is a full BLAS/LAPACK code without dependencies to CHAMELEON,
322
a code that most users should easily understand.
323
Then, the different interfaces CHAMELEON provides are exposed, from the
324 325
simplest API (step1) to more complicated ones (until step4).
The way some important parameters are set is discussed in step5.
326 327 328
step6 is an example about distributed computation with MPI.
Finally step7 shows how to let Chameleon initialize user's data
(matrices/vectors) in parallel.
329 330

directory.
331
If CMake option @option{CHAMELEON_ENABLE_EXAMPLE} is @option{ON} then source
332 333 334 335 336 337
files are compiled with the project libraries.
The arithmetic precision is @code{double}.
To execute a step @samp{X}, enter the following command:
@example
./step@samp{X} --option1 --option2 ...
@end example
338
Instructions about the arguments to give to executables are accessible thanks
339 340 341 342
to the option @option{-[-]help} or @option{-[-]h}.
Note there exist default values for options.

For all steps, the program solves a linear system @math{Ax=B}
343 344
The matrix values are randomly generated but ensure that matrix @math{A} is
symmetric positive definite so that @math{A} can be factorized in a @math{LL^T}
345 346 347 348 349
form using the Cholesky factorization.


Lets comment the different steps of the tutorial
@menu
350
* Step0:: a simple Cholesky example using the C interface of
351
BLAS/LAPACK
352
* Step1:: introduces the LAPACK equivalent interface of Chameleon
353
* Step2:: introduces the tile interface
354
* Step3:: indicates how to give your own tile matrix to Chameleon
355 356
* Step4:: introduces the tile async interface
* Step5:: shows how to set some important parameters
357 358
* Step6:: introduces how to benefit from MPI in Chameleon
* Step7:: introduces how to let Chameleon initialize the user's matrix data
359 360 361 362 363
@end menu

@node Step0
@subsubsection Step0

364 365 366
The C interface of BLAS and LAPACK, that is, CBLAS and
LAPACKE, are used to solve the system. The size of the system (matrix) and the
number of right hand-sides can be given as arguments to the executable (be
367
careful not to give huge numbers if you do not have an infinite amount of RAM!).
368
As for every step, the correctness of the solution is checked by calculating
369
the norm @math{||Ax-B||/(||A||||x||+||B||)}.
370 371
The time spent in factorization+solve is recorded and, because we know exactly
the number of operations of these algorithms, we deduce the number of
372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401
operations that have been processed per second (in GFlops/s).
The important part of the code that solves the problem is:
@verbatim
/* Cholesky factorization:
 * A is replaced by its factorization L or L^T depending on uplo */
LAPACKE_dpotrf( LAPACK_COL_MAJOR, 'U', N, A, N );
/* Solve:
 * B is stored in X on entry, X contains the result on exit.
 * Forward ...
 */
cblas_dtrsm(
    CblasColMajor,
    CblasLeft,
    CblasUpper,
    CblasConjTrans,
    CblasNonUnit,
    N, NRHS, 1.0, A, N, X, N);
/* ... and back substitution */
cblas_dtrsm(
    CblasColMajor,
    CblasLeft,
    CblasUpper,
    CblasNoTrans,
    CblasNonUnit,
    N, NRHS, 1.0, A, N, X, N);
@end verbatim

@node Step1
@subsubsection Step1

402
It introduces the simplest CHAMELEON interface which is equivalent to
403
CBLAS/LAPACKE.
404 405
The code is very similar to step0 but instead of calling CBLAS/LAPACKE
functions, we call CHAMELEON equivalent functions.
406 407 408 409 410 411 412 413
The solving code becomes:
@verbatim
/* Factorization: */
MORSE_dpotrf( UPLO, N, A, N );
/* Solve: */
MORSE_dpotrs(UPLO, N, NRHS, A, N, X, N);
@end verbatim
The API is almost the same so that it is easy to use for beginners.
414
It is important to keep in mind that before any call to MORSE routines,
415 416 417 418 419
@code{MORSE_Init} has to be invoked to initialize MORSE and the runtime system.
Example:
@verbatim
MORSE_Init( NCPU, NGPU );
@end verbatim
420
After all MORSE calls have been done, a call to @code{MORSE_Finalize} is
421 422 423 424
required to free some data and finalize the runtime and/or MPI.
@verbatim
MORSE_Finalize();
@end verbatim
425
We use MORSE routines with the LAPACK interface which means the routines
426
accepts the same matrix format as LAPACK (1-D array column-major).
427 428
Note that we copy the matrix to get it in our own tile structures, see details
about this format here @ref{Tile Data Layout}.
429 430 431 432 433
This means you can get an overhead coming from copies.

@node Step2
@subsubsection Step2

434
This program is a copy of step1 but instead of using the LAPACK interface which
435 436
leads to copy LAPACK matrices inside MORSE routines we use the tile interface.
We will still use standard format of matrix but we will see how to give this
437
matrix to create a MORSE descriptor, a structure wrapping data on which we want
438 439 440 441 442 443 444 445
to apply sequential task-based algorithms.
The solving code becomes:
@verbatim
/* Factorization: */
MORSE_dpotrf_Tile( UPLO, descA );
/* Solve: */
MORSE_dpotrs_Tile( UPLO, descA, descX );
@end verbatim
446
To use the tile interface, a specific structure @code{MORSE_desc_t} must be
447 448 449
created.
This can be achieved from different ways.
@enumerate
450 451
@item Use the existing function @code{MORSE_Desc_Create}: means the
matrix data are considered contiguous in memory as it is considered in PLASMA
452
(@ref{Tile Data Layout}).
THIBAULT Samuel's avatar
THIBAULT Samuel committed
453 454 455
@item Use the existing function @code{MORSE_Desc_Create_OOC}: means the
matrix data is allocated on-demand in memory tile by tile, and possibly pushed
to disk if that does not fit memory.
456 457 458
@item Use the existing function @code{MORSE_Desc_Create_User}: it is more
flexible than @code{Desc_Create} because you can give your own way to access to
tile data so that your tiles can be allocated wherever you want in memory, see
459
next paragraph @ref{Step3}.
460 461
@item Create you own function to fill the descriptor.
If you understand well the meaning of each item of @code{MORSE_desc_t}, you
462 463 464 465 466 467
should be able to fill correctly the structure (good luck).
@end enumerate

In Step2, we use the first way to create the descriptor:
@verbatim
MORSE_Desc_Create(&descA, NULL, MorseRealDouble,
468 469
                  NB, NB, NB*NB, N, N,
                  0, 0, N, N,
470 471 472 473 474 475 476 477
                  1, 1);
@end verbatim

@itemize @bullet

@item @code{descA} is the descriptor to create.

@item The second argument is a pointer to existing data.
478 479
The existing data must follow LAPACK/PLASMA matrix layout @ref{Tile Data
Layout} (1-D array column-major) if @code{MORSE_Desc_Create} is used to create
480
the descriptor.
481 482 483
The @code{MORSE_Desc_Create_User} function can be used if you have data
organized differently.
This is discussed in the next paragraph @ref{Step3}.
484
Giving a @code{NULL} pointer means you let the function allocate memory space.
485
This requires to copy your data in the memory allocated by the
486 487 488 489 490 491
@code{Desc_Create}.
This can be done with
@verbatim
MORSE_Lapack_to_Tile(A, N, descA);
@end verbatim

492
@item Third argument of @code{Desc_Create} is the datatype (used for memory
493 494
allocation).

495 496 497
@item Fourth argument until sixth argument stand for respectively, the number
of rows (@code{NB}), columns (@code{NB}) in each tile, the total number of
values in a tile (@code{NB*NB}), the number of rows (@code{N}), colmumns
498 499
(@code{N}) in the entire matrix.

500 501
@item Seventh argument until ninth argument stand for respectively, the
beginning row (@code{0}), column (@code{0}) indexes of the submatrix and the
502 503 504 505
number of rows (@code{N}), columns (@code{N}) in the submatrix.
These arguments are specific and used in precise cases.
If you do not consider submatrices, just use @code{0, 0, NROWS, NCOLS}.

506
@item Two last arguments are the parameter of the 2-D block-cyclic distribution
507
grid, see @uref{http://www.netlib.org/scalapack/slug/node75.html, ScaLAPACK}.
508
To be able to use other data distribution over the nodes,
509 510 511 512 513 514 515 516
@code{MORSE_Desc_Create_User} function should be used.

@end itemize


@node Step3
@subsubsection Step3

517 518
This program makes use of the same interface than Step2 (tile interface) but
does not allocate LAPACK matrices anymore so that no copy between LAPACK matrix
519 520 521 522 523
layout and tile matrix layout are necessary to call MORSE routines.
To generate random right hand-sides you can use:
@verbatim
/* Allocate memory and initialize descriptor B */
MORSE_Desc_Create(&descB,  NULL, MorseRealDouble,
524
                  NB, NB,  NB*NB, N, NRHS,
525 526 527 528 529
                  0, 0, N, NRHS, 1, 1);
/* generate RHS with random values */
MORSE_dplrnt_Tile( descB, 5673 );
@end verbatim

530 531
The other important point is that is it possible to create a descriptor, the
necessary structure to call MORSE efficiently, by giving your own pointer to
532 533 534 535 536
tiles if your matrix is not organized as a 1-D array column-major.
This can be achieved with the @code{MORSE_Desc_Create_User} routine.
Here is an example:
@verbatim
MORSE_Desc_Create_User(&descA, matA, MorseRealDouble,
537 538 539 540 541
                       NB, NB, NB*NB, N, N,
                       0, 0, N, N, 1, 1,
                       user_getaddr_arrayofpointers,
                       user_getblkldd_arrayofpointers,
                       user_getrankof_zero);
542
@end verbatim
543
Firsts arguments are the same than @code{MORSE_Desc_Create} routine.
544 545
Following arguments allows you to give pointer to functions that manage the
access to tiles from the structure given as second argument.
546
Here for example, @code{matA} is an array containing addresses to tiles, see
547 548
The three functions you have to define for @code{Desc_Create_User} are:
@itemize @bullet
549 550 551 552
@item a function that returns address of tile @math{A(m,n)}, m and n standing
for the indexes of the tile in the global matrix. Lets consider a matrix
@math{4x4} with tile size @math{2x2}, the matrix contains four tiles of
indexes: @math{A(m=0,n=0)}, @math{A(m=0,n=1)}, @math{A(m=1,n=0)},
553 554 555 556
@math{A(m=1,n=1)}
@item a function that returns the leading dimension of tile @math{A(m,*)}
@item a function that returns MPI rank of tile @math{A(m,n)}
@end itemize
557
Note that the way we define these functions is related to the tile matrix
558
format and to the data distribution considered.
559 560
This example should not be used with MPI since all tiles are affected to
processus @code{0}, which means a large amount of data will be
561 562 563 564
potentially transfered between nodes.

@node Step4
@subsubsection Step4
565
This program is a copy of step2 but instead of using the tile interface, it
566 567
uses the tile async interface.
The goal is to exhibit the runtime synchronization barriers.
568 569 570
Keep in mind that when the tile interface is called, like
@code{MORSE_dpotrf_Tile}, a synchronization function, waiting for the actual
execution and termination of all tasks, is called to ensure the
571
proper completion of the algorithm (i.e. data are up-to-date).
572
The code shows how to exploit the async interface to pipeline subsequent
573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604
algorithms so that less synchronisations are done.
The code becomes:
@verbatim
/* Morse structure containing parameters and a structure to interact with
 * the Runtime system */
MORSE_context_t *morse;
/* MORSE sequence uniquely identifies a set of asynchronous function calls
 * sharing common exception handling */
MORSE_sequence_t *sequence = NULL;
/* MORSE request uniquely identifies each asynchronous function call */
MORSE_request_t request = MORSE_REQUEST_INITIALIZER;
int status;

...

morse_sequence_create(morse, &sequence);

/* Factorization: */
MORSE_dpotrf_Tile_Async( UPLO, descA, sequence, &request );

/* Solve: */
MORSE_dpotrs_Tile_Async( UPLO, descA, descX, sequence, &request);

/* Synchronization barrier (the runtime ensures that all submitted tasks
 * have been terminated */
RUNTIME_barrier(morse);
/* Ensure that all data processed on the gpus we are depending on are back
 * in main memory */
RUNTIME_desc_getoncpu(descA);
RUNTIME_desc_getoncpu(descX);

status = sequence->status;
605

606
@end verbatim
607 608
Here the sequence of @code{dpotrf} and @code{dpotrs} algorithms is processed
without synchronization so that some tasks of @code{dpotrf} and @code{dpotrs}
609 610
can be concurently executed which could increase performances.
The async interface is very similar to the tile one.
611
It is only necessary to give two new objects @code{MORSE_sequence_t} and
612 613 614
@code{MORSE_request_t} used to handle asynchronous function calls.

@center @image{potri_async,13cm,8cm}
615
POTRI (POTRF, TRTRI, LAUUM) algorithm with and without synchronization
616 617 618 619 620 621
barriers, courtesey of the @uref{http://icl.cs.utk.edu/plasma/, PLASMA} team.

@node Step5
@subsubsection Step5

Step5 shows how to set some important parameters.
622 623
This program is a copy of Step4 but some additional parameters are given by
the user.
624 625 626 627 628
The parameters that can be set are:
@itemize @bullet
@item number of Threads
@item number of GPUs

629
The number of workers can be given as argument to the executable with
630
@option{--threads=} and @option{--gpus=} options.
631
It is important to notice that we assign one thread per gpu to optimize data
632
transfer between main memory and devices memory.
633
The number of workers of each type @code{CPU} and @code{CUDA} must be given at
634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654
@code{MORSE_Init}.
@verbatim
if ( iparam[IPARAM_THRDNBR] == -1 ) {
    get_thread_count( &(iparam[IPARAM_THRDNBR]) );
    /* reserve one thread par cuda device to optimize memory transfers */
    iparam[IPARAM_THRDNBR] -= iparam[IPARAM_NCUDAS];
}
NCPU = iparam[IPARAM_THRDNBR];
NGPU = iparam[IPARAM_NCUDAS];

/* initialize MORSE with main parameters */
MORSE_Init( NCPU, NGPU );
@end verbatim

@item matrix size
@item number of right-hand sides
@item block (tile) size

The problem size is given with @option{--n=} and @option{--nrhs=} options.
The tile size is given with option @option{--nb=}.
These parameters are required to create descriptors.
655
The size tile @code{NB} is a key parameter to get performances since it
656
defines the granularity of tasks.
657 658
If @code{NB} is too large compared to @code{N}, there are few tasks to
schedule.
659
If the number of workers is large this leads to limit parallelism.
660 661
On the contrary, if @code{NB} is too small (@emph{i.e.} many small tasks),
workers could not be correctly fed and the runtime systems operations
662
could represent a substantial overhead.
663 664
A trade-off has to be found depending on many parameters: problem size,
algorithm (drive data dependencies), architecture (number of workers,
665
workers speed, workers uniformity, memory bus speed).
666 667
By default it is set to 128.
Do not hesitate to play with this parameter and compare performances on your
668 669 670 671 672
machine.

@item inner-blocking size

The inner-blocking size is given with option @option{--ib=}.
673 674
This parameter is used by kernels (optimized algorithms applied on tiles) to
perform subsequent operations with data block-size that fits the cache of
675 676 677 678 679 680 681
workers.
Parameters @code{NB} and @code{IB} can be given with @code{MORSE_Set} function:
@verbatim
MORSE_Set(MORSE_TILE_SIZE,        iparam[IPARAM_NB] );
MORSE_Set(MORSE_INNER_BLOCK_SIZE, iparam[IPARAM_IB] );
@end verbatim
@end itemize
682

683 684 685
@node Step6
@subsubsection Step6

686 687 688 689
This program is a copy of Step5 with some additional parameters to be set for
the data distribution.
To use this program properly MORSE must use StarPU Runtime system and MPI
option must be activated at configure.
690
The data distribution used here is 2-D block-cyclic, see for example
691
@uref{http://www.netlib.org/scalapack/slug/node75.html, ScaLAPACK} for
692
explanation.
693
The user can enter the parameters of the distribution grid at execution with
694 695 696 697 698 699 700
@option{--p=} option.
Example using OpenMPI on four nodes with one process per node:
@example
mpirun -np 4 ./step6 --n=10000 --nb=320 --ib=64 \
                     --threads=8 --gpus=2 --p=2
@end example

701
In this program we use the tile data layout from PLASMA so that the call
702 703
@verbatim
MORSE_Desc_Create_User(&descA, NULL, MorseRealDouble,
704 705 706 707 708 709
                       NB, NB, NB*NB, N, N,
                       0, 0, N, N,
                       GRID_P, GRID_Q,
                       morse_getaddr_ccrb,
                       morse_getblkldd_ccrb,
                       morse_getrankof_2d);
710 711 712 713
@end verbatim
is equivalent to the following call
@verbatim
MORSE_Desc_Create(&descA, NULL, MorseRealDouble,
714 715
                  NB, NB, NB*NB, N, N,
                  0, 0, N, N,
716 717
                  GRID_P, GRID_Q);
@end verbatim
718
functions @code{morse_getaddr_ccrb}, @code{morse_getblkldd_ccrb},
719 720
@code{morse_getrankof_2d} being used in @code{Desc_Create}.
It is interesting to notice that the code is almost the same as Step5.
721
The only additional information to give is the way tiles are distributed
722
through the third function given to @code{MORSE_Desc_Create_User}.
723
Here, because we have made experiments only with a 2-D block-cyclic
724
distribution, we have parameters P and Q in the interface of @code{Desc_Create}
725
but they have sense only for 2-D block-cyclic distribution and then using
726
@code{morse_getrankof_2d} function.
727
Of course it could be used with other distributions, being no more the
728 729
parameters of a 2-D block-cyclic grid but of another distribution.

730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759
@node Step7
@subsubsection Step7

This program is a copy of step6 with some additional calls to
build a matrix from within chameleon using a function provided by the user.
This can be seen as a replacement of the function like @code{MORSE_dplgsy_Tile()} that can be used
to fill the matrix with random data, @code{MORSE_dLapack_to_Tile()} to fill the matrix
with data stored in a lapack-like buffer, or @code{MORSE_Desc_Create_User()} that can be used
to describe an arbitrary tile matrix structure.
In this example, the build callback function are just wrapper towards @code{CORE_xxx()} functions, so the output
of the program step7 should be exactly similar to that of step6.
The difference is that the function used to fill the tiles is provided by the user,
and therefore this approach is much more flexible.

The new function to understand is @code{MORSE_dbuild_Tile}, e.g.
@verbatim
struct data_pl data_A={(double)N, 51, N};
MORSE_dbuild_Tile(MorseUpperLower, descA, (void*)&data_A, Morse_build_callback_plgsy);
@end verbatim
The idea here is to let Chameleon fill the matrix data in a task-based fashion
(parallel) by using a function given by the user.
First, the user should define if all the blocks must be entirelly filled or just
the upper/lower part with, e.g. @code{MorseUpperLower}.
We still relies on the same structure @code{MORSE_desc_t} which must be
initialized with the proper parameters, by calling for example
@code{MORSE_Desc_Create}.
Then, an opaque pointer is used to let the user give some extra data used by
his function.
The last parameter is the pointer to the user's function.

760 761 762 763 764 765 766
@node List of available routines
@subsection List of available routines

@menu
* Auxiliary routines:: Init, Finalize, Version, etc
* Descriptor routines:: To handle descriptors
* Options routines:: To set options
767
* Sequences routines:: To manage asynchronous function calls
768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793
* Linear Algebra routines:: Computional routines
@end menu

@node Auxiliary routines
@subsubsection Auxiliary routines

Reports MORSE version number.
@verbatim
int MORSE_Version        (int *ver_major, int *ver_minor, int *ver_micro);
@end verbatim

Initialize MORSE: initialize some parameters, initialize the runtime and/or MPI.
@verbatim
int MORSE_Init           (int nworkers, int ncudas);
@end verbatim

Finalyze MORSE: free some data and finalize the runtime and/or MPI.
@verbatim
int MORSE_Finalize       (void);
@end verbatim

Return the MPI rank of the calling process.
@verbatim
int MORSE_My_Mpi_Rank    (void);
@end verbatim

794
Suspend MORSE runtime to poll for new tasks, to avoid useless CPU consumption when
795 796 797 798 799 800 801 802 803 804
no tasks have to be executed by MORSE runtime system.
@verbatim
int MORSE_Pause          (void);
@end verbatim

Symmetrical call to MORSE_Pause, used to resume the workers polling for new tasks.
@verbatim
int MORSE_Resume         (void);
@end verbatim

805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840
Conversion from LAPACK layout to tile layout.
@verbatim
int MORSE_Lapack_to_Tile (void *Af77, int LDA, MORSE_desc_t *A);
@end verbatim

Conversion from tile layout to LAPACK layout.
@verbatim
int MORSE_Tile_to_Lapack (MORSE_desc_t *A, void *Af77, int LDA);
@end verbatim

@node Descriptor routines
@subsubsection Descriptor routines

@c /* Descriptor */
Create matrix descriptor, internal function.
@verbatim
int MORSE_Desc_Create  (MORSE_desc_t **desc, void *mat, MORSE_enum dtyp,
                        int mb, int nb, int bsiz, int lm, int ln,
                        int i, int j, int m, int n, int p, int q);
@end verbatim

Create matrix descriptor, user function.
@verbatim
int MORSE_Desc_Create_User(MORSE_desc_t **desc, void *mat, MORSE_enum dtyp,
                           int mb, int nb, int bsiz, int lm, int ln,
                           int i, int j, int m, int n, int p, int q,
                           void* (*get_blkaddr)( const MORSE_desc_t*, int, int),
                           int (*get_blkldd)( const MORSE_desc_t*, int ),
                           int (*get_rankof)( const MORSE_desc_t*, int, int ));
@end verbatim

Destroys matrix descriptor.
@verbatim
int MORSE_Desc_Destroy (MORSE_desc_t **desc);
@end verbatim

841
Ensure that all data are up-to-date in main memory (even if some tasks have
842 843
been processed on GPUs)
@verbatim
844
int MORSE_Desc_Flush(MORSE_desc_t  *desc, MORSE_sequence_t *sequence);
845 846 847 848 849
@end verbatim

@node Options routines
@subsubsection Options routines

850
@c /* Options */
851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878
Enable MORSE feature.
@verbatim
int MORSE_Enable  (MORSE_enum option);
@end verbatim
Feature to be enabled:
@itemize @bullet
@item @code{MORSE_WARNINGS}:   printing of warning messages,
@item @code{MORSE_ERRORS}:     printing of error messages,
@item @code{MORSE_AUTOTUNING}: autotuning for tile size and inner block size,
@item @code{MORSE_PROFILING_MODE}:  activate kernels profiling.
@end itemize

Disable MORSE feature.
@verbatim
int MORSE_Disable (MORSE_enum option);
@end verbatim
Symmetric to @code{MORSE_Enable}.

Set MORSE parameter.
@verbatim
int MORSE_Set     (MORSE_enum param, int  value);
@end verbatim
Parameters to be set:
@itemize @bullet
@item @code{MORSE_TILE_SIZE}:        size matrix tile,
@item @code{MORSE_INNER_BLOCK_SIZE}: size of tile inner block,
@item @code{MORSE_HOUSEHOLDER_MODE}: type of householder trees (FLAT or TREE),
@item @code{MORSE_HOUSEHOLDER_SIZE}: size of the groups in householder trees,
879
@item @code{MORSE_TRANSLATION_MODE}: related to the
880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895
@end itemize

Get value of MORSE parameter.
@verbatim
int MORSE_Get     (MORSE_enum param, int *value);
@end verbatim

@node Sequences routines
@subsubsection Sequences routines

@c /* Sequences */
Create a sequence.
@verbatim
int MORSE_Sequence_Create  (MORSE_sequence_t **sequence);
@end verbatim

896
Destroy a sequence.
897 898 899 900 901 902 903 904 905 906 907 908
@verbatim
int MORSE_Sequence_Destroy (MORSE_sequence_t *sequence);
@end verbatim

Wait for the completion of a sequence.
@verbatim
int MORSE_Sequence_Wait    (MORSE_sequence_t *sequence);
@end verbatim

@node Linear Algebra routines
@subsubsection Linear Algebra routines

909 910
Routines computing linear algebra of the form
@code{MORSE_name[_Tile[_Async]]} (@code{name} follows LAPACK naming scheme, see
911 912 913
@uref{http://www.netlib.org/lapack/lug/node24.html} availables:

@verbatim
914
/**
915
 *  Declarations of computational functions (LAPACK layout)
916
 **/
917
int MORSE_zgelqf(int M, int N, MORSE_Complex64_t *A, int LDA,
918
                 MORSE_desc_t *descT);
919 920

int MORSE_zgelqs(int M, int N, int NRHS, MORSE_Complex64_t *A, int LDA,
921
                 MORSE_desc_t *descT, MORSE_Complex64_t *B, int LDB);
922 923 924

int MORSE_zgels(MORSE_enum trans, int M, int N, int NRHS,
                MORSE_Complex64_t *A, int LDA, MORSE_desc_t *descT,
925 926
                MORSE_Complex64_t *B, int LDB);

927 928 929
int MORSE_zgemm(MORSE_enum transA, MORSE_enum transB, int M, int N, int K,
                MORSE_Complex64_t alpha, MORSE_Complex64_t *A, int LDA,
                MORSE_Complex64_t *B, int LDB, MORSE_Complex64_t beta,
930 931
                MORSE_Complex64_t *C, int LDC);

932
int MORSE_zgeqrf(int M, int N, MORSE_Complex64_t *A, int LDA,
933
                 MORSE_desc_t *descT);
934 935

int MORSE_zgeqrs(int M, int N, int NRHS, MORSE_Complex64_t *A, int LDA,
936 937
                 MORSE_desc_t *descT, MORSE_Complex64_t *B, int LDB);

938 939
int MORSE_zgesv_incpiv(int N, int NRHS, MORSE_Complex64_t *A, int LDA,
                       MORSE_desc_t *descL, int *IPIV,
940 941
                       MORSE_Complex64_t *B, int LDB);

942
int MORSE_zgesv_nopiv(int N, int NRHS, MORSE_Complex64_t *A, int LDA,
943 944
                      MORSE_Complex64_t *B, int LDB);

945
int MORSE_zgetrf_incpiv(int M, int N, MORSE_Complex64_t *A, int LDA,
946 947 948 949
                        MORSE_desc_t *descL, int *IPIV);

int MORSE_zgetrf_nopiv(int M, int N, MORSE_Complex64_t *A, int LDA);

950 951 952
int MORSE_zgetrs_incpiv(MORSE_enum trans, int N, int NRHS,
                        MORSE_Complex64_t *A, int LDA,
                        MORSE_desc_t *descL, int *IPIV,
953 954
                        MORSE_Complex64_t *B, int LDB);

955 956
int MORSE_zgetrs_nopiv(MORSE_enum trans, int N, int NRHS,
                       MORSE_Complex64_t *A, int LDA,
957 958
                       MORSE_Complex64_t *B, int LDB);

959 960 961
int MORSE_zhemm(MORSE_enum side, MORSE_enum uplo, int M, int N,
                MORSE_Complex64_t alpha, MORSE_Complex64_t *A, int LDA,
                MORSE_Complex64_t *B, int LDB, MORSE_Complex64_t beta,
962 963
                MORSE_Complex64_t *C, int LDC);

964 965
int MORSE_zherk(MORSE_enum uplo, MORSE_enum trans, int N, int K,
                double alpha, MORSE_Complex64_t *A, int LDA,
966 967
                double beta, MORSE_Complex64_t *C, int LDC);

968 969 970
int MORSE_zher2k(MORSE_enum uplo, MORSE_enum trans, int N, int K,
                 MORSE_Complex64_t alpha, MORSE_Complex64_t *A, int LDA,
                 MORSE_Complex64_t *B, int LDB, double beta,
971 972
                 MORSE_Complex64_t *C, int LDC);

973 974
int MORSE_zlacpy(MORSE_enum uplo, int M, int N,
                 MORSE_Complex64_t *A, int LDA,
975 976
                 MORSE_Complex64_t *B, int LDB);

977
double MORSE_zlange(MORSE_enum norm, int M, int N,
978 979
                    MORSE_Complex64_t *A, int LDA);

980
double MORSE_zlanhe(MORSE_enum norm, MORSE_enum uplo, int N,
981 982
                    MORSE_Complex64_t *A, int LDA);

983
double MORSE_zlansy(MORSE_enum norm, MORSE_enum uplo, int N,
984 985
                    MORSE_Complex64_t *A, int LDA);

986
double MORSE_zlantr(MORSE_enum norm, MORSE_enum uplo, MORSE_enum diag,
987 988
                    int M, int N, MORSE_Complex64_t *A, int LDA);

989
int MORSE_zlaset(MORSE_enum uplo, int M, int N, MORSE_Complex64_t alpha,
990 991 992 993
                 MORSE_Complex64_t beta, MORSE_Complex64_t *A, int LDA);

int MORSE_zlauum(MORSE_enum uplo, int N, MORSE_Complex64_t *A, int LDA);

994 995
int MORSE_zplghe( double bump, MORSE_enum uplo, int N,
                  MORSE_Complex64_t *A, int LDA,
996 997
                  unsigned long long int seed );

998
int MORSE_zplgsy( MORSE_Complex64_t bump, MORSE_enum uplo, int N,
999
                  MORSE_Complex64_t *A, int LDA,
1000 1001
                  unsigned long long int seed );

1002
int MORSE_zplrnt( int M, int N, MORSE_Complex64_t *A, int LDA,
1003 1004
                  unsigned long long int seed );

1005 1006
int MORSE_zposv(MORSE_enum uplo, int N, int NRHS,
                MORSE_Complex64_t *A, int LDA,
1007 1008 1009 1010 1011 1012 1013 1014
                MORSE_Complex64_t *B, int LDB);

int MORSE_zpotrf(MORSE_enum uplo, int N, MORSE_Complex64_t *A, int LDA);

int MORSE_zsytrf(MORSE_enum uplo, int N, MORSE_Complex64_t *A, int LDA);

int MORSE_zpotri(MORSE_enum uplo, int N, MORSE_Complex64_t *A, int LDA);

1015 1016
int MORSE_zpotrs(MORSE_enum uplo, int N, int NRHS,
                 MORSE_Complex64_t *A, int LDA,
1017 1018 1019
                 MORSE_Complex64_t *B, int LDB);

#if defined (PRECISION_c) || defined(PRECISION_z)
1020 1021
int MORSE_zsytrs(MORSE_enum uplo, int N, int NRHS,
                 MORSE_Complex64_t *A, int LDA,
1022 1023 1024
                 MORSE_Complex64_t *B, int LDB);
#endif

1025 1026 1027
int MORSE_zsymm(MORSE_enum side, MORSE_enum uplo, int M, int N,
                MORSE_Complex64_t alpha, MORSE_Complex64_t *A, int LDA,
                MORSE_Complex64_t *B, int LDB, MORSE_Complex64_t beta,
1028 1029
                MORSE_Complex64_t *C, int LDC);

1030 1031
int MORSE_zsyrk(MORSE_enum uplo, MORSE_enum trans, int N, int K,
                MORSE_Complex64_t alpha, MORSE_Complex64_t *A, int LDA,
1032 1033
                MORSE_Complex64_t beta, MORSE_Complex64_t *C, int LDC);

1034 1035 1036
int MORSE_zsyr2k(MORSE_enum uplo, MORSE_enum trans, int N, int K,
                 MORSE_Complex64_t alpha, MORSE_Complex64_t *A, int LDA,
                 MORSE_Complex64_t *B, int LDB, MORSE_Complex64_t beta,
1037 1038
                 MORSE_Complex64_t *C, int LDC);

1039 1040 1041 1042
int MORSE_ztrmm(MORSE_enum side, MORSE_enum uplo,
                MORSE_enum transA, MORSE_enum diag,
                int N, int NRHS,
                MORSE_Complex64_t alpha, MORSE_Complex64_t *A, int LDA,
1043 1044
                MORSE_Complex64_t *B, int LDB);

1045 1046 1047 1048
int MORSE_ztrsm(MORSE_enum side, MORSE_enum uplo,
                MORSE_enum transA, MORSE_enum diag,
                int N, int NRHS,
                MORSE_Complex64_t alpha, MORSE_Complex64_t *A, int LDA,
1049 1050
                MORSE_Complex64_t *B, int LDB);

1051 1052
int MORSE_ztrsmpl(int N, int NRHS, MORSE_Complex64_t *A, int LDA,
                  MORSE_desc_t *descL, int *IPIV,
1053 1054
                  MORSE_Complex64_t *B, int LDB);

1055 1056 1057 1058
int MORSE_ztrsmrv(MORSE_enum side, MORSE_enum uplo,
                  MORSE_enum transA, MORSE_enum diag,
                  int N, int NRHS,
                  MORSE_Complex64_t alpha, MORSE_Complex64_t *A, int LDA,
1059 1060
                  MORSE_Complex64_t *B, int LDB);

1061
int MORSE_ztrtri(MORSE_enum uplo, MORSE_enum diag, int N,
1062 1063
                 MORSE_Complex64_t *A, int LDA);

1064
int MORSE_zunglq(int M, int N, int K, MORSE_Complex64_t *A, int LDA,
1065 1066
                 MORSE_desc_t *descT, MORSE_Complex64_t *B, int LDB);

1067
int MORSE_zungqr(int M, int N, int K, MORSE_Complex64_t *A, int LDA,
1068 1069
                 MORSE_desc_t *descT, MORSE_Complex64_t *B, int LDB);

1070 1071 1072
int MORSE_zunmlq(MORSE_enum side, MORSE_enum trans, int M, int N, int K,
                 MORSE_Complex64_t *A, int LDA,
                 MORSE_desc_t *descT,
1073 1074
                 MORSE_Complex64_t *B, int LDB);

1075 1076
int MORSE_zunmqr(MORSE_enum side, MORSE_enum trans, int M, int N, int K,
                 MORSE_Complex64_t *A, int LDA, MORSE_desc_t *descT,
1077 1078
                 MORSE_Complex64_t *B, int LDB);

1079
/**
1080 1081 1082 1083 1084 1085
 *  Declarations of computational functions (tile layout)
 **/
int MORSE_zgelqf_Tile(MORSE_desc_t *A, MORSE_desc_t *T);

int MORSE_zgelqs_Tile(MORSE_desc_t *A, MORSE_desc_t *T, MORSE_desc_t *B);

1086
int MORSE_zgels_Tile(MORSE_enum trans, MORSE_desc_t *A, MORSE_desc_t *T,
1087
                     MORSE_desc_t *B);
1088 1089 1090 1091

int MORSE_zgemm_Tile(MORSE_enum transA, MORSE_enum transB,
                     MORSE_Complex64_t alpha, MORSE_desc_t *A,
                     MORSE_desc_t