using.texi 55.1 KB
Newer Older
1 2 3 4 5 6
@c -*-texinfo-*-

@c This file is part of the MORSE Handbook.
@c Copyright (C) 2014 Inria
@c Copyright (C) 2014 The University of Tennessee
@c Copyright (C) 2014 King Abdullah University of Science and Technology
7
@c See the file ../chameleon.texi for copying conditions.
8 9

@menu
10 11 12
* Using CHAMELEON executables::
* Linking an external application with CHAMELEON libraries::
* CHAMELEON API::
13 14
@end menu

15 16
@node Using CHAMELEON executables
@section Using CHAMELEON executables
17

18 19
CHAMELEON provides several test executables that are compiled and link with 
CHAMELEON stack of dependencies.
20 21 22 23 24 25 26 27 28 29 30
Instructions about the arguments to give to executables are accessible thanks 
to the option @option{-[-]help} or @option{-[-]h}.
This set of binaries are separated into three categories and can be found in 
three different directories:

@itemize @bullet

  @item example

  contains examples of API usage and more specifically the 
  sub-directory lapack_to_morse/ provides a tutorial that explain how to use 
31 32
  CHAMELEON functionalities starting from a full LAPACK code, see 
@ref{Tutorial LAPACK to CHAMELEON}
33 34 35 36

  @item testing

  contains testing drivers to check numerical correctness of 
37
  CHAMELEON linear algebra routines with a wide range of parameters
38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65
  @example
  ./testing/stesting 4 1 LANGE 600 100 700
  @end example
  Two first arguments are the number of cores and gpus to use.
  The third one is the name of the algorithm to test.
  The other arguments depend on the algorithm, here it lies for the number of 
  rows, columns and leading dimension of the problem.
  
  Name of algorithms available for testing are:
  @itemize @bullet
    @item LANGE: norms of matrices Infinite, One, Max, Frobenius
    @item GEMM: general matrix-matrix multiply
    @item HEMM: hermitian matrix-matrix multiply
    @item HERK: hermitian matrix-matrix rank k update
    @item HER2K: hermitian matrix-matrix rank 2k update
    @item SYMM: symmetric matrix-matrix multiply
    @item SYRK: symmetric matrix-matrix rank k update
    @item SYR2K: symmetric matrix-matrix rank 2k update
    @item PEMV: matrix-vector multiply with pentadiagonal matrix    
    @item TRMM: triangular matrix-matrix multiply
    @item TRSM: triangular solve, multiple rhs
    @item POSV: solve linear systems with symmetric positive-definite matrix
    @item GESV_INCPIV: solve linear systems with general matrix    
    @item GELS: linear least squares with general matrix
  @end itemize

  @item timing

66
  contains timing drivers to assess performances of CHAMELEON routines.
67 68 69
  There are two sets of executables, those who do not use the tile interface 
and those who do (with _tile in the name of the executable).
  Executables without tile interface allocates data following LAPACK 
70
conventions and these data can be given as arguments to CHAMELEON routines 
71 72
as you would do with LAPACK.
  Executables with tile interface generate directly the data in the format 
73
  CHAMELEON tile algorithms used to submit tasks to the runtime system.
74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138
  Executables with tile interface should be more performant because no data 
copy from LAPACK matrix layout to tile matrix layout are necessary.
  Calling example:
  @example
  ./timing/time_dpotrf --n_range=1000:10000:1000 --nb=320 
                       --threads=9 --gpus=3 
                       --nowarmup
  @end example
  
  List of main options that can be used in timing:
  @itemize @bullet
    @item @option{--help}: show usage
    @item @option{--threads}: Number of CPU workers (default: 
@option{_SC_NPROCESSORS_ONLN})
    @item @option{--gpus}: number of GPU workers (default: @option{0})
    @item @option{--n_range=R}: range of N values, with 
@option{R=Start:Stop:Step}
(default: @option{500:5000:500})
    @item @option{--m=X}: dimension (M) of the matrices (default: @option{N})
    @item @option{--k=X}: dimension (K) of the matrices (default: @option{1}), 
useful for GEMM algorithm (k is the shared dimension and must be defined >1 to 
consider matrices and not vectors)
    @item @option{--nrhs=X}: number of right-hand size (default: @option{1})
    @item @option{--nb=X}: block/tile size. (default: @option{128})
    @item @option{--ib=X}: inner-blocking/IB size. (default: @option{32})
    @item @option{--niter=X}: number of iterations performed for each test 
(default: @option{1})
    @item @option{--rhblk=X}: if X > 0, enable Householder mode for QR and LQ 
factorization. X is the size of each subdomain (default: @option{0})
    @item @option{--[no]check}: check result (default: @option{nocheck})
    @item @option{--[no]profile}: print profiling informations (default: 
@option{noprofile})
    @item @option{--[no]trace}: enable/disable trace generation (default: 
@option{notrace})
    @item @option{--[no]dag}: enable/disable DAG generation (default: 
@option{nodag})
    @item @option{--[no]inv}: check on inverse (default: @option{noinv})
    @item @option{--nocpu}: all GPU kernels are exclusively executed on GPUs 
(default: @option{0})
  @end itemize
  
  List of timing algorithms available:
  @itemize @bullet
    @item LANGE: norms of matrices
    @item GEMM: general matrix-matrix multiply
    @item TRSM: triangular solve    
    @item POTRF: Cholesky factorization with a symmetric 
positive-definite matrix
    @item POSV: solve linear systems with symmetric positive-definite matrix
    @item GETRF_NOPIV: LU factorization of a general matrix
using the tile LU algorithm without row pivoting 
    @item GESV_NOPIV: solve linear system for a general matrix
using the tile LU algorithm without row pivoting 
    @item GETRF_INCPIV: LU factorization of a general matrix
using the tile LU algorithm with partial tile pivoting with row interchanges 
    @item GESV_INCPIV: solve linear system for a general matrix
using the tile LU algorithm with partial tile pivoting with row interchanges
matrix
    @item GEQRF: QR factorization of a general matrix
    @item GELS: solves overdetermined or underdetermined linear systems 
involving a general matrix using the QR or the LQ factorization
  @end itemize
  
@end itemize

139 140
@node Linking an external application with CHAMELEON libraries
@section Linking an external application with CHAMELEON libraries
141

142
Compilation and link with CHAMELEON libraries have been tested with 
143 144 145 146 147
@strong{gcc/gfortran 4.8.1} and @strong{icc/ifort 14.0.2}.

@menu
* Static linking in C::
* Dynamic linking in C::
148
* Build a Fortran program with CHAMELEON::
149 150 151 152 153
@end menu

@node Static linking in C
@subsection Static linking in C

154
Lets imagine you have a file main.c that you want to link with CHAMELEON 
155
static libraries.
156 157
Lets consider @file{/home/yourname/install/chameleon} is the install directory 
of CHAMELEON containing sub-directories @file{include/} and @file{lib/}.
158 159
Here could be your compilation command with gcc compiler:
@example
160
gcc -I/home/yourname/install/chameleon/include -o main.o -c main.c
161 162
@end example

163
Now if you want to link your application with CHAMELEON static libraries, you 
164 165
could do:
@example
PRUVOST Florent's avatar
PRUVOST Florent committed
166
gcc main.o -o main                                         \
167 168
/home/yourname/install/chameleon/lib/libchameleon.a        \
/home/yourname/install/chameleon/lib/libchameleon_starpu.a \
PRUVOST Florent's avatar
PRUVOST Florent committed
169 170
/home/yourname/install/chameleon/lib/libcoreblas.a         \
-lstarpu-1.1 -Wl,--no-as-needed -lmkl_intel_lp64           \
171 172 173 174 175 176
-lmkl_sequential -lmkl_core -lpthread -lm -lrt
@end example
As you can see in this example, we also link with dynamic libraries 
@option{starpu-1.1}, @option{Intel MKL} 
libraries (for BLAS/LAPACK/CBLAS/LAPACKE), @option{pthread}, @option{m} (math) 
and @option{rt}.
177
These libraries will depend on the configuration of your CHAMELEON build.
178 179
You can find these dependencies in .pc files we generate during compilation and 
that are installed in the sub-directory @file{lib/pkgconfig} of your 
180
CHAMELEON install directory.
181 182 183 184 185 186 187 188 189
Note also that you could need to specify where to find these libraries with 
@option{-L} option of your compiler/linker.
Before to run your program, make sure that all dynamic libraries path are 
appended in the @env{LD_LIBRARY_PATH} (for Linux systems) environment variable 
(@env{DYLD_LIBRARY_PATH} on Mac, @env{LIB} on Windows).

@node Dynamic linking in C 
@subsection Dynamic linking in C

190
For dynamic linking (need to build CHAMELEON with CMake 
191 192 193 194 195 196
option @option{BUILD_SHARED_LIBS=ON}) it is similar to static compilation/link 
but instead of specifying path to your static libraries you indicate the path 
to dynamic libraries with @option{-L} option and you give the name of libraries 
with @option{-l} option like this:
@example
gcc main.o -o main                               \
PRUVOST Florent's avatar
PRUVOST Florent committed
197 198
-L/home/yourname/install/chameleon/lib           \
-lchameleon -lchameleon_starpu -lcoreblas        \
199 200 201 202 203 204 205
-lstarpu-1.1 -Wl,--no-as-needed -lmkl_intel_lp64 \
-lmkl_sequential -lmkl_core -lpthread -lm -lrt
@end example
Note that an update of your environment variable 
@env{LD_LIBRARY_PATH} (@env{DYLD_LIBRARY_PATH} on Mac, @env{LIB} on Windows) 
with the path of the libraries is required before executing, example:
@example
206
export @env{LD_LIBRARY_PATH}=path/to/libs:path/to/chameleon/lib 
207 208
@end example

209 210
@node Build a Fortran program with CHAMELEON
@subsection Build a Fortran program with CHAMELEON
211

212
CHAMELEON provides a Fortran interface to user functions. Example:
213 214 215 216 217 218 219 220 221 222 223 224 225 226
@example
call morse_version(major, minor, patch) !or
call MORSE_VERSION(major, minor, patch)
@end example

Build and link are very similar to the C case.

Compilation example:
@example
gfortran -o main.o -c main.c
@end example

Static linking example:
@example
PRUVOST Florent's avatar
PRUVOST Florent committed
227
gfortran main.o -o main                                    \
228 229
/home/yourname/install/chameleon/lib/libchameleon.a        \
/home/yourname/install/chameleon/lib/libchameleon_starpu.a \
PRUVOST Florent's avatar
PRUVOST Florent committed
230 231
/home/yourname/install/chameleon/lib/libcoreblas.a         \
-lstarpu-1.1 -Wl,--no-as-needed -lmkl_intel_lp64           \
232 233 234 235 236 237
-lmkl_sequential -lmkl_core -lpthread -lm -lrt
@end example

Dynamic linking example:
@example
gfortran main.o -o main                          \
PRUVOST Florent's avatar
PRUVOST Florent committed
238 239
-L/home/yourname/install/chameleon/lib           \
-lchameleon -lchameleon_starpu -lcoreblas        \
240 241 242 243
-lstarpu-1.1 -Wl,--no-as-needed -lmkl_intel_lp64 \
-lmkl_sequential -lmkl_core -lpthread -lm -lrt
@end example

244 245
@node CHAMELEON API
@section CHAMELEON API
246

247
CHAMELEON provides routines to solve dense general systems of linear 
248 249 250 251 252 253 254 255 256 257 258 259
equations, symmetric positive definite systems of linear equations and linear 
least squares problems, using LU, Cholesky, QR and LQ factorizations.
Real arithmetic and complex arithmetic are supported in both single precision 
and double precision.
Routines that compute linear algebra are of the folowing form:
@example
MORSE_name[_Tile[_Async]]
@end example
@itemize @bullet
@item all user routines are prefixed with @code{MORSE}
@item @code{name} follows BLAS/LAPACK naming scheme for algorithms 
(@emph{e.g.} sgemm for general matrix-matrix multiply simple precision)
260
@item CHAMELEON provides three interface levels
261 262 263 264 265 266
  @itemize @minus
  @item @code{MORSE_name}: simplest interface, very close to CBLAS and LAPACKE, 
matrices are given following the LAPACK data layout (1-D array column-major). 
It involves copy of data from LAPACK layout to tile layout and conversely (to 
update LAPACK data), see @ref{Step1}.
  @item @code{MORSE_name_Tile}: the tile interface avoid copies between LAPACK 
267
and tile layouts. It is the standard interface of CHAMELEON and it should 
268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289
achieved better performance than the previous simplest interface. The data are 
given through a specific structure called a descriptor, see @ref{Step2}.
  @item @code{MORSE_name_Tile_Async}: similar to the tile interface, it avoids 
synchonization barrier normally called between @code{Tile} routines. 
At the end of an @code{Async} function, completion of tasks is not guarentee 
and data are not necessarily up-to-date. 
To ensure that tasks have been all executed a synchronization function has to 
be called after the sequence of @code{Async} functions, see @ref{Step4}.
  @end itemize
@end itemize

MORSE routine calls have to be precede from 
@example
MORSE_Init( NCPU, NGPU );
@end example
to initialize MORSE and the runtime system and followed by
@example
MORSE_Finalize();
@end example
to free some data and finalize the runtime and/or MPI.

@menu
290
* Tutorial LAPACK to CHAMELEON::
291 292 293
* List of available routines::
@end menu

294 295
@node Tutorial LAPACK to CHAMELEON
@subsection Tutorial LAPACK to CHAMELEON
296

297
This tutorial is dedicated to the API usage of CHAMELEON.
298
The idea is to start from a simple code and step by step explain how to 
299 300
use CHAMELEON routines. 
The first step is a full BLAS/LAPACK code without dependencies to CHAMELEON, 
301
a code that most users should easily understand.
302
Then, the different interfaces CHAMELEON provides are exposed, from the 
303 304 305 306 307 308
simplest API (step1) to more complicated ones (until step4).
The way some important parameters are set is discussed in step5.
Finally step6 is an example about distributed computation with MPI.

Source files can be found in the @file{example/lapack_to_morse/} 
directory.
309
If CMake option @option{CHAMELEON_ENABLE_EXAMPLE} is @option{ON} then source 
310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378
files are compiled with the project libraries.
The arithmetic precision is @code{double}.
To execute a step @samp{X}, enter the following command:
@example
./step@samp{X} --option1 --option2 ...
@end example
Instructions about the arguments to give to executables are accessible thanks 
to the option @option{-[-]help} or @option{-[-]h}.
Note there exist default values for options.

For all steps, the program solves a linear system @math{Ax=B}
The matrix values are randomly generated but ensure that matrix @math{A} is 
symmetric positive definite so that @math{A} can be factorized in a @math{LL^T} 
form using the Cholesky factorization.


Lets comment the different steps of the tutorial
@menu
* Step0:: a simple Cholesky example using the C interface of 
BLAS/LAPACK
* Step1:: introduces the LAPACK equivalent interface of MORSE
* Step2:: introduces the tile interface
* Step3:: indicates how to give your own tile matrix to MORSE
* Step4:: introduces the tile async interface
* Step5:: shows how to set some important parameters
* Step6:: introduces how to benefit from MPI in MORSE.
@end menu

@node Step0
@subsubsection Step0

The C interface of BLAS and LAPACK, that is, CBLAS and 
LAPACKE, are used to solve the system. The size of the system (matrix) and the 
number of right hand-sides can be given as arguments to the executable (be 
careful not to give huge numbers if you do not have an infinite amount of RAM!).
As for every step, the correctness of the solution is checked by calculating 
the norm @math{||Ax-B||/(||A||||x||+||B||)}.
The time spent in factorization+solve is recorded and, because we know exactly 
the number of operations of these algorithms, we deduce the number of 
operations that have been processed per second (in GFlops/s).
The important part of the code that solves the problem is:
@verbatim
/* Cholesky factorization:
 * A is replaced by its factorization L or L^T depending on uplo */
LAPACKE_dpotrf( LAPACK_COL_MAJOR, 'U', N, A, N );
/* Solve:
 * B is stored in X on entry, X contains the result on exit.
 * Forward ...
 */
cblas_dtrsm(
    CblasColMajor,
    CblasLeft,
    CblasUpper,
    CblasConjTrans,
    CblasNonUnit,
    N, NRHS, 1.0, A, N, X, N);
/* ... and back substitution */
cblas_dtrsm(
    CblasColMajor,
    CblasLeft,
    CblasUpper,
    CblasNoTrans,
    CblasNonUnit,
    N, NRHS, 1.0, A, N, X, N);
@end verbatim

@node Step1
@subsubsection Step1

379
It introduces the simplest CHAMELEON interface which is equivalent to 
380 381
CBLAS/LAPACKE.
The code is very similar to step0 but instead of calling CBLAS/LAPACKE 
382
functions, we call CHAMELEON equivalent functions. 
383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784
The solving code becomes:
@verbatim
/* Factorization: */
MORSE_dpotrf( UPLO, N, A, N );
/* Solve: */
MORSE_dpotrs(UPLO, N, NRHS, A, N, X, N);
@end verbatim
The API is almost the same so that it is easy to use for beginners.
It is important to keep in mind that before any call to MORSE routines, 
@code{MORSE_Init} has to be invoked to initialize MORSE and the runtime system.
Example:
@verbatim
MORSE_Init( NCPU, NGPU );
@end verbatim
After all MORSE calls have been done, a call to @code{MORSE_Finalize} is 
required to free some data and finalize the runtime and/or MPI.
@verbatim
MORSE_Finalize();
@end verbatim
We use MORSE routines with the LAPACK interface which means the routines 
accepts the same matrix format as LAPACK (1-D array column-major).
Note that we copy the matrix to get it in our own tile structures, see details 
about this format here @ref{Tile Data Layout}. 
This means you can get an overhead coming from copies.

@node Step2
@subsubsection Step2

This program is a copy of step1 but instead of using the LAPACK interface which 
leads to copy LAPACK matrices inside MORSE routines we use the tile interface.
We will still use standard format of matrix but we will see how to give this
matrix to create a MORSE descriptor, a structure wrapping data on which we want 
to apply sequential task-based algorithms.
The solving code becomes:
@verbatim
/* Factorization: */
MORSE_dpotrf_Tile( UPLO, descA );
/* Solve: */
MORSE_dpotrs_Tile( UPLO, descA, descX );
@end verbatim
To use the tile interface, a specific structure @code{MORSE_desc_t} must be 
created.
This can be achieved from different ways.
@enumerate
@item Use the existing function @code{MORSE_Desc_Create}: means the 
matrix data are considered contiguous in memory as it is considered in PLASMA 
(@ref{Tile Data Layout}).
@item Use the existing function @code{MORSE_Desc_Create_User}: it is more 
flexible than @code{Desc_Create} because you can give your own way to access to 
tile data so that your tiles can be allocated wherever you want in memory, see 
next paragraph @ref{Step3}.
@item Create you own function to fill the descriptor. 
If you understand well the meaning of each item of @code{MORSE_desc_t}, you 
should be able to fill correctly the structure (good luck).
@end enumerate

In Step2, we use the first way to create the descriptor:
@verbatim
MORSE_Desc_Create(&descA, NULL, MorseRealDouble,
                  NB, NB, NB*NB, N, N, 
                  0, 0, N, N, 
                  1, 1);
@end verbatim

@itemize @bullet

@item @code{descA} is the descriptor to create.

@item The second argument is a pointer to existing data.
The existing data must follow LAPACK/PLASMA matrix layout @ref{Tile Data 
Layout} (1-D array column-major) if @code{MORSE_Desc_Create} is used to create 
the descriptor.
The @code{MORSE_Desc_Create_User} function can be used if you have data 
organized differently. 
This is discussed in the next paragraph @ref{Step3}. 
Giving a @code{NULL} pointer means you let the function allocate memory space.
This requires to copy your data in the memory allocated by the 
@code{Desc_Create}.
This can be done with
@verbatim
MORSE_Lapack_to_Tile(A, N, descA);
@end verbatim

@item Third argument of @code{Desc_Create} is the datatype (used for memory 
allocation).

@item Fourth argument until sixth argument stand for respectively, the number 
of rows (@code{NB}), columns (@code{NB}) in each tile, the total number of 
values in a tile (@code{NB*NB}), the number of rows (@code{N}), colmumns 
(@code{N}) in the entire matrix.

@item Seventh argument until ninth argument stand for respectively, the 
beginning row (@code{0}), column (@code{0}) indexes of the submatrix and the 
number of rows (@code{N}), columns (@code{N}) in the submatrix.
These arguments are specific and used in precise cases.
If you do not consider submatrices, just use @code{0, 0, NROWS, NCOLS}.

@item Two last arguments are the parameter of the 2-D block-cyclic distribution 
grid, see @uref{http://www.netlib.org/scalapack/slug/node75.html, ScaLAPACK}.
To be able to use other data distribution over the nodes, 
@code{MORSE_Desc_Create_User} function should be used.

@end itemize


@node Step3
@subsubsection Step3

This program makes use of the same interface than Step2 (tile interface) but 
does not allocate LAPACK matrices anymore so that no copy between LAPACK matrix 
layout and tile matrix layout are necessary to call MORSE routines.
To generate random right hand-sides you can use:
@verbatim
/* Allocate memory and initialize descriptor B */
MORSE_Desc_Create(&descB,  NULL, MorseRealDouble,
                  NB, NB,  NB*NB, N, NRHS, 
                  0, 0, N, NRHS, 1, 1);
/* generate RHS with random values */
MORSE_dplrnt_Tile( descB, 5673 );
@end verbatim

The other important point is that is it possible to create a descriptor, the 
necessary structure to call MORSE efficiently, by giving your own pointer to 
tiles if your matrix is not organized as a 1-D array column-major.
This can be achieved with the @code{MORSE_Desc_Create_User} routine.
Here is an example:
@verbatim
MORSE_Desc_Create_User(&descA, matA, MorseRealDouble,
		       NB, NB, NB*NB, N, N, 
		       0, 0, N, N, 1, 1,
		       user_getaddr_arrayofpointers,
		       user_getblkldd_arrayofpointers,
		       user_getrankof_zero);
@end verbatim
Firsts arguments are the same than @code{MORSE_Desc_Create} routine. 
Following arguments allows you to give pointer to functions that manage the
access to tiles from the structure given as second argument.
Here for example, @code{matA} is an array containing addresses to tiles, see 
the function @code{allocate_tile_matrix} defined in @file{step3.h}.
The three functions you have to define for @code{Desc_Create_User} are:
@itemize @bullet
@item a function that returns address of tile @math{A(m,n)}, m and n standing 
for the indexes of the tile in the global matrix. Lets consider a matrix 
@math{4x4} with tile size @math{2x2}, the matrix contains four tiles of 
indexes: @math{A(m=0,n=0)}, @math{A(m=0,n=1)}, @math{A(m=1,n=0)}, 
@math{A(m=1,n=1)}
@item a function that returns the leading dimension of tile @math{A(m,*)}
@item a function that returns MPI rank of tile @math{A(m,n)}
@end itemize
Examples for these functions are vizible in @file{step3.h}.
Note that the way we define these functions is related to the tile matrix 
format and to the data distribution considered.
This example should not be used with MPI since all tiles are affected to 
processus @code{0}, which means a large amount of data will be 
potentially transfered between nodes.

@node Step4
@subsubsection Step4
This program is a copy of step2 but instead of using the tile interface, it 
uses the tile async interface.
The goal is to exhibit the runtime synchronization barriers.
Keep in mind that when the tile interface is called, like 
@code{MORSE_dpotrf_Tile}, a synchronization function, waiting for the actual 
execution and termination of all tasks, is called to ensure the 
proper completion of the algorithm (i.e. data are up-to-date).
The code shows how to exploit the async interface to pipeline subsequent 
algorithms so that less synchronisations are done.
The code becomes:
@verbatim
/* Morse structure containing parameters and a structure to interact with
 * the Runtime system */
MORSE_context_t *morse;
/* MORSE sequence uniquely identifies a set of asynchronous function calls
 * sharing common exception handling */
MORSE_sequence_t *sequence = NULL;
/* MORSE request uniquely identifies each asynchronous function call */
MORSE_request_t request = MORSE_REQUEST_INITIALIZER;
int status;

...

morse_sequence_create(morse, &sequence);

/* Factorization: */
MORSE_dpotrf_Tile_Async( UPLO, descA, sequence, &request );

/* Solve: */
MORSE_dpotrs_Tile_Async( UPLO, descA, descX, sequence, &request);

/* Synchronization barrier (the runtime ensures that all submitted tasks
 * have been terminated */
RUNTIME_barrier(morse);
/* Ensure that all data processed on the gpus we are depending on are back
 * in main memory */
RUNTIME_desc_getoncpu(descA);
RUNTIME_desc_getoncpu(descX);

status = sequence->status;
    
@end verbatim
Here the sequence of @code{dpotrf} and @code{dpotrs} algorithms is processed 
without synchronization so that some tasks of @code{dpotrf} and @code{dpotrs} 
can be concurently executed which could increase performances.
The async interface is very similar to the tile one.
It is only necessary to give two new objects @code{MORSE_sequence_t} and 
@code{MORSE_request_t} used to handle asynchronous function calls.

@center @image{potri_async,13cm,8cm}
POTRI (POTRF, TRTRI, LAUUM) algorithm with and without synchronization 
barriers, courtesey of the @uref{http://icl.cs.utk.edu/plasma/, PLASMA} team.

@node Step5
@subsubsection Step5

Step5 shows how to set some important parameters.
This program is a copy of Step4 but some additional parameters are given by 
the user. 
The parameters that can be set are:
@itemize @bullet
@item number of Threads
@item number of GPUs

The number of workers can be given as argument to the executable with 
@option{--threads=} and @option{--gpus=} options.
It is important to notice that we assign one thread per gpu to optimize data 
transfer between main memory and devices memory.
The number of workers of each type @code{CPU} and @code{CUDA} must be given at 
@code{MORSE_Init}.
@verbatim
if ( iparam[IPARAM_THRDNBR] == -1 ) {
    get_thread_count( &(iparam[IPARAM_THRDNBR]) );
    /* reserve one thread par cuda device to optimize memory transfers */
    iparam[IPARAM_THRDNBR] -= iparam[IPARAM_NCUDAS];
}
NCPU = iparam[IPARAM_THRDNBR];
NGPU = iparam[IPARAM_NCUDAS];

/* initialize MORSE with main parameters */
MORSE_Init( NCPU, NGPU );
@end verbatim

@item matrix size
@item number of right-hand sides
@item block (tile) size

The problem size is given with @option{--n=} and @option{--nrhs=} options.
The tile size is given with option @option{--nb=}.
These parameters are required to create descriptors.
The size tile @code{NB} is a key parameter to get performances since it 
defines the granularity of tasks.
If @code{NB} is too large compared to @code{N}, there are few tasks to 
schedule. 
If the number of workers is large this leads to limit parallelism.
On the contrary, if @code{NB} is too small (@emph{i.e.} many small tasks), 
workers could not be correctly fed and the runtime systems operations 
could represent a substantial overhead.
A trade-off has to be found depending on many parameters: problem size, 
algorithm (drive data dependencies), architecture (number of workers, 
workers speed, workers uniformity, memory bus speed).
By default it is set to 128. 
Do not hesitate to play with this parameter and compare performances on your 
machine.

@item inner-blocking size

The inner-blocking size is given with option @option{--ib=}.
This parameter is used by kernels (optimized algorithms applied on tiles) to 
perform subsequent operations with data block-size that fits the cache of 
workers.
Parameters @code{NB} and @code{IB} can be given with @code{MORSE_Set} function:
@verbatim
MORSE_Set(MORSE_TILE_SIZE,        iparam[IPARAM_NB] );
MORSE_Set(MORSE_INNER_BLOCK_SIZE, iparam[IPARAM_IB] );
@end verbatim
@end itemize
 
@node Step6
@subsubsection Step6

This program is a copy of Step5 with some additional parameters to be set for 
the data distribution. 
To use this program properly MORSE must use StarPU Runtime system and MPI 
option must be activated at configure. 
The data distribution used here is 2-D block-cyclic, see for example
@uref{http://www.netlib.org/scalapack/slug/node75.html, ScaLAPACK} for 
explanation.
The user can enter the parameters of the distribution grid at execution with 
@option{--p=} option.
Example using OpenMPI on four nodes with one process per node:
@example
mpirun -np 4 ./step6 --n=10000 --nb=320 --ib=64 \
                     --threads=8 --gpus=2 --p=2
@end example

In this program we use the tile data layout from PLASMA so that the call 
@verbatim
MORSE_Desc_Create_User(&descA, NULL, MorseRealDouble,
		       NB, NB, NB*NB, N, N, 
		       0, 0, N, N,
		       GRID_P, GRID_Q,
		       morse_getaddr_ccrb,
		       morse_getblkldd_ccrb,
		       morse_getrankof_2d);
@end verbatim
is equivalent to the following call
@verbatim
MORSE_Desc_Create(&descA, NULL, MorseRealDouble,
                  NB, NB, NB*NB, N, N, 
                  0, 0, N, N, 
                  GRID_P, GRID_Q);
@end verbatim
functions @code{morse_getaddr_ccrb}, @code{morse_getblkldd_ccrb}, 
@code{morse_getrankof_2d} being used in @code{Desc_Create}.
It is interesting to notice that the code is almost the same as Step5.
The only additional information to give is the way tiles are distributed 
through the third function given to @code{MORSE_Desc_Create_User}.
Here, because we have made experiments only with a 2-D block-cyclic 
distribution, we have parameters P and Q in the interface of @code{Desc_Create}
but they have sense only for 2-D block-cyclic distribution and then using 
@code{morse_getrankof_2d} function.
Of course it could be used with other distributions, being no more the 
parameters of a 2-D block-cyclic grid but of another distribution.

@node List of available routines
@subsection List of available routines

@menu
* Auxiliary routines:: Init, Finalize, Version, etc
* Descriptor routines:: To handle descriptors
* Options routines:: To set options
* Sequences routines:: To manage asynchronous function calls 
* Linear Algebra routines:: Computional routines
@end menu

@node Auxiliary routines
@subsubsection Auxiliary routines

Reports MORSE version number.
@verbatim
int MORSE_Version        (int *ver_major, int *ver_minor, int *ver_micro);
@end verbatim

Initialize MORSE: initialize some parameters, initialize the runtime and/or MPI.
@verbatim
int MORSE_Init           (int nworkers, int ncudas);
@end verbatim

Finalyze MORSE: free some data and finalize the runtime and/or MPI.
@verbatim
int MORSE_Finalize       (void);
@end verbatim

Return the MPI rank of the calling process.
@verbatim
int MORSE_My_Mpi_Rank    (void);
@end verbatim

Conversion from LAPACK layout to tile layout.
@verbatim
int MORSE_Lapack_to_Tile (void *Af77, int LDA, MORSE_desc_t *A);
@end verbatim

Conversion from tile layout to LAPACK layout.
@verbatim
int MORSE_Tile_to_Lapack (MORSE_desc_t *A, void *Af77, int LDA);
@end verbatim

@node Descriptor routines
@subsubsection Descriptor routines

@c /* Descriptor */
Create matrix descriptor, internal function.
@verbatim
int MORSE_Desc_Create  (MORSE_desc_t **desc, void *mat, MORSE_enum dtyp,
                        int mb, int nb, int bsiz, int lm, int ln,
                        int i, int j, int m, int n, int p, int q);
@end verbatim

Create matrix descriptor, user function.
@verbatim
int MORSE_Desc_Create_User(MORSE_desc_t **desc, void *mat, MORSE_enum dtyp,
                           int mb, int nb, int bsiz, int lm, int ln,
                           int i, int j, int m, int n, int p, int q,
                           void* (*get_blkaddr)( const MORSE_desc_t*, int, int),
                           int (*get_blkldd)( const MORSE_desc_t*, int ),
                           int (*get_rankof)( const MORSE_desc_t*, int, int ));
@end verbatim

Destroys matrix descriptor.
@verbatim
int MORSE_Desc_Destroy (MORSE_desc_t **desc);
@end verbatim

Ensure that all data are up-to-date in main memory (even if some tasks have 
been processed on GPUs)
@verbatim
int MORSE_Desc_Getoncpu(MORSE_desc_t  *desc);
@end verbatim

@node Options routines
@subsubsection Options routines

785
@c /* Options */ 
786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380
Enable MORSE feature.
@verbatim
int MORSE_Enable  (MORSE_enum option);
@end verbatim
Feature to be enabled:
@itemize @bullet
@item @code{MORSE_WARNINGS}:   printing of warning messages,
@item @code{MORSE_ERRORS}:     printing of error messages,
@item @code{MORSE_AUTOTUNING}: autotuning for tile size and inner block size,
@item @code{MORSE_PROFILING_MODE}:  activate kernels profiling.
@end itemize

Disable MORSE feature.
@verbatim
int MORSE_Disable (MORSE_enum option);
@end verbatim
Symmetric to @code{MORSE_Enable}.

Set MORSE parameter.
@verbatim
int MORSE_Set     (MORSE_enum param, int  value);
@end verbatim
Parameters to be set:
@itemize @bullet
@item @code{MORSE_TILE_SIZE}:        size matrix tile,
@item @code{MORSE_INNER_BLOCK_SIZE}: size of tile inner block,
@item @code{MORSE_HOUSEHOLDER_MODE}: type of householder trees (FLAT or TREE),
@item @code{MORSE_HOUSEHOLDER_SIZE}: size of the groups in householder trees,
@item @code{MORSE_TRANSLATION_MODE}: related to the 
@code{MORSE_Lapack_to_Tile}, see @file{ztile.c}.
@end itemize

Get value of MORSE parameter.
@verbatim
int MORSE_Get     (MORSE_enum param, int *value);
@end verbatim

@node Sequences routines
@subsubsection Sequences routines

@c /* Sequences */
Create a sequence.
@verbatim
int MORSE_Sequence_Create  (MORSE_sequence_t **sequence);
@end verbatim

Destroy a squence.
@verbatim
int MORSE_Sequence_Destroy (MORSE_sequence_t *sequence);
@end verbatim

Wait for the completion of a sequence.
@verbatim
int MORSE_Sequence_Wait    (MORSE_sequence_t *sequence);
@end verbatim

@node Linear Algebra routines
@subsubsection Linear Algebra routines

Routines computing linear algebra of the form 
@code{MORSE_name[_Tile[_Async]]} (@code{name} follows LAPACK naming scheme, see 
@uref{http://www.netlib.org/lapack/lug/node24.html} availables:

@verbatim
/** ********************************************************
 *  Declarations of computational functions (LAPACK layout) 
 **/
 
int MORSE_zgelqf(int M, int N, MORSE_Complex64_t *A, int LDA, 
                 MORSE_desc_t *descT);
                 
int MORSE_zgelqs(int M, int N, int NRHS, MORSE_Complex64_t *A, int LDA, 
                 MORSE_desc_t *descT, MORSE_Complex64_t *B, int LDB);
                 
int MORSE_zgels(MORSE_enum trans, int M, int N, int NRHS, 
                MORSE_Complex64_t *A, int LDA, MORSE_desc_t *descT, 
                MORSE_Complex64_t *B, int LDB);

int MORSE_zgemm(MORSE_enum transA, MORSE_enum transB, int M, int N, int K, 
                MORSE_Complex64_t alpha, MORSE_Complex64_t *A, int LDA, 
                MORSE_Complex64_t *B, int LDB, MORSE_Complex64_t beta, 
                MORSE_Complex64_t *C, int LDC);

int MORSE_zgeqrf(int M, int N, MORSE_Complex64_t *A, int LDA, 
                 MORSE_desc_t *descT);
                 
int MORSE_zgeqrs(int M, int N, int NRHS, MORSE_Complex64_t *A, int LDA, 
                 MORSE_desc_t *descT, MORSE_Complex64_t *B, int LDB);

int MORSE_zgesv_incpiv(int N, int NRHS, MORSE_Complex64_t *A, int LDA, 
                       MORSE_desc_t *descL, int *IPIV, 
                       MORSE_Complex64_t *B, int LDB);

int MORSE_zgesv_nopiv(int N, int NRHS, MORSE_Complex64_t *A, int LDA, 
                      MORSE_Complex64_t *B, int LDB);

int MORSE_zgetrf_incpiv(int M, int N, MORSE_Complex64_t *A, int LDA, 
                        MORSE_desc_t *descL, int *IPIV);

int MORSE_zgetrf_nopiv(int M, int N, MORSE_Complex64_t *A, int LDA);

int MORSE_zgetrs_incpiv(MORSE_enum trans, int N, int NRHS, 
                        MORSE_Complex64_t *A, int LDA, 
                        MORSE_desc_t *descL, int *IPIV, 
                        MORSE_Complex64_t *B, int LDB);

int MORSE_zgetrs_nopiv(MORSE_enum trans, int N, int NRHS, 
                       MORSE_Complex64_t *A, int LDA, 
                       MORSE_Complex64_t *B, int LDB);

#ifdef COMPLEX
int MORSE_zhemm(MORSE_enum side, MORSE_enum uplo, int M, int N, 
                MORSE_Complex64_t alpha, MORSE_Complex64_t *A, int LDA, 
                MORSE_Complex64_t *B, int LDB, MORSE_Complex64_t beta, 
                MORSE_Complex64_t *C, int LDC);

int MORSE_zherk(MORSE_enum uplo, MORSE_enum trans, int N, int K, 
                double alpha, MORSE_Complex64_t *A, int LDA, 
                double beta, MORSE_Complex64_t *C, int LDC);

int MORSE_zher2k(MORSE_enum uplo, MORSE_enum trans, int N, int K, 
                 MORSE_Complex64_t alpha, MORSE_Complex64_t *A, int LDA, 
                 MORSE_Complex64_t *B, int LDB, double beta, 
                 MORSE_Complex64_t *C, int LDC);
#endif

int MORSE_zlacpy(MORSE_enum uplo, int M, int N, 
                 MORSE_Complex64_t *A, int LDA, 
                 MORSE_Complex64_t *B, int LDB);

double MORSE_zlange(MORSE_enum norm, int M, int N, 
                    MORSE_Complex64_t *A, int LDA);

#ifdef COMPLEX
double MORSE_zlanhe(MORSE_enum norm, MORSE_enum uplo, int N, 
                    MORSE_Complex64_t *A, int LDA);
#endif

double MORSE_zlansy(MORSE_enum norm, MORSE_enum uplo, int N, 
                    MORSE_Complex64_t *A, int LDA);

double MORSE_zlantr(MORSE_enum norm, MORSE_enum uplo, MORSE_enum diag, 
                    int M, int N, MORSE_Complex64_t *A, int LDA);

int MORSE_zlaset(MORSE_enum uplo, int M, int N, MORSE_Complex64_t alpha, 
                 MORSE_Complex64_t beta, MORSE_Complex64_t *A, int LDA);

int MORSE_zlauum(MORSE_enum uplo, int N, MORSE_Complex64_t *A, int LDA);

#ifdef COMPLEX
int MORSE_zplghe( double bump, int N, MORSE_Complex64_t *A, int LDA, 
                  unsigned long long int seed );
#endif

int MORSE_zplgsy( MORSE_Complex64_t bump, int N, 
                  MORSE_Complex64_t *A, int LDA, 
                  unsigned long long int seed );

int MORSE_zplrnt( int M, int N, MORSE_Complex64_t *A, int LDA, 
                  unsigned long long int seed );

int MORSE_zposv(MORSE_enum uplo, int N, int NRHS, 
                MORSE_Complex64_t *A, int LDA, 
                MORSE_Complex64_t *B, int LDB);

int MORSE_zpotrf(MORSE_enum uplo, int N, MORSE_Complex64_t *A, int LDA);

int MORSE_zsytrf(MORSE_enum uplo, int N, MORSE_Complex64_t *A, int LDA);

int MORSE_zpotri(MORSE_enum uplo, int N, MORSE_Complex64_t *A, int LDA);

int MORSE_zpotrs(MORSE_enum uplo, int N, int NRHS, 
                 MORSE_Complex64_t *A, int LDA, 
                 MORSE_Complex64_t *B, int LDB);

#if defined (PRECISION_c) || defined(PRECISION_z)
int MORSE_zsytrs(MORSE_enum uplo, int N, int NRHS, 
                 MORSE_Complex64_t *A, int LDA, 
                 MORSE_Complex64_t *B, int LDB);
#endif

int MORSE_zsymm(MORSE_enum side, MORSE_enum uplo, int M, int N, 
                MORSE_Complex64_t alpha, MORSE_Complex64_t *A, int LDA, 
                MORSE_Complex64_t *B, int LDB, MORSE_Complex64_t beta, 
                MORSE_Complex64_t *C, int LDC);

int MORSE_zsyrk(MORSE_enum uplo, MORSE_enum trans, int N, int K, 
                MORSE_Complex64_t alpha, MORSE_Complex64_t *A, int LDA, 
                MORSE_Complex64_t beta, MORSE_Complex64_t *C, int LDC);

int MORSE_zsyr2k(MORSE_enum uplo, MORSE_enum trans, int N, int K, 
                 MORSE_Complex64_t alpha, MORSE_Complex64_t *A, int LDA, 
                 MORSE_Complex64_t *B, int LDB, MORSE_Complex64_t beta, 
                 MORSE_Complex64_t *C, int LDC);

int MORSE_ztrmm(MORSE_enum side, MORSE_enum uplo, 
                MORSE_enum transA, MORSE_enum diag, 
                int N, int NRHS, 
                MORSE_Complex64_t alpha, MORSE_Complex64_t *A, int LDA, 
                MORSE_Complex64_t *B, int LDB);

int MORSE_ztrsm(MORSE_enum side, MORSE_enum uplo, 
                MORSE_enum transA, MORSE_enum diag, 
                int N, int NRHS, 
                MORSE_Complex64_t alpha, MORSE_Complex64_t *A, int LDA, 
                MORSE_Complex64_t *B, int LDB);

int MORSE_ztrsmpl(int N, int NRHS, MORSE_Complex64_t *A, int LDA, 
                  MORSE_desc_t *descL, int *IPIV, 
                  MORSE_Complex64_t *B, int LDB);

int MORSE_ztrsmrv(MORSE_enum side, MORSE_enum uplo, 
                  MORSE_enum transA, MORSE_enum diag, 
                  int N, int NRHS, 
                  MORSE_Complex64_t alpha, MORSE_Complex64_t *A, int LDA, 
                  MORSE_Complex64_t *B, int LDB);

int MORSE_ztrtri(MORSE_enum uplo, MORSE_enum diag, int N, 
                 MORSE_Complex64_t *A, int LDA);

int MORSE_zunglq(int M, int N, int K, MORSE_Complex64_t *A, int LDA, 
                 MORSE_desc_t *descT, MORSE_Complex64_t *B, int LDB);

int MORSE_zungqr(int M, int N, int K, MORSE_Complex64_t *A, int LDA, 
                 MORSE_desc_t *descT, MORSE_Complex64_t *B, int LDB);

int MORSE_zunmlq(MORSE_enum side, MORSE_enum trans, int M, int N, int K, 
                 MORSE_Complex64_t *A, int LDA, 
                 MORSE_desc_t *descT, 
                 MORSE_Complex64_t *B, int LDB);

int MORSE_zunmqr(MORSE_enum side, MORSE_enum trans, int M, int N, int K, 
                 MORSE_Complex64_t *A, int LDA, MORSE_desc_t *descT, 
                 MORSE_Complex64_t *B, int LDB);

/** ******************************************************
 *  Declarations of computational functions (tile layout)
 **/
 
int MORSE_zgelqf_Tile(MORSE_desc_t *A, MORSE_desc_t *T);

int MORSE_zgelqs_Tile(MORSE_desc_t *A, MORSE_desc_t *T, MORSE_desc_t *B);

int MORSE_zgels_Tile(MORSE_enum trans, MORSE_desc_t *A, MORSE_desc_t *T, 
                     MORSE_desc_t *B);
                     
int MORSE_zgemm_Tile(MORSE_enum transA, MORSE_enum transB, 
                     MORSE_Complex64_t alpha, MORSE_desc_t *A, 
                     MORSE_desc_t *B, MORSE_Complex64_t beta, 
                     MORSE_desc_t *C);
                     
int MORSE_zgeqrf_Tile(MORSE_desc_t *A, MORSE_desc_t *T);

int MORSE_zgeqrs_Tile(MORSE_desc_t *A, MORSE_desc_t *T, MORSE_desc_t *B);

int MORSE_zgesv_incpiv_Tile(MORSE_desc_t *A, MORSE_desc_t *L, int *IPIV, 
                            MORSE_desc_t *B);

int MORSE_zgesv_nopiv_Tile(MORSE_desc_t *A, MORSE_desc_t *B);

int MORSE_zgetrf_incpiv_Tile(MORSE_desc_t *A, MORSE_desc_t *L, int *IPIV);

int MORSE_zgetrf_nopiv_Tile(MORSE_desc_t *A);

int MORSE_zgetrs_incpiv_Tile(MORSE_desc_t *A, MORSE_desc_t *L, int *IPIV, 
                             MORSE_desc_t *B);
                             
int MORSE_zgetrs_nopiv_Tile(MORSE_desc_t *A, MORSE_desc_t *B);

#ifdef COMPLEX
int MORSE_zhemm_Tile(MORSE_enum side, MORSE_enum uplo, 
                     MORSE_Complex64_t alpha, MORSE_desc_t *A, 
                     MORSE_desc_t *B, MORSE_Complex64_t beta, 
                     MORSE_desc_t *C);
                     
int MORSE_zherk_Tile(MORSE_enum uplo, MORSE_enum trans, 
                     double alpha, MORSE_desc_t *A, 
                     double beta, MORSE_desc_t *C);
                     
int MORSE_zher2k_Tile(MORSE_enum uplo, MORSE_enum trans, 
                      MORSE_Complex64_t alpha, MORSE_desc_t *A, 
                      MORSE_desc_t *B, double beta, MORSE_desc_t *C);
#endif

int MORSE_zlacpy_Tile(MORSE_enum uplo, MORSE_desc_t *A, MORSE_desc_t *B);

double MORSE_zlange_Tile(MORSE_enum norm, MORSE_desc_t *A);

#ifdef COMPLEX
double MORSE_zlanhe_Tile(MORSE_enum norm, MORSE_enum uplo, MORSE_desc_t *A);
#endif

double MORSE_zlansy_Tile(MORSE_enum norm, MORSE_enum uplo, MORSE_desc_t *A);

double MORSE_zlantr_Tile(MORSE_enum norm, MORSE_enum uplo, 
                         MORSE_enum diag, MORSE_desc_t *A);

int MORSE_zlaset_Tile(MORSE_enum uplo, MORSE_Complex64_t alpha, 
                      MORSE_Complex64_t beta, MORSE_desc_t *A);
                      
int MORSE_zlauum_Tile(MORSE_enum uplo, MORSE_desc_t *A);

#ifdef COMPLEX
int MORSE_zplghe_Tile(double bump, MORSE_desc_t *A, 
                      unsigned long long int seed);
#endif

int MORSE_zplgsy_Tile(MORSE_Complex64_t bump, MORSE_desc_t *A, 
                      unsigned long long int seed );
                      
int MORSE_zplrnt_Tile(MORSE_desc_t *A, unsigned long long int seed );

int MORSE_zposv_Tile(MORSE_enum uplo, MORSE_desc_t *A, MORSE_desc_t *B);

int MORSE_zpotrf_Tile(MORSE_enum uplo, MORSE_desc_t *A);

int MORSE_zsytrf_Tile(MORSE_enum uplo, MORSE_desc_t *A);

int MORSE_zpotri_Tile(MORSE_enum uplo, MORSE_desc_t *A);

int MORSE_zpotrs_Tile(MORSE_enum uplo, MORSE_desc_t *A, MORSE_desc_t *B);

#if defined (PRECISION_c) || defined(PRECISION_z)
int MORSE_zsytrs_Tile(MORSE_enum uplo, MORSE_desc_t *A, MORSE_desc_t *B);
#endif

int MORSE_zsymm_Tile(MORSE_enum side, MORSE_enum uplo, 
                     MORSE_Complex64_t alpha, MORSE_desc_t *A, 
                     MORSE_desc_t *B, MORSE_Complex64_t beta, 
                     MORSE_desc_t *C);
                     
int MORSE_zsyrk_Tile(MORSE_enum uplo, MORSE_enum trans, 
                     MORSE_Complex64_t alpha, MORSE_desc_t *A, 
                     MORSE_Complex64_t beta, MORSE_desc_t *C);
                     
int MORSE_zsyr2k_Tile(MORSE_enum uplo, MORSE_enum trans, 
                      MORSE_Complex64_t alpha, MORSE_desc_t *A, 
                      MORSE_desc_t *B, MORSE_Complex64_t beta, 
                      MORSE_desc_t *C);
                      
int MORSE_ztrmm_Tile(MORSE_enum side, MORSE_enum uplo, 
                     MORSE_enum transA, MORSE_enum diag, 
                     MORSE_Complex64_t alpha, MORSE_desc_t *A, 
                     MORSE_desc_t *B);
                     
int MORSE_ztrsm_Tile(MORSE_enum side, MORSE_enum uplo, 
                     MORSE_enum transA, MORSE_enum diag, 
                     MORSE_Complex64_t alpha, MORSE_desc_t *A, 
                     MORSE_desc_t *B);
                     
int MORSE_ztrsmpl_Tile(MORSE_desc_t *A, MORSE_desc_t *L, 
                       int *IPIV, MORSE_desc_t *B);
                       
int MORSE_ztrsmrv_Tile(MORSE_enum side, MORSE_enum uplo, 
                       MORSE_enum transA, MORSE_enum diag, 
                       MORSE_Complex64_t alpha, MORSE_desc_t *A, 
                       MORSE_desc_t *B);
                       
int MORSE_ztrtri_Tile(MORSE_enum uplo, MORSE_enum diag, MORSE_desc_t *A);

int MORSE_zunglq_Tile(MORSE_desc_t *A, MORSE_desc_t *T, MORSE_desc_t *B);

int MORSE_zungqr_Tile(MORSE_desc_t *A, MORSE_desc_t *T, MORSE_desc_t *B);

int MORSE_zunmlq_Tile(MORSE_enum side, MORSE_enum trans, MORSE_desc_t *A, 
                      MORSE_desc_t *T, MORSE_desc_t *B);

int MORSE_zunmqr_Tile(MORSE_enum side, MORSE_enum trans, MORSE_desc_t *A, 
                      MORSE_desc_t *T, MORSE_desc_t *B);

/** ****************************************
 *  Declarations of computational functions 
 *  (tile layout, asynchronous execution)
 **/
 
int MORSE_zgelqf_Tile_Async(MORSE_desc_t *A, MORSE_desc_t *T, 
                            MORSE_sequence_t *sequence, 
                            MORSE_request_t *request);

int MORSE_zgelqs_Tile_Async(MORSE_desc_t *A, MORSE_desc_t *T, 
                            MORSE_desc_t *B, 
                            MORSE_sequence_t *sequence, 
                            MORSE_request_t *request);
                            
int MORSE_zgels_Tile_Async(MORSE_enum trans, MORSE_desc_t *A, 
                           MORSE_desc_t *T, MORSE_desc_t *B, 
                           MORSE_sequence_t *sequence, 
                           MORSE_request_t *request);

int MORSE_zgemm_Tile_Async(MORSE_enum transA, MORSE_enum transB, 
                           MORSE_Complex64_t alpha, MORSE_desc_t *A, 
                           MORSE_desc_t *B, MORSE_Complex64_t beta, 
                           MORSE_desc_t *C, MORSE_sequence_t *sequence, 
                           MORSE_request_t *request);

int MORSE_zgeqrf_Tile_Async(MORSE_desc_t *A, MORSE_desc_t *T, 
                            MORSE_sequence_t *sequence, 
                            MORSE_request_t *request)

int MORSE_zgeqrs_Tile_Async(MORSE_desc_t *A, MORSE_desc_t *T, 
                            MORSE_desc_t *B, 
			    MORSE_sequence_t *sequence, 
			    MORSE_request_t *request);

int MORSE_zgesv_incpiv_Tile_Async(MORSE_desc_t *A, MORSE_desc_t *L, 
                                  int *IPIV, MORSE_desc_t *B, 
                                  MORSE_sequence_t *sequence, 
                                  MORSE_request_t *request);

int MORSE_zgesv_nopiv_Tile_Async(MORSE_desc_t *A, MORSE_desc_t *B, 
                                 MORSE_sequence_t *sequence, 
                                 MORSE_request_t *request);

int MORSE_zgetrf_incpiv_Tile_Async(MORSE_desc_t *A, MORSE_desc_t *L, 
                                   int *IPIV, MORSE_sequence_t *sequence, 
                                   MORSE_request_t *request);

int MORSE_zgetrf_nopiv_Tile_Async(MORSE_desc_t *A, 
                                  MORSE_sequence_t *sequence, 
                                  MORSE_request_t *request);

int MORSE_zgetrs_incpiv_Tile_Async(MORSE_desc_t *A, MORSE_desc_t *L, 
                                   int *IPIV, MORSE_desc_t *B, 
                                   MORSE_sequence_t *sequence, 
                                   MORSE_request_t *request);

int MORSE_zgetrs_nopiv_Tile_Async(MORSE_desc_t *A, MORSE_desc_t *B, 
                                  MORSE_sequence_t *sequence, 
                                  MORSE_request_t *request);

#ifdef COMPLEX
int MORSE_zhemm_Tile_Async(MORSE_enum side, MORSE_enum uplo, 
                           MORSE_Complex64_t alpha, MORSE_desc_t *A, 
                           MORSE_desc_t *B, MORSE_Complex64_t beta, 
                           MORSE_desc_t *C, MORSE_sequence_t *sequence, 
                           MORSE_request_t *request);

int MORSE_zherk_Tile_Async(MORSE_enum uplo, MORSE_enum trans, 
                           double alpha, MORSE_desc_t *A, 
                           double beta, MORSE_desc_t *C, 
                           MORSE_sequence_t *sequence, 
                           MORSE_request_t *request);

int MORSE_zher2k_Tile_Async(MORSE_enum uplo, MORSE_enum trans, 
                            MORSE_Complex64_t alpha, MORSE_desc_t *A, 
                            MORSE_desc_t *B, double beta, MORSE_desc_t *C, 
                            MORSE_sequence_t *sequence, 
                            MORSE_request_t *request);
#endif

int MORSE_zlacpy_Tile_Async(MORSE_enum uplo, MORSE_desc_t *A, 
                            MORSE_desc_t *B, MORSE_sequence_t *sequence, 
                            MORSE_request_t *request);

int MORSE_zlange_Tile_Async(MORSE_enum norm, MORSE_desc_t *A, double *value, 
                            MORSE_sequence_t *sequence, 
                            MORSE_request_t *request);

#ifdef COMPLEX
int MORSE_zlanhe_Tile_Async(MORSE_enum norm, MORSE_enum uplo, 
                            MORSE_desc_t *A, double *value, 
                            MORSE_sequence_t *sequence, 
                            MORSE_request_t *request);
#endif

int MORSE_zlansy_Tile_Async(MORSE_enum norm, MORSE_enum uplo, 
                            MORSE_desc_t *A, double *value, 
                            MORSE_sequence_t *sequence, 
                            MORSE_request_t *request);

int MORSE_zlantr_Tile_Async(MORSE_enum norm, MORSE_enum uplo, 
                            MORSE_enum diag, MORSE_desc_t *A, double *value, 
                            MORSE_sequence_t *sequence, 
                            MORSE_request_t *request);

int MORSE_zlaset_Tile_Async(MORSE_enum uplo, MORSE_Complex64_t alpha, 
                            MORSE_Complex64_t beta, MORSE_desc_t *A, 
                            MORSE_sequence_t *sequence, 
                            MORSE_request_t *request);

int MORSE_zlauum_Tile_Async(MORSE_enum uplo, MORSE_desc_t *A, 
                            MORSE_sequence_t *sequence, 
                            MORSE_request_t *request);

#ifdef COMPLEX
int MORSE_zplghe_Tile_Async(double bump, MORSE_desc_t *A, 
                            unsigned long long int seed, 
                            MORSE_sequence_t *sequence, 
                            MORSE_request_t *request );
#endif

int MORSE_zplgsy_Tile_Async(MORSE_Complex64_t bump, MORSE_desc_t *A, 
                            unsigned long long int seed, 
                            MORSE_sequence_t *sequence, 
                            MORSE_request_t *request );

int MORSE_zplrnt_Tile_Async(MORSE_desc_t *A, unsigned long long int seed, 
                            MORSE_sequence_t *sequence, 
                            MORSE_request_t *request );

int MORSE_zposv_Tile_Async(MORSE_enum uplo, MORSE_desc_t *A, 
                           MORSE_desc_t *B, 
                           MORSE_sequence_t *sequence, 
                           MORSE_request_t *request);

int MORSE_zpotrf_Tile_Async(MORSE_enum uplo, MORSE_desc_t *A, 
                            MORSE_sequence_t *sequence, 
                            MORSE_request_t *request);

int MORSE_zsytrf_Tile_Async(MORSE_enum uplo, MORSE_desc_t *A, 
                            MORSE_sequence_t *sequence, 
                            MORSE_request_t *request);

int MORSE_zpotri_Tile_Async(MORSE_enum uplo, MORSE_desc_t *A, 
                            MORSE_sequence_t *sequence, 
                            MORSE_request_t *request);

int MORSE_zpotrs_Tile_Async(MORSE_enum uplo, MORSE_desc_t *A, 
                            MORSE_desc_t *B, MORSE_sequence_t *sequence, 
                            MORSE_request_t *request);

#if defined (PRECISION_c) || defined(PRECISION_z)
int MORSE_zsytrs_Tile_Async(MORSE_enum uplo, MORSE_desc_t *A, 
                            MORSE_desc_t *B, 
                            MORSE_sequence_t *sequence, 
                            MORSE_request_t *request);
#endif

int MORSE_zsymm_Tile_Async(MORSE_enum side, MORSE_enum uplo, 
                           MORSE_Complex64_t alpha, MORSE_desc_t *A, 
                           MORSE_desc_t *B, MORSE_Complex64_t beta, 
                           MORSE_desc_t *C, MORSE_sequence_t *sequence, 
                           MORSE_request_t *request);

int MORSE_zsyrk_Tile_Async(MORSE_enum uplo, MORSE_enum trans, 
                           MORSE_Complex64_t alpha, MORSE_desc_t *A, 
                           MORSE_Complex64_t beta, MORSE_desc_t *C, 
                           MORSE_sequence_t *sequence, 
                           MORSE_request_t *request);

int MORSE_zsyr2k_Tile_Async(MORSE_enum uplo, MORSE_enum trans, 
                            MORSE_Complex64_t alpha, MORSE_desc_t *A, 
                            MORSE_desc_t *B, MORSE_Complex64_t beta, 
                            MORSE_desc_t *C, MORSE_sequence_t *sequence, 
                            MORSE_request_t *request);

int MORSE_ztrmm_Tile_Async(MORSE_enum side, MORSE_enum uplo, 
                           MORSE_enum transA, MORSE_enum diag, 
                           MORSE_Complex64_t alpha, MORSE_desc_t *A, 
                           MORSE_desc_t *B, MORSE_sequence_t *sequence, 
                           MORSE_request_t *request);

int MORSE_ztrsm_Tile_Async(MORSE_enum side, MORSE_enum uplo, 
                           MORSE_enum transA, MORSE_enum diag, 
                           MORSE_Complex64_t alpha, MORSE_desc_t *A, 
                           MORSE_desc_t *B, MORSE_sequence_t *sequence, 
                           MORSE_request_t *request);

int MORSE_ztrsmpl_Tile_Async(MORSE_desc_t *A, MORSE_desc_t *L, int *IPIV, 
                             MORSE_desc_t *B, MORSE_sequence_t *sequence, 
                             MORSE_request_t *request);

int MORSE_ztrsmrv_Tile_Async(MORSE_enum side, MORSE_enum uplo, 
                             MORSE_enum transA, MORSE_enum diag, 
                             MORSE_Complex64_t alpha, MORSE_desc_t *A, 
                             MORSE_desc_t *B, MORSE_sequence_t *sequence, 
                             MORSE_request_t *request);

int MORSE_ztrtri_Tile_Async(MORSE_enum uplo, MORSE_enum diag, 
                            MORSE_desc_t *A, 
                            MORSE_sequence_t *sequence, 
                            MORSE_request_t *request);

int MORSE_zunglq_Tile_Async(MORSE_desc_t *A, MORSE_desc_t *T, 
                            MORSE_desc_t *B, 
                            MORSE_sequence_t *sequence, 
                            MORSE_request_t *request);

int MORSE_zungqr_Tile_Async(MORSE_desc_t *A, MORSE_desc_t *T, 
                            MORSE_desc_t *B, 
                            MORSE_sequence_t *sequence, 
                            MORSE_request_t *request);

int MORSE_zunmlq_Tile_Async(MORSE_enum side, MORSE_enum trans, 
                            MORSE_desc_t *A, MORSE_desc_t *T, 
                            MORSE_desc_t *B, MORSE_sequence_t *sequence, 
                            MORSE_request_t *request);

int MORSE_zunmqr_Tile_Async(MORSE_enum side, MORSE_enum trans, 
                            MORSE_desc_t *A, MORSE_desc_t *T, 
                            MORSE_desc_t *B, MORSE_sequence_t *sequence, 
                            MORSE_request_t *request);

@end verbatim

1381
@c -nofor_main