cuda_zgelqt.c

/**
 *
 * @copyright (c) 2009-2014 The University of Tennessee and The University
 *                          of Tennessee Research Foundation.
 *                          All rights reserved.
 * @copyright (c) 2012-2015 Inria. All rights reserved.
 * @copyright (c) 2012-2014 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, Univ. Bordeaux. All rights reserved.
 *
 **/

/**
 *
 * @file cuda_zgelqt.c
 *
 *  MORSE cudablas kernel
 *  MORSE is a software package provided by Univ. of Tennessee,
 *  Univ. of California Berkeley and Univ. of Colorado Denver,
 *  and INRIA Bordeaux Sud-Ouest
 *
 * @author Florent Pruvost
 * @date 2015-09-16
 * @precisions normal z -> c d s
 *
 **/
#include "cudablas/include/cudablas.h"

#if defined(CHAMELEON_USE_MAGMA)
int CUDA_zgelqt(
        magma_int_t m, magma_int_t n, magma_int_t nb,
        magmaDoubleComplex *da, magma_int_t ldda,
        magmaDoubleComplex *v,  magma_int_t ldv,
        magmaDoubleComplex *dt, magma_int_t lddt,
        magmaDoubleComplex *t,  magma_int_t ldt,
        magmaDoubleComplex *dd,
        magmaDoubleComplex *d,  magma_int_t ldd,
        magmaDoubleComplex *tau,
        magmaDoubleComplex *hwork,
        magmaDoubleComplex *dwork,
        CUstream stream)
{
#define da_ref(a_1,a_2) ( da+(a_2)*(ldda) + (a_1))
#define v_ref(a_1,a_2)  ( v+(a_2)*(ldv) + (a_1))
#define dt_ref(a_1,a_2) ( dt+(a_2)*(lddt) + (a_1))
#define t_ref(a_1,a_2)  ( t+(a_2)*(ldt) + (a_1))

    int i, k, ib, lddwork, old_i, old_ib, rows, cols;
    double _Complex one=1.;

    if (m < 0) {
    return -1;
    } else if (n < 0) {
    return -2;
    } else if (ldda < max(1,m)) {
    return -4;
    }

    k = min(m,n);
    if (k == 0) {
    hwork[0] = *((magmaDoubleComplex*) &one);
    return MAGMA_SUCCESS;
    }

    lddwork= m;

    /* lower parts of little T must be zero: memset to 0 for simplicity */
    memset(t_ref(0,0), 0, nb*n*sizeof(magmaDoubleComplex));
    cudaMemset(dt_ref(0,0), 0, nb*n*sizeof(magmaDoubleComplex));

    /* copy first panel of A on the host */
    cublasGetMatrix(min(m, nb), n, sizeof(magmaDoubleComplex),
              da_ref(0, 0), ldda,
              v, ldv);

    /* Use blocked code initially */
    for (i = 0; i < k; i += nb) {

    ib = min(k-i, nb);
    if (i+nb >= m) ib = min(m-i, nb);
    cols = n-i;

    if (i > 0){

      /* copy panel of A from device to host */
      cublasGetMatrix(ib, n, sizeof(magmaDoubleComplex),
                      da_ref(i, 0), ldda,
                      v, ldv);

      /* Apply H' to A(i+2*ib:m, i:n) from the right */
      rows = m-old_i-2*old_ib;
      if (rows > 0){
          magma_zlarfb_gpu( MagmaRight, MagmaNoTrans, MagmaForward, MagmaRowwise,
                            rows, n-old_i, old_ib,
                            da_ref(old_i, old_i), ldda, dt_ref(0,old_i), lddt,
                            da_ref(old_i+2*old_ib, old_i), ldda,
                            dwork, lddwork);
      }

      /* copy the lower diag tile into d_A */
      CUDA_zgemerge(MagmaRight, MagmaUnit, old_ib, old_ib,
                    dd, ldd, da_ref(old_i, old_i), ldda, stream);

    }

    /* Form the triangular factor of the block reflector on the host
    H = H'(i+ib-1) . . . H(i+1) H(i) */
    CORE_zgelqt(ib, cols, ib,
              (double _Complex*) v_ref(0,i), ldv,
              (double _Complex*) t_ref(0,0), ldt,
              (double _Complex*) tau+i,
              (double _Complex*) hwork);

    if ( i + ib < m ){
      /* put 0s in the lower triangular part of a panel (and 1s on the
       diagonal); copy the lower triangular in d */
      CORE_zgesplit(MorseRight, MorseUnit, ib, min(cols,ib),
                    (double _Complex*) v_ref(0,i), ldv,
                    (double _Complex*) d, ldd);

      /* copy from host to device a tile diag */
      cublasSetMatrix( ib, min(cols,ib), sizeof(magmaDoubleComplex),
                       d, ldd, dd, ldd );
    }

    /* Send the triangular factor T to the GPU */
    cublasSetMatrix( ib, ib, sizeof(magmaDoubleComplex),
                   t_ref(0,0), ldt, dt_ref(0,i), lddt );

    /* A panel (with zeros in lower tri of its diag) is ready to be used
    in input of zlarfb_gpu: we send the panel to the gpu */
    cublasSetMatrix( ib, cols, sizeof(magmaDoubleComplex),
                   v_ref(0,i), ldv, da_ref(i,i), ldda );

    if (i + ib < m) {

      if (i+2*ib < m){
          rows = ib;
      }
      else{
          rows = m-i-ib;
      }
      /* Apply H' to A(i+ib:i+2*ib, i:n) from the right */
      magma_zlarfb_gpu( MagmaRight, MagmaNoTrans, MagmaForward, MagmaRowwise,
                        rows, cols, ib, da_ref(i,i), ldda, dt_ref(0,i),
                        lddt, da_ref(i+ib,i), ldda, dwork, lddwork);
      cudaThreadSynchronize();
      old_i = i;
      old_ib = ib;
      if (i+nb >= k){
          /* Apply H' to A(i+2*ib:m, i:n) from the right */
          rows = m-old_i-2*old_ib;
          if (rows > 0){
              magma_zlarfb_gpu( MagmaRight, MagmaNoTrans, MagmaForward, MagmaRowwise,
                                rows, cols, old_ib,
                                da_ref(old_i, old_i), ldda, dt_ref(0,old_i), lddt,
                                da_ref(old_i+2*old_ib, old_i), ldda,
                                dwork, lddwork);
          }
          /* copy the upper diag tile into d_A */
          CUDA_zgemerge(MagmaRight, MagmaUnit, old_ib, old_ib,
                        dd, ldd, da_ref(old_i, old_i), ldda, stream);
      }
    }

    }

#undef da_ref
#undef v_ref
#undef dt_ref
#undef t_ref

    return MORSE_SUCCESS;
}
#endif