Re: [eigen] Matrix product between some MatrixType and triangularView<UnitLower>

[ Thread Index | Date Index | More lists.tuxfamily.org/eigen Archives ]


Here is my source code, the part of interest is at the top

(HHUtilities<..>).



The incrimined lines is the 47/48 : with line 47, everything works, but it

does a copy I would like to avoid.

With line 48, I do no copy, but it crashs.



"Derived" is Matrix<float,Dynamic,Dynamic> and OtherDerived is

Block<MatrixType> where MatrixType is the previous

Matrix<float,Dynamic,Dynamic> one.



On Tue, 15 Jun 2010 16:02:04 +0200, Gael Guennebaud

<gael.guennebaud@xxxxxxxxx> wrote:

> Hi,

> 

> could you resend your piece of code in a text file please, because

> inlined in the email, it is really hard to read !

> 

> Regarding the previous issues, what was it ?

> 

> 

> cheers,

> gael

> 

> 

> On Tue, Jun 15, 2010 at 3:42 PM,  <vincent.lejeune@xxxxxxxxxx> wrote:

>>

>> Hi,

>>

>>

>>

>> I fixed the issue I had that prevent me from getting a working

>>

>> implementation of QR Householder decomposition, and apply Householder

on

>>

>> the left/right function.

>>

>>

>>

>> However, Im' currently facing another issue : my implementation is

slower

>>

>> than the unblocked version of theses functions, because I'm relying on

>> the

>>

>> "triangularView" template.

>>

>> As I understand, matrix.triangularView() does not copy the matrix

object,

>>

>> and provides some comfort. However, the following

>>

>> applyBlockHouseholderOnTheLeft function does trigger a segfault :

>>

>>

>>

>>  template<typename Derived, typename OtherDerived = Matrix<

>>

>>      typename Derived::Scalar, Derived::ColsAtCompileTime, Dynamic> >

>>

>>    class HHutilities

>>

>>    {

>>

>>    public:

>>

>>      typedef Matrix<typename Derived::Scalar,

>> Derived::ColsAtCompileTime,

>>

>>          Derived::ColsAtCompileTime> TriangularFactorType;

>>

>>      typedef typename ei_plain_diag_type<Derived>::type CoeffVect;

>>

>>

>>

>>      TriangularFactorType

>>

>>      static

>>

>>      BlockHouseholderTriangularFactor(const Derived& V,

>>

>>          const CoeffVect& coeffs,const Derived& VV)

>>

>>      {

>>

>>        const int k = V.cols();

>>

>>        ei_assert(V.cols()==k && V.rows()>=k && "Vector storing Matrix

>>

>> must have same columns than coeffs size and more rows than columns");

>>

>>        TriangularFactorType ret = TriangularFactorType::Zero(k, k);

>>

>>        const int j = V.rows();

>>

>>

>>

>>        for (int i = 0; i < k; i++)

>>

>>          {

>>

>>            ret.block(0, i, i, 1) = -  coeffs(i)

>>

>>                * V.block(i, 0, j - i, i).transpose() * V.block(i, i, j

-

>>

>> i, 1);

>>

>>            ret.block(0, i, i, 1) = ret.block(0, 0, i, i) * ret.block(0,

>>

>> i, i,

>>

>>                1);

>>

>>            ret(i, i) = coeffs(i);

>>

>>          }

>>

>>

>>

>>        return ret;

>>

>>      }

>>

>>

>>

>>      static

>>

>>      void

>>

>>      applyBlockHouseholderOnTheLeft(OtherDerived& A, const Derived& V,

>>

>>          const CoeffVect& coeffs)

>>

>>      {

>>

>>          const TriangularView<Derived,UnitLower>& Vtmp = V.template

>>

>> triangularView<UnitLower> ();

>>

>>

>>

>>        const TriangularFactorType& T =

>>

>> BlockHouseholderTriangularFactor(Vtmp,

>>

>>            coeffs,V);

>>

>>        A -= Vtmp * T.transpose() * Vtmp.transpose() * A;

>>

>>      }

>>

>> }

>>

>>

>>

>> With OtherDerived=Block<MatrixType>, and

>>

>> MatrixType=Matrix<float,Dynamic,Dynamic>.

>>

>> I'm forced to make a copy of the TriangularView :

>>

>>

>>

>> const Derived& Vtmp(V.template triangularView<UnitLower> ());

>>

>>

>>

>> (instead of

>>

>> const TriangularView<Derived,UnitLower>& Vtmp = V.template

>>

>> triangularView<UnitLower> ()

>>

>> )

>>

>>

>>

>> to get my code to work. However, this is a bad workaround, as it makes

an

>>

>> additional copy.

>>

>> The overhead is not that big (0.1 s on an old P4 3Ghz for a 256x256

>>

>> matrix), but it should not be there, as theoricaly, blocked computation

>> are

>>

>> better on big matrix...So I hope to improve things if I can get rid of

>> this

>>

>> avoidable matrix copy.

>>

>>

>>

>> Thanks,

>>

>> Vincent

>>

>>

>>
#include <GPUmat.h>
#include <Eigen/Eigen>
using namespace Eigen;

#define NB 8

namespace GPU
{

  template<typename Derived, typename OtherDerived = Matrix<
      typename Derived::Scalar, Derived::ColsAtCompileTime, Dynamic> >
    class HHutilities
    {
    public:
      typedef Matrix<typename Derived::Scalar, Derived::ColsAtCompileTime,
          Derived::ColsAtCompileTime> TriangularFactorType;
      typedef typename ei_plain_diag_type<Derived>::type CoeffVect;

      TriangularFactorType
      static
      BlockHouseholderTriangularFactor(const Derived& V,
          const CoeffVect& coeffs,const Derived& VV)
      {
        const int k = V.cols();
        ei_assert(V.cols()==k && V.rows()>=k && "Vector storing Matrix must have same columns than coeffs size and more rows than columns");
        TriangularFactorType ret = TriangularFactorType::Zero(k, k);
        const int j = V.rows();

        for (int i = 0; i < k; i++)
          {
            ret.block(0, i, i, 1) = -  coeffs(i)
                * V.block(i, 0, j - i, i).transpose() * V.block(i, i, j - i, 1);
            ret.block(0, i, i, 1) = ret.block(0, 0, i, i) * ret.block(0, i, i,
                1);
            ret(i, i) = coeffs(i);
          }

        return ret;
      }

      static
      void
      applyBlockHouseholderOnTheLeft(OtherDerived& A, const Derived& V,
          const CoeffVect& coeffs)
      {

        const Derived& Vtmp(V.template triangularView<UnitLower> ());
  	  //const TriangularView<Derived,UnitLower>& Vtmp = V.template triangularView<UnitLower> ();

        const TriangularFactorType& T = BlockHouseholderTriangularFactor(Vtmp,
            coeffs,V);
        A -= Vtmp * T.transpose() * Vtmp.transpose() * A;
      }

      static
      void
      applyBlockHouseholderOnTheRight(OtherDerived& A, const Derived& V,
          const CoeffVect& coeffs)
      {

        //const Derived& Vtmp(V.template triangularView<UnitLower> ());
    	const  TriangularView<Derived,UnitLower>& Vtmp = V.template triangularView<UnitLower> ();

        const TriangularFactorType& T = BlockHouseholderTriangularFactor(Vtmp,
            coeffs);

        A -= A * Vtmp * T * Vtmp.transpose();
      }

    };








  template<typename MatrixType,int BLOCKSIZE>
    class HouseholderQR2 : public HouseholderQR<MatrixType>
    {
    public:
      typedef HouseholderQR<MatrixType> Base;

      typedef Block<MatrixType, Dynamic, Dynamic> BlockType;
      typedef typename MatrixType::Scalar RealScalar;
      typedef Matrix<typename MatrixType::Scalar, Dynamic, Dynamic>
          DynamicMatrix;
      typedef typename HHutilities<DynamicMatrix,DynamicMatrix>::TriangularFactorType
          TriangularFactorType;
      typedef typename HHutilities<DynamicMatrix,DynamicMatrix>::CoeffVect HCoeffsType;


      void
      unblockedCompute(Block<MatrixType>& m_qr)
      {
        int rows = m_qr.rows();
        int cols = m_qr.cols();
        int size = std::min(rows, cols);



        Base::m_temp.resize(cols);

        for (int k = 0; k < size; ++k)
          {
            int remainingRows = rows - k;
            int remainingCols = cols - k - 1;

            RealScalar beta;
            m_qr.col(k).tail(remainingRows).makeHouseholderInPlace(
            		Base::m_temp.coeffRef(k), beta);
            m_qr.coeffRef(k, k) = beta;


            // apply H to remaining part of m_qr from the left
            m_qr.bottomRightCorner(remainingRows, remainingCols) .applyHouseholderOnTheLeft(
                m_qr.col(k).tail(remainingRows - 1), Base::m_temp.coeffRef(k),
                &(Base::m_temp.coeffRef(k + 1)));
          }
      }

      HouseholderQR2<MatrixType,BLOCKSIZE>&
      compute(const MatrixType& matrix)
      {
        int rows = matrix.rows();
        int cols = matrix.cols();
        int size = std::min(rows, cols);

        Base::m_qr = matrix;

        Base::m_temp.resize(BLOCKSIZE);
        for (int k = 0; k < size; k += BLOCKSIZE)
          {
            int remainingRows = rows - k;
            int remainingCols = cols - k;



            BlockType tmp=Base::m_qr.block(k,k,rows-k,BLOCKSIZE);
            unblockedCompute(tmp);


            DynamicMatrix dtmp(tmp);
            BlockType dtmp2(Base::m_qr.block(k,k+BLOCKSIZE,rows-k,cols-k-BLOCKSIZE));

            HHutilities<DynamicMatrix,BlockType>::applyBlockHouseholderOnTheLeft(dtmp2,dtmp,Base::m_temp);


          }
        Base::m_isInitialized = true;
        return *this;
      }

    };

/*template<typename MatrixType>
 void QRFact(const MatrixType& mat)
 {
 MatrixType m_qr(mat);
 typedef typename MatrixType::Scalar Scalar;
 typedef typename GPUmatrix<MatrixType> GPTR;

 int m=mat.rows(),n=mat.cols();

 //T* TAU = static_cast<T*>(malloc(m*sizeof(T)));
 Scalar* TAU = new Scalar[m];

 GPTR gmat(m_qr);

 //    matrix_flushable<T> tmp(NB,NB);


 for (int J=1;J<min(m,n);J+=NB)
 {
 int JB=min(min(m,n)-J+1,NB);

 gmat.flush_to_cpu(J-1,J-1,m,J+NB-1);
 Block<MatrixType> tmp(m_qr.block(J-1,J-1,m-J,NB));
 tmp.HouseholderQR();
 step1(A,J,JB,TAU);
 if (J+JB<m)
 {
 step2(A,tmp,J,JB,TAU);

 step3(A,tmp,J,NB);
 A.move_gpu_cpu_pivot(0,NB);
 }
 }


 }*/

}

/*
 template<typename Tp>
 void step2(matrix_square_algorithm<Tp>& A,matrix_flushable<Tp>& B,int j,int jb,Tp* tau)
 {

 int sz=A.get_gpu_part_height();
 _larft('F','C',sz,jb,A.get_content_at(j,j),A.get_ld(),&tau[j-1],B.get_cpuptr(),B.get_ld());
 B.flush_to_gpu(jb,jb);
 }

 template<typename Tp>
 void step3(matrix_square_algorithm<Tp> & A,matrix_flushable<Tp> &T,int j,int jb)
 {
 A.move_gpu_cpu_pivot(-NB,0);
 int sz=A.get_gpu_part_height();
 int MI1=A.get_rows()-j+1;
 int NIIB1=A.get_columns()-j-jb+1;
 int lda=A.get_rows();
 matrix_flushable<Tp> WORKS(lda,jb);

 cublas_makelowerunit(jb,jb,A.get_ld(),A.get_gpuptr_at(j,j));

 cublas_gemm('T','T',jb,sz,jb,1.0f,T.get_gpuptr(),T.get_ld(),A.get_gpuptr_at(j,j),A.get_ld(),0,WORKS.get_gpuptr(),WORKS.get_ld());
 cublas_gemm('N','N',jb,sz-jb,sz,1.0f,WORKS.get_gpuptr(),WORKS.get_ld(),A.get_gpuptr_at(j,j+jb),A.get_ld(),0,WORKS.get_gpuptr(),WORKS.get_ld());
 cublas_gemm('N','N',sz,sz-jb,jb,-1.0f,A.get_gpuptr_at(j,j),A.get_ld(),WORKS.get_gpuptr(),WORKS.get_ld(),1.0f,A.get_gpuptr_at(j,j+jb),A.get_ld());
 A.move_gpu_cpu_pivot_without_sync(NB,0);
 }*/


Mail converted by MHonArc 2.6.19+ http://listengine.tuxfamily.org/