Re: [eigen] Matrix product between some MatrixType and triangularView<UnitLower> |
[ Thread Index |
Date Index
| More lists.tuxfamily.org/eigen Archives
]
Here is my source code, the part of interest is at the top
(HHUtilities<..>).
The incrimined lines is the 47/48 : with line 47, everything works, but it
does a copy I would like to avoid.
With line 48, I do no copy, but it crashs.
"Derived" is Matrix<float,Dynamic,Dynamic> and OtherDerived is
Block<MatrixType> where MatrixType is the previous
Matrix<float,Dynamic,Dynamic> one.
On Tue, 15 Jun 2010 16:02:04 +0200, Gael Guennebaud
<gael.guennebaud@xxxxxxxxx> wrote:
> Hi,
>
> could you resend your piece of code in a text file please, because
> inlined in the email, it is really hard to read !
>
> Regarding the previous issues, what was it ?
>
>
> cheers,
> gael
>
>
> On Tue, Jun 15, 2010 at 3:42 PM, <vincent.lejeune@xxxxxxxxxx> wrote:
>>
>> Hi,
>>
>>
>>
>> I fixed the issue I had that prevent me from getting a working
>>
>> implementation of QR Householder decomposition, and apply Householder
on
>>
>> the left/right function.
>>
>>
>>
>> However, Im' currently facing another issue : my implementation is
slower
>>
>> than the unblocked version of theses functions, because I'm relying on
>> the
>>
>> "triangularView" template.
>>
>> As I understand, matrix.triangularView() does not copy the matrix
object,
>>
>> and provides some comfort. However, the following
>>
>> applyBlockHouseholderOnTheLeft function does trigger a segfault :
>>
>>
>>
>> template<typename Derived, typename OtherDerived = Matrix<
>>
>> typename Derived::Scalar, Derived::ColsAtCompileTime, Dynamic> >
>>
>> class HHutilities
>>
>> {
>>
>> public:
>>
>> typedef Matrix<typename Derived::Scalar,
>> Derived::ColsAtCompileTime,
>>
>> Derived::ColsAtCompileTime> TriangularFactorType;
>>
>> typedef typename ei_plain_diag_type<Derived>::type CoeffVect;
>>
>>
>>
>> TriangularFactorType
>>
>> static
>>
>> BlockHouseholderTriangularFactor(const Derived& V,
>>
>> const CoeffVect& coeffs,const Derived& VV)
>>
>> {
>>
>> const int k = V.cols();
>>
>> ei_assert(V.cols()==k && V.rows()>=k && "Vector storing Matrix
>>
>> must have same columns than coeffs size and more rows than columns");
>>
>> TriangularFactorType ret = TriangularFactorType::Zero(k, k);
>>
>> const int j = V.rows();
>>
>>
>>
>> for (int i = 0; i < k; i++)
>>
>> {
>>
>> ret.block(0, i, i, 1) = - coeffs(i)
>>
>> * V.block(i, 0, j - i, i).transpose() * V.block(i, i, j
-
>>
>> i, 1);
>>
>> ret.block(0, i, i, 1) = ret.block(0, 0, i, i) * ret.block(0,
>>
>> i, i,
>>
>> 1);
>>
>> ret(i, i) = coeffs(i);
>>
>> }
>>
>>
>>
>> return ret;
>>
>> }
>>
>>
>>
>> static
>>
>> void
>>
>> applyBlockHouseholderOnTheLeft(OtherDerived& A, const Derived& V,
>>
>> const CoeffVect& coeffs)
>>
>> {
>>
>> const TriangularView<Derived,UnitLower>& Vtmp = V.template
>>
>> triangularView<UnitLower> ();
>>
>>
>>
>> const TriangularFactorType& T =
>>
>> BlockHouseholderTriangularFactor(Vtmp,
>>
>> coeffs,V);
>>
>> A -= Vtmp * T.transpose() * Vtmp.transpose() * A;
>>
>> }
>>
>> }
>>
>>
>>
>> With OtherDerived=Block<MatrixType>, and
>>
>> MatrixType=Matrix<float,Dynamic,Dynamic>.
>>
>> I'm forced to make a copy of the TriangularView :
>>
>>
>>
>> const Derived& Vtmp(V.template triangularView<UnitLower> ());
>>
>>
>>
>> (instead of
>>
>> const TriangularView<Derived,UnitLower>& Vtmp = V.template
>>
>> triangularView<UnitLower> ()
>>
>> )
>>
>>
>>
>> to get my code to work. However, this is a bad workaround, as it makes
an
>>
>> additional copy.
>>
>> The overhead is not that big (0.1 s on an old P4 3Ghz for a 256x256
>>
>> matrix), but it should not be there, as theoricaly, blocked computation
>> are
>>
>> better on big matrix...So I hope to improve things if I can get rid of
>> this
>>
>> avoidable matrix copy.
>>
>>
>>
>> Thanks,
>>
>> Vincent
>>
>>
>>
#include <GPUmat.h>
#include <Eigen/Eigen>
using namespace Eigen;
#define NB 8
namespace GPU
{
template<typename Derived, typename OtherDerived = Matrix<
typename Derived::Scalar, Derived::ColsAtCompileTime, Dynamic> >
class HHutilities
{
public:
typedef Matrix<typename Derived::Scalar, Derived::ColsAtCompileTime,
Derived::ColsAtCompileTime> TriangularFactorType;
typedef typename ei_plain_diag_type<Derived>::type CoeffVect;
TriangularFactorType
static
BlockHouseholderTriangularFactor(const Derived& V,
const CoeffVect& coeffs,const Derived& VV)
{
const int k = V.cols();
ei_assert(V.cols()==k && V.rows()>=k && "Vector storing Matrix must have same columns than coeffs size and more rows than columns");
TriangularFactorType ret = TriangularFactorType::Zero(k, k);
const int j = V.rows();
for (int i = 0; i < k; i++)
{
ret.block(0, i, i, 1) = - coeffs(i)
* V.block(i, 0, j - i, i).transpose() * V.block(i, i, j - i, 1);
ret.block(0, i, i, 1) = ret.block(0, 0, i, i) * ret.block(0, i, i,
1);
ret(i, i) = coeffs(i);
}
return ret;
}
static
void
applyBlockHouseholderOnTheLeft(OtherDerived& A, const Derived& V,
const CoeffVect& coeffs)
{
const Derived& Vtmp(V.template triangularView<UnitLower> ());
//const TriangularView<Derived,UnitLower>& Vtmp = V.template triangularView<UnitLower> ();
const TriangularFactorType& T = BlockHouseholderTriangularFactor(Vtmp,
coeffs,V);
A -= Vtmp * T.transpose() * Vtmp.transpose() * A;
}
static
void
applyBlockHouseholderOnTheRight(OtherDerived& A, const Derived& V,
const CoeffVect& coeffs)
{
//const Derived& Vtmp(V.template triangularView<UnitLower> ());
const TriangularView<Derived,UnitLower>& Vtmp = V.template triangularView<UnitLower> ();
const TriangularFactorType& T = BlockHouseholderTriangularFactor(Vtmp,
coeffs);
A -= A * Vtmp * T * Vtmp.transpose();
}
};
template<typename MatrixType,int BLOCKSIZE>
class HouseholderQR2 : public HouseholderQR<MatrixType>
{
public:
typedef HouseholderQR<MatrixType> Base;
typedef Block<MatrixType, Dynamic, Dynamic> BlockType;
typedef typename MatrixType::Scalar RealScalar;
typedef Matrix<typename MatrixType::Scalar, Dynamic, Dynamic>
DynamicMatrix;
typedef typename HHutilities<DynamicMatrix,DynamicMatrix>::TriangularFactorType
TriangularFactorType;
typedef typename HHutilities<DynamicMatrix,DynamicMatrix>::CoeffVect HCoeffsType;
void
unblockedCompute(Block<MatrixType>& m_qr)
{
int rows = m_qr.rows();
int cols = m_qr.cols();
int size = std::min(rows, cols);
Base::m_temp.resize(cols);
for (int k = 0; k < size; ++k)
{
int remainingRows = rows - k;
int remainingCols = cols - k - 1;
RealScalar beta;
m_qr.col(k).tail(remainingRows).makeHouseholderInPlace(
Base::m_temp.coeffRef(k), beta);
m_qr.coeffRef(k, k) = beta;
// apply H to remaining part of m_qr from the left
m_qr.bottomRightCorner(remainingRows, remainingCols) .applyHouseholderOnTheLeft(
m_qr.col(k).tail(remainingRows - 1), Base::m_temp.coeffRef(k),
&(Base::m_temp.coeffRef(k + 1)));
}
}
HouseholderQR2<MatrixType,BLOCKSIZE>&
compute(const MatrixType& matrix)
{
int rows = matrix.rows();
int cols = matrix.cols();
int size = std::min(rows, cols);
Base::m_qr = matrix;
Base::m_temp.resize(BLOCKSIZE);
for (int k = 0; k < size; k += BLOCKSIZE)
{
int remainingRows = rows - k;
int remainingCols = cols - k;
BlockType tmp=Base::m_qr.block(k,k,rows-k,BLOCKSIZE);
unblockedCompute(tmp);
DynamicMatrix dtmp(tmp);
BlockType dtmp2(Base::m_qr.block(k,k+BLOCKSIZE,rows-k,cols-k-BLOCKSIZE));
HHutilities<DynamicMatrix,BlockType>::applyBlockHouseholderOnTheLeft(dtmp2,dtmp,Base::m_temp);
}
Base::m_isInitialized = true;
return *this;
}
};
/*template<typename MatrixType>
void QRFact(const MatrixType& mat)
{
MatrixType m_qr(mat);
typedef typename MatrixType::Scalar Scalar;
typedef typename GPUmatrix<MatrixType> GPTR;
int m=mat.rows(),n=mat.cols();
//T* TAU = static_cast<T*>(malloc(m*sizeof(T)));
Scalar* TAU = new Scalar[m];
GPTR gmat(m_qr);
// matrix_flushable<T> tmp(NB,NB);
for (int J=1;J<min(m,n);J+=NB)
{
int JB=min(min(m,n)-J+1,NB);
gmat.flush_to_cpu(J-1,J-1,m,J+NB-1);
Block<MatrixType> tmp(m_qr.block(J-1,J-1,m-J,NB));
tmp.HouseholderQR();
step1(A,J,JB,TAU);
if (J+JB<m)
{
step2(A,tmp,J,JB,TAU);
step3(A,tmp,J,NB);
A.move_gpu_cpu_pivot(0,NB);
}
}
}*/
}
/*
template<typename Tp>
void step2(matrix_square_algorithm<Tp>& A,matrix_flushable<Tp>& B,int j,int jb,Tp* tau)
{
int sz=A.get_gpu_part_height();
_larft('F','C',sz,jb,A.get_content_at(j,j),A.get_ld(),&tau[j-1],B.get_cpuptr(),B.get_ld());
B.flush_to_gpu(jb,jb);
}
template<typename Tp>
void step3(matrix_square_algorithm<Tp> & A,matrix_flushable<Tp> &T,int j,int jb)
{
A.move_gpu_cpu_pivot(-NB,0);
int sz=A.get_gpu_part_height();
int MI1=A.get_rows()-j+1;
int NIIB1=A.get_columns()-j-jb+1;
int lda=A.get_rows();
matrix_flushable<Tp> WORKS(lda,jb);
cublas_makelowerunit(jb,jb,A.get_ld(),A.get_gpuptr_at(j,j));
cublas_gemm('T','T',jb,sz,jb,1.0f,T.get_gpuptr(),T.get_ld(),A.get_gpuptr_at(j,j),A.get_ld(),0,WORKS.get_gpuptr(),WORKS.get_ld());
cublas_gemm('N','N',jb,sz-jb,sz,1.0f,WORKS.get_gpuptr(),WORKS.get_ld(),A.get_gpuptr_at(j,j+jb),A.get_ld(),0,WORKS.get_gpuptr(),WORKS.get_ld());
cublas_gemm('N','N',sz,sz-jb,jb,-1.0f,A.get_gpuptr_at(j,j),A.get_ld(),WORKS.get_gpuptr(),WORKS.get_ld(),1.0f,A.get_gpuptr_at(j,j+jb),A.get_ld());
A.move_gpu_cpu_pivot_without_sync(NB,0);
}*/