Re: [eigen] Re: How to use Eigen in CUDA kernels for eigendecomposition?

[ Thread Index | Date Index | More lists.tuxfamily.org/eigen Archives ]


On Wed, 2017-07-05 at 15:28 +0200, Gael Guennebaud wrote:
> I did not paid attention, but currently only computeDirect is supported from CUDA kernels, and
> computeDirect supports only 2x2 and 3x3 real matrices.
> 
> To support the more general compute() function, all its dependencies should be qualified with
> EIGEN_DEVICE_FUNC, but I cannot guaranty that all of them can be compiled on the device without
> some refactoring. Its a shame that nvcc does not trigger a compile time error when device code
> calling host-only code gets truly instantiated.

Ugh, that is inconvenient.  The problem is that I need to compute the eigendecomposition of millions
of 8x8 matrices (which is the reason I want to use CUDA).

I am very interested in helping to debug this code and get it working.  Unfortunately, I am not very
proficient with CUDA debugging and have not yet worked with Eigen's expression template code.  Can
you perhaps provide some guidance on how to proceed, which functions to look at, and what tools to
use for debugging?

I appreciate your efforts very much!

Thanks again,
Henri

> 
> gael
> 
> On Mon, Jul 3, 2017 at 11:17 PM, Henri Menke <henrimenke@xxxxxxxxx> wrote:
> > On Mon, 2017-07-03 at 14:07 +0200, Gael Guennebaud wrote:
> > > Hi,
> > >
> > > is your code running fine despite these warnings?  It could be that those warnings are
> > generated
> > > from code that is never called. Compiling with --expt-relaxed-constexpr will also help.
> > 
> > Thanks for your reply!
> > 
> > The warnings don't seem to have an influence.  It also possible to get rid of them by simply
> > using
> > C++98 code in which case constexpr is ignored.  Unfortunately, the code below does not run when
> > I
> > comment in the solver.compute line.  The output of the code on my test machine is
> > 
> > CUDA error status: 0
> > CUDA error status: 0
> > CUDA error status: 77
> > an illegal memory access was encountered in test.cu 73
> > 
> > I ran the code with the offending line commented in through cuda-memcheck (which somehow changed
> > the
> > error status from 77 to 4).  The output of cuda-memcheck is attached (test.mem).  Running it in
> > cuda-gdb is not very helpful either (see attached test.gdb).  Do you have any advice how to
> > debug
> > this code?
> > 
> > Cheers, Henri
> > 
> > ---
> > 
> > #include <iostream>
> > #include <vector>
> > #include <thrust/complex.h>
> > #include <Eigen/Eigenvalues>
> > 
> > static void HandleError(cudaError_t err, const char *file, int line)
> > {
> >   std::cerr << "CUDA error status: " << err << '\n';
> >   if (err != cudaSuccess)
> >   {
> >     std::cerr << cudaGetErrorString(err) << " in " << file << ' ' << line << '\n';
> >     exit(EXIT_FAILURE);
> >   }
> > }
> > #define HANDLE_ERROR(err) (HandleError(err, __FILE__, __LINE__))
> > 
> > 
> > namespace Eigen {
> >   template<typename _Real> struct NumTraits<thrust::complex<_Real> >
> >     : NumTraits<std::complex<_Real> >
> >   {
> >     typedef _Real Real;
> >     typedef typename NumTraits<_Real>::Literal Literal;
> >     enum {
> >       IsComplex = 1,
> >       RequireInitialization = NumTraits<_Real>::RequireInitialization,
> >       ReadCost = 2 * NumTraits<_Real>::ReadCost,
> >       AddCost = 2 * NumTraits<Real>::AddCost,
> >       MulCost = 4 * NumTraits<Real>::MulCost + 2 * NumTraits<Real>::AddCost
> >     };
> >   };
> > }
> > 
> > 
> > namespace thrust {
> >   template < typename T >
> >   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
> >   T real(complex<T> const& x) { return x.real(); }
> > 
> >   template < typename T >
> >   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
> >   T imag(complex<T> const& x) { return x.imag(); }
> > }
> > 
> > 
> > __global__ void test_kernel(int n, float * r) {
> >   typedef Eigen::Matrix<thrust::complex<float>,2,2> cuMatrix2cf;
> > 
> >   cuMatrix2cf m;
> >   m << 1,0, 0,1;
> >   m *= threadIdx.x;
> > 
> >   Eigen::SelfAdjointEigenSolver< cuMatrix2cf > solver;
> > //solver.compute(m, Eigen::EigenvaluesOnly);
> > 
> >   Eigen::Vector2f const& D = solver.eigenvalues();
> > 
> >   for (int i = 0; i < n; ++i)
> >   {
> >     r[n*threadIdx.x + i] = D(i);
> >   }
> > }
> > 
> > int main()
> > {
> >   float * r;
> >   HANDLE_ERROR( cudaMalloc(&r, 4*10*sizeof(float)) );
> > 
> >   test_kernel<<<10,1>>>(4,r);
> >   HANDLE_ERROR( cudaGetLastError() );
> > 
> >   float * ev = new float[4*10];
> >   HANDLE_ERROR( cudaMemcpy(ev, r, 4*10*sizeof(float), cudaMemcpyDeviceToHost) );
> > 
> >   for (size_t i = 0; i < 4*10; ++i)
> >     std::cout << ev[i] << ' ';
> >   std::cout << '\n';
> > 
> >   delete[] ev;
> >   cudaFree(r);
> > }
> > 
> > 
> > 
> > > gael
> > >
> > > On Mon, Jul 3, 2017 at 6:36 AM, Henri Menke <henrimenke@xxxxxxxxx> wrote:
> > > > Dear list,
> > > >
> > > > I am very sorry.  I didn't compile the example with the latest development version but some
> > > > outdated
> > > > version from a GitHub mirror.  With the latest version from BitBucket it compiles without
> > these
> > > > loads of __host__ __device__ warnings.  A few warnings are left however, namely
> > > >
> > > >
> > > >
> > > > eigen/Eigen/src/Core/MathFunctions.h(1265): warning: calling a constexpr __host__
> > > > function("real")
> > > > from a __host__ __device__ function("abs") is not allowed. The experimental flag '--expt-
> > > > relaxed-
> > > > constexpr' can be used to allow this.
> > > >
> > > > eigen/Eigen/src/Core/MathFunctions.h(1265): warning: calling a constexpr __host__
> > > > function("imag")
> > > > from a __host__ __device__ function("abs") is not allowed. The experimental flag '--expt-
> > > > relaxed-
> > > > constexpr' can be used to allow this.
> > > >
> > > > eigen/Eigen/src/Core/MathFunctions.h(1265): warning: calling a constexpr __host__ function
> > from
> > > > a
> > > > __host__ __device__ function is not allowed. The experimental flag '--expt-relaxed-
> > constexpr'
> > > > can be
> > > > used to allow this.
> > > >
> > > > eigen/Eigen/src/Core/MathFunctions.h(1265): warning: calling a constexpr __host__ function
> > from
> > > > a
> > > > __host__ __device__ function is not allowed. The experimental flag '--expt-relaxed-
> > constexpr'
> > > > can be
> > > > used to allow this.
> > > >
> > > > eigen/Eigen/src/Core/MathFunctions.h(1270): warning: calling a constexpr __host__
> > > > function("real")
> > > > from a __host__ __device__ function("abs") is not allowed. The experimental flag '--expt-
> > > > relaxed-
> > > > constexpr' can be used to allow this.
> > > >
> > > > eigen/Eigen/src/Core/MathFunctions.h(1270): warning: calling a constexpr __host__
> > > > function("imag")
> > > > from a __host__ __device__ function("abs") is not allowed. The experimental flag '--expt-
> > > > relaxed-
> > > > constexpr' can be used to allow this.
> > > >
> > > > eigen/Eigen/src/Core/MathFunctions.h(1270): warning: calling a constexpr __host__ function
> > from
> > > > a
> > > > __host__ __device__ function is not allowed. The experimental flag '--expt-relaxed-
> > constexpr'
> > > > can be
> > > > used to allow this.
> > > >
> > > > eigen/Eigen/src/Core/MathFunctions.h(1270): warning: calling a constexpr __host__ function
> > from
> > > > a
> > > > __host__ __device__ function is not allowed. The experimental flag '--expt-relaxed-
> > constexpr'
> > > > can be
> > > > used to allow this.
> > > >
> > > >
> > > >
> > > > This is slightly concerning.  Did I forget to define a function?  Because
> > > > thrust::abs(thrust::complex<T> const&) already exists.
> > > >
> > > > Cheers, Henri
> > > >
> > > > On Sun, 2017-07-02 at 13:01 +1200, Henri Menke wrote:
> > > > > Dear list,
> > > > >
> > > > > This is my first post to the list and I would like to use it to thank all the developers
> > for
> > > > their
> > > > > astonishing work.  Keep it up guys!
> > > > >
> > > > > Now to my problem: I am using the latest development version of Eigen and I want to
> > calculate
> > > > the
> > > > > eigendecomposition of a matrix in a CUDA kernel.
> > > > >
> > > > > I followed »Using custom scalar types« (http://eigen.tuxfamily.org/dox-3.2/TopicCustomizin
> > gEig
> > > > en.h
> > > > > tm
> > > > > l#CustomScalarType) to implement support for thrust::complex.
> > > > >
> > > > > The example below compiles but is unusable.  The approximately 800 lines of nvcc V8.0.61
> > > > output
> > > > > can
> > > > > be view at https://gist.github.com/anonymous/757218ae4daec63ed021b143a38e76b0. ; There, I
> > see a
> > > > lot
> > > > > of warnings which involve calling a __host__ function from a __host__ __device__
> > function.  It
> > > > > appears to me that some functions are not correctly decorated (yet).  I am very interested
> > in
> > > > > helping to fix this but have so far not worked with the internals of Eigen's expression
> > > > template
> > > > > code.  Could you please provide some help?
> > > > >
> > > > > Best regards,
> > > > > Henri
> > > > >
> > > > > ---
> > > > >
> > > > > The compile command line is simply
> > > > > $ nvcc -std=c++11 -I/path/to/eigen/ test.cu
> > > > >
> > > > >
> > > > > #include <iostream>
> > > > > #include <thrust/complex.h>
> > > > > #include <Eigen/Eigenvalues>
> > > > >
> > > > > static void HandleError(cudaError_t err, const char *file, int line)
> > > > > {
> > > > >   if (err != cudaSuccess)
> > > > >   {
> > > > >     std::cerr << cudaGetErrorString(err) << " in " << file << ' ' << line << '\n';
> > > > >     exit(EXIT_FAILURE);
> > > > >   }
> > > > > }
> > > > > #define HANDLE_ERROR(err) (HandleError(err, __FILE__, __LINE__))
> > > > >
> > > > >
> > > > > namespace Eigen {
> > > > >   template<typename _Real> struct NumTraits<thrust::complex<_Real> >
> > > > >     : NumTraits<std::complex<_Real> >
> > > > >   {
> > > > >     typedef _Real Real;
> > > > >     typedef typename NumTraits<_Real>::Literal Literal;
> > > > >     enum {
> > > > >       IsComplex = 1,
> > > > >       RequireInitialization = NumTraits<_Real>::RequireInitialization,
> > > > >       ReadCost = 2 * NumTraits<_Real>::ReadCost,
> > > > >       AddCost = 2 * NumTraits<Real>::AddCost,
> > > > >       MulCost = 4 * NumTraits<Real>::MulCost + 2 * NumTraits<Real>::AddCost
> > > > >     };
> > > > >   };
> > > > > }
> > > > >
> > > > >
> > > > > namespace thrust {
> > > > >   template < typename T >
> > > > >   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
> > > > >   T real(complex<T> const& x) { return x.real(); }
> > > > >
> > > > >   template < typename T >
> > > > >   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
> > > > >   T imag(complex<T> const& x) { return x.imag(); }
> > > > > }
> > > > >
> > > > >
> > > > > __global__ void test_kernel(int n, float * r) {
> > > > >   typedef Eigen::Matrix<thrust::complex<float>,2,2> cuMatrixXcf;
> > > > >
> > > > >   cuMatrixXcf m;
> > > > >   m << 1,0, 0,1;
> > > > >   m *= threadIdx.x;
> > > > >
> > > > >   Eigen::SelfAdjointEigenSolver< cuMatrixXcf > solver;
> > > > >   solver.compute(m, Eigen::EigenvaluesOnly);
> > > > >
> > > > >   auto const& D = solver.eigenvalues();
> > > > >   
> > > > >   for (int i = 0; i < n; ++i)
> > > > >     r[n*threadIdx.x + i] = D(i);
> > > > > }
> > > > >
> > > > >
> > > > > int main()
> > > > > {
> > > > >   float * r;
> > > > >   HANDLE_ERROR( cudaMalloc(&r, 4*10*sizeof(float)) );
> > > > >
> > > > >   test_kernel<<<10,1>>>(4,r);
> > > > >
> > > > >   std::vector<float> ev(4*10);
> > > > >   HANDLE_ERROR( cudaMemcpy(ev.data(), r, 4*10*sizeof(float), cudaMemcpyDeviceToHost) );
> > > > >
> > > > >   for (auto const& p : ev)
> > > > >     std::cout << p << ' ';
> > > > >   std::cout << '\n';
> > > > > }
> > > >
> > > >
> > > >
> > 



Mail converted by MHonArc 2.6.19+ http://listengine.tuxfamily.org/