Re: [eigen] Re: How to use Eigen in CUDA kernels for eigendecomposition?

On Wed, Jul 5, 2017 at 3:23 PM, Henri Menke <henrimenke@xxxxxxxxx> wrote:

On Wed, 2017-07-05 at 15:28 +0200, Gael Guennebaud wrote:
> I did not paid attention, but currently only computeDirect is supported from CUDA kernels, and
> computeDirect supports only 2x2 and 3x3 real matrices.
>
> To support the more general compute() function, all its dependencies should be qualified with
> EIGEN_DEVICE_FUNC, but I cannot guaranty that all of them can be compiled on the device without
> some refactoring. Its a shame that nvcc does not trigger a compile time error when device code
> calling host-only code gets truly instantiated.

Ugh, that is inconvenient. The problem is that I need to compute the eigendecomposition of millions
of 8x8 matrices (which is the reason I want to use CUDA).

I am very interested in helping to debug this code and get it working. Unfortunately, I am not very
proficient with CUDA debugging and have not yet worked with Eigen's _expression_ template code. Can
you perhaps provide some guidance on how to proceed, which functions to look at, and what tools to
use for debugging?

I appreciate your efforts very much!

Thanks again,
Henri

>
> gael
>
> On Mon, Jul 3, 2017 at 11:17 PM, Henri Menke <henrimenke@xxxxxxxxx> wrote:
> > On Mon, 2017-07-03 at 14:07 +0200, Gael Guennebaud wrote:
> > > Hi,
> > >
> > > is your code running fine despite these warnings? It could be that those warnings are
> > generated
> > > from code that is never called. Compiling with --expt-relaxed-constexpr will also help.
> >
> > Thanks for your reply!
> >
> > The warnings don't seem to have an influence. It also possible to get rid of them by simply
> > using
> > C++98 code in which case constexpr is ignored. Unfortunately, the code below does not run when
> > I
> > comment in the solver.compute line. The output of the code on my test machine is
> >
> > CUDA error status: 0
> > CUDA error status: 0
> > CUDA error status: 77
> > an illegal memory access was encountered in test.cu 73
> >
> > I ran the code with the offending line commented in through cuda-memcheck (which somehow changed
> > the
> > error status from 77 to 4). The output of cuda-memcheck is attached (test.mem). Running it in
> > cuda-gdb is not very helpful either (see attached test.gdb). Do you have any advice how to
> > debug
> > this code?
> >
> > Cheers, Henri
> >
> > ---
> >
> > #include <iostream>
> > #include <vector>
> > #include <thrust/complex.h>
> > #include <Eigen/Eigenvalues>
> >
> > static void HandleError(cudaError_t err, const char *file, int line)
> > {
> > std::cerr << "CUDA error status: " << err << '\n';
> > if (err != cudaSuccess)
> > {
> >     std::cerr << cudaGetErrorString(err) << " in " << file << ' ' << line << '\n';
> >     exit(EXIT_FAILURE);
> > }
> > }
> > #define HANDLE_ERROR(err) (HandleError(err, __FILE__, __LINE__))
> >
> >
> > namespace Eigen {
> > template<typename _Real> struct NumTraits<thrust::complex<_Real> >
> >     : NumTraits<std::complex<_Real> >
> > {
> >     typedef _Real Real;
> >     typedef typename NumTraits<_Real>::Literal Literal;
> >     enum {
> >       IsComplex = 1,
> >       RequireInitialization = NumTraits<_Real>::RequireInitialization,
> >       ReadCost = 2 * NumTraits<_Real>::ReadCost,
> >       AddCost = 2 * NumTraits<Real>::AddCost,
> >       MulCost = 4 * NumTraits<Real>::MulCost + 2 * NumTraits<Real>::AddCost
> >     };
> > };
> > }
> >
> >
> > namespace thrust {
> > template < typename T >
> > EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
> > T real(complex<T> const& x) { return x.real(); }
> >
> > template < typename T >
> > EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
> > T imag(complex<T> const& x) { return x.imag(); }
> > }
> >
> >
> > __global__ void test_kernel(int n, float * r) {
> > typedef Eigen::Matrix<thrust::complex<float>,2,2> cuMatrix2cf;
> >
> > cuMatrix2cf m;
> > m << 1,0, 0,1;
> > m *= threadIdx.x;
> >
> > Eigen::SelfAdjointEigenSolver< cuMatrix2cf > solver;
> > //solver.compute(m, Eigen::EigenvaluesOnly);
> >
> > Eigen::Vector2f const& D = solver.eigenvalues();
> >
> > for (int i = 0; i < n; ++i)
> > {
> >     r[n*threadIdx.x + i] = D(i);
> > }
> > }
> >
> > int main()
> > {
> > float * r;
> > HANDLE_ERROR( cudaMalloc(&r, 4*10*sizeof(float)) );
> >
> > test_kernel<<<10,1>>>(4,r);
> > HANDLE_ERROR( cudaGetLastError() );
> >
> > float * ev = new float[4*10];
> > HANDLE_ERROR( cudaMemcpy(ev, r, 4*10*sizeof(float), cudaMemcpyDeviceToHost) );
> >
> > for (size_t i = 0; i < 4*10; ++i)
> >     std::cout << ev[i] << ' ';
> > std::cout << '\n';
> >
> > delete[] ev;
> > cudaFree(r);
> > }
> >
> >
> >
> > > gael
> > >
> > > On Mon, Jul 3, 2017 at 6:36 AM, Henri Menke <henrimenke@xxxxxxxxx> wrote:
> > > > Dear list,
> > > >
> > > > I am very sorry. I didn't compile the example with the latest development version but some
> > > > outdated
> > > > version from a GitHub mirror. With the latest version from BitBucket it compiles without
> > these
> > > > loads of __host__ __device__ warnings. A few warnings are left however, namely
> > > >
> > > >
> > > >
> > > > eigen/Eigen/src/Core/MathFunctions.h(1265): warning: calling a constexpr __host__
> > > > function("real")
> > > > from a __host__ __device__ function("abs") is not allowed. The experimental flag '--expt-
> > > > relaxed-
> > > > constexpr' can be used to allow this.
> > > >
> > > > eigen/Eigen/src/Core/MathFunctions.h(1265): warning: calling a constexpr __host__
> > > > function("imag")
> > > > from a __host__ __device__ function("abs") is not allowed. The experimental flag '--expt-
> > > > relaxed-
> > > > constexpr' can be used to allow this.
> > > >
> > > > eigen/Eigen/src/Core/MathFunctions.h(1265): warning: calling a constexpr __host__ function
> > from
> > > > a
> > > > __host__ __device__ function is not allowed. The experimental flag '--expt-relaxed-
> > constexpr'
> > > > can be
> > > > used to allow this.
> > > >
> > > > eigen/Eigen/src/Core/MathFunctions.h(1265): warning: calling a constexpr __host__ function
> > from
> > > > a
> > > > __host__ __device__ function is not allowed. The experimental flag '--expt-relaxed-

> > constexpr'
> > > > can be
> > > > used to allow this.
> > > >
> > > > eigen/Eigen/src/Core/MathFunctions.h(1270): warning: calling a constexpr __host__
> > > > function("real")
> > > > from a __host__ __device__ function("abs") is not allowed. The experimental flag '--expt-
> > > > relaxed-
> > > > constexpr' can be used to allow this.
> > > >
> > > > eigen/Eigen/src/Core/MathFunctions.h(1270): warning: calling a constexpr __host__
> > > > function("imag")
> > > > from a __host__ __device__ function("abs") is not allowed. The experimental flag '--expt-
> > > > relaxed-
> > > > constexpr' can be used to allow this.
> > > >
> > > > eigen/Eigen/src/Core/MathFunctions.h(1270): warning: calling a constexpr __host__ function
> > from
> > > > a
> > > > __host__ __device__ function is not allowed. The experimental flag '--expt-relaxed-
> > constexpr'
> > > > can be
> > > > used to allow this.
> > > >
> > > > eigen/Eigen/src/Core/MathFunctions.h(1270): warning: calling a constexpr __host__ function
> > from
> > > > a
> > > > __host__ __device__ function is not allowed. The experimental flag '--expt-relaxed-
> > constexpr'
> > > > can be
> > > > used to allow this.
> > > >
> > > >
> > > >

> > > > This is slightly concerning. Did I forget to define a function? Because
> > > > thrust::abs(thrust::complex<T> const&) already exists.
> > > >
> > > > Cheers, Henri
> > > >
> > > > On Sun, 2017-07-02 at 13:01 +1200, Henri Menke wrote:
> > > > > Dear list,
> > > > >
> > > > > This is my first post to the list and I would like to use it to thank all the developers
> > for
> > > > their
> > > > > astonishing work. Keep it up guys!
> > > > >
> > > > > Now to my problem: I am using the latest development version of Eigen and I want to
> > calculate
> > > > the
> > > > > eigendecomposition of a matrix in a CUDA kernel.
> > > > >
> > > > > I followed »Using custom scalar types« (http://eigen.tuxfamily.org/dox-3.2/TopicCustomizin
> > gEig
> > > > en.h
> > > > > tm
> > > > > l#CustomScalarType) to implement support for thrust::complex.
> > > > >
> > > > > The example below compiles but is unusable.  The approximately 800 lines of nvcc V8.0.61
> > > > output
> > > > > can
> > > > > be view at https://gist.github.com/anonymous/757218ae4daec63ed021b143a38e76b0. There, I
> > see a
> > > > lot
> > > > > of warnings which involve calling a __host__ function from a __host__ __device__
> > function. It
> > > > > appears to me that some functions are not correctly decorated (yet). I am very interested
> > in
> > > > > helping to fix this but have so far not worked with the internals of Eigen's _expression_
> > > > template
> > > > > code. Could you please provide some help?
> > > > >
> > > > > Best regards,
> > > > > Henri
> > > > >
> > > > > ---
> > > > >
> > > > > The compile command line is simply
> > > > > $ nvcc -std=c++11 -I/path/to/eigen/ test.cu
> > > > >
> > > > >
> > > > > #include <iostream>
> > > > > #include <thrust/complex.h>
> > > > > #include <Eigen/Eigenvalues>
> > > > >
> > > > > static void HandleError(cudaError_t err, const char *file, int line)
> > > > > {
> > > > > if (err != cudaSuccess)
> > > > > {
> > > > >     std::cerr << cudaGetErrorString(err) << " in " << file << ' ' << line << '\n';
> > > > >     exit(EXIT_FAILURE);
> > > > > }
> > > > > }
> > > > > #define HANDLE_ERROR(err) (HandleError(err, __FILE__, __LINE__))
> > > > >
> > > > >
> > > > > namespace Eigen {
> > > > > template<typename _Real> struct NumTraits<thrust::complex<_Real> >
> > > > >     : NumTraits<std::complex<_Real> >
> > > > > {
> > > > >     typedef _Real Real;
> > > > >     typedef typename NumTraits<_Real>::Literal Literal;
> > > > >     enum {
> > > > >       IsComplex = 1,
> > > > >       RequireInitialization = NumTraits<_Real>::RequireInitialization,
> > > > >       ReadCost = 2 * NumTraits<_Real>::ReadCost,
> > > > >       AddCost = 2 * NumTraits<Real>::AddCost,
> > > > >       MulCost = 4 * NumTraits<Real>::MulCost + 2 * NumTraits<Real>::AddCost
> > > > >     };
> > > > > };
> > > > > }
> > > > >
> > > > >
> > > > > namespace thrust {
> > > > > template < typename T >
> > > > > EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
> > > > > T real(complex<T> const& x) { return x.real(); }
> > > > >
> > > > > template < typename T >
> > > > > EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
> > > > > T imag(complex<T> const& x) { return x.imag(); }
> > > > > }
> > > > >
> > > > >
> > > > > __global__ void test_kernel(int n, float * r) {
> > > > > typedef Eigen::Matrix<thrust::complex<float>,2,2> cuMatrixXcf;
> > > > >
> > > > > cuMatrixXcf m;
> > > > > m << 1,0, 0,1;
> > > > > m *= threadIdx.x;
> > > > >
> > > > > Eigen::SelfAdjointEigenSolver< cuMatrixXcf > solver;
> > > > > solver.compute(m, Eigen::EigenvaluesOnly);
> > > > >
> > > > > auto const& D = solver.eigenvalues();
> > > > >
> > > > > for (int i = 0; i < n; ++i)
> > > > >     r[n*threadIdx.x + i] = D(i);
> > > > > }
> > > > >
> > > > >
> > > > > int main()
> > > > > {
> > > > > float * r;
> > > > > HANDLE_ERROR( cudaMalloc(&r, 4*10*sizeof(float)) );
> > > > >
> > > > > test_kernel<<<10,1>>>(4,r);
> > > > >
> > > > > std::vector<float> ev(4*10);
> > > > > HANDLE_ERROR( cudaMemcpy(ev.data(), r, 4*10*sizeof(float), cudaMemcpyDeviceToHost) );
> > > > >
> > > > > for (auto const& p : ev)
> > > > >     std::cout << p << ' ';
> > > > > std::cout << '\n';
> > > > > }
> > > >
> > > >
> > > >
> >