Re: [eigen] Re: How to use Eigen in CUDA kernels for eigendecomposition?

On Mon, Jul 3, 2017 at 11:17 PM, Henri Menke <henrimenke@xxxxxxxxx> wrote:

On Mon, 2017-07-03 at 14:07 +0200, Gael Guennebaud wrote:
> Hi,
>
> is your code running fine despite these warnings? It could be that those warnings are generated
> from code that is never called. Compiling with --expt-relaxed-constexpr will also help.

Thanks for your reply!

The warnings don't seem to have an influence. It also possible to get rid of them by simply using
C++98 code in which case constexpr is ignored. Unfortunately, the code below does not run when I
comment in the solver.compute line. The output of the code on my test machine is

CUDA error status: 0
CUDA error status: 0
CUDA error status: 77
an illegal memory access was encountered in test.cu 73

I ran the code with the offending line commented in through cuda-memcheck (which somehow changed the
error status from 77 to 4). The output of cuda-memcheck is attached (test.mem). Running it in
cuda-gdb is not very helpful either (see attached test.gdb). Do you have any advice how to debug
this code?

Cheers, Henri

---

#include <iostream>
#include <vector>
#include <thrust/complex.h>
#include <Eigen/Eigenvalues>

static void HandleError(cudaError_t err, const char *file, int line)
{
std::cerr << "CUDA error status: " << err << '\n';
if (err != cudaSuccess)
{
    std::cerr << cudaGetErrorString(err) << " in " << file << ' ' << line << '\n';
    exit(EXIT_FAILURE);
}
}
#define HANDLE_ERROR(err) (HandleError(err, __FILE__, __LINE__))

namespace Eigen {
template<typename _Real> struct NumTraits<thrust::complex<_Real> >
    : NumTraits<std::complex<_Real> >
{
    typedef _Real Real;
    typedef typename NumTraits<_Real>::Literal Literal;
    enum {
      IsComplex = 1,
      RequireInitialization = NumTraits<_Real>::RequireInitialization,
      ReadCost = 2 * NumTraits<_Real>::ReadCost,
      AddCost = 2 * NumTraits<Real>::AddCost,
      MulCost = 4 * NumTraits<Real>::MulCost + 2 * NumTraits<Real>::AddCost
    };
};
}

namespace thrust {
template < typename T >
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
T real(complex<T> const& x) { return x.real(); }

template < typename T >
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
T imag(complex<T> const& x) { return x.imag(); }
}

__global__ void test_kernel(int n, float * r) {
typedef Eigen::Matrix<thrust::complex<float>,2,2> cuMatrix2cf;

cuMatrix2cf m;
m << 1,0, 0,1;
m *= threadIdx.x;

Eigen::SelfAdjointEigenSolver< cuMatrix2cf > solver;
//solver.compute(m, Eigen::EigenvaluesOnly);

Eigen::Vector2f const& D = solver.eigenvalues();

for (int i = 0; i < n; ++i)
{
    r[n*threadIdx.x + i] = D(i);
}
}

int main()
{
float * r;
HANDLE_ERROR( cudaMalloc(&r, 4*10*sizeof(float)) );

test_kernel<<<10,1>>>(4,r);
HANDLE_ERROR( cudaGetLastError() );

float * ev = new float[4*10];
HANDLE_ERROR( cudaMemcpy(ev, r, 4*10*sizeof(float), cudaMemcpyDeviceToHost) );

for (size_t i = 0; i < 4*10; ++i)
    std::cout << ev[i] << ' ';
std::cout << '\n';

delete[] ev;
cudaFree(r);

}

> gael
>
> On Mon, Jul 3, 2017 at 6:36 AM, Henri Menke <henrimenke@xxxxxxxxx> wrote:
> > Dear list,
> >
> > I am very sorry. I didn't compile the example with the latest development version but some
> > outdated
> > version from a GitHub mirror. With the latest version from BitBucket it compiles without these
> > loads of __host__ __device__ warnings. A few warnings are left however, namely
> >
> >
> >
> > eigen/Eigen/src/Core/MathFunctions.h(1265): warning: calling a constexpr __host__
> > function("real")
> > from a __host__ __device__ function("abs") is not allowed. The experimental flag '--expt-
> > relaxed-
> > constexpr' can be used to allow this.
> >
> > eigen/Eigen/src/Core/MathFunctions.h(1265): warning: calling a constexpr __host__
> > function("imag")
> > from a __host__ __device__ function("abs") is not allowed. The experimental flag '--expt-
> > relaxed-
> > constexpr' can be used to allow this.
> >
> > eigen/Eigen/src/Core/MathFunctions.h(1265): warning: calling a constexpr __host__ function from
> > a
> > __host__ __device__ function is not allowed. The experimental flag '--expt-relaxed-constexpr'
> > can be
> > used to allow this.
> >
> > eigen/Eigen/src/Core/MathFunctions.h(1265): warning: calling a constexpr __host__ function from
> > a
> > __host__ __device__ function is not allowed. The experimental flag '--expt-relaxed-constexpr'
> > can be
> > used to allow this.
> >
> > eigen/Eigen/src/Core/MathFunctions.h(1270): warning: calling a constexpr __host__
> > function("real")
> > from a __host__ __device__ function("abs") is not allowed. The experimental flag '--expt-
> > relaxed-
> > constexpr' can be used to allow this.
> >
> > eigen/Eigen/src/Core/MathFunctions.h(1270): warning: calling a constexpr __host__
> > function("imag")
> > from a __host__ __device__ function("abs") is not allowed. The experimental flag '--expt-
> > relaxed-
> > constexpr' can be used to allow this.
> >
> > eigen/Eigen/src/Core/MathFunctions.h(1270): warning: calling a constexpr __host__ function from
> > a
> > __host__ __device__ function is not allowed. The experimental flag '--expt-relaxed-constexpr'
> > can be
> > used to allow this.
> >
> > eigen/Eigen/src/Core/MathFunctions.h(1270): warning: calling a constexpr __host__ function from
> > a
> > __host__ __device__ function is not allowed. The experimental flag '--expt-relaxed-constexpr'
> > can be
> > used to allow this.
> >
> >
> >
> > This is slightly concerning. Did I forget to define a function? Because
> > thrust::abs(thrust::complex<T> const&) already exists.
> >
> > Cheers, Henri
> >
> > On Sun, 2017-07-02 at 13:01 +1200, Henri Menke wrote:
> > > Dear list,
> > >
> > > This is my first post to the list and I would like to use it to thank all the developers for
> > their
> > > astonishing work. Keep it up guys!
> > >
> > > Now to my problem: I am using the latest development version of Eigen and I want to calculate
> > the
> > > eigendecomposition of a matrix in a CUDA kernel.
> > >
> > > I followed »Using custom scalar types« (http://eigen.tuxfamily.org/dox-3.2/TopicCustomizingEig
> > en.h
> > > tm
> > > l#CustomScalarType) to implement support for thrust::complex..
> > >
> > > The example below compiles but is unusable.  The approximately 800 lines of nvcc V8.0.61
> > output
> > > can
> > > be view at https://gist.github.com/anonymous/757218ae4daec63ed021b143a38e76b0. There, I see a
> > lot
> > > of warnings which involve calling a __host__ function from a __host__ __device__ function. It
> > > appears to me that some functions are not correctly decorated (yet). I am very interested in
> > > helping to fix this but have so far not worked with the internals of Eigen's _expression_
> > template
> > > code. Could you please provide some help?
> > >
> > > Best regards,
> > > Henri
> > >
> > > ---
> > >
> > > The compile command line is simply
> > > $ nvcc -std=c++11 -I/path/to/eigen/ test.cu
> > >
> > >
> > > #include <iostream>
> > > #include <thrust/complex.h>
> > > #include <Eigen/Eigenvalues>
> > >
> > > static void HandleError(cudaError_t err, const char *file, int line)
> > > {
> > > if (err != cudaSuccess)
> > > {
> > >     std::cerr << cudaGetErrorString(err) << " in " << file << ' ' << line << '\n';
> > >     exit(EXIT_FAILURE);
> > > }
> > > }
> > > #define HANDLE_ERROR(err) (HandleError(err, __FILE__, __LINE__))
> > >
> > >
> > > namespace Eigen {
> > > template<typename _Real> struct NumTraits<thrust::complex<_Real> >
> > >     : NumTraits<std::complex<_Real> >
> > > {
> > >     typedef _Real Real;
> > >     typedef typename NumTraits<_Real>::Literal Literal;
> > >     enum {
> > >       IsComplex = 1,
> > >       RequireInitialization = NumTraits<_Real>::RequireInitialization,
> > >       ReadCost = 2 * NumTraits<_Real>::ReadCost,
> > >       AddCost = 2 * NumTraits<Real>::AddCost,
> > >       MulCost = 4 * NumTraits<Real>::MulCost + 2 * NumTraits<Real>::AddCost
> > >     };
> > > };
> > > }
> > >
> > >
> > > namespace thrust {
> > > template < typename T >
> > > EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
> > > T real(complex<T> const& x) { return x.real(); }
> > >
> > > template < typename T >
> > > EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
> > > T imag(complex<T> const& x) { return x.imag(); }
> > > }
> > >
> > >
> > > __global__ void test_kernel(int n, float * r) {
> > > typedef Eigen::Matrix<thrust::complex<float>,2,2> cuMatrixXcf;
> > >
> > > cuMatrixXcf m;
> > > m << 1,0, 0,1;
> > > m *= threadIdx.x;
> > >
> > > Eigen::SelfAdjointEigenSolver< cuMatrixXcf > solver;
> > > solver.compute(m, Eigen::EigenvaluesOnly);
> > >
> > > auto const& D = solver.eigenvalues();
> > >
> > > for (int i = 0; i < n; ++i)
> > >     r[n*threadIdx.x + i] = D(i);
> > > }
> > >
> > >
> > > int main()
> > > {
> > > float * r;
> > > HANDLE_ERROR( cudaMalloc(&r, 4*10*sizeof(float)) );
> > >
> > > test_kernel<<<10,1>>>(4,r);
> > >
> > > std::vector<float> ev(4*10);
> > > HANDLE_ERROR( cudaMemcpy(ev.data(), r, 4*10*sizeof(float), cudaMemcpyDeviceToHost) );
> > >
> > > for (auto const& p : ev)
> > >     std::cout << p << ' ';
> > > std::cout << '\n';
> > > }
> >
> >
> >