Re: [eigen] two things

[ Thread Index | Date Index | More lists.tuxfamily.org/eigen Archives ]


ok, so I benched a = a + b, 400x400 (10000 repetitions)

hand coded vector with loop peeling:
1.0101 sec 1.47521 GFlops

VectorXf(400*400):
1.50368 0.990978 GFlops

MatrixXf, various configuration inner/outer:
400 x 400  1.69703s   0.878074 GFlops
320 x 500  1.61007s   0.925497 GFlops
256 x 625  1.65798s   0.898753 GFlops
250 x 640  1.72979s   0.861445 GFlops
200 x 800  1.7328s   0.859946 GFlops
160 x 1000  1.78083s   0.836755 GFlops
128 x 1250  1.70838s   0.872238 GFlops
125 x 1280  1.83812s   0.810676 GFlops
100 x 1600  1.79399s   0.830616 GFlops
80 x 2000  1.90025s   0.78417 GFlops
64 x 2500  1.91228s   0.779236 GFlops
50 x 3200  2.40844s   0.618706 GFlops
40 x 4000  2.08198s   0.715722 GFlops
32 x 5000  2.30892s   0.645372 GFlops
25 x 6400  3.05263s   0.488142 GFlops
20 x 8000  2.89561s   0.514611 GFlops
16 x 10000  3.36251s   0.443156 GFlops
10 x 16000  6.72561s   0.221559 GFlops
8 x 20000  8.03166s   0.18553 GFlops
5 x 32000  8.08158s   0.184384 GFlops
4 x 40000  9.30723s   0.160103 GFlops

so .... yeah why not !

gael.

On Wed, Jun 25, 2008 at 10:17 PM, Benoît Jacob <jacob@xxxxxxxxxxxxxxx> wrote:
> On Wednesday 25 June 2008 22:09:17 Gael Guennebaud wrote:
>> right, an intermediate solution would be to specialize Matrix<> if one
>> of the dim is 1, e.g. let's add an additional "hidden" template
>> parameter:
>>
>> template <typename T, int R, int C, ...,  int IsVector = R==1 || C==1>
>> class Matrix {....};
>>
>> template <typename T, int R, int C, ..., int Flags>
>> class Matrix<T,R,C, ...,  MaxSize,1,Flags,true> {  /* vector */ };
>>
>> but this only allows to improve a bit the constructors and this does
>> not offer any template Vector<> class, so I'm not sure it's  worth it,
>> just an idea....
>
> It's interesting, I'm not opposed to it; on the other hand like you i'm not
> sure if it's worth the effort, as the problems with the matrix constructors
> are essentially "virtual problems" which don't cause harm in practice.
>
>>
>> > 2) about linear vectorization: i remember you suggested adding a
>> > packet(int) method. I think this is still a good idea and would still
>> > improve linear vectorization despite the improvements you already made.
>> > OK?
>>
>> actually I'm not sure that's worth it: I've done some bench and if I
>> remember well the overhead between a MatrixXf(R,C); versus a
>> VectorXf(R*C); was negligible, like x1.1 or x1.2.....
>
> so it was a 10% or 20% speed difference? Not what I call negligible!
>
> Also, the speed difference becomes bigger as the InnerSize is small. What
> sizes did you test?
>
> Cheers,
>
> Benoit
>
#include <Eigen/Core>
#include <bench/BenchTimer.h>
using namespace Eigen;

#ifndef SIZE
#define SIZE 50
#endif

#ifndef REPEAT
#define REPEAT 10000
#endif

typedef float Scalar;

__attribute__ ((noinline)) void benchVec(Scalar* a, Scalar* b, Scalar* c, int size);
__attribute__ ((noinline)) void benchVec(MatrixXf& a, MatrixXf& b, MatrixXf& c);
__attribute__ ((noinline)) void benchVec(VectorXf& a, VectorXf& b, VectorXf& c);

int main(int argc, char* argv[])
{
    int size = SIZE * 8;
    int size2 = size * size;
    Scalar* a = new Scalar[size2];
    Scalar* b = new Scalar[size2];
    Scalar* c = new Scalar[size2]; 
    
    for (int i=0; i<size; ++i)
    {
        a[i] = b[i] = c[i] = 0;
    }
    
    BenchTimer timer;
    
    timer.reset();
    for (int k=0; k<3; ++k)
    {
        timer.start();
        benchVec(a, b, c, size2);
        timer.stop();
    }
    std::cout << timer.value() << "s  " << (double(size2*REPEAT)/timer.value())/(1024.*1024.*1024.) << " GFlops\n";
    
    for (int innersize = size; innersize>2 ; --innersize)
    {
        if (size2%innersize==0)
        {
            int outersize = size2/innersize;
            MatrixXf ma = MatrixXf::map(a, innersize, outersize );
            MatrixXf mb = MatrixXf::map(b, innersize, outersize );
            MatrixXf mc = MatrixXf::map(c, innersize, outersize );
            timer.reset();
            for (int k=0; k<3; ++k)
            {
                timer.start();
                benchVec(ma, mb, mc);
                timer.stop();
            }
            std::cout << innersize << " x " << outersize << "  " << timer.value() << "s   " << (double(size2*REPEAT)/timer.value())/(1024.*1024.*1024.) << " GFlops\n";
        }
    }
    
    VectorXf va = VectorXf::map(a, size2);
    VectorXf vb = VectorXf::map(b, size2);
    VectorXf vc = VectorXf::map(c, size2);
    timer.reset();
    for (int k=0; k<3; ++k)
    {
        timer.start();
        benchVec(va, vb, vc);
        timer.stop();
    }
    std::cout << timer.value() << "s   " << (double(size2*REPEAT)/timer.value())/(1024.*1024.*1024.) << " GFlops\n";

    return 0;
}

void benchVec(MatrixXf& a, MatrixXf& b, MatrixXf& c)
{
    for (int k=0; k<REPEAT; ++k)
        a = a + b;
}

void benchVec(VectorXf& a, VectorXf& b, VectorXf& c)
{
    for (int k=0; k<REPEAT; ++k)
        a = a + b;
}

void benchVec(Scalar* a, Scalar* b, Scalar* c, int size)
{
    typedef ei_packet_traits<Scalar>::type PacketScalar;
    const int PacketSize = ei_packet_traits<Scalar>::size;
    PacketScalar a0, a1, a2, a3, b0, b1, b2, b3;
    for (int k=0; k<REPEAT; ++k)
        for (int i=0; i<size; i+=PacketSize*8)
        {
            a0 = ei_pload(&a[i]);
            b0 = ei_pload(&b[i]);
            a1 = ei_pload(&a[i+1*PacketSize]);
            b1 = ei_pload(&b[i+1*PacketSize]);
            a2 = ei_pload(&a[i+2*PacketSize]);
            b2 = ei_pload(&b[i+2*PacketSize]);
            a3 = ei_pload(&a[i+3*PacketSize]);
            b3 = ei_pload(&b[i+3*PacketSize]);
            ei_pstore(&a[i], ei_padd(a0, b0));
            a0 = ei_pload(&a[i+4*PacketSize]);
            b0 = ei_pload(&b[i+4*PacketSize]);
            
            ei_pstore(&a[i+1*PacketSize], ei_padd(a1, b1));
            a1 = ei_pload(&a[i+5*PacketSize]);
            b1 = ei_pload(&b[i+5*PacketSize]);
            
            ei_pstore(&a[i+2*PacketSize], ei_padd(a2, b2));
            a2 = ei_pload(&a[i+6*PacketSize]);
            b2 = ei_pload(&b[i+6*PacketSize]);
            
            ei_pstore(&a[i+3*PacketSize], ei_padd(a3, b3));
            a3 = ei_pload(&a[i+7*PacketSize]);
            b3 = ei_pload(&b[i+7*PacketSize]);
            
            ei_pstore(&a[i+4*PacketSize], ei_padd(a0, b0));
            ei_pstore(&a[i+5*PacketSize], ei_padd(a1, b1));
            ei_pstore(&a[i+6*PacketSize], ei_padd(a2, b2));
            ei_pstore(&a[i+7*PacketSize], ei_padd(a3, b3));
            
//             ei_pstore(&a[i+2*PacketSize], ei_padd(ei_pload(&a[i+2*PacketSize]), ei_pload(&b[i+2*PacketSize])));
//             ei_pstore(&a[i+3*PacketSize], ei_padd(ei_pload(&a[i+3*PacketSize]), ei_pload(&b[i+3*PacketSize])));
//             ei_pstore(&a[i+4*PacketSize], ei_padd(ei_pload(&a[i+4*PacketSize]), ei_pload(&b[i+4*PacketSize])));
//             ei_pstore(&a[i+5*PacketSize], ei_padd(ei_pload(&a[i+5*PacketSize]), ei_pload(&b[i+5*PacketSize])));
//             ei_pstore(&a[i+6*PacketSize], ei_padd(ei_pload(&a[i+6*PacketSize]), ei_pload(&b[i+6*PacketSize])));
//             ei_pstore(&a[i+7*PacketSize], ei_padd(ei_pload(&a[i+7*PacketSize]), ei_pload(&b[i+7*PacketSize])));
        }
}


Mail converted by MHonArc 2.6.19+ http://listengine.tuxfamily.org/