Re: [eigen] Poor performance on MSVC compiler

[ Thread Index | Date Index | More lists.tuxfamily.org/eigen Archives ]


Ok, here is a working example.  Its a bit cludgy due to the copy/paste, but here goes.  The emitted ASM is 8MB, I can send it if you would like.  It takes pretty close to the same amount of runtime as my fully-integrated solution, so I think this is representative of the problem

This code takes around 3 us/call with Eigen, an STL solution (not shown), takes 0.9 us/call.  Compiled in Visual Studio 2010 Release mode, x64, /O2, /arch:AVX

The base VS flags are:

/I"C:\Users\Belli\Documents\Code\coolprop-v5-new\externals\Eigen" /I"C:\Program Files (x86)\Visual Leak Detector\include" /Zi /nologo /W3 /WX- /O2 /Oi /GL /D "WIN32" /D "NDEBUG" /D "_CONSOLE" /D "_UNICODE" /D "UNICODE" /Gm- /EHsc /GS /Gy /fp:precise /Zc:wchar_t /Zc:forScope /Fp"x64\Release\eigen_looping.pch" /Fa"x64\Release\" /Fo"x64\Release\" /Fd"x64\Release\vc100.pdb" /Gd /errorReport:queue

Also, it takes about a minute to compile this example, which is pretty darn slow...

#include "Eigen/Core"
#include "time.h"
#include <cmath>
#include <iostream>

double beta2[18] = {0,0,0,0,0,0,0,0,0,0,0,2.33, 3.47, 3.15, 3.19, 0.92, 18.8, 547.8};
double epsilon2[18] = {0,0,0,0,0,0,0,0,0,0,0,1.283, 0..6936, 0.788, 0.473, 0.8577, 0.271, 0.948};
double eta2[18] = {0,0,0,0,0,0,0,0,0,0,0,0.963, 1.977, 1.917, 2.307, 2.546, 3.28, 14.6};
double gamma2[18] = {0,0,0,0,0,0,0,0,0,0,0,0.684, 0.829, 1.419, 0.817, 1.5, 1.426, 1.093};
double d[18] = {4, 1, 1, 2, 2, 1, 3, 6, 6, 2, 3 , 1, 1, 1, 2, 2, 4, 1};
double l[18] = {0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2,0,0,0,0,0,0,0};
double c[18] = {0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1,0,0,0,0,0,0,0};
double n[18] = {0.042910051, 1.7313671, -2.4516524, 0.34157466, -0.46047898, -0..66847295, 0.20889705, 0.19421381, -0.22917851, -0.60405866, 0.066680654, 0..017534618, 0.33874242, 0.22228777, -0.23219062, -0.09220694, -0.47575718, -0.017486824};
double t[18] = {1, 0.33, 0.8, 0.43, 0.9, 2.46, 2.09, 0.88, 1.09, 3.25, 4.62, 0.76, 2.5, 2.75, 3.05, 2.55, 8.4, 6.75};

double beta1[18] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,0,0,0,0,0,0,0};
double eta1[18] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,0,0,0,0,0,0,0};
double epsilon1[18] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,0,0,0,0,0,0,0};
double gamma1[18] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,0,0,0,0,0,0,0};

Eigen::Map<Eigen::ArrayXd> nE(n,18);
Eigen::Map<Eigen::ArrayXd> dE(d,18);
Eigen::Map<Eigen::ArrayXd> tE(t,18);
Eigen::Map<Eigen::ArrayXd> lE(l,18);
Eigen::Map<Eigen::ArrayXd> cE(c,18);
Eigen::Map<Eigen::ArrayXd> beta1E(beta1,18);
Eigen::Map<Eigen::ArrayXd> beta2E(beta2,18);
Eigen::Map<Eigen::ArrayXd> eta1E(eta1,18);
Eigen::Map<Eigen::ArrayXd> eta2E(eta2,18);

Eigen::Map<Eigen::ArrayXd> epsilon1E(epsilon1,18);
Eigen::Map<Eigen::ArrayXd> epsilon2E(epsilon2,18);
Eigen::Map<Eigen::ArrayXd> gamma1E(gamma1,18);
Eigen::Map<Eigen::ArrayXd> gamma2E(gamma2,18);

#define POW2(x) (x)*(x)
#define POW3(x) (x)*(x)*(x)

struct HelmholtzDerivatives
{
    double alphar, dalphar_ddelta, dalphar_dtau, d2alphar_ddelta2, d2alphar_dtau2, d2alphar_ddelta_dtau,
                d3alphar_ddelta3, d3alphar_ddelta_dtau2, d3alphar_ddelta2_dtau, d3alphar_dtau3;
    void reset(){alphar = 0; dalphar_ddelta = 0; dalphar_dtau = 0; d2alphar_ddelta2 = 0; d2alphar_dtau2 = 0; d2alphar_ddelta_dtau = 0;
                d3alphar_ddelta3 = 0; d3alphar_ddelta_dtau2 = 0; d3alphar_ddelta2_dtau = 0; d3alphar_dtau3 = 0;
                }
    HelmholtzDerivatives(){reset();};
};

void allEigen(double &tau, double &delta, HelmholtzDerivatives &derivs) throw()
{
    double log_tau = log(tau), log_delta = log(delta),
            _one_over_delta_ = 1/delta, _one_over_tau_ = 1/tau; // division is much slower than multiplication, so do one division here

    Eigen::ArrayXd delta_li; delta_li.resize(18);

    for (std::size_t i = 0; i < 18; ++i)
    {
        delta_li[i] = pow(delta, l[i]);
    }

    Eigen::ArrayXd uE = -cE*delta_li - eta1E*(delta-epsilon1E) - eta2E*(delta-epsilon2E).square() - beta2E*(tau-gamma2E).square() - beta1E*(tau-gamma1E) ;
    Eigen::ArrayXd du_ddeltaE = -cE*lE*delta_li*one_over_delta -eta1E - 2.0*eta2E*(delta-epsilon2E);
    Eigen::ArrayXd d2u_ddelta2E = -cE*lE*(lE-1)*delta_li*one_over_delta*one_over_delta - 2*eta2E;
    Eigen::ArrayXd d3u_ddelta3E = -cE*lE*(lE-1)*(lE-2)*delta_li*one_over_delta*one_over_delta*one_over_delta;

    Eigen::ArrayXd du_dtauE = -2.0*beta2E*(tau-gamma2E) - beta1E;
    Eigen::ArrayXd d2u_dtau2E = -2.0*beta2E;
    Eigen::ArrayXd d3u_dtau3E = 0*beta2E;
   
    Eigen::ArrayXd ndteuE = nE*exp(tE*log_tau + dE*log_delta + uE);
    Eigen::ArrayXd B_deltaE = delta*du_ddeltaE + dE;
    Eigen::ArrayXd B_tauE = tau*du_dtauE + tE;
    Eigen::ArrayXd B_delta2E = delta*delta*d2u_ddelta2E + (delta*du_ddeltaE + dE).square() + dE;
    Eigen::ArrayXd B_tau2E = tau*tau*d2u_dtau2E + (tau*du_dtauE + tE).square() + tE;
    Eigen::ArrayXd B_delta3E = POW3(delta)*d3u_ddelta3E + 3*dE*POW2(delta)*d2u_ddelta2E+3*POW3(delta)*d2u_ddelta2E*du_ddeltaE+3*dE*(delta*du_ddeltaE).square()+3*dE*(dE-1)*delta*du_ddeltaE+dE*(dE-1)*(dE-2)+POW3(delta)*du_ddeltaE.cube();
    Eigen::ArrayXd B_tau3E = POW3(tau)*d3u_dtau3E + 3*tE*POW2(tau)*d2u_dtau2E+3*POW3(tau)*d2u_dtau2E*du_dtauE+3*tE*(tau*du_dtauE).square()+3*tE*(tE-1)*tau*du_dtauE+tE*(tE-1)*(tE-2)+POW3(tau)*du_dtauE.cube();

    Eigen::ArrayXd ndteuE_B_deltaE = ndteuE*B_deltaE;
    Eigen::ArrayXd ndteuE_B_tauE = ndteuE*B_tauE;
   
    derivs.alphar                +=  ndteuE.sum();
    derivs.dalphar_ddelta        += (ndteuE_B_deltaE).sum()*one_over_delta;
    derivs.dalphar_dtau          += (ndteuE_B_tauE).sum()*one_over_tau;
    derivs.d2alphar_ddelta2      += (ndteuE*B_delta2E).sum()*POW2(one_over_delta);
    derivs.d2alphar_dtau2        += (ndteuE*B_tau2E).sum()*POW2(one_over_tau);
    derivs.d2alphar_ddelta_dtau  += (ndteuE_B_deltaE*B_tauE).sum()*one_over_delta*one_over_tau;
   
    derivs.d3alphar_ddelta3      += (ndteuE*B_delta3E).sum()*POW3(one_over_delta);
    derivs.d3alphar_dtau3        += (ndteuE*B_tau3E).sum()*POW3(one_over_tau);
    derivs.d3alphar_ddelta2_dtau += (ndteuE_B_tauE*B_delta2E).sum()*POW2(one_over_delta)*one_over_tau;
    derivs.d3alphar_ddelta_dtau2 += (ndteuE_B_deltaE*B_tau2E).sum()*one_over_delta*POW2(one_over_tau);
   
    return;
};

int main(){
    time_t t1, t2;
    double s;
    double tau = 0.8, delta = 1.1;
    long N = 100000;
    t1 = clock();
    for (int i = 0; i < N; ++i){
        HelmholtzDerivatives derivs;
        allEigen(tau, delta, derivs);
        s += derivs.alphar;
    }
    t2 = clock();
    double tA = (double)(t2-t1)/CLOCKS_PER_SEC/((double)N)*1e6;
    std::cout << "Time A: " << tA << " " << s << std::endl;

    return 0;
}


On Mon, Aug 18, 2014 at 1:44 PM, Ian Bell <ian.h.bell@xxxxxxxxx> wrote:
Yeah I know, but this function is pretty deeply embedded into a larger program.  I'm going to try to strip it down to something more reasonable with a copy-paste of coefficients, etc to see if I can duplicate this error for a minimal example.


On Mon, Aug 18, 2014 at 1:20 PM, Christoph Hertzberg <chtz@xxxxxxxxxxxxxxxxxxxxxxxx> wrote:
On 18.08.2014 00:06, Ian Bell wrote:
I have a code snippet that looks something like the code at the end of this
email.  I figured a copy/paste was better for future readability rather
than a gist, etc.

Could you please try to reduce your problem to something shorter and compilable? Understanding your problem requires a lot of guess-work, otherwise ....

Christoph



--
----------------------------------------------
Dipl.-Inf., Dipl.-Math. Christoph Hertzberg
Cartesium 0.049
Universität Bremen
Enrique-Schmidt-Straße 5
28359 Bremen

Tel: +49 (421) 218-64252
----------------------------------------------






Mail converted by MHonArc 2.6.19+ http://listengine.tuxfamily.org/