Ok, here is a working example. Its a bit cludgy due to the copy/paste, but here goes. The emitted ASM is 8MB, I can send it if you would like. It takes pretty close to the same amount of runtime as my fully-integrated solution, so I think this is representative of the problem
This code takes around 3 us/call with Eigen, an STL solution (not shown), takes 0.9 us/call. Compiled in Visual Studio 2010 Release mode, x64, /O2, /arch:AVX
The base VS flags are:
/I"C:\Users\Belli\Documents\Code\coolprop-v5-new\externals\Eigen" /I"C:\Program Files (x86)\Visual Leak Detector\include" /Zi /nologo /W3 /WX- /O2 /Oi /GL /D "WIN32" /D "NDEBUG" /D "_CONSOLE" /D "_UNICODE" /D "UNICODE" /Gm- /EHsc /GS /Gy /fp:precise /Zc:wchar_t /Zc:forScope /Fp"x64\Release\eigen_looping.pch" /Fa"x64\Release\" /Fo"x64\Release\" /Fd"x64\Release\vc100.pdb" /Gd /errorReport:queue
Also, it takes about a minute to compile this example, which is pretty darn slow...
#include "Eigen/Core"
#include "time.h"
#include <cmath>
#include <iostream>
double beta2[18] = {0,0,0,0,0,0,0,0,0,0,0,2.33, 3.47, 3.15, 3.19, 0.92, 18.8, 547.8};
double epsilon2[18] = {0,0,0,0,0,0,0,0,0,0,0,1.283, 0..6936, 0.788, 0.473, 0.8577, 0.271, 0.948};
double eta2[18] = {0,0,0,0,0,0,0,0,0,0,0,0.963, 1.977, 1.917, 2.307, 2.546, 3.28, 14.6};
double gamma2[18] = {0,0,0,0,0,0,0,0,0,0,0,0.684, 0.829, 1.419, 0.817, 1.5, 1.426, 1.093};
double d[18] = {4, 1, 1, 2, 2, 1, 3, 6, 6, 2, 3 , 1, 1, 1, 2, 2, 4, 1};
double l[18] = {0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2,0,0,0,0,0,0,0};
double c[18] = {0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1,0,0,0,0,0,0,0};
double n[18] = {0.042910051, 1.7313671, -2.4516524, 0.34157466, -0.46047898, -0..66847295, 0.20889705, 0.19421381, -0.22917851, -0.60405866, 0.066680654, 0..017534618, 0.33874242, 0.22228777, -0.23219062, -0.09220694, -0.47575718, -0.017486824};
double t[18] = {1, 0.33, 0.8, 0.43, 0.9, 2.46, 2.09, 0.88, 1.09, 3.25, 4.62, 0.76, 2.5, 2.75, 3.05, 2.55, 8.4, 6.75};
double beta1[18] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,0,0,0,0,0,0,0};
double eta1[18] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,0,0,0,0,0,0,0};
double epsilon1[18] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,0,0,0,0,0,0,0};
double gamma1[18] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,0,0,0,0,0,0,0};
Eigen::Map<Eigen::ArrayXd> nE(n,18);
Eigen::Map<Eigen::ArrayXd> dE(d,18);
Eigen::Map<Eigen::ArrayXd> tE(t,18);
Eigen::Map<Eigen::ArrayXd> lE(l,18);
Eigen::Map<Eigen::ArrayXd> cE(c,18);
Eigen::Map<Eigen::ArrayXd> beta1E(beta1,18);
Eigen::Map<Eigen::ArrayXd> beta2E(beta2,18);
Eigen::Map<Eigen::ArrayXd> eta1E(eta1,18);
Eigen::Map<Eigen::ArrayXd> eta2E(eta2,18);
Eigen::Map<Eigen::ArrayXd> epsilon1E(epsilon1,18);
Eigen::Map<Eigen::ArrayXd> epsilon2E(epsilon2,18);
Eigen::Map<Eigen::ArrayXd> gamma1E(gamma1,18);
Eigen::Map<Eigen::ArrayXd> gamma2E(gamma2,18);
#define POW2(x) (x)*(x)
#define POW3(x) (x)*(x)*(x)
struct HelmholtzDerivatives
{
double alphar, dalphar_ddelta, dalphar_dtau, d2alphar_ddelta2, d2alphar_dtau2, d2alphar_ddelta_dtau,
d3alphar_ddelta3, d3alphar_ddelta_dtau2, d3alphar_ddelta2_dtau, d3alphar_dtau3;
void reset(){alphar = 0; dalphar_ddelta = 0; dalphar_dtau = 0; d2alphar_ddelta2 = 0; d2alphar_dtau2 = 0; d2alphar_ddelta_dtau = 0;
d3alphar_ddelta3 = 0; d3alphar_ddelta_dtau2 = 0; d3alphar_ddelta2_dtau = 0; d3alphar_dtau3 = 0;
}
HelmholtzDerivatives(){reset();};
};
void allEigen(double &tau, double &delta, HelmholtzDerivatives &derivs) throw()
{
double log_tau = log(tau), log_delta = log(delta),
_one_over_delta_ = 1/delta, _one_over_tau_ = 1/tau; // division is much slower than multiplication, so do one division here
Eigen::ArrayXd delta_li; delta_li.resize(18);
for (std::size_t i = 0; i < 18; ++i)
{
delta_li[i] = pow(delta, l[i]);
}
Eigen::ArrayXd uE = -cE*delta_li - eta1E*(delta-epsilon1E) - eta2E*(delta-epsilon2E).square() - beta2E*(tau-gamma2E).square() - beta1E*(tau-gamma1E) ;
Eigen::ArrayXd du_ddeltaE = -cE*lE*delta_li*one_over_delta -eta1E - 2.0*eta2E*(delta-epsilon2E);
Eigen::ArrayXd d2u_ddelta2E = -cE*lE*(lE-1)*delta_li*one_over_delta*one_over_delta - 2*eta2E;
Eigen::ArrayXd d3u_ddelta3E = -cE*lE*(lE-1)*(lE-2)*delta_li*one_over_delta*one_over_delta*one_over_delta;
Eigen::ArrayXd du_dtauE = -2.0*beta2E*(tau-gamma2E) - beta1E;
Eigen::ArrayXd d2u_dtau2E = -2.0*beta2E;
Eigen::ArrayXd d3u_dtau3E = 0*beta2E;
Eigen::ArrayXd ndteuE = nE*exp(tE*log_tau + dE*log_delta + uE);
Eigen::ArrayXd B_deltaE = delta*du_ddeltaE + dE;
Eigen::ArrayXd B_tauE = tau*du_dtauE + tE;
Eigen::ArrayXd B_delta2E = delta*delta*d2u_ddelta2E + (delta*du_ddeltaE + dE).square() + dE;
Eigen::ArrayXd B_tau2E = tau*tau*d2u_dtau2E + (tau*du_dtauE + tE).square() + tE;
Eigen::ArrayXd B_delta3E = POW3(delta)*d3u_ddelta3E + 3*dE*POW2(delta)*d2u_ddelta2E+3*POW3(delta)*d2u_ddelta2E*du_ddeltaE+3*dE*(delta*du_ddeltaE).square()+3*dE*(dE-1)*delta*du_ddeltaE+dE*(dE-1)*(dE-2)+POW3(delta)*du_ddeltaE.cube();
Eigen::ArrayXd B_tau3E = POW3(tau)*d3u_dtau3E + 3*tE*POW2(tau)*d2u_dtau2E+3*POW3(tau)*d2u_dtau2E*du_dtauE+3*tE*(tau*du_dtauE).square()+3*tE*(tE-1)*tau*du_dtauE+tE*(tE-1)*(tE-2)+POW3(tau)*du_dtauE.cube();
Eigen::ArrayXd ndteuE_B_deltaE = ndteuE*B_deltaE;
Eigen::ArrayXd ndteuE_B_tauE = ndteuE*B_tauE;
derivs.alphar += ndteuE.sum();
derivs.dalphar_ddelta += (ndteuE_B_deltaE).sum()*one_over_delta;
derivs.dalphar_dtau += (ndteuE_B_tauE).sum()*one_over_tau;
derivs.d2alphar_ddelta2 += (ndteuE*B_delta2E).sum()*POW2(one_over_delta);
derivs.d2alphar_dtau2 += (ndteuE*B_tau2E).sum()*POW2(one_over_tau);
derivs.d2alphar_ddelta_dtau += (ndteuE_B_deltaE*B_tauE).sum()*one_over_delta*one_over_tau;
derivs.d3alphar_ddelta3 += (ndteuE*B_delta3E).sum()*POW3(one_over_delta);
derivs.d3alphar_dtau3 += (ndteuE*B_tau3E).sum()*POW3(one_over_tau);
derivs.d3alphar_ddelta2_dtau += (ndteuE_B_tauE*B_delta2E).sum()*POW2(one_over_delta)*one_over_tau;
derivs.d3alphar_ddelta_dtau2 += (ndteuE_B_deltaE*B_tau2E).sum()*one_over_delta*POW2(one_over_tau);
return;
};
int main(){
time_t t1, t2;
double s;
double tau = 0.8, delta = 1.1;
long N = 100000;
t1 = clock();
for (int i = 0; i < N; ++i){
HelmholtzDerivatives derivs;
allEigen(tau, delta, derivs);
s += derivs.alphar;
}
t2 = clock();
double tA = (double)(t2-t1)/CLOCKS_PER_SEC/((double)N)*1e6;
std::cout << "Time A: " << tA << " " << s << std::endl;
return 0;
}