Acturally, I was running this code online:
most of the time was cost by this function:
void LSTM::forward(const VecD& xt, const LSTM::State* prev, LSTM::State* cur){
cur->i = this->bi;
cur->i.noalias() += this->Wxi*xt + this->Whi*prev->h;
cur->f = this->bf;
cur->f.noalias() += this->Wxf*xt + this->Whf*prev->h;
cur->o = this->bo;
cur->o.noalias() += this->Wxo*xt + this->Who*prev->h;
cur->u = this->bu;
cur->u.noalias() += this->Wxu*xt + this->Whu*prev->h;
ActFunc::logistic(cur->i);
ActFunc::logistic(cur->f);
ActFunc::logistic(cur->o);
ActFunc::tanh(cur->u);
cur->c = cur->i.array()*cur->u.array() + cur->f.array()*prev->c.array();
cur->cTanh = cur->c;
ActFunc::tanh(cur->cTanh);
cur->h = cur->o.array()*cur->cTanh.array();
}
I have created a sample example, the running time is also some kind of different for the same data, but I am not sure is it the same for above function.
#include <iostream>
#include <Eigen/Dense>
#include <Eigen/Core>
#include <sys/time.h>
using Eigen::MatrixXd;
using namespace Eigen;
using namespace Eigen::internal;
using namespace Eigen::Architecture;
using namespace std;
long long myTime()
{
struct timeval tm;
gettimeofday(&tm, NULL);
return tm.tv_sec*1000*1000 + tm.tv_usec;
}
int main(int argc, char* argv[])
{
MatrixXd m1(128, 1024), m2(128, 1024), m3(128, 1024), m4(128, 1024);
VectorXd v1(1024), v2(1024), v3(1024), v4(1024);
for(int j = 0; j < 1024; ++j)
{
v1(j) = j * 0.234 + 1;
v2(j) = j * 0.134 + 0.3;
v3(j) = j * 0.124 + 0.7;
v4(j) = j * 0.034 + 0.5;
for(int i = 0; i < 128; ++i)
{
m1(i, j) = i*0.378 + j*0.024 + 3.2;
m2(i, j) = i*1.384 + j*0.124 + 0.2;
m3(i, j) = i*0.778 + j*0.004 + 1.2;
m4(i, j) = i*0.538 + j*1.024 + 0.7;
}
}
for(int t = 0; t < 1000; ++t)
{
cout <<"\nt="<<t<<endl;
VectorXd x(128), y(128);
long long tm0 = myTime();
for(int i = 0; i < 10; ++i)
{
x = v1;
x += m1 * v1 + m2 * v2;
y = v3;
y += m3 * v3 + m4 * v4;
}
long long tm1 = myTime();
cout <<"mean time="<<(tm1-tm0)/1000<<endl;
long long tm3 = myTime();
for(int i = 0; i < 10; ++i)
{
x = v1;
x.noalias() += m1 * v1 + m2 * v2;
y = v3;
y.noalias() += m3 * v3 + m4 * v4;
}
long long tm4 = myTime();
cout <<"mean time="<<(tm4-tm3)/1000<<endl;
for(int i = 0; i < 10; ++i)
{
x = v1;
x.noalias() += m1 * v1;
x.noalias() += m2 * v2;
y = v3;
y.noalias() += m3 * v3;
y.noalias() += m4 * v4;
}
long long tm5 = myTime();
cout <<"mean time="<<(tm5-tm4)/1000<<endl;
}
}