Re: [eigen] Optimization advice for a specific expression

```--=-=-=
Content-Type: text/plain; charset=utf-8
Content-Disposition: inline
Content-Transfer-Encoding: quoted-printable

Christoph Hertzberg writes:

> On 2016-02-04 17:02, Alberto Luaces wrote:
>>> Other than that: * What exactly do you mean by homogeneous?
>>
>> G =3D [ v3x3 | 0]
>>      [  0=E1=B5=80  | 1]
>>
>> where "v3x3" is a 3x3 matrix, and "0" a vector of 3 zero-valued elements.
>>
>>> Can you make any assumptions on the determinant of G? (Calculating the
>>> det didn't not seem to be the bottleneck, though)
>>
>> No, I cannot, but as you point, removing the determinant computation for
>> testing purposes does not yield any improvement.
>
> Given the above, you have:
>   G.determinant() =3D=3D v3x3.determinant(),
> which should be significantly faster than the full determinant.
> If v3x3 happens to be a rotation matrix, G.determinant()=3D=3D1;
>
>>> Does it have any special form (e.g. low rank, or Identity+lowRank)
>>
>> It is just symmetric:
>>
>> Eplus << 2 , 1 , 1 , 5,
>>           1 , 2 , 1 , 5,
>>           1 , 1 , 2 , 5,
>>           5 , 5 , 5 , 20;
>
> The upper left is Identity + [1,1,1]^T*[1,1,1]. I'll abbreviate V=3Dv3x3.
> If you then define w=3DV*[1,1,1]^T =3D=3D V.rowwise().sum(), the resulting
> summand is (without the determinant factor):
>   [ V*V^T + w*w^T,  5*w ]
>   [ 5*w^T        ,  20  ]
>
> Again, if V happens to be a rotation, then V*V^T=3DIdentity
>
> So with a bit of hand-coding you may gain a lot efficiency here. It
> may be tricky to get it right that vectorization can still be
> used. E.g., instead of V.rowwise().sum() you can calculate Vector4d
> w=3DG.leftCols<3>().rowwise().sum(), which should be near optimal
> [again, check the generated assembly]
> Multiplication by 5 can be done at the end (if I4x4 initially is 0),
> and the lower right element is just 20*{sum of determinants}.

Thank you, Christoph: it certainly works better now, although it seems
that there exist some differences between Eigen versions.  I have
summarized them in the following table:

| Eigen version | General algorithm | Hand-coded algorithm |
| 3.2.7         | 0.10s             | 0.04s                |
| 3.3-beta1     | 0.21s             | 0.15s                |

I am attaching a minimal test case for reference.  The bottleneck lies
on the function InertiaTensor::addFace().  The data from the table were
computed with the compilation flags "-O3 -DNDEBUG".  Eigen 3.3-beta1
reports its version as "3.2.92"

--=-=-=
Content-Type: text/x-c++src; charset=utf-8
Content-Disposition: attachment; filename=main.cpp
Content-Transfer-Encoding: quoted-printable

#include <iostream>
#include <ctime>
#include <chrono>

#include <Eigen/Core>
#include <Eigen/Geometry>

// @InProceedings{Euler2006,
//   Title                    =3D {Fast Computation Of Inertia Through
//                               Affinely Extended Euler And Tensor},
//   Author                   =3D {DiCarlo, Antonio and Paoluzzi, Alberto},
//   Year                     =3D {2006}}

template<class REAL, class INT>
class InertiaTensor
{

public:

// --------------------------------------------------------------------=
--------------------- Constructor
/// Constructor without parameters.
InertiaTensor()
: dens_(0.0)
{
Eplus << 2 , 1 , 1 , 5,
1 , 2 , 1 , 5,
1 , 1 , 2 , 5,
5 , 5 , 5 , 20; // 120 * E+
}

// --------------------------------------------------------------------=
----------------------- Accessors
/// Set the density of the volume.
/// \param dens         Density of the volume
inline void setDensity(const REAL dens){ dens_ =3D dens; };

/// Returns the mass of the volume
/// \return The volume mass, a scalar
inline REAL  getMass()  { return mass_; };

/// Returns the position of the centre of mass of the volume (defined i=
n the same coordinates as the mesh)
/// \return A 3x1 array
inline REAL* getrG()  { return rG_; };

/// Returns the inertia tensor of the volume about its centre of mass (=
defined in the same coordinates as the mesh)
/// \return A 3x3 array
inline REAL* getInertiaTensor()  { return J_; };

// --------------------------------------------------------------------=
----------------------- Begin Vol
/// Starts the evaluation of the volumetric properties of a new volume.
/// Clears the integrals, mass, centre of mass position, and inertia te=
nsor.
void beginVolume()
{
InertiaTensor4x4.setZero();
}

// --------------------------------------------------------------------=
------------------------- End Vol
/// Finalizes the volume integration and evaluates the physical propert=
ies.
void endVolume()
{
computeMassProperties();
}

// --------------------------------------------------------------------=
/// Adds a new face to the integration process.
/// \param vertex_data  A 3xnum_vertices array with the x, y, and z mes=
h coordinates of the face vertices.
/// \param normal_data  A 3x1 array with the x, y, and z mesh coordinat=
es of the face normal. May be set to NULL if the face is a triangle.
/// \param num_vertices     Number of vertices in the face.
void addFace(const REAL* vertex_data, const REAL* normal_data, const IN=
T num_vertices)
{
// Check that we have at least 2 vertices per face
assert(num_vertices =3D=3D 3);

Eigen::Map<const Eigen::Matrix<REAL, 3, 3>> v(vertex_data);
Eigen::Matrix<REAL, 4, 4> G;

// G =3D [v0 v1 v2 0]
//     [ 0  0  0 1]
G.col(3).setZero();
G.row(3).setZero();
G(3, 3) =3D 1.0;

Eigen::Matrix<REAL, 4, 4>  temp;
Eigen::Vector4d w =3D G.template leftCols<3>().rowwise().sum();
temp << G.template block<3, 3>(0, 0) * G.template block<3, 3>(0, 0).transp=

// I4x4 =3D det(G) G E+ G=E1=B5=80
InertiaTensor4x4.noalias() +=3D  temp * (G.template block<3,3>(0,0).determ=
inant() / 120.0);

// Initial implementation
// InertiaTensor4x4.noalias() +=3D (G.template block<3,3>(0,0).determinant=
() / 120.0) * G * Eplus * G.transpose();
}

private:

REAL dens_;             // Density
REAL mass_;             // Mass of the volume
REAL rG_[3];        // Position of the centre of mass
REAL J_[9];             // Inertia tensor

Eigen::Matrix<REAL, 4, 4> Eplus, InertiaTensor4x4;

// --------------------------------------------------------------------=
--------- Compute mass properties
void computeMassProperties()
{
// Evaluate mass
mass_ =3D InertiaTensor4x4(3, 3) * dens_;
}
};

template<class F>
void massProps(const char *object, const char* method, unsigned int niters,=
double dens, double *mass1, double *J1, double *rg1, int nf, const F &feva=
l) {
std::cout << "--------------------------------------------------" << std:=
:endl;
std::cout << object << ": " << nf << " faces." << std::endl;
std::cout << "--------------------------------------------------" << std:=
:endl;

std::chrono::high_resolution_clock::time_point t1 =3D std::chrono::high_r=
esolution_clock::now();
for (unsigned int iters =3D 0; iters < niters; iters++)
{
feval();
}
std::chrono::high_resolution_clock::time_point t2 =3D std::chrono::high_r=
esolution_clock::now();
std::chrono::duration<double> time_span1 =3D std::chrono::duration_cast<s=
td::chrono::duration<double>>(t2 - t1);

std::cout << object << " - " << method << std::endl;
std::cout << "Elapsed time: " << time_span1.count() << " seconds." << std=
::endl;
std::cout << "Total mass: " << *mass1 << std::endl;
std::cout << std::endl;
}

template <class DT>
int computeMassPropertiesIT(DT *v, int *i, int nf, const double dens,=20
double *mass, double *r, double *J)
{
InertiaTensor<double, int> vp;
vp.setDensity(dens);

double faceverts[9];

Eigen::Map<Eigen::Matrix<DT, 3 , Eigen::Dynamic>> verts(v, 3, *std::max=
_element(i, i + nf) + 1);

vp.beginVolume();

for (int face =3D 0; face < nf; face++)
{
Eigen::Map<Eigen::Matrix3d> points(faceverts);

points.col(0) =3D verts.col(i[face * 3 + 0]).template cast<double>();
points.col(1) =3D verts.col(i[face * 3 + 1]).template cast<double>();
points.col(2) =3D verts.col(i[face * 3 + 2]).template cast<double>();

}

vp.endVolume();

*mass =3D vp.getMass();
Eigen::Map<Eigen::Vector3d>(r, 3, 1) =3D Eigen::Map<Eigen::Vector3d>(vp=
..getrG(), 3, 1);
Eigen::Map<Eigen::Matrix3d>(J, 3, 3) =3D Eigen::Map<Eigen::Matrix3d>(vp=
..getInertiaTensor(), 3, 3);
}

int main(int argc, char *argv[])
{
// Definition of a tetrahedron ___________________________________________=
_________________

// Number of faces
int nf =3D 4;

// Array of normals
double n[12] =3D {
1.0 / sqrt(3.0), 1.0 / sqrt(3.0), 1.0 / sqrt(3.0),
0.0, -1.0, 0.0,
0.0, 0.0, -1.0,
-1.0, 0.0, 0.0,
};

// Array of vertices (x, y, z coordinates of each vertex)
double v[12] =3D {
0.0, 0.0, 0.0,
1.0, 0.0, 0.0,
0.0, 1.0, 0.0,
0.0, 0.0, 1.0
};

// Array of vertex indices (vertices in each face)
int i[12] =3D {
1, 2, 3,
0, 1, 3,=20
0, 2, 1,=20
0, 3, 2
};

// Evaluation of mass properties _________________________________________=
_________________

double rg1[3], J1[9];
double mass1;
const double dens =3D 7780.0;

const unsigned int niters_pyramid =3D 1000000;

std::cout << "Using Eigen version: " << EIGEN_WORLD_VERSION << "." << EIGE=
N_MAJOR_VERSION << "." << EIGEN_MINOR_VERSION << "\n";

massProps("PYRAMID", "DICARLO", niters_pyramid, dens, &mass1, J1, rg1, nf,
[&](){ computeMassPropertiesIT(v, i, nf, dens, &mass1, rg1, J1);});

return 0;
}

--=-=-=
Content-Type: text/plain
Content-Disposition: inline

Regards,

Alberto

--=-=-=--

```

 Mail converted by MHonArc 2.6.19+ http://listengine.tuxfamily.org/