ok, thank you.

I implemented my own too, and I get the following results ('u' stands for unaligned):

Vec3 /Iso3 : 0.000645647
Vec3u/Iso3u: 0.000645586
Vec4 /Iso3 : 0.00126092
Vec4u/Iso3u: 0.000688231
Vec4 /Mat4 : 0.000391356
Vec4u/Mat4u: 0.000981292

Vec3 /Iso3 : 0.000649383
Vec3u/Iso3u: 0.000649382
Vec4 /Iso3 : 0.00126092
Vec4u/Iso3u: 0.000689219
Vec4 /Mat4 : 0.000475642
Vec4u/Mat4u: 0.000979767

obtained with gcc 4.6, -O2 -DNDEBUG.

The only strange result is the case "Vec4 /Iso3" which is unexpectedly twice as slow as "Vec4u/Iso3u". Looking at the assembly, the only difference between the two versions is that in the "Vec4 /Iso3" case the result is first copied into a temporary (one coefficient at a time), that is then copied using 2 movaps to the true result location. Clearly, the compiler should be able to remove this extra copy as it is done  with the "Vec4u/Iso3u" case, but here it does not, I do not know why.


On Mon, Dec 17, 2012 at 10:07 AM, Jakob Schwendner <jakob.schwendner@xxxxxxx> wrote:
So I cleaned up the benchmarking that I did a little bit, and added a test to the bench folder, which should be consistent with the rest of the benchmarks there:
I ran the benchmark results on two different systems:
And still get better results for the DontAlign in situation where I wouldn't expect it. Might be my benchmarking, though...

#include <bench/BenchTimer.h>
#include <iostream>
#include <Eigen/Geometry>

using namespace Eigen;

template<typename R, typename T, typename V>
EIGEN_DONT_INLINE void kernel(R& res, const T& t, const V& v)
  res = t * v;

template<typename R, typename T, typename V=R> void bench(const std::string& msg)
  R res;
  T A;
  V v;
  BenchTimer t;
  BENCH(t, 100, 100000, kernel(res,A,v));
  std::cout << msg << ": " << << "\n";

template<typename S> void benchall()
  typedef Matrix<S, 3, 1> Vec3;
  typedef Matrix<S, 3, 1, DontAlign> Vec3u;
  typedef Matrix<S, 4, 1> Vec4;
  typedef Matrix<S, 4, 1, DontAlign> Vec4u;
  typedef Transform<S, 3, Isometry> Iso3;
  typedef Transform<S, 3, Isometry, DontAlign> Iso3u;
  typedef Matrix<S, 4, 4> Mat4;
  typedef Matrix<S, 4, 4, DontAlign> Mat4u;
//   bench<Vec3,Iso3>  ("Vec3 /Iso3 ");
//   bench<Vec3u,Iso3u>("Vec3u/Iso3u");
  bench<Vec4,Iso3>  ("Vec4 /Iso3 ");
//   bench<Vec4,Mat4>  ("Vec4 /Mat4 ");
//   bench<Vec4u,Mat4u>("Vec4u/Mat4u");

int main()
  std::cout << "Float:\n";
  std::cout << "\nDouble:\n";
//   benchall<double>();
  return 0;

