Following up.
I missed the Goto paper reference for blocking matrix products that was way high above the implementation - so after studying that paper and some of Eigen's source, I know what's happening now, or at least what should be happening - the actual implementation in Eigen still loses me.
But I did find out about GeneralProduct and the product_type_selector, and CoeffBasedProductMode - which seems to be the naive c implementation I was after. So I intend to try things with the following patch - Gael, I on the right path in option and strategy?
diff -r c04c7f10894d Eigen/src/Core/GeneralProduct.h
--- a/Eigen/src/Core/GeneralProduct.h Mon Sep 05 17:14:20 2016 +0200
+++ b/Eigen/src/Core/GeneralProduct.h Mon Sep 05 18:00:29 2016 -0400
@@ -81,6 +81,16 @@
* This is a compile time mapping from {1,Small,Large}^3 -> {product types} */
// FIXME I'm not sure the current mapping is the ideal one.
template<int M, int N> struct product_type_selector<M,N,1> { enum { ret = OuterProduct }; };
+#ifdef EIGEN_USE_COEFF_BASED_PRODUCTS
+template<int M, int N, int D> struct product_type_selector<M, N, Depth> { enum { ret = CoeffBasedProductMode }; };
+template<int M> struct product_type_selector<M, 1, 1> { enum { ret = LazyCoeffBasedProductMode }; };
+template<int N> struct product_type_selector<1, N, 1> { enum { ret = LazyCoeffBasedProductMode }; };
+template<> struct product_type_selector<Small, Small, 1> { enum { ret = LazyCoeffBasedProductMode }; };
+template<> struct product_type_selector<Small, Large, 1> { enum { ret = LazyCoeffBasedProductMode }; };
+template<> struct product_type_selector<Large, Small, 1> { enum { ret = LazyCoeffBasedProductMode }; };
+template<int Depth> struct product_type_selector<1, 1, Depth> { enum { ret = InnerProduct }; };
+template<> struct product_type_selector<1, 1, 1> { enum { ret = InnerProduct }; };
+#else
template<int M> struct product_type_selector<M, 1, 1> { enum { ret = LazyCoeffBasedProductMode }; };
template<int N> struct product_type_selector<1, N, 1> { enum { ret = LazyCoeffBasedProductMode }; };
template<int Depth> struct product_type_selector<1, 1, Depth> { enum { ret = InnerProduct }; };
@@ -104,6 +114,7 @@
template<> struct product_type_selector<Large,Small,Small> { enum { ret = CoeffBasedProductMode }; };
template<> struct product_type_selector<Small,Large,Small> { enum { ret = CoeffBasedProductMode }; };
template<> struct product_type_selector<Large,Large,Small> { enum { ret = GemmProduct }; };
+#endif
} // end namespace internal
But also there is cblas's way of doing matrix mults - which is to distribute by coefs on the rhs term to all locations they are terms of in a single pass. Maybe this is EIGEN_USE_DISTRIBUTE_BY_RHS_COEFF_PRODUCTS? Not sure I can do a packetmath implementation of this but it's purpose is validation with the present API, not speed.
-Jason