[eigen] [PATCH] preliminary AltiVec support

[ Thread Index | Date Index | More lists.tuxfamily.org/eigen Archives ]


Hello all,

With great help from Gael and Benoit, I've just added preliminary 
AltiVec support (for PowerPC CPUs like the G4, G5, Power6 and Cell 
(the PPU part not the SPUs, these will follow).

Benchmarks show gain from as little as 65% faster (for 8x8 matrices) 
to 177% faster for 16x16 faster. I'm pretty sure I could get more but 
it would require me to learn more about how Eigen works internally :)

Konstantinos
diff -x .svn -ru eigen2.orig/CMakeLists.txt eigen2/CMakeLists.txt
--- eigen2.orig/CMakeLists.txt	2008-05-03 14:06:46.167603863 +0300
+++ eigen2/CMakeLists.txt	2008-04-14 10:54:57.000000000 +0300
@@ -16,6 +16,10 @@
       SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -msse2")
       MESSAGE("Enabling SSE2 in tests/examples")
     ENDIF(TEST_SSE2)
+    IF(TEST_ALTIVEC)
+      SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -maltivec -mabi=altivec")
+      MESSAGE("Enabling AltiVec in tests/examples")
+    ENDIF(TEST_ALTIVEC)
   ENDIF(CMAKE_SYSTEM_NAME MATCHES Linux)
 ENDIF(CMAKE_COMPILER_IS_GNUCXX)
 
diff -x .svn -ru eigen2.orig/Eigen/Core eigen2/Eigen/Core
--- eigen2.orig/Eigen/Core	2008-05-03 14:06:46.167603863 +0300
+++ eigen2/Eigen/Core	2008-05-03 14:03:45.850988724 +0300
@@ -8,14 +8,22 @@
 #include <emmintrin.h>
 #include <xmmintrin.h>
 #endif
+#ifdef __ALTIVEC__  // There are zero chances of both __SSE2__ AND __ALTIVEC__ been defined
+#define EIGEN_VECTORIZE
+#define EIGEN_VECTORIZE_ALTIVEC
+#include <altivec.h>
+// We _need_ to #undef bool as it's defined in <altivec.h> for some reason.
+#undef bool
 #endif
+#endif
+
+#include <cstdlib>
 
 #ifdef EIGEN_VECTORIZE
 // it seems we cannot assume posix_memalign is defined in the stdlib header
 extern "C" int posix_memalign (void **, size_t, size_t) throw ();
 #endif
 
-#include <cstdlib>
 #include <cmath>
 #include <complex>
 #include <cassert>
diff -x .svn -ru eigen2.orig/Eigen/src/Core/PacketMath.h eigen2/Eigen/src/Core/PacketMath.h
--- eigen2.orig/Eigen/src/Core/PacketMath.h	2008-05-03 14:06:46.164269716 +0300
+++ eigen2/Eigen/src/Core/PacketMath.h	2008-05-03 14:02:19.097631134 +0300
@@ -25,6 +25,10 @@
 #ifndef EIGEN_PACKET_MATH_H
 #define EIGEN_PACKET_MATH_H
 
+#ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
+#define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 		16
+#endif
+
 // Default implementation for types not supported by the vectorization.
 // In practice these functions are provided to make easier the writting
 // of generic vectorized code. However, at runtime, they should never be
@@ -53,6 +57,11 @@
 
 #ifdef EIGEN_VECTORIZE_SSE
 
+#ifdef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
+#undef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
+#define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 		16
+#endif
+
 template<> struct ei_packet_traits<float>  { typedef __m128  type; enum {size=4}; };
 template<> struct ei_packet_traits<double> { typedef __m128d type; enum {size=2}; };
 template<> struct ei_packet_traits<int>    { typedef __m128i type; enum {size=4}; };
@@ -116,7 +125,89 @@
 inline double ei_pfirst(const __m128d& a) { return _mm_cvtsd_f64(a); }
 inline int    ei_pfirst(const __m128i& a) { return _mm_cvtsi128_si32(a); }
 
-#endif // EIGEN_VECTORIZE_SSE
+#elif defined(EIGEN_VECTORIZE_ALTIVEC)
+
+#ifdef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
+#undef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
+#define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 		4
+#endif
+
+static const vector int   v0i   = vec_splat_u32(0);
+static const vector int   v16i_ = vec_splat_u32(-16);
+static const vector float v0f   = (vector float) v0i;
+
+template<> struct ei_packet_traits<float>  { typedef vector float type; enum {size=4}; };
+template<> struct ei_packet_traits<int>    { typedef vector int type; enum {size=4}; };
+
+inline vector float  ei_padd(const vector float   a, const vector float   b) { return vec_add(a,b); }
+inline vector int    ei_padd(const vector int     a, const vector int     b) { return vec_add(a,b); }
+
+inline vector float  ei_psub(const vector float   a, const vector float   b) { return vec_sub(a,b); }
+inline vector int    ei_psub(const vector int     a, const vector int     b) { return vec_sub(a,b); }
+
+inline vector float  ei_pmul(const vector float   a, const vector float   b) { return vec_madd(a,b, v0f); }
+inline vector int    ei_pmul(const vector int     a, const vector int     b)
+{
+  // Taken from http://
+
+  //Set up constants
+  vector int bswap, lowProduct, highProduct;
+
+  //Do real work
+  bswap = vec_rl( (vector unsigned int)b, (vector unsigned int)v16i_ );
+  lowProduct = vec_mulo( (vector short)a,(vector short)b );
+  highProduct = vec_msum((vector short)a,(vector short)bswap, v0i);
+  highProduct = vec_sl( (vector unsigned int)highProduct, (vector unsigned int)v16i_ );
+  return vec_add( lowProduct, highProduct );
+}
+
+inline vector float ei_pmadd(const vector float   a, const vector float   b, const vector float c) { return vec_madd(a, b, c); }
+
+inline vector float  ei_pmin(const vector float   a, const vector float   b) { return vec_min(a,b); }
+inline vector int    ei_pmin(const vector int     a, const vector int     b) { return vec_min(a,b); }
+
+inline vector float  ei_pmax(const vector float   a, const vector float   b) { return vec_max(a,b); }
+inline vector int    ei_pmax(const vector int     a, const vector int     b) { return vec_max(a,b); }
+
+inline vector float  ei_pload(const float*   from) { return vec_ld(0, from); }
+inline vector int    ei_pload(const int*     from) { return vec_ld(0, from); }
+
+inline vector float  ei_pset1(const float&  from)
+{
+  static float __attribute__(aligned(16)) af[4];
+  af[0] = from;
+  vector float vc = vec_ld(0, af);
+  vc = vec_splat(vc, 0);
+  return vc;
+}
+
+inline vector int    ei_pset1(const int&    from)
+{
+  static int __attribute__(aligned(16)) ai[4];
+  ai[0] = from;
+  vector int vc = vec_ld(0, ai);
+  vc = vec_splat(vc, 0);
+  return vc;
+}
+
+inline void ei_pstore(float*   to, const vector float   from) { vec_st(from, 0, to); }
+inline void ei_pstore(int*     to, const vector int     from) { vec_st(from, 0, to); }
+
+inline float  ei_pfirst(const vector float  a)
+{
+  static float __attribute__(aligned(16)) af[4];
+  vec_st(a, 0, af);
+  return af[0];
+}
+
+inline int    ei_pfirst(const vector int    a)
+{
+  static int __attribute__(aligned(16)) ai[4];
+  vec_st(a, 0, ai);
+  return ai[0];
+}
+
+#endif // EIGEN_VECTORIZE_ALTIVEC & SSE
 
 #endif // EIGEN_PACKET_MATH_H
 
diff -x .svn -ru eigen2.orig/Eigen/src/Core/Product.h eigen2/Eigen/src/Core/Product.h
--- eigen2.orig/Eigen/src/Core/Product.h	2008-05-03 14:06:46.164269716 +0300
+++ eigen2/Eigen/src/Core/Product.h	2008-05-03 14:02:37.497628306 +0300
@@ -133,8 +133,8 @@
   */
 template<typename Lhs, typename Rhs> struct ei_product_eval_mode
 {
-  enum{ value =  Lhs::MaxRowsAtCompileTime >= 16
-              && Rhs::MaxColsAtCompileTime >= 16
+  enum{ value =  Lhs::MaxRowsAtCompileTime >= EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
+              && Rhs::MaxColsAtCompileTime >= EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
               && (!( (Lhs::Flags&RowMajorBit) && (Rhs::Flags&RowMajorBit xor RowMajorBit)))
               ? CacheOptimalProduct : NormalProduct };
 };


Mail converted by MHonArc 2.6.19+ http://listengine.tuxfamily.org/