[eigen] [PATCH] preliminary AltiVec support |
[ Thread Index |
Date Index
| More lists.tuxfamily.org/eigen Archives
]
Hello all,
With great help from Gael and Benoit, I've just added preliminary
AltiVec support (for PowerPC CPUs like the G4, G5, Power6 and Cell
(the PPU part not the SPUs, these will follow).
Benchmarks show gain from as little as 65% faster (for 8x8 matrices)
to 177% faster for 16x16 faster. I'm pretty sure I could get more but
it would require me to learn more about how Eigen works internally :)
Konstantinos
diff -x .svn -ru eigen2.orig/CMakeLists.txt eigen2/CMakeLists.txt
--- eigen2.orig/CMakeLists.txt 2008-05-03 14:06:46.167603863 +0300
+++ eigen2/CMakeLists.txt 2008-04-14 10:54:57.000000000 +0300
@@ -16,6 +16,10 @@
SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -msse2")
MESSAGE("Enabling SSE2 in tests/examples")
ENDIF(TEST_SSE2)
+ IF(TEST_ALTIVEC)
+ SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -maltivec -mabi=altivec")
+ MESSAGE("Enabling AltiVec in tests/examples")
+ ENDIF(TEST_ALTIVEC)
ENDIF(CMAKE_SYSTEM_NAME MATCHES Linux)
ENDIF(CMAKE_COMPILER_IS_GNUCXX)
diff -x .svn -ru eigen2.orig/Eigen/Core eigen2/Eigen/Core
--- eigen2.orig/Eigen/Core 2008-05-03 14:06:46.167603863 +0300
+++ eigen2/Eigen/Core 2008-05-03 14:03:45.850988724 +0300
@@ -8,14 +8,22 @@
#include <emmintrin.h>
#include <xmmintrin.h>
#endif
+#ifdef __ALTIVEC__ // There are zero chances of both __SSE2__ AND __ALTIVEC__ been defined
+#define EIGEN_VECTORIZE
+#define EIGEN_VECTORIZE_ALTIVEC
+#include <altivec.h>
+// We _need_ to #undef bool as it's defined in <altivec.h> for some reason.
+#undef bool
#endif
+#endif
+
+#include <cstdlib>
#ifdef EIGEN_VECTORIZE
// it seems we cannot assume posix_memalign is defined in the stdlib header
extern "C" int posix_memalign (void **, size_t, size_t) throw ();
#endif
-#include <cstdlib>
#include <cmath>
#include <complex>
#include <cassert>
diff -x .svn -ru eigen2.orig/Eigen/src/Core/PacketMath.h eigen2/Eigen/src/Core/PacketMath.h
--- eigen2.orig/Eigen/src/Core/PacketMath.h 2008-05-03 14:06:46.164269716 +0300
+++ eigen2/Eigen/src/Core/PacketMath.h 2008-05-03 14:02:19.097631134 +0300
@@ -25,6 +25,10 @@
#ifndef EIGEN_PACKET_MATH_H
#define EIGEN_PACKET_MATH_H
+#ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
+#define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 16
+#endif
+
// Default implementation for types not supported by the vectorization.
// In practice these functions are provided to make easier the writting
// of generic vectorized code. However, at runtime, they should never be
@@ -53,6 +57,11 @@
#ifdef EIGEN_VECTORIZE_SSE
+#ifdef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
+#undef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
+#define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 16
+#endif
+
template<> struct ei_packet_traits<float> { typedef __m128 type; enum {size=4}; };
template<> struct ei_packet_traits<double> { typedef __m128d type; enum {size=2}; };
template<> struct ei_packet_traits<int> { typedef __m128i type; enum {size=4}; };
@@ -116,7 +125,89 @@
inline double ei_pfirst(const __m128d& a) { return _mm_cvtsd_f64(a); }
inline int ei_pfirst(const __m128i& a) { return _mm_cvtsi128_si32(a); }
-#endif // EIGEN_VECTORIZE_SSE
+#elif defined(EIGEN_VECTORIZE_ALTIVEC)
+
+#ifdef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
+#undef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
+#define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 4
+#endif
+
+static const vector int v0i = vec_splat_u32(0);
+static const vector int v16i_ = vec_splat_u32(-16);
+static const vector float v0f = (vector float) v0i;
+
+template<> struct ei_packet_traits<float> { typedef vector float type; enum {size=4}; };
+template<> struct ei_packet_traits<int> { typedef vector int type; enum {size=4}; };
+
+inline vector float ei_padd(const vector float a, const vector float b) { return vec_add(a,b); }
+inline vector int ei_padd(const vector int a, const vector int b) { return vec_add(a,b); }
+
+inline vector float ei_psub(const vector float a, const vector float b) { return vec_sub(a,b); }
+inline vector int ei_psub(const vector int a, const vector int b) { return vec_sub(a,b); }
+
+inline vector float ei_pmul(const vector float a, const vector float b) { return vec_madd(a,b, v0f); }
+inline vector int ei_pmul(const vector int a, const vector int b)
+{
+ // Taken from http://
+
+ //Set up constants
+ vector int bswap, lowProduct, highProduct;
+
+ //Do real work
+ bswap = vec_rl( (vector unsigned int)b, (vector unsigned int)v16i_ );
+ lowProduct = vec_mulo( (vector short)a,(vector short)b );
+ highProduct = vec_msum((vector short)a,(vector short)bswap, v0i);
+ highProduct = vec_sl( (vector unsigned int)highProduct, (vector unsigned int)v16i_ );
+ return vec_add( lowProduct, highProduct );
+}
+
+inline vector float ei_pmadd(const vector float a, const vector float b, const vector float c) { return vec_madd(a, b, c); }
+
+inline vector float ei_pmin(const vector float a, const vector float b) { return vec_min(a,b); }
+inline vector int ei_pmin(const vector int a, const vector int b) { return vec_min(a,b); }
+
+inline vector float ei_pmax(const vector float a, const vector float b) { return vec_max(a,b); }
+inline vector int ei_pmax(const vector int a, const vector int b) { return vec_max(a,b); }
+
+inline vector float ei_pload(const float* from) { return vec_ld(0, from); }
+inline vector int ei_pload(const int* from) { return vec_ld(0, from); }
+
+inline vector float ei_pset1(const float& from)
+{
+ static float __attribute__(aligned(16)) af[4];
+ af[0] = from;
+ vector float vc = vec_ld(0, af);
+ vc = vec_splat(vc, 0);
+ return vc;
+}
+
+inline vector int ei_pset1(const int& from)
+{
+ static int __attribute__(aligned(16)) ai[4];
+ ai[0] = from;
+ vector int vc = vec_ld(0, ai);
+ vc = vec_splat(vc, 0);
+ return vc;
+}
+
+inline void ei_pstore(float* to, const vector float from) { vec_st(from, 0, to); }
+inline void ei_pstore(int* to, const vector int from) { vec_st(from, 0, to); }
+
+inline float ei_pfirst(const vector float a)
+{
+ static float __attribute__(aligned(16)) af[4];
+ vec_st(a, 0, af);
+ return af[0];
+}
+
+inline int ei_pfirst(const vector int a)
+{
+ static int __attribute__(aligned(16)) ai[4];
+ vec_st(a, 0, ai);
+ return ai[0];
+}
+
+#endif // EIGEN_VECTORIZE_ALTIVEC & SSE
#endif // EIGEN_PACKET_MATH_H
diff -x .svn -ru eigen2.orig/Eigen/src/Core/Product.h eigen2/Eigen/src/Core/Product.h
--- eigen2.orig/Eigen/src/Core/Product.h 2008-05-03 14:06:46.164269716 +0300
+++ eigen2/Eigen/src/Core/Product.h 2008-05-03 14:02:37.497628306 +0300
@@ -133,8 +133,8 @@
*/
template<typename Lhs, typename Rhs> struct ei_product_eval_mode
{
- enum{ value = Lhs::MaxRowsAtCompileTime >= 16
- && Rhs::MaxColsAtCompileTime >= 16
+ enum{ value = Lhs::MaxRowsAtCompileTime >= EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
+ && Rhs::MaxColsAtCompileTime >= EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
&& (!( (Lhs::Flags&RowMajorBit) && (Rhs::Flags&RowMajorBit xor RowMajorBit)))
? CacheOptimalProduct : NormalProduct };
};