On 01/13/2010 10:08 PM, Benoit Jacob wrote:
2010/1/13 Mark Borgerding<mark@xxxxxxxxxxxxxx>:
I've found a certain amount of loop unrolling very beneficial to speed, even
with SIMD.
e.g. loading 4 SIMD registers, working on them, then storing the results can
be much faster than doing them one at a time.
I see. This kind of unrolling is what we called peeling and I can
believe that in some cases it brings benefits. Normally I would like
to keep peeling completely orthogonal to vectorization,
If you mean the definition of "loop peeling" given at
http://en.wikipedia.org/wiki/Loop_splitting
That is not what I am talking about.
Example code may illustrate my point. The UNROLL4 simd loop below takes
about 25% less time than both the Eigen MapAligned path and the elementwise
simd loop.
void vector_add(float * dst,const float * src1,const float * src2,int n)
{
int k=0;
bool all_aligned = (0 == (15& ( ptr2int(dst) | ptr2int(src1) |
ptr2int(src2) ) ) );
if (all_aligned) {
# ifdef USE_EIGEN
VectorXf::MapAligned(dst,n) = VectorXf::MapAligned(src1,n) +
VectorXf::MapAligned(src2,n);
return; // eigen takes care of the remaining samples after alignment
ends
# elif defined( UNROLL4 )
// process 16 floats per loop
for (; k+16<=n;k+=16) {
__m128 a = _mm_add_ps(_mm_load_ps(src1+k),_mm_load_ps(src2+k) );
__m128 b =
_mm_add_ps(_mm_load_ps(src1+k+4),_mm_load_ps(src2+k+4) );
__m128 c =
_mm_add_ps(_mm_load_ps(src1+k+8),_mm_load_ps(src2+k+8) );
__m128 d =
_mm_add_ps(_mm_load_ps(src1+k+12),_mm_load_ps(src2+k+12) );
_mm_store_ps(dst+k, a);
_mm_store_ps(dst+k+4,b);
_mm_store_ps(dst+k+8, c);
_mm_store_ps(dst+k+12, d);
}
# else
// one simd element ( 4 floats) at a time
for (; k+4<=n;k+=4)
_mm_store_ps(dst+k,_mm_add_ps(_mm_load_ps(src1+k),_mm_load_ps(src2+k) ) );
# endif
}
for (;k<n;++k)
dst[k] = src1[k] + src2[k];
}
test specifics
n=512 floats ( small enough to fit into cache to avoid testing memory speed)
core2 cpu
linux 32 bit g++ 4.4.2
-DNDEBUG -O3 -msse -msse2 -msse3
For you assembly gurus:
the elementwise simd loop and Eigen MapAligned code both compile to code
like this
.L6:
# basic block 5
movaps (%eax,%ecx,4), %xmm0 #* src1, tmp110
addps (%edx,%ecx,4), %xmm0 #* src2, tmp110
movaps %xmm0, (%ebx,%ecx,4) # tmp110,* dst
addl $4, %ecx #, index
cmpl %ecx, %esi # index, index
jg .L6 #,
and the unrolled simd loop compiles to
.L8:
# basic block 5
movaps (%edi,%eax,4), %xmm3 #* src1, tmp100
movaps 16(%edi,%eax,4), %xmm2 #, tmp103
movaps 32(%edi,%eax,4), %xmm1 #, tmp106
movaps 48(%edi,%eax,4), %xmm0 #, tmp109
addps (%esi,%eax,4), %xmm3 #* src2, tmp100
addps 16(%esi,%eax,4), %xmm2 #, tmp103
addps 32(%esi,%eax,4), %xmm1 #, tmp106
addps 48(%esi,%eax,4), %xmm0 #, tmp109
movaps %xmm3, (%ebx,%eax,4) # tmp100,* dst
movaps %xmm2, 16(%ebx,%eax,4) # tmp103,
movaps %xmm1, 32(%ebx,%eax,4) # tmp106,
movaps %xmm0, 48(%ebx,%eax,4) # tmp109,
addl $16, %eax #, k
cmpl %edx, %eax # D.54688, k
jne .L8 #,