[eigen] Vectorized(SSE) integer multiplication

[ Thread Index | Date Index | More lists.tuxfamily.org/eigen Archives ]


Hi,

This file has my vectorized implementation (sse2) of multiplication of
4 integers. The eigen routine was taken from packetMath.h file. The
benchmarks show small but noticeable difference.

~/Documents/numerical@rpg> g++ vec4i_mul.cpp -msse3 -O3 -march=native
~/Documents/numerical@rpg> ./a.out > /dev/null
1236491601 ei mul begins
1236491618 ei mul ends
1236491618my mul
1236491633 end

The macros could be defined better I admit. They were taken from my
implementation of vec4i multiplication which I wrote for my own needs
earlier. They are same as for the quaternion routine I sent earlier.
So please consider unifying them.

BTW, this multiplication instruction that you (and I) are using does
only unsigned multiplication. Signed multiplication is there as a
single instruction in SSE4.1. So a small patch could be added for that
too. the exact intrinsic is _mm_mul_epi32. My cpu doesn't have that,
so I can't test it.

Regards,
-- 
Rohit Garg

http://rpg-314.blogspot.com/

Senior Undergraduate
Department of Physics
Indian Institute of Technology
Bombay
#include<iostream>
#include<ctime>
#include<pmmintrin.h>
using namespace std;

const int testCount=25000;

#define VECTOR4UI_SHUFFLE_MASK(p,q,r,s) (((p<<6)|(q<<4)|(r<<2)|(s)))

#define vector4ui_swizzle(v,p,q,r,s) ((_mm_shuffle_epi32( (v), ((s)<<6)|((r)<<4)|((q)<<2)|(p))))

inline __m128i ei_mul(const __m128i a, const __m128i b)
{
  return _mm_or_si128(
    _mm_and_si128(
      _mm_mul_epu32(a,b),
      _mm_setr_epi32(0xffffffff,0,0xffffffff,0)),
    _mm_slli_si128(
      _mm_and_si128(
        _mm_mul_epu32(_mm_srli_si128(a,4),_mm_srli_si128(b,4)),
        _mm_setr_epi32(0xffffffff,0,0xffffffff,0)), 4));
}

inline __m128i my_mul(const __m128i v1, const __m128i v2)
    {
    __m128i a1=vector4ui_swizzle(v1,1,0,3,2);
    __m128i b1=vector4ui_swizzle(v2,1,0,3,2);
    __m128i mul1=_mm_mul_epu32(v1,v2);
    __m128i mul2=_mm_mul_epu32(a1,b1);
    __m128i composite=_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(mul1),_mm_castsi128_ps(mul2),VECTOR4UI_SHUFFLE_MASK(0,2,0,2)));
    return (vector4ui_swizzle(composite,1,3,0,2));
    }

int __attribute__((aligned(16))) printBuf[4];

void print( __m128i a)
    {
    _mm_store_si128((__m128i*)printBuf,a);
    cout<<printBuf[0]<<' '<<
	printBuf[1]<<' '<<
	printBuf[2]<<' '<<
	printBuf[3]<<endl;
    }

int main()
    {
    __m128i num1[testCount], num2[testCount], out[testCount];
    int i,j;
    for(i=0; i<testCount; i++)
	{
	num1[i]=_mm_setr_epi32(i,i+1,i+2,i+3);
	num2[i]=_mm_setr_epi32(i+3,i+2,i+1,i);
	}
    cerr<<time(NULL)<<" ei mul begins\n";
    for(j=0;j<testCount; j++)
	for(i=0; i<testCount; i++)
	    {
		out[i]=ei_mul(num1[i],num2[i]);
	    }
    cerr<<time(NULL)<<" ei mul ends\n";
    for(i=0;i<testCount; i++)
	print(out[i]);
    cerr<<time(NULL)<<"my mul\n";
    for(j=0;j<testCount; j++)
	for(i=0; i<testCount; i++)
	    {
		out[i]=my_mul(num1[i],num2[i]);
	    }
    cerr<<time(NULL)<<" end\n";
    for(i=0;i<testCount; i++)
	print(out[i]);
    return 0;
    }


Mail converted by MHonArc 2.6.19+ http://listengine.tuxfamily.org/