[eigen] Vectorized(SSE) integer multiplication |
[ Thread Index |
Date Index
| More lists.tuxfamily.org/eigen Archives
]
- To: eigen@xxxxxxxxxxxxxxxxxxx
- Subject: [eigen] Vectorized(SSE) integer multiplication
- From: Rohit Garg <rpg.314@xxxxxxxxx>
- Date: Sun, 8 Mar 2009 11:32:36 +0530
- Dkim-signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=gmail.com; s=gamma; h=domainkey-signature:mime-version:received:date:message-id:subject :from:to:content-type; bh=sanad5a9eF/86Ep/+mAekTUQ4bKpAs91P1I/Dkt+ZnU=; b=nQID/U9SG+uSHbmn6Rl1RKZUzQVe4mw2oph9VU+StVYWh3Mo9Uagm69E2+HHqT6Gum sRfxJQeinXe0X/Wf4wlMnyqC6pRUKv6JTPpEwaRy63a90ZTZJGzRma2vV9ARXif6GmBa KKl9k7owgB+ujONjcAlbM2pfV4Yyf7TRjkwdI=
- Domainkey-signature: a=rsa-sha1; c=nofws; d=gmail.com; s=gamma; h=mime-version:date:message-id:subject:from:to:content-type; b=X+EXSC0jcUNubJZR2z5FEbsRhu9YefK3B8ow9TZVqRFDayDSw9Srtbn7EVj+YP1Gnp ynFsVhJNWOfCPF5F13jYgK+24/0h42U9YBCAqs+aTrntUHFIhLb2Nn+vJTyEGVEf7ojJ FPxjG5DQs7hcLUgj3Ys/phIoWLzXGorhMKH0M=
Hi,
This file has my vectorized implementation (sse2) of multiplication of
4 integers. The eigen routine was taken from packetMath.h file. The
benchmarks show small but noticeable difference.
~/Documents/numerical@rpg> g++ vec4i_mul.cpp -msse3 -O3 -march=native
~/Documents/numerical@rpg> ./a.out > /dev/null
1236491601 ei mul begins
1236491618 ei mul ends
1236491618my mul
1236491633 end
The macros could be defined better I admit. They were taken from my
implementation of vec4i multiplication which I wrote for my own needs
earlier. They are same as for the quaternion routine I sent earlier.
So please consider unifying them.
BTW, this multiplication instruction that you (and I) are using does
only unsigned multiplication. Signed multiplication is there as a
single instruction in SSE4.1. So a small patch could be added for that
too. the exact intrinsic is _mm_mul_epi32. My cpu doesn't have that,
so I can't test it.
Regards,
--
Rohit Garg
http://rpg-314.blogspot.com/
Senior Undergraduate
Department of Physics
Indian Institute of Technology
Bombay
#include<iostream>
#include<ctime>
#include<pmmintrin.h>
using namespace std;
const int testCount=25000;
#define VECTOR4UI_SHUFFLE_MASK(p,q,r,s) (((p<<6)|(q<<4)|(r<<2)|(s)))
#define vector4ui_swizzle(v,p,q,r,s) ((_mm_shuffle_epi32( (v), ((s)<<6)|((r)<<4)|((q)<<2)|(p))))
inline __m128i ei_mul(const __m128i a, const __m128i b)
{
return _mm_or_si128(
_mm_and_si128(
_mm_mul_epu32(a,b),
_mm_setr_epi32(0xffffffff,0,0xffffffff,0)),
_mm_slli_si128(
_mm_and_si128(
_mm_mul_epu32(_mm_srli_si128(a,4),_mm_srli_si128(b,4)),
_mm_setr_epi32(0xffffffff,0,0xffffffff,0)), 4));
}
inline __m128i my_mul(const __m128i v1, const __m128i v2)
{
__m128i a1=vector4ui_swizzle(v1,1,0,3,2);
__m128i b1=vector4ui_swizzle(v2,1,0,3,2);
__m128i mul1=_mm_mul_epu32(v1,v2);
__m128i mul2=_mm_mul_epu32(a1,b1);
__m128i composite=_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(mul1),_mm_castsi128_ps(mul2),VECTOR4UI_SHUFFLE_MASK(0,2,0,2)));
return (vector4ui_swizzle(composite,1,3,0,2));
}
int __attribute__((aligned(16))) printBuf[4];
void print( __m128i a)
{
_mm_store_si128((__m128i*)printBuf,a);
cout<<printBuf[0]<<' '<<
printBuf[1]<<' '<<
printBuf[2]<<' '<<
printBuf[3]<<endl;
}
int main()
{
__m128i num1[testCount], num2[testCount], out[testCount];
int i,j;
for(i=0; i<testCount; i++)
{
num1[i]=_mm_setr_epi32(i,i+1,i+2,i+3);
num2[i]=_mm_setr_epi32(i+3,i+2,i+1,i);
}
cerr<<time(NULL)<<" ei mul begins\n";
for(j=0;j<testCount; j++)
for(i=0; i<testCount; i++)
{
out[i]=ei_mul(num1[i],num2[i]);
}
cerr<<time(NULL)<<" ei mul ends\n";
for(i=0;i<testCount; i++)
print(out[i]);
cerr<<time(NULL)<<"my mul\n";
for(j=0;j<testCount; j++)
for(i=0; i<testCount; i++)
{
out[i]=my_mul(num1[i],num2[i]);
}
cerr<<time(NULL)<<" end\n";
for(i=0;i<testCount; i++)
print(out[i]);
return 0;
}