[eigen] Vectorized quaternion multiplication. |
[ Thread Index |
Date Index
| More lists.tuxfamily.org/eigen Archives
]
- To: eigen@xxxxxxxxxxxxxxxxxxx
- Subject: [eigen] Vectorized quaternion multiplication.
- From: Rohit Garg <rpg.314@xxxxxxxxx>
- Date: Sat, 7 Mar 2009 14:54:09 +0530
- Dkim-signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=gmail.com; s=gamma; h=domainkey-signature:mime-version:received:date:message-id:subject :from:to:content-type; bh=t6Ljt36tpK9xVD8bsXMW0WAAxazWdyDgYcUz97mOc1M=; b=FGUFL+NnjQ/CHSKTcgDpv4m8Z+/k9MIKuKJ9zx1uGEcWMN/um5kZNxo9xlIDYpAiYN m1Kq2eC3xaZFCaVY6up9iVF6/A9H6PfsUBrmN0RMbAx6DPsR8ZFBs3Kzxq1gKXw1uj8V L+XOzyXgTiS6K+j57/lrZVju5ElMhjikEGXvU=
- Domainkey-signature: a=rsa-sha1; c=nofws; d=gmail.com; s=gamma; h=mime-version:date:message-id:subject:from:to:content-type; b=Gxzu7r332mASZ4SAZ5f0Bju+1Bv4i6WSliTG3jZNTwncq3KMSHza6nDmf5qSy7Q+Q0 MNQLGu2TDvWjp+5rPzWzOrXIrGMAikEzRZR/YjDddbSdskduH2GRakmciJOJ2OBJ4/xW ii4y91sqNTR74Rd98S6i+qEoOjUHv73+M03ww=
Hi,
The attached file has a the code for vectorized quaternion
multiplication. (SSE only). I have not made a patch because I do not
have access to svn. My school's proxy apparently doesn't play nice
with svn. The results match against eigen's results. The convention is
that x,y,z are stored first and then the scalar part.
The assembly is 23 instructions, but they are highly pipelined.
00000000004009e0 <_Z8quat_mulU8__vectorfS_>:
4009e0: 0f 28 f9 movaps %xmm1,%xmm7
4009e3: 0f 28 f1 movaps %xmm1,%xmm6
4009e6: 0f 28 d8 movaps %xmm0,%xmm3
4009e9: 0f 28 e9 movaps %xmm1,%xmm5
4009ec: 0f 28 d0 movaps %xmm0,%xmm2
4009ef: 0f c6 f9 ff shufps $0xff,%xmm1,%xmm7
4009f3: 0f c6 f1 09 shufps $0x9,%xmm1,%xmm6
4009f7: 0f c6 e9 64 shufps $0x64,%xmm1,%xmm5
4009fb: 0f 28 e0 movaps %xmm0,%xmm4
4009fe: 0f c6 d8 7f shufps $0x7f,%xmm0,%xmm3
400a02: 0f c6 d0 89 shufps $0x89,%xmm0,%xmm2
400a06: 0f c6 c9 92 shufps $0x92,%xmm1,%xmm1
400a0a: 0f c6 e0 12 shufps $0x12,%xmm0,%xmm4
400a0e: 0f 59 dd mulps %xmm5,%xmm3
400a11: 0f 59 d1 mulps %xmm1,%xmm2
400a14: 0f 28 0d 45 0f 20 00 movaps 0x200f45(%rip),%xmm1 #
601960 <_ZL9quat_mask>
400a1b: 0f 59 e6 mulps %xmm6,%xmm4
400a1e: 0f 59 c7 mulps %xmm7,%xmm0
400a21: 0f 57 d1 xorps %xmm1,%xmm2
400a24: 0f 57 d9 xorps %xmm1,%xmm3
400a27: 0f 5c c4 subps %xmm4,%xmm0
400a2a: 0f 58 d3 addps %xmm3,%xmm2
400a2d: 0f 58 c2 addps %xmm2,%xmm0
400a30: c3 retq
400a31: 66 66 66 66 66 66 2e nopw %cs:0x0(%rax,%rax,1)
400a38: 0f 1f 84 00 00 00 00
400a3f: 00
I checked for eigen's code, and it has ~55 instructions, so clearly a
win. With this I hope the major arguments against quaternion class
being scalar willl go away. If some can build a scaffolding for simd
quaternion code. I'll be happy to chip in with the optimized
implementations for the stuff. Add, subtract should be fairly trivial.
I am interested in implementing a fast quaternion slerp as well. Fast
== approximate.
On the topic of complex math. I feel Eigen should make specialized
classes. It is highly unlikely that gcc will be able to vectorize
complex mutiplication anytime soon.
Regards,
--
Rohit Garg
http://rpg-314.blogspot.com/
Senior Undergraduate
Department of Physics
Indian Institute of Technology
Bombay
#include<iostream>
#include<cmath>
#include<pmmintrin.h>
#include<Eigen/Geometry>
using namespace std;
using namespace Eigen;
float __attribute__((aligned(16))) printBuf[4];
#define vec4f_swizzle(v,p,q,r,s) (_mm_shuffle_ps( (v),(v), ((s)<<6|(r)<<4|(q)<<2|(p))))
const int zero=0;
const int flipSign=0x80000000;
const __m128 quat_mask=_mm_setr_ps( *(float*)&zero,
*(float*)&zero,
*(float*)&zero,
*(float*)&flipSign);
void print( __m128 a)
{
_mm_store_ps(printBuf,a);
cout<<printBuf[0]<<' '<<
printBuf[1]<<' '<<
printBuf[2]<<' '<<
printBuf[3]<<endl;
}
__m128 quat_mul(__m128 a, __m128 b)
{
__m128 swiz1=vec4f_swizzle(b,3,3,3,3);
// print(swiz1);
__m128 swiz2=vec4f_swizzle(a,2,0,1,0);
// print(swiz2);
__m128 swiz3=vec4f_swizzle(b,1,2,0,0);
// print(swiz3);
__m128 swiz4=vec4f_swizzle(a,3,3,3,1);
// print(swiz4);
__m128 swiz5=vec4f_swizzle(b,0,1,2,1);
// print(swiz5);
__m128 swiz6=vec4f_swizzle(a,1,2,0,2);
// print(swiz6);
__m128 swiz7=vec4f_swizzle(b,2,0,1,2);
// print(swiz7);
// cout<<"mul\n";
__m128 mul4=_mm_mul_ps(swiz6,swiz7);
// print(mul4);
__m128 mul3=_mm_mul_ps(swiz4,swiz5);
// print(mul3);
__m128 mul2=_mm_mul_ps(swiz2,swiz3);
// print(mul2);
__m128 mul1=_mm_mul_ps(a,swiz1);
// print(mul1);
__m128 flip1=_mm_xor_ps(mul4,quat_mask);
__m128 flip2=_mm_xor_ps(mul3,quat_mask);
/* cout<<"sign\n";
print(flip1);
print(flip2);
cout<<"ret\n";*/
__m128 retVal=_mm_sub_ps(mul1,mul2);
// print(retVal);
__m128 retVal2=_mm_add_ps(flip1,flip2);
// print(retVal2);
return _mm_add_ps(retVal,retVal2);
}
int main()
{
Quaternionf a(1,2,3,4);
Quaternionf b(11,12,13,14);
__m128 a1,b1;
a1=_mm_setr_ps(2.0,3.0,4.0,1);
b1=_mm_setr_ps(12,13,14,11);
print(a1);
print(b1);
// print(vec4f_swizzle(a1,1,2,0,3));
cout<<"mul\n";
print(quat_mul(a1,b1));
cout<<"quat\n";
asm("#start");
Quaternionf c=a*b;
asm("#end");
cout<<c.x()<<' '<<
c.y()<<' '<<
c.z()<<' '<<
c.w()<<"\n";
cout<<"test\n";
cout<<a.x()<<' '<<
a.y()<<' '<<
a.z()<<' '<<
a.w()<<"\n";
return 0;
}