[eigen] Vectorized quaternion multiplication.

[ Thread Index | Date Index | More lists.tuxfamily.org/eigen Archives ]


Hi,

The attached file has a the code for vectorized quaternion
multiplication. (SSE only). I have not made a patch because I do not
have access to svn. My school's proxy apparently doesn't play nice
with svn. The results match against eigen's results. The convention is
that x,y,z are stored first and then the scalar part.

The assembly is 23 instructions, but they are highly pipelined.

00000000004009e0 <_Z8quat_mulU8__vectorfS_>:
  4009e0:	0f 28 f9             	movaps %xmm1,%xmm7
  4009e3:	0f 28 f1             	movaps %xmm1,%xmm6
  4009e6:	0f 28 d8             	movaps %xmm0,%xmm3
  4009e9:	0f 28 e9             	movaps %xmm1,%xmm5
  4009ec:	0f 28 d0             	movaps %xmm0,%xmm2
  4009ef:	0f c6 f9 ff            	shufps $0xff,%xmm1,%xmm7
  4009f3:	0f c6 f1 09          	shufps $0x9,%xmm1,%xmm6
  4009f7:	0f c6 e9 64          	shufps $0x64,%xmm1,%xmm5
  4009fb:	0f 28 e0             	movaps %xmm0,%xmm4
  4009fe:	0f c6 d8 7f          	shufps $0x7f,%xmm0,%xmm3
  400a02:	0f c6 d0 89          	shufps $0x89,%xmm0,%xmm2
  400a06:	0f c6 c9 92          	shufps $0x92,%xmm1,%xmm1
  400a0a:	0f c6 e0 12          	shufps $0x12,%xmm0,%xmm4
  400a0e:	0f 59 dd             	mulps  %xmm5,%xmm3
  400a11:	0f 59 d1             	mulps  %xmm1,%xmm2
  400a14:	0f 28 0d 45 0f 20 00 	movaps 0x200f45(%rip),%xmm1        #
601960 <_ZL9quat_mask>
  400a1b:	0f 59 e6             	mulps  %xmm6,%xmm4
  400a1e:	0f 59 c7             	mulps  %xmm7,%xmm0
  400a21:	0f 57 d1             	xorps  %xmm1,%xmm2
  400a24:	0f 57 d9             	xorps  %xmm1,%xmm3
  400a27:	0f 5c c4             	subps  %xmm4,%xmm0
  400a2a:	0f 58 d3             	addps  %xmm3,%xmm2
  400a2d:	0f 58 c2             	addps  %xmm2,%xmm0
  400a30:	c3                   	        retq
  400a31:	66 66 66 66 66 66 2e 	nopw   %cs:0x0(%rax,%rax,1)
  400a38:	0f 1f 84 00 00 00 00
  400a3f:	00

I checked for eigen's code, and it has ~55 instructions, so clearly a
win. With this I hope the major arguments against quaternion class
being scalar willl go away. If some can build a scaffolding for simd
quaternion code. I'll be happy to chip in with the optimized
implementations for the stuff. Add, subtract should be fairly trivial.
I am interested in implementing a fast quaternion slerp as well. Fast
== approximate.

On the topic of complex math. I feel Eigen should make specialized
classes. It is highly unlikely that gcc will be able to vectorize
complex mutiplication anytime soon.

Regards,
-- 
Rohit Garg

http://rpg-314.blogspot.com/

Senior Undergraduate
Department of Physics
Indian Institute of Technology
Bombay
#include<iostream>
#include<cmath>
#include<pmmintrin.h>
#include<Eigen/Geometry>
using namespace std;
using namespace Eigen;

float __attribute__((aligned(16))) printBuf[4];

#define vec4f_swizzle(v,p,q,r,s) (_mm_shuffle_ps( (v),(v), ((s)<<6|(r)<<4|(q)<<2|(p))))

const int zero=0;
const int flipSign=0x80000000;


const __m128 quat_mask=_mm_setr_ps( *(float*)&zero,
    *(float*)&zero,
    *(float*)&zero,
    *(float*)&flipSign);

void print( __m128 a)
    {
    _mm_store_ps(printBuf,a);
    cout<<printBuf[0]<<' '<<
	printBuf[1]<<' '<<
	printBuf[2]<<' '<<
	printBuf[3]<<endl;
    }

__m128 quat_mul(__m128 a, __m128 b)
    {
    __m128 swiz1=vec4f_swizzle(b,3,3,3,3);
//    print(swiz1);
    __m128 swiz2=vec4f_swizzle(a,2,0,1,0);
//    print(swiz2);
    __m128 swiz3=vec4f_swizzle(b,1,2,0,0);
//    print(swiz3);
    __m128 swiz4=vec4f_swizzle(a,3,3,3,1);
//    print(swiz4);
    __m128 swiz5=vec4f_swizzle(b,0,1,2,1);
//    print(swiz5);
    __m128 swiz6=vec4f_swizzle(a,1,2,0,2);
//    print(swiz6);
    __m128 swiz7=vec4f_swizzle(b,2,0,1,2);
//    print(swiz7);
//    cout<<"mul\n";
    __m128 mul4=_mm_mul_ps(swiz6,swiz7);
//    print(mul4);
    __m128 mul3=_mm_mul_ps(swiz4,swiz5);
//    print(mul3);
    __m128 mul2=_mm_mul_ps(swiz2,swiz3);
//    print(mul2);
    __m128 mul1=_mm_mul_ps(a,swiz1);
//    print(mul1);
    __m128 flip1=_mm_xor_ps(mul4,quat_mask);
    __m128 flip2=_mm_xor_ps(mul3,quat_mask);
/*    cout<<"sign\n";
    print(flip1);
    print(flip2);
    cout<<"ret\n";*/
    __m128 retVal=_mm_sub_ps(mul1,mul2);
//    print(retVal);
    __m128 retVal2=_mm_add_ps(flip1,flip2);
//    print(retVal2);
    return _mm_add_ps(retVal,retVal2);
    }

int main()
    {
    Quaternionf a(1,2,3,4);
    Quaternionf b(11,12,13,14);
    __m128 a1,b1;
    a1=_mm_setr_ps(2.0,3.0,4.0,1);
    b1=_mm_setr_ps(12,13,14,11);
    print(a1);
    print(b1);
//    print(vec4f_swizzle(a1,1,2,0,3));
    cout<<"mul\n";
    print(quat_mul(a1,b1));
    cout<<"quat\n";
    asm("#start");
    Quaternionf c=a*b;
    asm("#end");

    cout<<c.x()<<' '<<
	c.y()<<' '<<
	c.z()<<' '<<
	c.w()<<"\n";
    cout<<"test\n";
    cout<<a.x()<<' '<<
	a.y()<<' '<<
	a.z()<<' '<<
	a.w()<<"\n";
    return 0;
    }


Mail converted by MHonArc 2.6.19+ http://listengine.tuxfamily.org/