Re: [eigen] request for help: 4x4 matrix inverse

Hi again,

I just finished (well 99% but it produces the right results so far)
the AltiVec version of the matrix inverse routine using matrix
partitioning technique mentioned in this list (btw, I did a typo in
one expression, thanks to Jacob for noticing this :)

Anyway I attach the function (it computes the matrix but doesn't save
it back yet, I was too tired last night at 5am to continue :)
I also included in the file, all the macros/typedefs used in this

About the performance: it's ~400% faster than a plain matrix inverse
function in SIMDx86, unoptimized, using the coefficients method.

About matrix inverse SIMD, you might like to check the following URLs
also:

http://www.cellperformance.com/articles/2006/06/a_4x4_matrix_inverse_1.html
(Cell and AltiVec, different approach than mine, haven't seen any
benchmarks on it yet, though i intend to compare them)

(SSE2 approach, again using the coefficients method).

Though I haven't tested it, I believe that the matrix partitioning
method is faster overall, since the calculations are fewer than the
coefficients method. But as I said, I intend to compare these soon.

Konstantinos
#define ALIGNMENT_PREFIX
#define ALIGNMENT_SUFFIX __attribute__((  aligned(16)  ))

typedef struct ALIGNMENT_PREFIX SIMDx86Matrix
{
float m[16];
} SIMDx86Matrix ALIGNMENT_SUFFIX;

// Calculate -0.0 and put it in a vector (needed for IEEE754 compliance)
// We also do two rounds of Newton-Rahpson
#define INVERSE_RECIPROCAL(inv, vec)							\
{											\
float ALIGNMENT_PREFIX c05[4] ALIGNMENT_SUFFIX;					\
c05[0] = 0.5f;									\
vector float _0, y, v1, v05;							\
vector unsigned int vtmp;							\
vtmp = vec_splat_u32(-1);							\
v1 = (vector float) vec_splat_u32(1);						\
v05 = vec_ld(0, c05);								\
v05 = vec_splat(v05, 0);							\
_0 = (vector float)(vec_sl(vtmp, vtmp));					\
inv = vec_re(vec);								\
}

{							\
vr = vec_ld(0, (float *)vs);			\
}

#define STORE_VECTOR(vr, vs)				\
{							\
vec_st(vr,  0, (float *)vs);			\
}

#define LOAD_MATRIX(m, vm1, vm2, vm3, vm4)		\
{							\
vm1 = vec_ld(0,  (float *)m);			\
vm2 = vec_ld(16, (float *)m);			\
vm3 = vec_ld(32, (float *)m);			\
vm4 = vec_ld(48, (float *)m);			\
}

#define STORE_MATRIX(m, vm1, vm2, vm3, vm4)		\
{							\
vec_st(vm1,  0, (float *)m);			\
vec_st(vm2, 16, (float *)m);			\
vec_st(vm3, 32, (float *)m);			\
vec_st(vm4, 48, (float *)m);			\
}

SIMDx86Matrix *SIMDx86Matrix_InverseOf_AltiVec(SIMDx86Matrix* pOut, const SIMDx86Matrix* pIn) {

// Calculate the determinant, using the following formula:
// detM = detP * det(S - R*P^(-1)*Q)
// where:   | a1 a2 a3 a4 |
//      M = | b1 b2 b3 b4 | = | P Q |
//          | c1 c2 c3 c4 |   | R S |
//          | d1 d2 d3 d4 |
// We use Block matrices and Liebniz Formula:
// http://en.wikipedia.org/wiki/Determinant

// Calculate detP first. if detP = 0 just return 0
// Make sure we align it properly to be used later in a vector
// detSRPQ will in fact hold det(S - R*P^(-1)*Q)
//
float ALIGNMENT_PREFIX dummy[4] ALIGNMENT_SUFFIX;

vector float vm_1, vm_2, vm_3, vm_4, vp_1, vp_2, vq_1, vq_2,
vpq_1, vpq_2, vrp_1, vrp_2,
vrpq_1, vrpq_2, vsrpq_1, vsrpq_2, vt_1, vt_2,
vdet, invdet, v0f;
vector unsigned int v0i, v0n, u_znzn, u_nznz;

// Trick to do only one load vector and minimize memory access.
union ALIGNMENT_PREFIX {
float f[4];
unsigned int i[4];
} ALIGNMENT_SUFFIX det, detSRPQ;
det.i[1] = 0x80000000L;
det.f[0] = pIn->m[0]*pIn->m[5] - pIn->m[1]*pIn->m[4];
if (det.f[0] == 0.0f) return NULL; // FIXME: there are cases where detP = 0 !=> detM = 0

//	printf("detP = %f\n", det.f[0]);
// Load detP onto a vector and splat the value.

v0i = vec_splat_u32(0);
v0f = (vector float)v0i;
// This is why we did the trick with the union. We did only one vector load,
// but from that we splat 2 vectors, for the det and the -0.0 float.
v0n = vec_splat((vector unsigned int)vdet, 1);
u_nznz = vec_mergeh(v0n, v0i);
u_znzn = vec_mergeh(v0i, v0n);

// Splat det
vdet = vec_splat(vdet, 0);

// Get 1/detP
INVERSE_RECIPROCAL(invdet, vdet);

// Now calculate S - R*P^(-1)*Q.
// Or to make advantage of the SIMD data layout, we could use 2x4 matrix:
// we first have to multiply [R 0 ] * [P^(-1) 0] and the result with [Q 0]
// and finally subtract the result from [S 0]
// But first we have to find the inverse of P^(-1) (since detP != 0 it is certain it exists)
// The inverse of a 2x2 matrix is:
// P^(-1) =  |  b2 -a2 |/detP
//           | -b1  a1 |

// We need to permute, but to save loading the permute masks from memory, we'll use mergeh
// vp_2 -> mergeh(vm_2, vm_1) -> | b1 a1 b2 a2 |
// vp_1 -> vp_2 << 64 = | b2 a2 0 0 |
vp_2 = vec_mergeh(vm_2, vm_1);
vp_1 = vec_sld(vp_2, vp_2, 8);

// Fix the signs in vp_1:
// vp_1 -> |  b2 -a2 |
// vp_2 -> | -b1  a1 |
// This part inspired from http://www.cellperformance.com/articles/2006/06/a_4x4_matrix_inverse_1.html
vp_1 = (vector float) vec_xor((vector unsigned int) vp_1, u_znzn);
vp_2 = (vector float) vec_xor((vector unsigned int) vp_2, u_nznz);

/*	printf("after permute and xor:\n");
STORE_VECTOR(vp_1, &dummy[0]);
printf("vp_1: %f %f %f %f\n", dummy[0], dummy[1], dummy[2], dummy[3]);
STORE_VECTOR(vp_2, &dummy[0]);
printf("vp_2: %f %f %f %f\n", dummy[0], dummy[1], dummy[2], dummy[3]);*/

// We now have the adjoint of P
// Multiply each element with inverse detP to get P^(-1)

/*	printf("divide with det = %f:\n", det.f[0]);
STORE_VECTOR(vp_1, &dummy[0]);
printf("vp_1: %f %f %f %f\n", dummy[0], dummy[1], dummy[2], dummy[3]);
STORE_VECTOR(vp_2, &dummy[0]);
printf("vp_2: %f %f %f %f\n", dummy[0], dummy[1], dummy[2], dummy[3]);*/

// Now P^(-1) is calculated, we can proceed to the first product: R*P^(-1) (1)
// The product is done like for a 4x4 matrix but only for the first 2 lines
// and the first 2 columns
// Calculate the first column of p1
vt_1 = vec_madd( vec_splat( vm_3, 0 ), vp_1, v0f );
vt_2 = vec_madd( vec_splat( vm_4, 0 ), vp_1, v0f );

// Multiply the 2nd column of p1, add to previous result
vrp_1 = vec_madd( vec_splat( vm_3, 1 ), vp_2, vt_1 );
vrp_2 = vec_madd( vec_splat( vm_4, 1 ), vp_2, vt_2 );

/*	printf("R*P^(-1):\n");
STORE_VECTOR(vrp_1, &dummy[0]);
printf("vrp_1: %f %f %f %f\n", dummy[0], dummy[1], dummy[2], dummy[3]);
STORE_VECTOR(vrp_2, &dummy[0]);
printf("vrp_2: %f %f %f %f\n", dummy[0], dummy[1], dummy[2], dummy[3]);*/

// Calculate R*P^(-1)*Q, just multiply previous result with Q
// using same method
vrpq_1 = vec_madd( vec_splat( vrp_1, 0 ), vm_1, v0f );
vrpq_2 = vec_madd( vec_splat( vrp_2, 0 ), vm_1, v0f );

// Multiply the 2nd column of p1, add to previous result
vrpq_1 = vec_madd( vec_splat( vrp_1, 1 ), vm_2, vrpq_1 );
vrpq_2 = vec_madd( vec_splat( vrp_2, 1 ), vm_2, vrpq_2 );

// Calculate P^(-1)*Q also
vq_1 = vec_sld(vm_1, v0f, 8);
vq_2 = vec_sld(vm_2, v0f, 8);
vpq_1 = vec_madd( vec_splat( vp_1, 0 ), vq_1, v0f );
vpq_2 = vec_madd( vec_splat( vp_2, 0 ), vq_1, v0f );

// Multiply the 2nd column of p1, add to previous result
vpq_1 = vec_madd( vec_splat( vp_1, 1 ), vq_2, vpq_1 );
vpq_2 = vec_madd( vec_splat( vp_2, 1 ), vq_2, vpq_2 );

/*	printf("R*P^(-1)*Q:\n");
STORE_VECTOR(vrpq_1, &dummy[0]);
printf("vrpq_1: %f %f %f %f\n", dummy[0], dummy[1], dummy[2], dummy[3]);
STORE_VECTOR(vrpq_2, &dummy[0]);
printf("vrpq_2: %f %f %f %f\n", dummy[0], dummy[1], dummy[2], dummy[3]);*/

// S - R*P^(-1)*Q
// Finally, just subtract the previous result from S
vsrpq_1 = vec_sub(vm_3, vrpq_1);
vsrpq_2 = vec_sub(vm_4, vrpq_2);

/*	printf("S - R*P^(-1)*Q:\n");
STORE_VECTOR(vsrpq_1, &dummy[0]);
printf("vsrpq_1: %f %f %f %f\n", dummy[0], dummy[1], dummy[2], dummy[3]);
STORE_VECTOR(vsrpq_2, &dummy[0]);
printf("vsrpq_2: %f %f %f %f\n", dummy[0], dummy[1], dummy[2], dummy[3]);*/

vt_2 = vec_sld(vsrpq_2, v0f, 8);
vt_1 = vec_sld(vsrpq_1, vt_2, 8);
// Get the determinant of (S - R*P^(-1)*Q) and multiply with detP
STORE_VECTOR(vt_1, &detSRPQ.f[0]);
// FIXME: I don't like how it's done, it should be done completely in AltiVec
//	printf("detSRPQ: %08x: %f %f %f %f\n", &detSRPQ.f[0], detSRPQ.f[0], detSRPQ.f[1], detSRPQ.f[2], detSRPQ.f[3]);
detSRPQ.f[0] = detSRPQ.f[0]*detSRPQ.f[3] - detSRPQ.f[1]*detSRPQ.f[2];

det.f[0] = det.f[0]*detSRPQ.f[0];
//	printf("det = %f\n", det.f[0]);
if (det.f[0] == 0.0f) return NULL;
// Get the inverse of (S - R*P^(-1)*Q), repeat the procedure
//	printf("detSRPQ = %f\n", detSRPQ.f[0]);
vdet = vec_splat(vdet, 0);
// Get 1/detP
INVERSE_RECIPROCAL(invdet, vdet);

vt_2 = vec_mergeh(vt_2, vt_1);
vt_1 = vec_sld(vt_2, vt_2, 8);

vector float vP_1, vP_2, vQ_1, vQ_2, vR_1, vR_2, vS_1, vS_2;
// Fix the signs in vp_1:
vS_1 = (vector float) vec_xor((vector unsigned int) vt_1, u_znzn);
vS_2 = (vector float) vec_xor((vector unsigned int) vt_2, u_nznz);

// We now have the adjoint of P
// Multiply each element with inverse detP to get (S - R*P^(-1)*Q)^(-1)
// Checkpoint #1: We got S'
printf("S':\n");
STORE_VECTOR(vS_1, &dummy[0]);
printf("vS_1: %f %f %f %f\n", dummy[0], dummy[1], dummy[2], dummy[3]);
STORE_VECTOR(vS_2, &dummy[0]);
printf("vS_2: %f %f %f %f\n", dummy[0], dummy[1], dummy[2], dummy[3]);

// R' = -S'*(R*P^(-1))
// Just multiply previous result with vrp
// using same method
vR_1 = vec_madd( vec_splat( vS_1, 0 ), vrp_1, v0f );
vR_2 = vec_madd( vec_splat( vS_2, 0 ), vrp_1, v0f );

// Multiply the 2nd column of p1, add to previous result
vR_1 = vec_madd( vec_splat( vS_1, 1 ), vrp_2, vR_1 );
vR_2 = vec_madd( vec_splat( vS_2, 1 ), vrp_2, vR_2 );
vR_1 = (vector float) vec_xor((vector unsigned int) vR_1, v0n);
vR_2 = (vector float) vec_xor((vector unsigned int) vR_2, v0n);

printf("R':\n");
STORE_VECTOR(vR_1, &dummy[0]);
printf("vR_1: %f %f %f %f\n", dummy[0], dummy[1], dummy[2], dummy[3]);
STORE_VECTOR(vR_2, &dummy[0]);
printf("vR_2: %f %f %f %f\n", dummy[0], dummy[1], dummy[2], dummy[3]);

// Q' = -(P^(-1)*Q)*S'
vQ_1 = vec_madd( vec_splat( vpq_1, 0 ), vS_1, v0f );
vQ_2 = vec_madd( vec_splat( vpq_2, 0 ), vS_1, v0f );

// Multiply the 2nd column of p1, add to previous result
vQ_1 = vec_madd( vec_splat( vpq_1, 1 ), vS_2, vQ_1 );
vQ_2 = vec_madd( vec_splat( vpq_2, 1 ), vS_2, vQ_2 );
vQ_1 = (vector float) vec_xor((vector unsigned int) vQ_1, v0n);
vQ_2 = (vector float) vec_xor((vector unsigned int) vQ_2, v0n);

printf("Q':\n");
STORE_VECTOR(vQ_1, &dummy[0]);
printf("vQ_1: %f %f %f %f\n", dummy[0], dummy[1], dummy[2], dummy[3]);
STORE_VECTOR(vQ_2, &dummy[0]);
printf("vQ_2: %f %f %f %f\n", dummy[0], dummy[1], dummy[2], dummy[3]);

// P' = P^(-1) - (P^(-1)*Q)*R'
vP_1 = vec_madd( vec_splat( vpq_1, 0 ), vR_1, v0f );
vP_2 = vec_madd( vec_splat( vpq_2, 0 ), vR_1, v0f );

// Multiply the 2nd column of p1, add to previous result
vP_1 = vec_madd( vec_splat( vpq_1, 1 ), vR_2, vP_1 );
vP_2 = vec_madd( vec_splat( vpq_2, 1 ), vR_2, vP_2 );

vP_1 = vec_sub(vp_1, vP_1);
vP_2 = vec_sub(vp_2, vP_2);

printf("P':\n");
STORE_VECTOR(vP_1, &dummy[0]);
printf("vP_1: %f %f %f %f\n", dummy[0], dummy[1], dummy[2], dummy[3]);
STORE_VECTOR(vP_2, &dummy[0]);
printf("vP_2: %f %f %f %f\n", dummy[0], dummy[1], dummy[2], dummy[3]);

// Finished, now let's put these back together to one matrix (4 vectors)
// all P', Q', R', S' matrices have the needed block in the left 2x2 part
// in the vP_*, vQ_*, vR_*, vS_* vectors. We need to shift right by 64 bits
// the vQ_* and vS_* vectors and merge left 2x2 block of vP_* with the right
// 2x2 block of vQ_*. Same with vR_* and vS_*. And then we have our inverse
// matrix
// Still TODO/FIXME:
// vm_1 = vec_or(vP_1, vQ_1, mask);
// vm_2 = vec_or(vP_2, vQ_2, mask);
// vm_3 = vec_or(vR_1, vS_1, mask);
// vm_4 = vec_or(vR_2, vS_2, mask);

// Store the resulting inverse matrix
STORE_MATRIX(&pOut->m[0], vm_1, vm_2, vm_3, vm_4);
}

 Mail converted by MHonArc 2.6.19+ http://listengine.tuxfamily.org/