Re: [eigen] request for help: 4x4 matrix inverse

[ Thread Index | Date Index | More lists.tuxfamily.org/eigen Archives ]


Hi again,

I just finished (well 99% but it produces the right results so far)
the AltiVec version of the matrix inverse routine using matrix 
partitioning technique mentioned in this list (btw, I did a typo in 
one expression, thanks to Jacob for noticing this :)

Anyway I attach the function (it computes the matrix but doesn't save 
it back yet, I was too tired last night at 5am to continue :)
I also included in the file, all the macros/typedefs used in this 
function, for easier reading.

About the performance: it's ~400% faster than a plain matrix inverse 
function in SIMDx86, unoptimized, using the coefficients method.

About matrix inverse SIMD, you might like to check the following URLs 
also:

http://www.cellperformance.com/articles/2006/06/a_4x4_matrix_inverse_1.html 
(Cell and AltiVec, different approach than mine, haven't seen any 
benchmarks on it yet, though i intend to compare them)

ftp://download.intel.com/design/PentiumIII/sml/24504301.pdf
(SSE2 approach, again using the coefficients method).

Though I haven't tested it, I believe that the matrix partitioning 
method is faster overall, since the calculations are fewer than the 
coefficients method. But as I said, I intend to compare these soon.

Konstantinos
#define ALIGNMENT_PREFIX
#define ALIGNMENT_SUFFIX __attribute__((  aligned(16)  ))

typedef struct ALIGNMENT_PREFIX SIMDx86Matrix
{
	float m[16];
} SIMDx86Matrix ALIGNMENT_SUFFIX;

// Calculate -0.0 and put it in a vector (needed for IEEE754 compliance)
// We also do two rounds of Newton-Rahpson
#define INVERSE_RECIPROCAL(inv, vec)							\
{											\
	float ALIGNMENT_PREFIX c05[4] ALIGNMENT_SUFFIX;					\
	c05[0] = 0.5f;									\
	vector float _0, y, v1, v05;							\
	vector unsigned int vtmp;							\
	vtmp = vec_splat_u32(-1);							\
	v1 = (vector float) vec_splat_u32(1);						\
	v05 = vec_ld(0, c05);								\
	v05 = vec_splat(v05, 0);							\
	_0 = (vector float)(vec_sl(vtmp, vtmp));					\
	inv = vec_re(vec);								\
}

#define LOAD_VECTOR(vr, vs)				\
{							\
	vr = vec_ld(0, (float *)vs);			\
}

#define STORE_VECTOR(vr, vs)				\
{							\
	vec_st(vr,  0, (float *)vs);			\
}

#define LOAD_MATRIX(m, vm1, vm2, vm3, vm4)		\
{							\
	vm1 = vec_ld(0,  (float *)m);			\
	vm2 = vec_ld(16, (float *)m);			\
	vm3 = vec_ld(32, (float *)m);			\
	vm4 = vec_ld(48, (float *)m);			\
}

#define STORE_MATRIX(m, vm1, vm2, vm3, vm4)		\
{							\
	vec_st(vm1,  0, (float *)m);			\
	vec_st(vm2, 16, (float *)m);			\
	vec_st(vm3, 32, (float *)m);			\
	vec_st(vm4, 48, (float *)m);			\
}

SIMDx86Matrix *SIMDx86Matrix_InverseOf_AltiVec(SIMDx86Matrix* pOut, const SIMDx86Matrix* pIn) {

	// Calculate the determinant, using the following formula:
	// detM = detP * det(S - R*P^(-1)*Q)
	// where:   | a1 a2 a3 a4 |   
	//      M = | b1 b2 b3 b4 | = | P Q |
	//          | c1 c2 c3 c4 |   | R S |
	//          | d1 d2 d3 d4 |
	// We use Block matrices and Liebniz Formula:
	// http://en.wikipedia.org/wiki/Determinant
	
	// Calculate detP first. if detP = 0 just return 0
	// Make sure we align it properly to be used later in a vector
	// detSRPQ will in fact hold det(S - R*P^(-1)*Q)
	// 
	float ALIGNMENT_PREFIX dummy[4] ALIGNMENT_SUFFIX;

	vector float vm_1, vm_2, vm_3, vm_4, vp_1, vp_2, vq_1, vq_2, 
		     vpq_1, vpq_2, vrp_1, vrp_2,
		     vrpq_1, vrpq_2, vsrpq_1, vsrpq_2, vt_1, vt_2,
		     vdet, invdet, v0f;
	vector unsigned int v0i, v0n, u_znzn, u_nznz;

	// Load SIMDx86 matrix
	LOAD_MATRIX(&pIn->m[0], vm_1, vm_2, vm_3, vm_4);

	// Trick to do only one load vector and minimize memory access.
	union ALIGNMENT_PREFIX {
		float f[4];
		unsigned int i[4];
	} ALIGNMENT_SUFFIX det, detSRPQ;
	det.i[1] = 0x80000000L;
	det.f[0] = pIn->m[0]*pIn->m[5] - pIn->m[1]*pIn->m[4];
	if (det.f[0] == 0.0f) return NULL; // FIXME: there are cases where detP = 0 !=> detM = 0

//	printf("detP = %f\n", det.f[0]);
	// Load detP onto a vector and splat the value.
	LOAD_VECTOR(vdet, &det.f[0]);

	v0i = vec_splat_u32(0);
	v0f = (vector float)v0i;
	// This is why we did the trick with the union. We did only one vector load,
	// but from that we splat 2 vectors, for the det and the -0.0 float.
	v0n = vec_splat((vector unsigned int)vdet, 1);
	u_nznz = vec_mergeh(v0n, v0i);
	u_znzn = vec_mergeh(v0i, v0n);

	// Splat det
	vdet = vec_splat(vdet, 0);
	
	// Get 1/detP
	INVERSE_RECIPROCAL(invdet, vdet);

	// Now calculate S - R*P^(-1)*Q.
	// Or to make advantage of the SIMD data layout, we could use 2x4 matrix:
	// we first have to multiply [R 0 ] * [P^(-1) 0] and the result with [Q 0]
	// and finally subtract the result from [S 0]
	// But first we have to find the inverse of P^(-1) (since detP != 0 it is certain it exists)
	// The inverse of a 2x2 matrix is:
	// P^(-1) =  |  b2 -a2 |/detP
	//           | -b1  a1 |

	// We need to permute, but to save loading the permute masks from memory, we'll use mergeh
	// vp_2 -> mergeh(vm_2, vm_1) -> | b1 a1 b2 a2 |
	// vp_1 -> vp_2 << 64 = | b2 a2 0 0 |
	vp_2 = vec_mergeh(vm_2, vm_1);
	vp_1 = vec_sld(vp_2, vp_2, 8);

	// Fix the signs in vp_1:
	// vp_1 -> |  b2 -a2 |
	// vp_2 -> | -b1  a1 |
	// This part inspired from http://www.cellperformance.com/articles/2006/06/a_4x4_matrix_inverse_1.html
	vp_1 = (vector float) vec_xor((vector unsigned int) vp_1, u_znzn);
	vp_2 = (vector float) vec_xor((vector unsigned int) vp_2, u_nznz);

/*	printf("after permute and xor:\n");
	STORE_VECTOR(vp_1, &dummy[0]);
	printf("vp_1: %f %f %f %f\n", dummy[0], dummy[1], dummy[2], dummy[3]);
	STORE_VECTOR(vp_2, &dummy[0]);
	printf("vp_2: %f %f %f %f\n", dummy[0], dummy[1], dummy[2], dummy[3]);*/

	// We now have the adjoint of P
	// Multiply each element with inverse detP to get P^(-1)
	vp_1 = vec_madd(vp_1, invdet, v0f);
	vp_2 = vec_madd(vp_2, invdet, v0f);

/*	printf("divide with det = %f:\n", det.f[0]);
	STORE_VECTOR(vp_1, &dummy[0]);
	printf("vp_1: %f %f %f %f\n", dummy[0], dummy[1], dummy[2], dummy[3]);
	STORE_VECTOR(vp_2, &dummy[0]);
	printf("vp_2: %f %f %f %f\n", dummy[0], dummy[1], dummy[2], dummy[3]);*/

	// Now P^(-1) is calculated, we can proceed to the first product: R*P^(-1) (1)
	// The product is done like for a 4x4 matrix but only for the first 2 lines
	// and the first 2 columns
	// Calculate the first column of p1
	vt_1 = vec_madd( vec_splat( vm_3, 0 ), vp_1, v0f );
	vt_2 = vec_madd( vec_splat( vm_4, 0 ), vp_1, v0f );
 
	// Multiply the 2nd column of p1, add to previous result
	vrp_1 = vec_madd( vec_splat( vm_3, 1 ), vp_2, vt_1 );
	vrp_2 = vec_madd( vec_splat( vm_4, 1 ), vp_2, vt_2 );

/*	printf("R*P^(-1):\n");
	STORE_VECTOR(vrp_1, &dummy[0]);
	printf("vrp_1: %f %f %f %f\n", dummy[0], dummy[1], dummy[2], dummy[3]);
	STORE_VECTOR(vrp_2, &dummy[0]);
	printf("vrp_2: %f %f %f %f\n", dummy[0], dummy[1], dummy[2], dummy[3]);*/

	// Calculate R*P^(-1)*Q, just multiply previous result with Q
	// using same method
	vrpq_1 = vec_madd( vec_splat( vrp_1, 0 ), vm_1, v0f );
	vrpq_2 = vec_madd( vec_splat( vrp_2, 0 ), vm_1, v0f );
 
	// Multiply the 2nd column of p1, add to previous result
	vrpq_1 = vec_madd( vec_splat( vrp_1, 1 ), vm_2, vrpq_1 );
	vrpq_2 = vec_madd( vec_splat( vrp_2, 1 ), vm_2, vrpq_2 );

	// Calculate P^(-1)*Q also 
	vq_1 = vec_sld(vm_1, v0f, 8);
	vq_2 = vec_sld(vm_2, v0f, 8);
	vpq_1 = vec_madd( vec_splat( vp_1, 0 ), vq_1, v0f );
	vpq_2 = vec_madd( vec_splat( vp_2, 0 ), vq_1, v0f );
 
	// Multiply the 2nd column of p1, add to previous result
	vpq_1 = vec_madd( vec_splat( vp_1, 1 ), vq_2, vpq_1 );
	vpq_2 = vec_madd( vec_splat( vp_2, 1 ), vq_2, vpq_2 );

/*	printf("R*P^(-1)*Q:\n");
	STORE_VECTOR(vrpq_1, &dummy[0]);
	printf("vrpq_1: %f %f %f %f\n", dummy[0], dummy[1], dummy[2], dummy[3]);
	STORE_VECTOR(vrpq_2, &dummy[0]);
	printf("vrpq_2: %f %f %f %f\n", dummy[0], dummy[1], dummy[2], dummy[3]);*/

	// S - R*P^(-1)*Q
	// Finally, just subtract the previous result from S
	vsrpq_1 = vec_sub(vm_3, vrpq_1);
	vsrpq_2 = vec_sub(vm_4, vrpq_2);

/*	printf("S - R*P^(-1)*Q:\n");
	STORE_VECTOR(vsrpq_1, &dummy[0]);
	printf("vsrpq_1: %f %f %f %f\n", dummy[0], dummy[1], dummy[2], dummy[3]);
	STORE_VECTOR(vsrpq_2, &dummy[0]);
	printf("vsrpq_2: %f %f %f %f\n", dummy[0], dummy[1], dummy[2], dummy[3]);*/

	vt_2 = vec_sld(vsrpq_2, v0f, 8);
	vt_1 = vec_sld(vsrpq_1, vt_2, 8);
	// Get the determinant of (S - R*P^(-1)*Q) and multiply with detP
	STORE_VECTOR(vt_1, &detSRPQ.f[0]);
	// FIXME: I don't like how it's done, it should be done completely in AltiVec
//	printf("detSRPQ: %08x: %f %f %f %f\n", &detSRPQ.f[0], detSRPQ.f[0], detSRPQ.f[1], detSRPQ.f[2], detSRPQ.f[3]);
	detSRPQ.f[0] = detSRPQ.f[0]*detSRPQ.f[3] - detSRPQ.f[1]*detSRPQ.f[2];
	
	det.f[0] = det.f[0]*detSRPQ.f[0];
//	printf("det = %f\n", det.f[0]);
	if (det.f[0] == 0.0f) return NULL;
	// Get the inverse of (S - R*P^(-1)*Q), repeat the procedure
	// Load determinant and splat
//	printf("detSRPQ = %f\n", detSRPQ.f[0]);
	LOAD_VECTOR(vdet, &detSRPQ.f[0]);
	vdet = vec_splat(vdet, 0);
	// Get 1/detP
	INVERSE_RECIPROCAL(invdet, vdet);

	vt_2 = vec_mergeh(vt_2, vt_1);
	vt_1 = vec_sld(vt_2, vt_2, 8);

	vector float vP_1, vP_2, vQ_1, vQ_2, vR_1, vR_2, vS_1, vS_2;
	// Fix the signs in vp_1:
	vS_1 = (vector float) vec_xor((vector unsigned int) vt_1, u_znzn);
	vS_2 = (vector float) vec_xor((vector unsigned int) vt_2, u_nznz);

	// We now have the adjoint of P
	// Multiply each element with inverse detP to get (S - R*P^(-1)*Q)^(-1)
	vS_1 = vec_madd(vS_1, invdet, v0f);
	vS_2 = vec_madd(vS_2, invdet, v0f);
	// Checkpoint #1: We got S' 
	printf("S':\n");
	STORE_VECTOR(vS_1, &dummy[0]);
	printf("vS_1: %f %f %f %f\n", dummy[0], dummy[1], dummy[2], dummy[3]);
	STORE_VECTOR(vS_2, &dummy[0]);
	printf("vS_2: %f %f %f %f\n", dummy[0], dummy[1], dummy[2], dummy[3]);

	// R' = -S'*(R*P^(-1))
	// Just multiply previous result with vrp 
	// using same method
	vR_1 = vec_madd( vec_splat( vS_1, 0 ), vrp_1, v0f );
	vR_2 = vec_madd( vec_splat( vS_2, 0 ), vrp_1, v0f );
 
	// Multiply the 2nd column of p1, add to previous result
	vR_1 = vec_madd( vec_splat( vS_1, 1 ), vrp_2, vR_1 );
	vR_2 = vec_madd( vec_splat( vS_2, 1 ), vrp_2, vR_2 );
	vR_1 = (vector float) vec_xor((vector unsigned int) vR_1, v0n);
	vR_2 = (vector float) vec_xor((vector unsigned int) vR_2, v0n);

	printf("R':\n");
	STORE_VECTOR(vR_1, &dummy[0]);
	printf("vR_1: %f %f %f %f\n", dummy[0], dummy[1], dummy[2], dummy[3]);
	STORE_VECTOR(vR_2, &dummy[0]);
	printf("vR_2: %f %f %f %f\n", dummy[0], dummy[1], dummy[2], dummy[3]);

	// Q' = -(P^(-1)*Q)*S'
	vQ_1 = vec_madd( vec_splat( vpq_1, 0 ), vS_1, v0f );
	vQ_2 = vec_madd( vec_splat( vpq_2, 0 ), vS_1, v0f );
 
	// Multiply the 2nd column of p1, add to previous result
	vQ_1 = vec_madd( vec_splat( vpq_1, 1 ), vS_2, vQ_1 );
	vQ_2 = vec_madd( vec_splat( vpq_2, 1 ), vS_2, vQ_2 );
	vQ_1 = (vector float) vec_xor((vector unsigned int) vQ_1, v0n);
	vQ_2 = (vector float) vec_xor((vector unsigned int) vQ_2, v0n);

	printf("Q':\n");
	STORE_VECTOR(vQ_1, &dummy[0]);
	printf("vQ_1: %f %f %f %f\n", dummy[0], dummy[1], dummy[2], dummy[3]);
	STORE_VECTOR(vQ_2, &dummy[0]);
	printf("vQ_2: %f %f %f %f\n", dummy[0], dummy[1], dummy[2], dummy[3]);

	// P' = P^(-1) - (P^(-1)*Q)*R'
	vP_1 = vec_madd( vec_splat( vpq_1, 0 ), vR_1, v0f );
	vP_2 = vec_madd( vec_splat( vpq_2, 0 ), vR_1, v0f );
 
	// Multiply the 2nd column of p1, add to previous result
	vP_1 = vec_madd( vec_splat( vpq_1, 1 ), vR_2, vP_1 );
	vP_2 = vec_madd( vec_splat( vpq_2, 1 ), vR_2, vP_2 );

	vP_1 = vec_sub(vp_1, vP_1);
	vP_2 = vec_sub(vp_2, vP_2);

	printf("P':\n");
	STORE_VECTOR(vP_1, &dummy[0]);
	printf("vP_1: %f %f %f %f\n", dummy[0], dummy[1], dummy[2], dummy[3]);
	STORE_VECTOR(vP_2, &dummy[0]);
	printf("vP_2: %f %f %f %f\n", dummy[0], dummy[1], dummy[2], dummy[3]);

	// Finished, now let's put these back together to one matrix (4 vectors)
	// all P', Q', R', S' matrices have the needed block in the left 2x2 part
	// in the vP_*, vQ_*, vR_*, vS_* vectors. We need to shift right by 64 bits
	// the vQ_* and vS_* vectors and merge left 2x2 block of vP_* with the right
	// 2x2 block of vQ_*. Same with vR_* and vS_*. And then we have our inverse
	// matrix
	// Still TODO/FIXME:
	// vm_1 = vec_or(vP_1, vQ_1, mask);
	// vm_2 = vec_or(vP_2, vQ_2, mask);
	// vm_3 = vec_or(vR_1, vS_1, mask);
	// vm_4 = vec_or(vR_2, vS_2, mask);

	// Store the resulting inverse matrix
	STORE_MATRIX(&pOut->m[0], vm_1, vm_2, vm_3, vm_4);
}



Mail converted by MHonArc 2.6.19+ http://listengine.tuxfamily.org/