Re: [eigen] sse asin implementation |
[ Thread Index |
Date Index
| More lists.tuxfamily.org/eigen Archives
]
- To: eigen@xxxxxxxxxxxxxxxxxxx
- Subject: Re: [eigen] sse asin implementation
- From: Rohit Garg <rpg.314@xxxxxxxxx>
- Date: Tue, 31 Mar 2009 14:08:22 +0530
- Dkim-signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=gmail.com; s=gamma; h=domainkey-signature:mime-version:received:in-reply-to:references :date:message-id:subject:from:to:content-type; bh=xzAklnq9/G4ZZ97/Kb6prvzj9Gg52GsNZXIt8dpa374=; b=Xl6zg0xUSx7vNYeVdiawojWVE5aSG/IVi3FXBR1LN8CwHh3sKXnNgzPelMJ8M4BWjc EvVcwg5W6x385B5e1/SusZK0+qWIfFjUdyFXTBm4k61d3KnPu6A7Bu05DjsZQpIma2Wa 728Ch4Gqv/mDYabirMO6xf2Py07fX/Rk9t1ko=
- Domainkey-signature: a=rsa-sha1; c=nofws; d=gmail.com; s=gamma; h=mime-version:in-reply-to:references:date:message-id:subject:from:to :content-type; b=aVYjHeMtPPudhGw87Bq49ewL6w+7bo1hiy19/Zu72Oi/lrsVTNjEtPymMBSjg9PEYv h7bijMPL71xOTy3BAylMtu9y8DDz1NS75TrWanfLRFZxhvxlUBApdkDvBqtO0a7VU3B7 C+qOweD3hnn4RxNPOhHHaf0ushLsFTWdSzsSc=
This should satisfy your concerns.
I fiddled around elsewhere w/o changing anything in the core ei_pasin
function that I had written. I copied over stuff that I needed from
eigen's files. The max error is reasonable.
On a side note, I think that defining the sign_mask in PacketMath.h
file and using the _mm_andnot_ps might be faster for calculating the
absolute value.
Cheers,
--
Rohit Garg
http://rpg-314.blogspot.com/
Senior Undergraduate
Department of Physics
Indian Institute of Technology
Bombay
#include<pmmintrin.h>
#include<iostream>
#include<cmath>
using namespace std;
typedef __m128 Packet4f ;
/*
template<> inline Packet4f ei_pset1<float>(const float& from) { return _mm_set1_ps(from); }
template<> inline Packet4i ei_pset1<int>(const int& from) { return _mm_set1_epi32(from); }
*/
#define _EIGEN_DECLARE_CONST_Packet4f(NAME,X) \
const Packet4f ei_p4f_##NAME = _mm_set1_ps(X)
#define _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(NAME,X) \
const Packet4f ei_p4f_##NAME = _mm_castsi128_ps(_mm_set1_epi32(X))
_EIGEN_DECLARE_CONST_Packet4f(half, 0.5);
_EIGEN_DECLARE_CONST_Packet4f(minus_half, -0.5);
_EIGEN_DECLARE_CONST_Packet4f(3half, 1.5);
_EIGEN_DECLARE_CONST_Packet4f_FROM_INT(sign_mask, 0x80000000);
_EIGEN_DECLARE_CONST_Packet4f(pi, 3.141592654);
_EIGEN_DECLARE_CONST_Packet4f(pi_over_2, 3.141592654*0.5);
_EIGEN_DECLARE_CONST_Packet4f(asin1, 4.2163199048E-2);
_EIGEN_DECLARE_CONST_Packet4f(asin2, 2.4181311049E-2);
_EIGEN_DECLARE_CONST_Packet4f(asin3, 4.5470025998E-2);
_EIGEN_DECLARE_CONST_Packet4f(asin4, 7.4953002686E-2);
_EIGEN_DECLARE_CONST_Packet4f(asin5, 1.6666752422E-1);
inline Packet4f ei_pabs(const Packet4f &x)
{return _mm_andnot_ps(ei_p4f_sign_mask,x);}
inline Packet4f ei_pmadd(const Packet4f &a,const Packet4f &b,const Packet4f &c)
{return _mm_add_ps(_mm_mul_ps(a,b),c);}
inline Packet4f ei_padd(const Packet4f &a,const Packet4f &b)
{return _mm_add_ps(a,b);}
inline Packet4f ei_psub(const Packet4f &a,const Packet4f &b)
{return _mm_sub_ps(a,b);}
inline Packet4f ei_pmul(const Packet4f &a,const Packet4f &b)
{return _mm_mul_ps(a,b);}
inline Packet4f ei_psqrt(Packet4f _x)
{
Packet4f half = ei_pmul(_x, ei_p4f_half);
Packet4f x = _mm_rsqrt_ps(_x);
x = ei_pmul(x, ei_psub(ei_p4f_3half, ei_pmul(half, ei_pmul(x,x))));
return ei_pmul(_x,x);
}
Packet4f ei_pasin(Packet4f _x)
{
Packet4f a = ei_pabs(_x);//got the absolute value
Packet4f sign_bit= _mm_and_ps(_x, ei_p4f_sign_mask);//extracted the sign bit
Packet4f z1,z2;//will need them during computation
//will compute the two branches for asin
//so first compare with half
Packet4f branch_mask= _mm_cmpgt_ps(a, ei_p4f_half);//this is to select which branch to take
//both will be taken, and finally results will be merged
//the branch for values >0.5
{
//the core series expansion
z1=ei_pmadd(ei_p4f_minus_half,a,ei_p4f_half);
Packet4f x1=ei_psqrt(z1);
Packet4f s1=ei_pmadd(ei_p4f_asin1, z1, ei_p4f_asin2);
Packet4f s2=ei_pmadd(s1, z1, ei_p4f_asin3);
Packet4f s3=ei_pmadd(s2,z1, ei_p4f_asin4);
Packet4f s4=ei_pmadd(s3,z1, ei_p4f_asin5);
Packet4f temp=ei_pmul(s4,z1);//not really a madd but a mul by z so that the next term can be a madd
z1=ei_pmadd(temp,x1,x1);
z1=ei_padd(z1,z1);
z1=ei_psub(ei_p4f_pi_over_2,z1);
}
{
//the core series expansion
Packet4f x2=a;
z2=ei_pmul(x2,x2);
Packet4f s1=ei_pmadd(ei_p4f_asin1, z2, ei_p4f_asin2);
Packet4f s2=ei_pmadd(s1, z2, ei_p4f_asin3);
Packet4f s3=ei_pmadd(s2,z2, ei_p4f_asin4);
Packet4f s4=ei_pmadd(s3,z2, ei_p4f_asin5);
Packet4f temp=ei_pmul(s4,z2);//not really a madd but a mul by z so that the next term can be a madd
z2=ei_pmadd(temp,x2,x2);
}
/* select the correct result from the two branch evaluations */
z1 = _mm_and_ps(branch_mask, z1);
z2 = _mm_andnot_ps(branch_mask, z2);
Packet4f z = _mm_or_ps(z1,z2);
/* update the sign */
return _mm_xor_ps(z, sign_bit);
}
const int MAX=100000;
float __attribute__((aligned(16))) in[MAX];
float __attribute__((aligned(16))) out[MAX];
const float frac=1.0/MAX;
int main()
{
int i;
__m128 temp;
float err, maxerr;
maxerr=0;
for(i=0; i<MAX; i++)
in[i]=float(i)*frac;
for(i=0; i< (MAX>>2); i++)
{
temp=ei_pasin(_mm_load_ps(in+4*i));
_mm_store_ps(out+4*i,temp);
//verify
err=(out[i]-asinf(in[i]));
maxerr=max(err,maxerr);
err=(out[i+1]-asinf(in[i+1]));
maxerr=max(err,maxerr);
err=(out[i+2]-asinf(in[i+2]));
maxerr=max(err,maxerr);
err=(out[i+3]-asinf(in[i+3]));
maxerr=max(err,maxerr);
}
cout<<maxerr<<endl;
return 0;
}