Re: [eigen] sse asin implementation

[ Thread Index | Date Index | More lists.tuxfamily.org/eigen Archives ]


This should satisfy your concerns.

I fiddled around elsewhere w/o changing anything in the core ei_pasin
function that I had written. I copied over stuff that I needed from
eigen's files. The max error is reasonable.

On a side note, I think that defining the sign_mask in PacketMath.h
file and using the _mm_andnot_ps might be faster for calculating the
absolute value.

Cheers,

-- 
Rohit Garg

http://rpg-314.blogspot.com/

Senior Undergraduate
Department of Physics
Indian Institute of Technology
Bombay
#include<pmmintrin.h>
#include<iostream>
#include<cmath>
using namespace std;

typedef __m128 Packet4f ;
/*
template<> inline Packet4f ei_pset1<float>(const float&  from) { return _mm_set1_ps(from); }

template<> inline Packet4i ei_pset1<int>(const int&    from) { return _mm_set1_epi32(from); }
*/

#define _EIGEN_DECLARE_CONST_Packet4f(NAME,X) \
  const Packet4f ei_p4f_##NAME = _mm_set1_ps(X)

#define _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(NAME,X) \
  const Packet4f ei_p4f_##NAME = _mm_castsi128_ps(_mm_set1_epi32(X))

_EIGEN_DECLARE_CONST_Packet4f(half, 0.5);
_EIGEN_DECLARE_CONST_Packet4f(minus_half, -0.5);
_EIGEN_DECLARE_CONST_Packet4f(3half, 1.5);

_EIGEN_DECLARE_CONST_Packet4f_FROM_INT(sign_mask, 0x80000000);

_EIGEN_DECLARE_CONST_Packet4f(pi, 3.141592654);
_EIGEN_DECLARE_CONST_Packet4f(pi_over_2, 3.141592654*0.5);

_EIGEN_DECLARE_CONST_Packet4f(asin1, 4.2163199048E-2);
_EIGEN_DECLARE_CONST_Packet4f(asin2, 2.4181311049E-2);
_EIGEN_DECLARE_CONST_Packet4f(asin3, 4.5470025998E-2);
_EIGEN_DECLARE_CONST_Packet4f(asin4, 7.4953002686E-2);
_EIGEN_DECLARE_CONST_Packet4f(asin5, 1.6666752422E-1);

inline Packet4f ei_pabs(const Packet4f &x)
    {return _mm_andnot_ps(ei_p4f_sign_mask,x);}

inline Packet4f ei_pmadd(const Packet4f &a,const Packet4f &b,const Packet4f &c) 
    {return _mm_add_ps(_mm_mul_ps(a,b),c);}

inline Packet4f ei_padd(const Packet4f &a,const Packet4f &b)  
    {return _mm_add_ps(a,b);}

inline Packet4f ei_psub(const Packet4f &a,const Packet4f &b)  
    {return _mm_sub_ps(a,b);}

inline Packet4f ei_pmul(const Packet4f &a,const Packet4f &b)
    {return _mm_mul_ps(a,b);}

inline Packet4f ei_psqrt(Packet4f _x)
{
  Packet4f half = ei_pmul(_x, ei_p4f_half);
  Packet4f x = _mm_rsqrt_ps(_x);
  x = ei_pmul(x, ei_psub(ei_p4f_3half, ei_pmul(half, ei_pmul(x,x))));
  return ei_pmul(_x,x);
}

Packet4f ei_pasin(Packet4f _x)
{

  Packet4f a = ei_pabs(_x);//got the absolute value

  Packet4f sign_bit= _mm_and_ps(_x, ei_p4f_sign_mask);//extracted the sign bit

  Packet4f z1,z2;//will need them during computation	


//will compute the two branches for asin
//so first compare with half

  Packet4f branch_mask= _mm_cmpgt_ps(a, ei_p4f_half);//this is to select which branch to take
//both will be taken, and finally results will be merged
//the branch for values >0.5

    {
//the core series expansion 
    z1=ei_pmadd(ei_p4f_minus_half,a,ei_p4f_half);
    Packet4f x1=ei_psqrt(z1);
    Packet4f s1=ei_pmadd(ei_p4f_asin1, z1, ei_p4f_asin2);
    Packet4f s2=ei_pmadd(s1, z1, ei_p4f_asin3);
    Packet4f s3=ei_pmadd(s2,z1, ei_p4f_asin4);
    Packet4f s4=ei_pmadd(s3,z1, ei_p4f_asin5);
    Packet4f temp=ei_pmul(s4,z1);//not really a madd but a mul by z so that the next term can be a madd
    z1=ei_pmadd(temp,x1,x1);
    z1=ei_padd(z1,z1);
    z1=ei_psub(ei_p4f_pi_over_2,z1);
    }

    {
//the core series expansion 
    Packet4f x2=a;
    z2=ei_pmul(x2,x2);
    Packet4f s1=ei_pmadd(ei_p4f_asin1, z2, ei_p4f_asin2);
    Packet4f s2=ei_pmadd(s1, z2, ei_p4f_asin3);
    Packet4f s3=ei_pmadd(s2,z2, ei_p4f_asin4);
    Packet4f s4=ei_pmadd(s3,z2, ei_p4f_asin5);
    Packet4f temp=ei_pmul(s4,z2);//not really a madd but a mul by z so that the next term can be a madd
    z2=ei_pmadd(temp,x2,x2);
    }

/* select the correct result from the two branch evaluations */
  z1  = _mm_and_ps(branch_mask, z1);
  z2  = _mm_andnot_ps(branch_mask, z2);
  Packet4f z  = _mm_or_ps(z1,z2);

/* update the sign */
  return _mm_xor_ps(z, sign_bit);

}

const int MAX=100000;
float __attribute__((aligned(16))) in[MAX];
float __attribute__((aligned(16))) out[MAX];

const float frac=1.0/MAX;
int main()
    {
    int i;
    __m128 temp;
    float err, maxerr;
    maxerr=0;
    for(i=0; i<MAX; i++)
	in[i]=float(i)*frac;
    for(i=0; i< (MAX>>2); i++)
	{
	temp=ei_pasin(_mm_load_ps(in+4*i));
	_mm_store_ps(out+4*i,temp);
//verify
	err=(out[i]-asinf(in[i]));
	maxerr=max(err,maxerr);
	err=(out[i+1]-asinf(in[i+1]));
	maxerr=max(err,maxerr);	
	err=(out[i+2]-asinf(in[i+2]));
	maxerr=max(err,maxerr);
	err=(out[i+3]-asinf(in[i+3]));
	maxerr=max(err,maxerr);
	}
    cout<<maxerr<<endl;
    return 0;
    }


Mail converted by MHonArc 2.6.19+ http://listengine.tuxfamily.org/