[eigen] vectorization for ints not working

[ Thread Index | Date Index | More lists.tuxfamily.org/eigen Archives ]


I had a simple piece of code involving matrix4i, and it is not
compiling with vectorization. This is on fedora 11, 64 bit, gcc 4.4.1.
To compile, use make eigen. I am interested in vectorization of the
code in the addressesGen function, which enclosed in asm directives.
There are clear signs of eigen unrolling the loop, but I'd like my
vectorization please. :)

To compile, do make eigen.

-- 
Rohit Garg

http://rpg-314.blogspot.com/

Senior Undergraduate
Department of Physics
Indian Institute of Technology
Bombay

Attachment: Makefile
Description: Binary data

#include<vector8s.h>
#include<vector4ui.h>
#include<Eigen/Core>
#include<Eigen/Dense>
using namespace std;
using namespace Eigen;

const int cordNum            =  4;//coordination number
const int batchSize          =  8;//neighbours are processed in batches of 8, the simd width
const int totalBatches       = (cordNum%batchSize)?((cordNum/batchSize)+1):cordNum/batchSize;//final size that we will declare
const int totalSize          = totalBatches*batchSize;

class cord
    {
public:
    short x,y,z;

    cord();
    cord(const short&, const short&, const short&);
    };

cord::cord()
    {
    x=y=z=0;
    }

inline cord::cord(const short & p,const short &q,const short& r)
    {
    x=p;
    y=q;
    z=r;
    }

typedef Matrix<int, cordNum, 1> AddressVector_t ;

struct alignedConsts
    {
    char atomClass[64];
    vector8s xDelta[totalBatches];
    vector8s yDelta[totalBatches];
    vector8s zDelta[totalBatches];
    vector8s bitMask8;
    vector8s one; 
    vector8s maxBounds[3];
    vector8s byteLocalMask8;
    AddressVector_t xFactor,yFactor;
    };

const int X__=0;
const int Y__=1;
const int Z__=2;

alignedConsts *ptr;

void addressesGen(const short *bits, const int*bytes, const short *neighboursX,const short *neighboursY, const short *neighboursZ,
    const short* bounds, const short x, const short y, const short z,const short negate)
    {
    int i,j;
    vector8s xVec[totalBatches];
    vector8s yVec[totalBatches];
    vector8s zVec[totalBatches];
    vector8s minusNegateVec(-negate);
    vector8s negateVec(negate);
    vector8s rootX(x);
    vector8s rootY(y);
    vector8s rootZ(z);

    for(i=0; i<totalBatches; i++)
	{
	xVec[i]=rootX+(ptr->xDelta[i]);
	yVec[i]=rootY+(ptr->yDelta[i]);
	zVec[i] = (rootZ + negateVec) + (minusNegateVec ^ (ptr->zDelta[i]) );
//generated neighbours
	( ((xVec[i] & (ptr->bitMask8))<<4) | ((yVec[i] & (ptr->bitMask8))<<2) | (zVec[i] & (ptr->bitMask8)) ).store((__m128i*)&bits[i*batchSize]);
//stored bit offsets
	xVec[i].store((__m128i*)&neighboursX[i*batchSize]);
	yVec[i].store((__m128i*)&neighboursY[i*batchSize]);
	zVec[i].store((__m128i*)&neighboursZ[i*batchSize]);
//stored neigbours
	    (	
	    ( (ptr->one | (xVec[i] >> 15)) & (ptr->one | ((ptr->maxBounds[X__]-xVec[i]) >> 15)) ) &
	    ( (ptr->one | (yVec[i] >> 15)) & (ptr->one | ((ptr->maxBounds[Y__]-yVec[i]) >> 15)) ) &
	    ( (ptr->one | (zVec[i] >> 15)) & (ptr->one | ((ptr->maxBounds[Z__]-zVec[i]) >> 15)) )
	    ).store((__m128i*)&bounds[i*batchSize]);

//generated and stored bounds
	}
//NOTE: this part is custom/handwritten for each lattice type, this one is for diamond
//generate and store the local byte offsets first. 
    ( ( (xVec[0] & ptr->byteLocalMask8) <<2 ) | ( yVec[0] & ptr->byteLocalMask8 ) | ( (zVec[0] & ptr->byteLocalMask8)>>2 ) ).first4((__m128i*)bytes);
//map the memory for integer operations
//bytes has the byteLocal offsets preloaded
    asm("#it begins here!");
    Map<AddressVector_t,ForceAligned> addresses(bytes);
    AddressVector_t temp;
    __m128i * tempPtr=(__m128i*)temp.data();

    (xVec[0]>>4).first4(tempPtr);
    addresses+=(temp.cwise()*(ptr->xFactor));
    (yVec[0]>>4).first4(tempPtr);
    addresses+=(temp.cwise()*(ptr->yFactor));
    ((zVec[0]>>4)<<6).first4(tempPtr);
    addresses+=temp;
    asm("#it ends here!");

    }

static const short xNeighbourDelta8[totalSize]={1,-1,-1,1,0,0,0,0};
static const short yNeighbourDelta8[totalSize]={1,1,-1,-1,0,0,0,0};
static const short zNeighbourDelta8[totalSize]={-1,1,-1,1,0,0,0,0};//you add it if you are sitting on a black lattice site

int main()
    {
    cord u(10,20,30);

    short __attribute__((aligned(16))) bits[totalSize];
    int   __attribute__((aligned(16))) bytes[cordNum];
    short __attribute__((aligned(16))) neighbours[3][totalSize];
    short __attribute__((aligned(16))) bounds[totalSize];

//test the vec8 way
    cout<<"test shorts\n";
	{
	if(posix_memalign((void**)&ptr, 64, sizeof(alignedConsts))!=0)
	    {
	    return 1;
	    }
	ptr->bitMask8.set(0x03);
	ptr->byteLocalMask8.set(0x0c);
	ptr->xFactor = AddressVector_t::Constant(40*64);
	ptr->yFactor = AddressVector_t::Constant(20*64);
	ptr->one.set(1);
	int i,j;
	ptr->xDelta[0].load((__m128i*)xNeighbourDelta8);
	ptr->yDelta[0].load((__m128i*)yNeighbourDelta8);
	ptr->zDelta[0].load((__m128i*)zNeighbourDelta8);
	short negate = (u.x & 3 & 1) ^ 1;
	if( u.x & 3 & 1 ) 
	    {
//current atom (u) is on black lattice
//assuming that current x,y,z, ie those of u are legal
	    cout<<"want negate=0; "<<negate<<endl;
	    }
	else
	    {
//current atom, (u) on the white lattice
//assuming that current x,y,z, ie those of u are legal
	    cout<<"want negate=1; "<<negate<<endl;
	    }
	short x=1;
	cout<<"Test on x=1"<<((x>>1)&1)<<endl;
	x=-1;
	cout<<"Test on x=-1"<<((x>>1)&1)<<endl;
	cout<<"calling addresseGen()\n";
	addressesGen(bits, bytes, neighbours[X__], neighbours[Y__], neighbours[Z__], bounds, u.x, u.y, u.z, negate );

	cout<<"Bits\n";
	for(i=0; i<totalSize; i++)
	    cout<<bits[i]<<' ';
	cout<<"\nBytes\n";
	for(i=0; i<cordNum; i++)
	    cout<<bytes[i]<<' ';
	cout<<"\nBounds\n";
	for(i=0; i<totalSize; i++)
	    cout<<bounds[i]<<' ';
	}

    free(ptr);
    return 0;
    }


Mail converted by MHonArc 2.6.19+ http://listengine.tuxfamily.org/