| [eigen] vectorization for ints not working |
[ Thread Index | Date Index | More lists.tuxfamily.org/eigen Archives ]
I had a simple piece of code involving matrix4i, and it is not compiling with vectorization. This is on fedora 11, 64 bit, gcc 4.4.1. To compile, use make eigen. I am interested in vectorization of the code in the addressesGen function, which enclosed in asm directives. There are clear signs of eigen unrolling the loop, but I'd like my vectorization please. :) To compile, do make eigen. -- Rohit Garg http://rpg-314.blogspot.com/ Senior Undergraduate Department of Physics Indian Institute of Technology Bombay
Attachment:
Makefile
Description: Binary data
#include<vector8s.h>
#include<vector4ui.h>
#include<Eigen/Core>
#include<Eigen/Dense>
using namespace std;
using namespace Eigen;
const int cordNum = 4;//coordination number
const int batchSize = 8;//neighbours are processed in batches of 8, the simd width
const int totalBatches = (cordNum%batchSize)?((cordNum/batchSize)+1):cordNum/batchSize;//final size that we will declare
const int totalSize = totalBatches*batchSize;
class cord
{
public:
short x,y,z;
cord();
cord(const short&, const short&, const short&);
};
cord::cord()
{
x=y=z=0;
}
inline cord::cord(const short & p,const short &q,const short& r)
{
x=p;
y=q;
z=r;
}
typedef Matrix<int, cordNum, 1> AddressVector_t ;
struct alignedConsts
{
char atomClass[64];
vector8s xDelta[totalBatches];
vector8s yDelta[totalBatches];
vector8s zDelta[totalBatches];
vector8s bitMask8;
vector8s one;
vector8s maxBounds[3];
vector8s byteLocalMask8;
AddressVector_t xFactor,yFactor;
};
const int X__=0;
const int Y__=1;
const int Z__=2;
alignedConsts *ptr;
void addressesGen(const short *bits, const int*bytes, const short *neighboursX,const short *neighboursY, const short *neighboursZ,
const short* bounds, const short x, const short y, const short z,const short negate)
{
int i,j;
vector8s xVec[totalBatches];
vector8s yVec[totalBatches];
vector8s zVec[totalBatches];
vector8s minusNegateVec(-negate);
vector8s negateVec(negate);
vector8s rootX(x);
vector8s rootY(y);
vector8s rootZ(z);
for(i=0; i<totalBatches; i++)
{
xVec[i]=rootX+(ptr->xDelta[i]);
yVec[i]=rootY+(ptr->yDelta[i]);
zVec[i] = (rootZ + negateVec) + (minusNegateVec ^ (ptr->zDelta[i]) );
//generated neighbours
( ((xVec[i] & (ptr->bitMask8))<<4) | ((yVec[i] & (ptr->bitMask8))<<2) | (zVec[i] & (ptr->bitMask8)) ).store((__m128i*)&bits[i*batchSize]);
//stored bit offsets
xVec[i].store((__m128i*)&neighboursX[i*batchSize]);
yVec[i].store((__m128i*)&neighboursY[i*batchSize]);
zVec[i].store((__m128i*)&neighboursZ[i*batchSize]);
//stored neigbours
(
( (ptr->one | (xVec[i] >> 15)) & (ptr->one | ((ptr->maxBounds[X__]-xVec[i]) >> 15)) ) &
( (ptr->one | (yVec[i] >> 15)) & (ptr->one | ((ptr->maxBounds[Y__]-yVec[i]) >> 15)) ) &
( (ptr->one | (zVec[i] >> 15)) & (ptr->one | ((ptr->maxBounds[Z__]-zVec[i]) >> 15)) )
).store((__m128i*)&bounds[i*batchSize]);
//generated and stored bounds
}
//NOTE: this part is custom/handwritten for each lattice type, this one is for diamond
//generate and store the local byte offsets first.
( ( (xVec[0] & ptr->byteLocalMask8) <<2 ) | ( yVec[0] & ptr->byteLocalMask8 ) | ( (zVec[0] & ptr->byteLocalMask8)>>2 ) ).first4((__m128i*)bytes);
//map the memory for integer operations
//bytes has the byteLocal offsets preloaded
asm("#it begins here!");
Map<AddressVector_t,ForceAligned> addresses(bytes);
AddressVector_t temp;
__m128i * tempPtr=(__m128i*)temp.data();
(xVec[0]>>4).first4(tempPtr);
addresses+=(temp.cwise()*(ptr->xFactor));
(yVec[0]>>4).first4(tempPtr);
addresses+=(temp.cwise()*(ptr->yFactor));
((zVec[0]>>4)<<6).first4(tempPtr);
addresses+=temp;
asm("#it ends here!");
}
static const short xNeighbourDelta8[totalSize]={1,-1,-1,1,0,0,0,0};
static const short yNeighbourDelta8[totalSize]={1,1,-1,-1,0,0,0,0};
static const short zNeighbourDelta8[totalSize]={-1,1,-1,1,0,0,0,0};//you add it if you are sitting on a black lattice site
int main()
{
cord u(10,20,30);
short __attribute__((aligned(16))) bits[totalSize];
int __attribute__((aligned(16))) bytes[cordNum];
short __attribute__((aligned(16))) neighbours[3][totalSize];
short __attribute__((aligned(16))) bounds[totalSize];
//test the vec8 way
cout<<"test shorts\n";
{
if(posix_memalign((void**)&ptr, 64, sizeof(alignedConsts))!=0)
{
return 1;
}
ptr->bitMask8.set(0x03);
ptr->byteLocalMask8.set(0x0c);
ptr->xFactor = AddressVector_t::Constant(40*64);
ptr->yFactor = AddressVector_t::Constant(20*64);
ptr->one.set(1);
int i,j;
ptr->xDelta[0].load((__m128i*)xNeighbourDelta8);
ptr->yDelta[0].load((__m128i*)yNeighbourDelta8);
ptr->zDelta[0].load((__m128i*)zNeighbourDelta8);
short negate = (u.x & 3 & 1) ^ 1;
if( u.x & 3 & 1 )
{
//current atom (u) is on black lattice
//assuming that current x,y,z, ie those of u are legal
cout<<"want negate=0; "<<negate<<endl;
}
else
{
//current atom, (u) on the white lattice
//assuming that current x,y,z, ie those of u are legal
cout<<"want negate=1; "<<negate<<endl;
}
short x=1;
cout<<"Test on x=1"<<((x>>1)&1)<<endl;
x=-1;
cout<<"Test on x=-1"<<((x>>1)&1)<<endl;
cout<<"calling addresseGen()\n";
addressesGen(bits, bytes, neighbours[X__], neighbours[Y__], neighbours[Z__], bounds, u.x, u.y, u.z, negate );
cout<<"Bits\n";
for(i=0; i<totalSize; i++)
cout<<bits[i]<<' ';
cout<<"\nBytes\n";
for(i=0; i<cordNum; i++)
cout<<bytes[i]<<' ';
cout<<"\nBounds\n";
for(i=0; i<totalSize; i++)
cout<<bounds[i]<<' ';
}
free(ptr);
return 0;
}
| Mail converted by MHonArc 2.6.19+ | http://listengine.tuxfamily.org/ |