[eigen] vectorization for ints not working |
[ Thread Index | Date Index | More lists.tuxfamily.org/eigen Archives ]
I had a simple piece of code involving matrix4i, and it is not compiling with vectorization. This is on fedora 11, 64 bit, gcc 4.4.1. To compile, use make eigen. I am interested in vectorization of the code in the addressesGen function, which enclosed in asm directives. There are clear signs of eigen unrolling the loop, but I'd like my vectorization please. :) To compile, do make eigen. -- Rohit Garg http://rpg-314.blogspot.com/ Senior Undergraduate Department of Physics Indian Institute of Technology Bombay
Attachment:
Makefile
Description: Binary data
#include<vector8s.h> #include<vector4ui.h> #include<Eigen/Core> #include<Eigen/Dense> using namespace std; using namespace Eigen; const int cordNum = 4;//coordination number const int batchSize = 8;//neighbours are processed in batches of 8, the simd width const int totalBatches = (cordNum%batchSize)?((cordNum/batchSize)+1):cordNum/batchSize;//final size that we will declare const int totalSize = totalBatches*batchSize; class cord { public: short x,y,z; cord(); cord(const short&, const short&, const short&); }; cord::cord() { x=y=z=0; } inline cord::cord(const short & p,const short &q,const short& r) { x=p; y=q; z=r; } typedef Matrix<int, cordNum, 1> AddressVector_t ; struct alignedConsts { char atomClass[64]; vector8s xDelta[totalBatches]; vector8s yDelta[totalBatches]; vector8s zDelta[totalBatches]; vector8s bitMask8; vector8s one; vector8s maxBounds[3]; vector8s byteLocalMask8; AddressVector_t xFactor,yFactor; }; const int X__=0; const int Y__=1; const int Z__=2; alignedConsts *ptr; void addressesGen(const short *bits, const int*bytes, const short *neighboursX,const short *neighboursY, const short *neighboursZ, const short* bounds, const short x, const short y, const short z,const short negate) { int i,j; vector8s xVec[totalBatches]; vector8s yVec[totalBatches]; vector8s zVec[totalBatches]; vector8s minusNegateVec(-negate); vector8s negateVec(negate); vector8s rootX(x); vector8s rootY(y); vector8s rootZ(z); for(i=0; i<totalBatches; i++) { xVec[i]=rootX+(ptr->xDelta[i]); yVec[i]=rootY+(ptr->yDelta[i]); zVec[i] = (rootZ + negateVec) + (minusNegateVec ^ (ptr->zDelta[i]) ); //generated neighbours ( ((xVec[i] & (ptr->bitMask8))<<4) | ((yVec[i] & (ptr->bitMask8))<<2) | (zVec[i] & (ptr->bitMask8)) ).store((__m128i*)&bits[i*batchSize]); //stored bit offsets xVec[i].store((__m128i*)&neighboursX[i*batchSize]); yVec[i].store((__m128i*)&neighboursY[i*batchSize]); zVec[i].store((__m128i*)&neighboursZ[i*batchSize]); //stored neigbours ( ( (ptr->one | (xVec[i] >> 15)) & (ptr->one | ((ptr->maxBounds[X__]-xVec[i]) >> 15)) ) & ( (ptr->one | (yVec[i] >> 15)) & (ptr->one | ((ptr->maxBounds[Y__]-yVec[i]) >> 15)) ) & ( (ptr->one | (zVec[i] >> 15)) & (ptr->one | ((ptr->maxBounds[Z__]-zVec[i]) >> 15)) ) ).store((__m128i*)&bounds[i*batchSize]); //generated and stored bounds } //NOTE: this part is custom/handwritten for each lattice type, this one is for diamond //generate and store the local byte offsets first. ( ( (xVec[0] & ptr->byteLocalMask8) <<2 ) | ( yVec[0] & ptr->byteLocalMask8 ) | ( (zVec[0] & ptr->byteLocalMask8)>>2 ) ).first4((__m128i*)bytes); //map the memory for integer operations //bytes has the byteLocal offsets preloaded asm("#it begins here!"); Map<AddressVector_t,ForceAligned> addresses(bytes); AddressVector_t temp; __m128i * tempPtr=(__m128i*)temp.data(); (xVec[0]>>4).first4(tempPtr); addresses+=(temp.cwise()*(ptr->xFactor)); (yVec[0]>>4).first4(tempPtr); addresses+=(temp.cwise()*(ptr->yFactor)); ((zVec[0]>>4)<<6).first4(tempPtr); addresses+=temp; asm("#it ends here!"); } static const short xNeighbourDelta8[totalSize]={1,-1,-1,1,0,0,0,0}; static const short yNeighbourDelta8[totalSize]={1,1,-1,-1,0,0,0,0}; static const short zNeighbourDelta8[totalSize]={-1,1,-1,1,0,0,0,0};//you add it if you are sitting on a black lattice site int main() { cord u(10,20,30); short __attribute__((aligned(16))) bits[totalSize]; int __attribute__((aligned(16))) bytes[cordNum]; short __attribute__((aligned(16))) neighbours[3][totalSize]; short __attribute__((aligned(16))) bounds[totalSize]; //test the vec8 way cout<<"test shorts\n"; { if(posix_memalign((void**)&ptr, 64, sizeof(alignedConsts))!=0) { return 1; } ptr->bitMask8.set(0x03); ptr->byteLocalMask8.set(0x0c); ptr->xFactor = AddressVector_t::Constant(40*64); ptr->yFactor = AddressVector_t::Constant(20*64); ptr->one.set(1); int i,j; ptr->xDelta[0].load((__m128i*)xNeighbourDelta8); ptr->yDelta[0].load((__m128i*)yNeighbourDelta8); ptr->zDelta[0].load((__m128i*)zNeighbourDelta8); short negate = (u.x & 3 & 1) ^ 1; if( u.x & 3 & 1 ) { //current atom (u) is on black lattice //assuming that current x,y,z, ie those of u are legal cout<<"want negate=0; "<<negate<<endl; } else { //current atom, (u) on the white lattice //assuming that current x,y,z, ie those of u are legal cout<<"want negate=1; "<<negate<<endl; } short x=1; cout<<"Test on x=1"<<((x>>1)&1)<<endl; x=-1; cout<<"Test on x=-1"<<((x>>1)&1)<<endl; cout<<"calling addresseGen()\n"; addressesGen(bits, bytes, neighbours[X__], neighbours[Y__], neighbours[Z__], bounds, u.x, u.y, u.z, negate ); cout<<"Bits\n"; for(i=0; i<totalSize; i++) cout<<bits[i]<<' '; cout<<"\nBytes\n"; for(i=0; i<cordNum; i++) cout<<bytes[i]<<' '; cout<<"\nBounds\n"; for(i=0; i<totalSize; i++) cout<<bounds[i]<<' '; } free(ptr); return 0; }
Mail converted by MHonArc 2.6.19+ | http://listengine.tuxfamily.org/ |