Re: [eigen] Help needed to run a benchmark on many machines |
[ Thread Index |
Date Index
| More lists.tuxfamily.org/eigen Archives
]
- To: Eigen <eigen@xxxxxxxxxxxxxxxxxxx>
- Subject: Re: [eigen] Help needed to run a benchmark on many machines
- From: Hauke Heibel <hauke.heibel@xxxxxxxxx>
- Date: Wed, 18 Feb 2015 18:22:36 +0100
- Dkim-signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=gmail.com; s=20120113; h=mime-version:in-reply-to:references:date:message-id:subject:from:to :content-type; bh=QXa8edtWRabxuCcIKZuVuS6HntwfMkzhKkuifgwqnPs=; b=OHGYLfe9T5V0V7wJZSm+t6yi9YRe45jfz/M4mtAwwVA7PSwQUIM2Nhpv0CjCdeJCHr nfFHUum+dufXIGXjrCZpvNRhSgViLzAWFG7ctjtZk8MWxwyz4RO7jy0ZFuKyrjB/hJ13 X0hw9zE73n1FPyF2BK8/YdDZTdKESJ+rPWvrBQL12JVHaIjZL9TV4yjLo6XcncBIv8dR Yq9xRDHMNFbtC0nmhJrv4VGY5WRDSJ7/XuAFw/BBDBkd2ZG60wUYLoOH3j98wJMTsHx3 M/Nir0pZY0CqA/usFnm3gdXVEkBy2wvqqXXJOVTNeFPNQFreuH0+NUuBzJa4NxFpjU4K OPww==
Hi Benoit,
I am running the test right now. I needed to add a few changes (see
attached main.cpp). Classical windows specific stuff.
I also needed to change std::min<SizeType> to std::min<Index> in the
default branch.
I will let you know once the test finishes.
Regards,
Hauke
On Wed, Feb 18, 2015 at 5:58 PM, Benoit Jacob <jacob.benoit.1@xxxxxxxxx> wrote:
> Hi List,
>
> I'm looking into tuning Eigen's matrix product blocking parameters on
> various machines. That involves actually measuring the impact of different
> parameter values on performance. The tracking bug for this effort is
> http://eigen.tuxfamily.org/bz/show_bug.cgi?id=937
>
> I'm attaching a benchmark to this email. It tests all sorts of matrix
> product sizes with all sorts of blocking parameters, and outputs both a
> complete log and a table that might be what we need to directly provide
> Eigen with.
>
> It would be very helpful if some people could help me run it on different
> machines. It runs at least on:
> - GNU Linux
> - Android
> - Mac
>
> If you run it, please record its output into a text file and send it back to
> me together with some description of your device (e.g. "nexus 6"). The
> benchmark already records some hardware info such as /proc/cpuinfo on linux.
>
> Please also tell me your Eigen changeset, compiler and command line.
>
> Compilation instructions:
> - most important: please use today's devel branch of Eigen - must have the
> patch from http://eigen.tuxfamily.org/bz/show_bug.cgi?id=958 .
> - c++11 mode
> - on intel, please compile with -mavx if available
> - on ARM, please compile with -mfpu=neon-vfpv4 if available, otherwise
> -mfpu=neon for older devices. DO NOT pass a -march flag as that triggers
> compiler bugs spilling registers.
> - on ARM, bonus points if you can also apply the patch from
> http://eigen.tuxfamily.org/bz/show_bug.cgi?id=955 - not necessary but
> better.
>
> Example desktop command line:
> c++ -mavx -DNDEBUG -O3 --std=c++0x benchmark-blocking-sizes.cpp -o b -I
> ../eigen && ./b | tee log-blocking-sizes-mac
>
> Example Android command line:
> $CXX $SRC -o $EXE -save-temps \
> -O3 -DNDEBUG \
> --std=c++0x -Wall -Wextra -pedantic \
> -fPIE -pie -mfpu=neon-vfpv4 -mfloat-abi=softfp \
> -I $HOME/eigen
>
> Thanks!
> Benoit
>
#include <iostream>
#include <ctime>
#include <cstdint>
#include <cstdlib>
#include <map>
#include <vector>
#include <fstream>
#ifndef __linux__
#ifndef _WIN32
#include <sys/time.h>
#endif
#endif
#if defined(_WIN32)
#include <windows.h>
LARGE_INTEGER frequency;
#endif
int testk, testm, testn;
#define EIGEN_TEST_SPECIFIC_BLOCKING_SIZES
#define EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_K testk
#define EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_M testm
#define EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_N testn
#include <Eigen/Core>
using namespace Eigen;
using namespace std;
const int measurement_passes = 4;
const float min_accurate_time = 1e-2f;
const float conservativeness = 1.02f;
const size_t maxsize = 1024;
const size_t minsize = 16;
typedef MatrixXf MatrixType;
static_assert( ( maxsize & ( maxsize - 1 ) ) == 0, "maxsize must be a power of two" );
static_assert( ( minsize & ( minsize - 1 ) ) == 0, "minsize must be a power of two" );
static_assert( maxsize > minsize, "maxsize must be larger than minsize" );
static_assert( maxsize < ( minsize << 16 ), "maxsize must be less than (minsize<<16)" );
double time()
{
#ifdef __linux__
timespec t;
clock_gettime( CLOCK_THREAD_CPUTIME_ID, &t );
return t.tv_sec + 1e-9 * t.tv_nsec;
#elif _WIN32
LARGE_INTEGER now;
::QueryPerformanceCounter( &now );
return now.QuadPart / static_cast<double>( frequency.QuadPart );
#else
timeval t;
gettimeofday( &t, nullptr );
return t.tv_sec + 1e-6 * t.tv_usec;
#endif
}
struct humansize
{
size_t value;
humansize( size_t v ) : value( v ) {}
};
ostream& operator<<( ostream& s, const humansize& h )
{
if ( h.value >> 20 )
{
return s << ( h.value / float( 1 << 20 ) ) << "M";
}
else if ( h.value >> 10 )
{
return s << ( h.value / float( 1 << 10 ) ) << "k";
}
else
{
return s << h.value;
}
}
struct size_triple_t
{
size_t k, m, n;
size_triple_t() : k( 0 ), m( 0 ), n( 0 ) {}
size_triple_t( size_t _k, size_t _m, size_t _n ) : k( _k ), m( _m ), n( _n ) {}
size_triple_t( const size_triple_t& o ) : k( o.k ), m( o.m ), n( o.n ) {}
};
ostream& operator<<( ostream& s, const size_triple_t& t )
{
return s << "(k=" << humansize( t.k ) << ", m=" << humansize( t.m ) << ", n=" << humansize( t.n ) << ")";
}
struct result_t
{
size_triple_t productsizes;
size_triple_t blocksizes;
float time;
result_t() : time( 0 ) {}
result_t( const size_triple_t& p,
const size_triple_t& b,
float t )
: productsizes( p )
, blocksizes( b )
, time( t )
{}
result_t( const result_t& o )
: productsizes( o.productsizes )
, blocksizes( o.blocksizes )
, time( o.time )
{}
size_t pressure() const
{
return blocksizes.k * ( blocksizes.m + blocksizes.n );
}
};
ostream& operator<<( ostream& s, const result_t& r )
{
const size_triple_t& p = r.productsizes;
const size_triple_t& b = r.blocksizes;
s << "product: " << p
<< ", block: " << b;
s << ", GFlop/s: " << 2e-9 * p.k * p.m * p.n / r.time;
return s;
}
void benchmark( result_t& result,
const size_triple_t& productsizes,
const size_triple_t& blocksizes )
{
testk = blocksizes.k;
testm = blocksizes.m;
testn = blocksizes.n;
static MatrixType a, b, c;
int iters_at_a_time = 1;
float timing = 0.0f;
while ( true )
{
a = MatrixType::Ones( productsizes.m, productsizes.k );
b = MatrixType::Ones( productsizes.k, productsizes.n );
c = MatrixType::Ones( productsizes.m, productsizes.n );
double starttime = time();
for ( int i = 0; i < iters_at_a_time; i++ )
{
c = a * b;
}
double endtime = time();
timing = float( endtime - starttime );
if ( timing >= min_accurate_time )
{
break;
}
iters_at_a_time *= 2;
}
timing /= iters_at_a_time;
result = result_t( productsizes,
blocksizes,
timing );
}
bool operator<( const result_t& r1, const result_t& r2 )
{
return r1.time < r2.time;
}
uint8_t log2_pot( size_t x )
{
size_t l = 0;
while ( x >>= 1 ) { l++; }
return l;
}
void print_results( const char* name, const vector<result_t>& results )
{
cout << "Encoding for " << name << ":" << endl;
cout << "minsize = " << minsize << endl;
cout << "maxsize = " << maxsize << endl;
size_t sizes = log2_pot( maxsize ) - log2_pot( minsize ) + 1;
cout << "number of sizes = " << sizes << endl;
size_t entries = sizes * sizes * sizes;
cout << "number of entries = (number of sizes)^3 = " << entries << endl;
if ( entries != results.size() )
{
cout << "fatal: wrong number of entries." << endl;
exit( 1 );
}
size_t i = 0;
cout << endl;
cout << "// Each row corresponds to a value of the (k, m) pair." << endl;
cout << "// Each columns corresponds to a value of n." << endl;
cout << "// k, m, n take the power-of-two values from " << minsize << " to " << maxsize << "." << endl;
cout << "// Each cell in the table gives the corresponding optimal triple" << endl;
cout << "// (kblock, mblock, nblock) encoded as a 12-bit hex value of the form 0xabc" << endl;
cout << "// where a, b, c are the log2 of kblock, mblock, nblock respectively." << endl;
cout << "uint16_t data[" << entries << "] = {" << endl;
for ( size_t ksize = minsize; ksize <= maxsize; ksize *= 2 )
{
for ( size_t msize = minsize; msize <= maxsize; msize *= 2 )
{
cout << " ";
for ( size_t nsize = minsize; nsize <= maxsize; nsize *= 2 )
{
const result_t& r = results[i++];
if ( r.productsizes.k != ksize || r.productsizes.m != msize || r.productsizes.n != nsize )
{
cout << endl << "fatal: entries not in expected order" << endl;
exit( 1 );
}
uint16_t lk = log2_pot( r.blocksizes.k );
uint16_t lm = log2_pot( r.blocksizes.m );
uint16_t ln = log2_pot( r.blocksizes.n );
if ( ( lk | lm | ln ) > 0xf )
{
cout << "fatal: logs should fit in 4 bits each" << endl;
exit( 1 );
}
uint16_t val = ( lk << 8 ) | ( lm << 4 ) | ln;
cout << hex << "0x" << val << dec << ", ";
}
cout << "// k=" << ksize << ", m=" << msize << endl;
}
}
cout << "};" << endl << endl;
}
void print_cpuinfo()
{
#ifdef __linux__
cout << "contents of /proc/cpuinfo:" << endl;
string line;
ifstream cpuinfo( "/proc/cpuinfo" );
if ( cpuinfo.is_open() )
{
while ( getline( cpuinfo, line ) )
{
cout << line << endl;
}
cpuinfo.close();
}
cout << endl;
#elif defined __APPLE__
cout << "output of sysctl hw:" << endl;
system( "sysctl hw" );
cout << endl;
#endif
}
template <typename T>
string type_name()
{
return "unknown";
}
template<>
string type_name<float>()
{
return "float";
}
template<>
string type_name<double>()
{
return "double";
}
int main()
{
#if defined(_WIN32)
::QueryPerformanceFrequency( &frequency );
#endif
print_cpuinfo();
cout << "benchmark parameters:" << endl;
cout << "pointer size: " << 8 * sizeof( void* ) << " bits" << endl;
cout << "scalar type: " << type_name<MatrixType::Scalar>() << endl;
cout << "packet size: " << internal::packet_traits<MatrixType::Scalar>::size << endl;
cout << "minsize = " << minsize << endl;
cout << "maxsize = " << minsize << endl;
cout << "measurement_passes = " << measurement_passes << endl;
cout << "min_accurate_time = " << min_accurate_time << endl;
cout << "conservativeness = " << conservativeness << endl;
cout << endl;
cout.precision( 3 );
vector<result_t> fastest_results;
vector<result_t> conservative_results;
for ( size_t ksize = minsize; ksize <= maxsize; ksize *= 2 )
{
for ( size_t msize = minsize; msize <= maxsize; msize *= 2 )
{
for ( size_t nsize = minsize; nsize <= maxsize; nsize *= 2 )
{
size_triple_t p( ksize, msize, nsize );
vector<result_t> results_passes[measurement_passes];
cout << "***************************************************" << endl;
cout << "Optimizing product size (" << ksize << "," << msize << "," << nsize << ")" << endl << endl;
for ( int pass = 0; pass < measurement_passes; pass++ )
{
cout << "Pass " << ( pass + 1 ) << "/" << measurement_passes << endl;
for ( size_t kblock = minsize; kblock <= ksize; kblock *= 2 )
{
for ( size_t mblock = minsize; mblock <= msize; mblock *= 2 )
{
for ( size_t nblock = minsize; nblock <= nsize; nblock *= 2 )
{
size_triple_t b( kblock, mblock, nblock );
result_t r;
benchmark( r, p, b );
results_passes[pass].push_back( r );
cout << r << endl;
}
}
}
}
vector<result_t> results;
for ( size_t i = 0; i < results_passes[0].size(); i++ )
{
result_t best_result = results_passes[0][i];
for ( int pass = 0; pass < measurement_passes; pass++ )
{
if ( results_passes[pass][i].time < best_result.time )
{
best_result = results_passes[pass][i];
}
}
results.push_back( best_result );
}
cout << endl << "best results for size " << p << ":" << endl;
std::sort( results.begin(), results.end() );
fastest_results.push_back( results[0] );
size_t best_pressure = results[0].pressure();
size_t best_pressure_index = 0;
for ( size_t i = 0; i < results.size(); ++i )
{
if ( results[i].time > conservativeness * results[0].time )
{
break;
}
cout << results[i] << endl;
size_t pressure = results[i].pressure();
if ( pressure < best_pressure )
{
best_pressure = pressure;
best_pressure_index = i;
}
}
conservative_results.push_back( results[best_pressure_index] );
cout << endl;
cout << "fastest choice:" << endl << fastest_results.back() << endl;
cout << "conservative choice:" << endl << conservative_results.back() << endl;
cout << endl;
}
}
}
print_results( "strictly fastest results (not recommended)", fastest_results );
print_results( "conservative results (recommended)", conservative_results );
}