[eigen] Help needed to run a benchmark on many machines |
[ Thread Index |
Date Index
| More lists.tuxfamily.org/eigen Archives
]
- To: eigen <eigen@xxxxxxxxxxxxxxxxxxx>
- Subject: [eigen] Help needed to run a benchmark on many machines
- From: Benoit Jacob <jacob.benoit.1@xxxxxxxxx>
- Date: Wed, 18 Feb 2015 11:58:00 -0500
- Dkim-signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=gmail.com; s=20120113; h=mime-version:date:message-id:subject:from:to:content-type; bh=ge9rhN2Cca48xy9UmC3LQO/yTGLuc6OdpkyBWjX3qQ8=; b=BdAFAqWHPpYfrf89m439NgaEgZ5/H1X6O9j4tQ+3w2LJp95mTL9KrU/4n9wtg7bo4s FssP7DBeip4dLeeeuxZk9gmDphUNsEO9fnPn9QkV3FCUNnyUhh6/doKtv3qxbcEkC7Ly GZlLMTmJlN2FTOk2rADnIhPB9ywFebnLkBLnLxztwBLS4EOST1EQ8JCqxxwEYEe/EyLS WkHM6JlwETB+03Zf0vv98LOL9/2njOA8riVkY8mdM+PIhEXuN5M67QAL31G4pYET/uVf 1XKzw8GAqlCcsihD23kfBjZn9C0+WoTtEyXxvGEEtu6XA//X7G8uzwSUkxdqHPQsz7aQ dp0Q==
Hi List,
I'm looking into tuning Eigen's matrix product blocking parameters on various machines. That involves actually measuring the impact of different parameter values on performance. The tracking bug for this effort is
http://eigen.tuxfamily.org/bz/show_bug.cgi?id=937I'm attaching a benchmark to this email. It tests all sorts of matrix product sizes with all sorts of blocking parameters, and outputs both a complete log and a table that might be what we need to directly provide Eigen with.
It would be very helpful if some people could help me run it on different machines. It runs at least on:
- GNU Linux
- Android
- Mac
If you run it, please record its output into a text file and send it back to me together with some description of your device (e.g. "nexus 6"). The benchmark already records some hardware info such as /proc/cpuinfo on linux.
Please also tell me your Eigen changeset, compiler and command line.
Compilation instructions:
- c++11 mode
- on intel, please compile with -mavx if available
- on ARM, please compile with -mfpu=neon-vfpv4 if available, otherwise -mfpu=neon for older devices. DO NOT pass a -march flag as that triggers compiler bugs spilling registers.
Example desktop command line:
c++ -mavx -DNDEBUG -O3 --std=c++0x benchmark-blocking-sizes.cpp -o b -I ../eigen && ./b | tee log-blocking-sizes-mac
Example Android command line:
$CXX $SRC -o $EXE -save-temps \
-O3 -DNDEBUG \
--std=c++0x -Wall -Wextra -pedantic \
-fPIE -pie -mfpu=neon-vfpv4 -mfloat-abi=softfp \
-I $HOME/eigen
Thanks!
Benoit
#include <iostream>
#include <ctime>
#include <cstdint>
#include <cstdlib>
#include <map>
#include <vector>
#include <fstream>
#ifndef __linux__
#include <sys/time.h>
#endif
int testk, testm, testn;
#define EIGEN_TEST_SPECIFIC_BLOCKING_SIZES
#define EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_K testk
#define EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_M testm
#define EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_N testn
#include <Eigen/Core>
using namespace Eigen;
using namespace std;
const int measurement_passes = 4;
const float min_accurate_time = 1e-2f;
const float conservativeness = 1.02f;
const size_t maxsize = 1024;
const size_t minsize = 16;
typedef MatrixXf MatrixType;
static_assert((maxsize & (maxsize - 1)) == 0, "maxsize must be a power of two");
static_assert((minsize & (minsize - 1)) == 0, "minsize must be a power of two");
static_assert(maxsize > minsize, "maxsize must be larger than minsize");
static_assert(maxsize < (minsize << 16), "maxsize must be less than (minsize<<16)");
double time() {
#ifdef __linux__
timespec t;
clock_gettime(CLOCK_THREAD_CPUTIME_ID, &t);
return t.tv_sec + 1e-9 * t.tv_nsec;
#else
timeval t;
gettimeofday(&t, nullptr);
return t.tv_sec + 1e-6 * t.tv_usec;
#endif
}
struct humansize
{
size_t value;
humansize(size_t v) : value(v) {}
};
ostream& operator<<(ostream& s, const humansize& h)
{
if (h.value >> 20) {
return s << (h.value / float(1 << 20)) << "M";
} else if (h.value >> 10) {
return s << (h.value / float(1 << 10)) << "k";
} else {
return s << h.value;
}
}
struct size_triple_t
{
size_t k, m, n;
size_triple_t() : k(0), m(0), n(0) {}
size_triple_t(size_t _k, size_t _m, size_t _n) : k(_k), m(_m), n(_n) {}
size_triple_t(const size_triple_t& o) : k(o.k), m(o.m), n(o.n) {}
};
ostream& operator<<(ostream& s, const size_triple_t& t)
{
return s << "(k=" << humansize(t.k) << ", m=" << humansize(t.m) << ", n=" << humansize(t.n) << ")";
}
struct result_t
{
size_triple_t productsizes;
size_triple_t blocksizes;
float time;
result_t() : time(0) {}
result_t(const size_triple_t& p,
const size_triple_t& b,
float t)
: productsizes(p)
, blocksizes(b)
, time(t)
{}
result_t(const result_t& o)
: productsizes(o.productsizes)
, blocksizes(o.blocksizes)
, time(o.time)
{}
size_t pressure() const
{
return blocksizes.k * (blocksizes.m + blocksizes.n);
}
};
ostream& operator<<(ostream& s, const result_t& r)
{
const size_triple_t& p = r.productsizes;
const size_triple_t& b = r.blocksizes;
s << "product: " << p
<< ", block: " << b;
s << ", GFlop/s: " << 2e-9 * p.k * p.m * p.n / r.time;
return s;
}
void benchmark(result_t& result,
const size_triple_t& productsizes,
const size_triple_t& blocksizes)
{
testk = blocksizes.k;
testm = blocksizes.m;
testn = blocksizes.n;
static MatrixType a, b, c;
int iters_at_a_time = 1;
float timing = 0.0f;
while (true) {
a = MatrixType::Ones(productsizes.m, productsizes.k);
b = MatrixType::Ones(productsizes.k, productsizes.n);
c = MatrixType::Ones(productsizes.m, productsizes.n);
double starttime = time();
for (int i = 0; i < iters_at_a_time; i++) {
c = a * b;
}
double endtime = time();
timing = float(endtime - starttime);
if (timing >= min_accurate_time) {
break;
}
iters_at_a_time *= 2;
}
timing /= iters_at_a_time;
result = result_t(productsizes,
blocksizes,
timing);
}
bool operator<(const result_t& r1, const result_t& r2)
{
return r1.time < r2.time;
}
uint8_t log2_pot(size_t x) {
size_t l = 0;
while (x >>= 1) l++;
return l;
}
void print_results(const char* name, const vector<result_t>& results)
{
cout << "Encoding for " << name << ":" << endl;
cout << "minsize = " << minsize << endl;
cout << "maxsize = " << maxsize << endl;
size_t sizes = log2_pot(maxsize) - log2_pot(minsize) + 1;
cout << "number of sizes = " << sizes << endl;
size_t entries = sizes * sizes * sizes;
cout << "number of entries = (number of sizes)^3 = " << entries << endl;
if (entries != results.size()) {
cout << "fatal: wrong number of entries." << endl;
exit(1);
}
size_t i = 0;
cout << endl;
cout << "// Each row corresponds to a value of the (k, m) pair." << endl;
cout << "// Each columns corresponds to a value of n." << endl;
cout << "// k, m, n take the power-of-two values from " << minsize << " to " << maxsize << "." << endl;
cout << "// Each cell in the table gives the corresponding optimal triple" << endl;
cout << "// (kblock, mblock, nblock) encoded as a 12-bit hex value of the form 0xabc" << endl;
cout << "// where a, b, c are the log2 of kblock, mblock, nblock respectively." << endl;
cout << "uint16_t data[" << entries << "] = {" << endl;
for (size_t ksize = minsize; ksize <= maxsize; ksize *= 2) {
for (size_t msize = minsize; msize <= maxsize; msize *= 2) {
cout << " ";
for (size_t nsize = minsize; nsize <= maxsize; nsize *= 2) {
const result_t& r = results[i++];
if (r.productsizes.k != ksize || r.productsizes.m != msize || r.productsizes.n != nsize) {
cout << endl << "fatal: entries not in expected order" << endl;
exit(1);
}
uint16_t lk = log2_pot(r.blocksizes.k);
uint16_t lm = log2_pot(r.blocksizes.m);
uint16_t ln = log2_pot(r.blocksizes.n);
if ((lk | lm | ln) > 0xf) {
cout << "fatal: logs should fit in 4 bits each" << endl;
exit(1);
}
uint16_t val = (lk << 8) | (lm << 4) | ln;
cout << hex << "0x" << val << dec << ", ";
}
cout << "// k=" << ksize << ", m=" << msize << endl;
}
}
cout << "};" << endl << endl;
}
void print_cpuinfo() {
#ifdef __linux__
cout << "contents of /proc/cpuinfo:" << endl;
string line;
ifstream cpuinfo("/proc/cpuinfo");
if (cpuinfo.is_open()) {
while (getline(cpuinfo, line)) {
cout << line << endl;
}
cpuinfo.close();
}
cout << endl;
#elif defined __APPLE__
cout << "output of sysctl hw:" << endl;
system("sysctl hw");
cout << endl;
#endif
}
template <typename T>
string type_name()
{
return "unknown";
}
template<>
string type_name<float>()
{
return "float";
}
template<>
string type_name<double>()
{
return "double";
}
int main()
{
print_cpuinfo();
cout << "benchmark parameters:" << endl;
cout << "pointer size: " << 8*sizeof(void*) << " bits" << endl;
cout << "scalar type: " << type_name<MatrixType::Scalar>() << endl;
cout << "packet size: " << internal::packet_traits<MatrixType::Scalar>::size << endl;
cout << "minsize = " << minsize << endl;
cout << "maxsize = " << minsize << endl;
cout << "measurement_passes = " << measurement_passes << endl;
cout << "min_accurate_time = " << min_accurate_time << endl;
cout << "conservativeness = " << conservativeness << endl;
cout << endl;
cout.precision(3);
vector<result_t> fastest_results;
vector<result_t> conservative_results;
for (size_t ksize = minsize; ksize <= maxsize; ksize *= 2) {
for (size_t msize = minsize; msize <= maxsize; msize *= 2) {
for (size_t nsize = minsize; nsize <= maxsize; nsize *= 2) {
size_triple_t p(ksize, msize, nsize);
vector<result_t> results_passes[measurement_passes];
cout << "***************************************************" << endl;
cout << "Optimizing product size (" << ksize << "," << msize << "," << nsize << ")" << endl << endl;
for (int pass = 0; pass < measurement_passes; pass++) {
cout << "Pass " << (pass+1) << "/" << measurement_passes << endl;
for (size_t kblock = minsize; kblock <= ksize; kblock *= 2) {
for (size_t mblock = minsize; mblock <= msize; mblock *= 2) {
for (size_t nblock = minsize; nblock <= nsize; nblock *= 2) {
size_triple_t b(kblock, mblock, nblock);
result_t r;
benchmark(r, p, b);
results_passes[pass].push_back(r);
cout << r << endl;
}
}
}
}
vector<result_t> results;
for (size_t i = 0; i < results_passes[0].size(); i++) {
result_t best_result = results_passes[0][i];
for (int pass = 0; pass < measurement_passes; pass++) {
if (results_passes[pass][i].time < best_result.time) {
best_result = results_passes[pass][i];
}
}
results.push_back(best_result);
}
cout << endl << "best results for size " << p << ":" << endl;
std::sort(results.begin(), results.end());
fastest_results.push_back(results[0]);
size_t best_pressure = results[0].pressure();
size_t best_pressure_index = 0;
for (size_t i = 0; i < results.size(); ++i) {
if (results[i].time > conservativeness * results[0].time) {
break;
}
cout << results[i] << endl;
size_t pressure = results[i].pressure();
if (pressure < best_pressure) {
best_pressure = pressure;
best_pressure_index = i;
}
}
conservative_results.push_back(results[best_pressure_index]);
cout << endl;
cout << "fastest choice:" << endl << fastest_results.back() << endl;
cout << "conservative choice:" << endl << conservative_results.back() << endl;
cout << endl;
}
}
}
print_results("strictly fastest results (not recommended)", fastest_results);
print_results("conservative results (recommended)", conservative_results);
}