Open
Description
Hi,
I wrote a small benchmark program for arithmetic functions I'd like to use (see below). I observe a drastic performance problem when using function1 which slightly differs from function2. Here is the output:
Benchmark function1
t1 accumulated: 0.019393
t2 accumulated: 0.021725
t3 accumulated: 0.401697
t3 max within single iteration: 0.369303
Benchmark function2
t1 accumulated: 0.022363
t2 accumulated: 0.025782
t3 accumulated: 0.014175
There appears to be a point within the loop when the calculation takes a very long time (see t3 max compared to t3 accumulated). For function1 we have a *= someArray
while for function2 we have a = someArray*someArray
. How does this huge difference (factor ~30 between both functions) result and how can I fix it as I clearly want to use function1?
I use a GeForce GTX TITAN with ArrayFire version 3.6.2.
#include "arrayfire.h"
#include <iostream>
af::array function1(const af::array &a, const af::array &b, double c, double &t1, double &t2, double &t3){
af::timer start1 = af::timer::start();
af::array temp = b * c;
t1 = af::timer::stop(start1);
af::timer start2 = af::timer::start();
temp = af::exp(temp);
t2 = af::timer::stop(start2);
af::timer start3 = af::timer::start();
af::array ret = a * temp;
t3 = af::timer::stop(start3);
return ret;
}
af::array function2(const af::array &a, const af::array &b, double c, double &t1, double &t2, double &t3){
af::timer start1 = af::timer::start();
af::array temp = b * c;
t1 = af::timer::stop(start1);
af::timer start2 = af::timer::start();
temp = af::exp(temp);
t2 = af::timer::stop(start2);
af::timer start3 = af::timer::start();
af::array dummy = a * temp; // only to show that a * temp is performed as well, but the result isn't used
af::array ret = temp * temp; // difference to function1 HERE
t3 = af::timer::stop(start3);
return ret;
}
void benchmarkComposedFn(){
// parameters
double t1,t2,t3,t1_acc,t2_acc,t3_acc,t3_max;
t1_acc = 0.0; t2_acc = 0.0; t3_acc = 0.0; t3_max=0.0;
int reps = 1000;
long n = pow(2,20);
af::dim4 af_dim = af::dim4(n, 2);
// data
af::array a = af::randu(af_dim, c64);
af::array b = af::randu(af_dim, c64);
double c = 0.0; // this results in a = a * exp(0) = a
// check function1
for (int j = 0; j<reps; j++)
{
a = function1(a,b,c,t1,t2,t3);
t1_acc += t1; t2_acc += t2; t3_acc += t3;
t3_max = t3 > t3_max ? t3 : t3_max;
}
std::cout << "Benchmark function1" << '\n';
std::cout << "t1 accumulated: " << t1_acc << '\n';
std::cout << "t2 accumulated: " << t2_acc << '\n';
std::cout << "t3 accumulated: " << t3_acc << '\n';
std::cout << "t3 max within single iteration: " << t3_max << '\n';
t1_acc = 0.0; t2_acc = 0.0; t3_acc = 0.0;
// check function2
for (int j = 0; j<reps; j++)
{
a = function2(a,b,c,t1,t2,t3);
t1_acc += t1; t2_acc += t2; t3_acc += t3;
}
std::cout << "Benchmark function2" << '\n';
std::cout << "t1 accumulated: " << t1_acc << '\n';
std::cout << "t2 accumulated: " << t2_acc << '\n';
std::cout << "t3 accumulated: " << t3_acc << '\n';
t1_acc = 0.0; t2_acc = 0.0; t3_acc = 0.0;
}
int main(){
af::setBackend(AF_BACKEND_CUDA);
benchmarkComposedFn();
return 0;
}