Built-in mod ('%') vs custom mod function: improve the performance of modulus operation
According to Chandler Carruth's benchmarks at CppCon 2015, the fastest modulo operator (on x86, when compiled with Clang) is:
int fast_mod(const int input, const int ceil) {
// apply the modulo operator only when needed
// (i.e. when the input is greater than the ceiling)
return input >= ceil ? input % ceil : input;
// NB: the assumption here is that the numbers are positive
}
I suggest that you watch the whole talk, he goes into more details on why this method is faster than just using %
unconditionally.
This will likely be compiler and platform dependent.
But I was interested and on my system you appear to be correct in my benchmarks. However the method from @865719's answer is fastest:
#include <chrono>
#include <iostream>
class Timer
{
using clk = std::chrono::steady_clock;
using microseconds = std::chrono::microseconds;
clk::time_point tsb;
clk::time_point tse;
public:
void clear() { tsb = tse = clk::now(); }
void start() { tsb = clk::now(); }
void stop() { tse = clk::now(); }
friend std::ostream& operator<<(std::ostream& o, const Timer& timer)
{
return o << timer.secs();
}
// return time difference in seconds
double secs() const
{
if(tse <= tsb)
return 0.0;
auto d = std::chrono::duration_cast<microseconds>(tse - tsb);
return d.count() / 1000000.0;
}
};
int mod(int a, int b)
{
int tmp=a/b;
return a-(b*tmp);
}
int fast_mod(const int input, const int ceil) {
// apply the modulo operator only when needed
// (i.e. when the input is greater than the ceiling)
return input < ceil ? input : input % ceil;
// NB: the assumption here is that the numbers are positive
}
int main()
{
auto N = 1000000000U;
unsigned sum = 0;
Timer timer;
for(auto times = 0U; times < 3; ++times)
{
std::cout << " run: " << (times + 1) << '\n';
sum = 0;
timer.start();
for(decltype(N) n = 0; n < N; ++n)
sum += n % (N - n);
timer.stop();
std::cout << " %: " << sum << " " << timer << "s" << '\n';
sum = 0;
timer.start();
for(decltype(N) n = 0; n < N; ++n)
sum += mod(n, N - n);
timer.stop();
std::cout << " mod: " << sum << " " << timer << "s" << '\n';
sum = 0;
timer.start();
for(decltype(N) n = 0; n < N; ++n)
sum += fast_mod(n, N - n);
timer.stop();
std::cout << "fast_mod: " << sum << " " << timer << "s" << '\n';
}
}
Build: GCC 5.1.1
(x86_64)
g++ -std=c++14 -march=native -O3 -g0 ...
Output:
run: 1
%: 3081207628 5.49396s
mod: 3081207628 4.30814s
fast_mod: 3081207628 2.51296s
run: 2
%: 3081207628 5.5522s
mod: 3081207628 4.25427s
fast_mod: 3081207628 2.52364s
run: 3
%: 3081207628 5.4947s
mod: 3081207628 4.29646s
fast_mod: 3081207628 2.56916s