Why is (i&1) | ((i&mostSig)?onesComp:0)
, where all three variables are integers, faster than i&1
on its own when compiled on GCC without optimisation?
The first expression obviously has to evaluate i&1
and then OR it with ((i&shift)?oneortwo:0)
, so I would expect it to be faster than i%2
but slower than i&1
, and that is the case on Clang (3.8 and newer):(mine is
(i&1) | ((i&mostSig)?onesComp:0)
, broken i&1
(because it would break on a system that uses one's complement), and naive i%2
)
But on GCC (5.5 - 8.1) my expression is marginally faster than the other two, all without optimisations:(As above)
The full code for the first expression is:
int i = -3; // Number to test
int onesComp = !((-1) & 1); // Check if machine uses One's complement
int mostSig = 1<<(sizeof(i)*8-1); // Represents the most significant bit (Should be unsigned, but serves for demonstration)
int odd = (i&1) | ((i&mostSig)?onesComp:0);// The bit being benchmarked
where the last line runs in a loop when benchmarking.
I don't think GCC is optimising the code despite the lack of -O flags, because all three expressions produce different assembly, and when I compile the code with -O3 all three perform exactly the same.
I have obviously run the benchmarks multiple times, and whilst the performance gap varies a bit, my expression is always the fastest, and i%2
is always the slowest.
EDIT: Here's the assembly GCC 8.1 produced for (i&1) | ((i&mostSig)?onesComp:0)
using Google Benchmark:
push %rbp
mov %rsp,%rbp
sub $0x90,%rsp
mov %rdi,-0x88(%rbp)
movl $0x0,-0x54(%rbp)
movl $0xfffffffd,-0x4(%rbp)
movl $0x0,-0x8(%rbp)
movl $0x80000000,-0xc(%rbp)
mov -0x88(%rbp),%rax
mov %rax,-0x18(%rbp)
mov -0x18(%rbp),%rax
mov %rax,-0x28(%rbp)
mov -0x28(%rbp),%rax
mov %rax,-0x30(%rbp)
mov -0x30(%rbp),%rax
movzbl 0x3c(%rax),%eax
test %al,%al
je 4075ae <mine(benchmark::State&)+0x5c>
mov $0x0,%eax
jmp 4075b6 <mine(benchmark::State&)+0x64>
mov -0x30(%rbp),%rax
mov 0x78(%rax),%rax
mov %rax,-0x40(%rbp)
mov -0x30(%rbp),%rax
mov %rax,-0x38(%rbp)
mov -0x40(%rbp),%rax
mov -0x38(%rbp),%rdx
mov %rax,-0x70(%rbp)
mov %rdx,-0x68(%rbp)
mov -0x18(%rbp),%rax
mov %rax,-0x20(%rbp)
mov -0x20(%rbp),%rax
mov %rax,%rdi
callq 416b60 <benchmark::State::StartKeepRunning()>
movq $0x0,-0x50(%rbp)
movq $0x0,-0x48(%rbp)
mov -0x50(%rbp),%rax
mov -0x48(%rbp),%rdx
mov %rax,-0x80(%rbp)
mov %rdx,-0x78(%rbp)
mov -0x70(%rbp),%rax
test %rax,%rax
setne %al
movzbl %al,%eax
test %rax,%rax
je 40761f <mine(benchmark::State&)+0xcd>
mov $0x1,%eax
jmp 407630 <mine(benchmark::State&)+0xde>
mov -0x68(%rbp),%rax
mov %rax,%rdi
callq 4168e0 <benchmark::State::FinishKeepRunning()>
mov $0x0,%eax
test %al,%al
je 407693 <mine(benchmark::State&)+0x141>
mov -0x4(%rbp),%eax
and $0x1,%eax
mov %eax,%edx
mov -0x4(%rbp),%eax
and -0xc(%rbp),%eax
test %eax,%eax
je 40764b <mine(benchmark::State&)+0xf9>
mov -0x8(%rbp),%eax
jmp 407650 <mine(benchmark::State&)+0xfe>
mov $0x0,%eax
or %edx,%eax
mov %eax,-0x54(%rbp)
mov -0x54(%rbp),%eax
mov -0x70(%rbp),%rax
test %rax,%rax
jne 40767a <mine(benchmark::State&)+0x128>
mov $0x426ca0,%ecx
mov $0x274,%edx
mov $0x426c40,%esi
mov $0x426c69,%edi
callq 404240 <__assert_fail@plt>
mov -0x70(%rbp),%rax
sub $0x1,%rax
mov %rax,-0x70(%rbp)
jmpq 407606 <mine(benchmark::State&)+0xb4>
mov %rax,%rdi
callq 404790 <_Unwind_Resume@plt>
nop
leaveq
retq
And this is for i&1
:
push %rbp
mov %rsp,%rbp
sub $0x90,%rsp
mov %rdi,-0x88(%rbp)
movl $0x0,-0x54(%rbp)
movl $0xfffffffd,-0x4(%rbp)
mov -0x88(%rbp),%rax
mov %rax,-0x10(%rbp)
mov -0x10(%rbp),%rax
mov %rax,-0x20(%rbp)
mov -0x20(%rbp),%rax
mov %rax,-0x28(%rbp)
mov -0x28(%rbp),%rax
movzbl 0x3c(%rax),%eax
test %al,%al
je 4076e4 <broken(benchmark::State&)+0x4e>
mov $0x0,%eax
jmp 4076ec <broken(benchmark::State&)+0x56>
mov -0x28(%rbp),%rax
mov 0x78(%rax),%rax
mov %rax,-0x40(%rbp)
mov -0x28(%rbp),%rax
mov %rax,-0x38(%rbp)
mov -0x40(%rbp),%rax
mov -0x38(%rbp),%rdx
mov %rax,-0x70(%rbp)
mov %rdx,-0x68(%rbp)
mov -0x10(%rbp),%rax
mov %rax,-0x18(%rbp)
mov -0x18(%rbp),%rax
mov %rax,%rdi
callq 416b60 <benchmark::State::StartKeepRunning()>
movq $0x0,-0x50(%rbp)
movq $0x0,-0x48(%rbp)
mov -0x50(%rbp),%rax
mov -0x48(%rbp),%rdx
mov %rax,-0x80(%rbp)
mov %rdx,-0x78(%rbp)
mov -0x70(%rbp),%rax
test %rax,%rax
setne %al
movzbl %al,%eax
test %rax,%rax
je 407755 <broken(benchmark::State&)+0xbf>
mov $0x1,%eax
jmp 407766 <broken(benchmark::State&)+0xd0>
mov -0x68(%rbp),%rax
mov %rax,%rdi
callq 4168e0 <benchmark::State::FinishKeepRunning()>
mov $0x0,%eax
test %al,%al
je 4077ae <broken(benchmark::State&)+0x118>
mov -0x4(%rbp),%eax
and $0x1,%eax
mov %eax,-0x54(%rbp)
mov -0x54(%rbp),%eax
mov -0x70(%rbp),%rax
test %rax,%rax
jne 407798 <broken(benchmark::State&)+0x102>
mov $0x426ca0,%ecx
mov $0x274,%edx
mov $0x426c40,%esi
mov $0x426c69,%edi
callq 404240 <__assert_fail@plt>
mov -0x70(%rbp),%rax
sub $0x1,%rax
mov %rax,-0x70(%rbp)
jmp 40773c <broken(benchmark::State&)+0xa6>
mov %rax,%rdi
callq 404790 <_Unwind_Resume@plt>
nop
leaveq
retq
User contributions licensed under CC BY-SA 3.0