Why is this expression faster than one of its components on its own?

0

Why is (i&1) | ((i&mostSig)?onesComp:0), where all three variables are integers, faster than i&1 on its own when compiled on GCC without optimisation?

The first expression obviously has to evaluate i&1 and then OR it with ((i&shift)?oneortwo:0), so I would expect it to be faster than i%2 but slower than i&1, and that is the case on Clang (3.8 and newer):enter image description here(mine is (i&1) | ((i&mostSig)?onesComp:0), broken i&1 (because it would break on a system that uses one's complement), and naive i%2)

But on GCC (5.5 - 8.1) my expression is marginally faster than the other two, all without optimisations:enter image description here(As above)

The full code for the first expression is:

  int i = -3;                                // Number to test
  int onesComp = !((-1) & 1);                // Check if machine uses One's complement
  int mostSig = 1<<(sizeof(i)*8-1);          // Represents the most significant bit (Should be unsigned, but serves for demonstration) 
  int odd = (i&1) | ((i&mostSig)?onesComp:0);// The bit being benchmarked

where the last line runs in a loop when benchmarking.

I don't think GCC is optimising the code despite the lack of -O flags, because all three expressions produce different assembly, and when I compile the code with -O3 all three perform exactly the same.

I have obviously run the benchmarks multiple times, and whilst the performance gap varies a bit, my expression is always the fastest, and i%2 is always the slowest.

EDIT: Here's the assembly GCC 8.1 produced for (i&1) | ((i&mostSig)?onesComp:0) using Google Benchmark:

   push   %rbp
   mov    %rsp,%rbp
   sub    $0x90,%rsp
   mov    %rdi,-0x88(%rbp)
   movl   $0x0,-0x54(%rbp)
   movl   $0xfffffffd,-0x4(%rbp)
   movl   $0x0,-0x8(%rbp)
   movl   $0x80000000,-0xc(%rbp)
   mov    -0x88(%rbp),%rax
   mov    %rax,-0x18(%rbp)
   mov    -0x18(%rbp),%rax
   mov    %rax,-0x28(%rbp)
   mov    -0x28(%rbp),%rax
   mov    %rax,-0x30(%rbp)
   mov    -0x30(%rbp),%rax
   movzbl 0x3c(%rax),%eax
   test   %al,%al
   je     4075ae <mine(benchmark::State&)+0x5c>
   mov    $0x0,%eax
   jmp    4075b6 <mine(benchmark::State&)+0x64>
   mov    -0x30(%rbp),%rax
   mov    0x78(%rax),%rax
   mov    %rax,-0x40(%rbp)
   mov    -0x30(%rbp),%rax
   mov    %rax,-0x38(%rbp)
   mov    -0x40(%rbp),%rax
   mov    -0x38(%rbp),%rdx
   mov    %rax,-0x70(%rbp)
   mov    %rdx,-0x68(%rbp)
   mov    -0x18(%rbp),%rax
   mov    %rax,-0x20(%rbp)
   mov    -0x20(%rbp),%rax
   mov    %rax,%rdi
   callq  416b60 <benchmark::State::StartKeepRunning()>
   movq   $0x0,-0x50(%rbp)
   movq   $0x0,-0x48(%rbp)
   mov    -0x50(%rbp),%rax
   mov    -0x48(%rbp),%rdx
   mov    %rax,-0x80(%rbp)
   mov    %rdx,-0x78(%rbp)
   mov    -0x70(%rbp),%rax
   test   %rax,%rax
   setne  %al
   movzbl %al,%eax
   test   %rax,%rax
   je     40761f <mine(benchmark::State&)+0xcd>
   mov    $0x1,%eax
   jmp    407630 <mine(benchmark::State&)+0xde>
   mov    -0x68(%rbp),%rax
   mov    %rax,%rdi
   callq  4168e0 <benchmark::State::FinishKeepRunning()>
   mov    $0x0,%eax
   test   %al,%al
   je     407693 <mine(benchmark::State&)+0x141>
   mov    -0x4(%rbp),%eax
   and    $0x1,%eax
   mov    %eax,%edx
   mov    -0x4(%rbp),%eax
   and    -0xc(%rbp),%eax
   test   %eax,%eax
   je     40764b <mine(benchmark::State&)+0xf9>
   mov    -0x8(%rbp),%eax
   jmp    407650 <mine(benchmark::State&)+0xfe>
   mov    $0x0,%eax
   or     %edx,%eax
   mov    %eax,-0x54(%rbp)
   mov    -0x54(%rbp),%eax
   mov    -0x70(%rbp),%rax
   test   %rax,%rax
   jne    40767a <mine(benchmark::State&)+0x128>
   mov    $0x426ca0,%ecx
   mov    $0x274,%edx
   mov    $0x426c40,%esi
   mov    $0x426c69,%edi
   callq  404240 <__assert_fail@plt>
   mov    -0x70(%rbp),%rax
   sub    $0x1,%rax
   mov    %rax,-0x70(%rbp)
   jmpq   407606 <mine(benchmark::State&)+0xb4>
   mov    %rax,%rdi
   callq  404790 <_Unwind_Resume@plt>
   nop
   leaveq 
   retq   

And this is for i&1:

   push   %rbp
   mov    %rsp,%rbp
   sub    $0x90,%rsp
   mov    %rdi,-0x88(%rbp)
   movl   $0x0,-0x54(%rbp)
   movl   $0xfffffffd,-0x4(%rbp)
   mov    -0x88(%rbp),%rax
   mov    %rax,-0x10(%rbp)
   mov    -0x10(%rbp),%rax
   mov    %rax,-0x20(%rbp)
   mov    -0x20(%rbp),%rax
   mov    %rax,-0x28(%rbp)
   mov    -0x28(%rbp),%rax
   movzbl 0x3c(%rax),%eax
   test   %al,%al
   je     4076e4 <broken(benchmark::State&)+0x4e>
   mov    $0x0,%eax
   jmp    4076ec <broken(benchmark::State&)+0x56>
   mov    -0x28(%rbp),%rax
   mov    0x78(%rax),%rax
   mov    %rax,-0x40(%rbp)
   mov    -0x28(%rbp),%rax
   mov    %rax,-0x38(%rbp)
   mov    -0x40(%rbp),%rax
   mov    -0x38(%rbp),%rdx
   mov    %rax,-0x70(%rbp)
   mov    %rdx,-0x68(%rbp)
   mov    -0x10(%rbp),%rax
   mov    %rax,-0x18(%rbp)
   mov    -0x18(%rbp),%rax
   mov    %rax,%rdi
   callq  416b60 <benchmark::State::StartKeepRunning()>
   movq   $0x0,-0x50(%rbp)
   movq   $0x0,-0x48(%rbp)
   mov    -0x50(%rbp),%rax
   mov    -0x48(%rbp),%rdx
   mov    %rax,-0x80(%rbp)
   mov    %rdx,-0x78(%rbp)
   mov    -0x70(%rbp),%rax
   test   %rax,%rax
   setne  %al
   movzbl %al,%eax
   test   %rax,%rax
   je     407755 <broken(benchmark::State&)+0xbf>
   mov    $0x1,%eax
   jmp    407766 <broken(benchmark::State&)+0xd0>
   mov    -0x68(%rbp),%rax
   mov    %rax,%rdi
   callq  4168e0 <benchmark::State::FinishKeepRunning()>
   mov    $0x0,%eax
   test   %al,%al
   je     4077ae <broken(benchmark::State&)+0x118>
   mov    -0x4(%rbp),%eax
   and    $0x1,%eax
   mov    %eax,-0x54(%rbp)
   mov    -0x54(%rbp),%eax
   mov    -0x70(%rbp),%rax
   test   %rax,%rax
   jne    407798 <broken(benchmark::State&)+0x102>
   mov    $0x426ca0,%ecx
   mov    $0x274,%edx
   mov    $0x426c40,%esi
   mov    $0x426c69,%edi
   callq  404240 <__assert_fail@plt>
   mov    -0x70(%rbp),%rax
   sub    $0x1,%rax
   mov    %rax,-0x70(%rbp)
   jmp    40773c <broken(benchmark::State&)+0xa6>
   mov    %rax,%rdi
   callq  404790 <_Unwind_Resume@plt>
   nop
   leaveq 
   retq   
c
performance
gcc
asked on Stack Overflow Jul 13, 2018 by DividedByZero • edited Jul 13, 2018 by Mark Setchell

0 Answers

Nobody has answered this question yet.


User contributions licensed under CC BY-SA 3.0