Tail Call Optimization (TCO) in Clang on O2

1

I was playing with Tail Call Optimization (TCO) and various optimization levels in clang on godbolt. I have the very simple function (lifted from here):

int factorial (int x, int y){
  if (x==0)
    return y;
  else
    return factorial(x-1,y*x);
}

If I set -O1, all is good and I understand the generated assembly. However, if I set -O2, a lot more happens and I don't understand what is happening. Could someone explain?

Generated Assembly at -02:

.LCPI0_0:
        .long   0                       # 0x0
        .long   4294967295              # 0xffffffff
        .long   4294967294              # 0xfffffffe
        .long   4294967293              # 0xfffffffd
.LCPI0_1:
        .zero   4
        .long   1                       # 0x1
        .long   1                       # 0x1
        .long   1                       # 0x1
.LCPI0_2:
        .long   1                       # 0x1
        .long   1                       # 0x1
        .long   1                       # 0x1
        .long   1                       # 0x1
.LCPI0_3:
        .long   4294967292              # 0xfffffffc
        .long   4294967292              # 0xfffffffc
        .long   4294967292              # 0xfffffffc
        .long   4294967292              # 0xfffffffc
.LCPI0_4:
        .long   4294967288              # 0xfffffff8
        .long   4294967288              # 0xfffffff8
        .long   4294967288              # 0xfffffff8
        .long   4294967288              # 0xfffffff8
.LCPI0_5:
        .long   4294967284              # 0xfffffff4
        .long   4294967284              # 0xfffffff4
        .long   4294967284              # 0xfffffff4
        .long   4294967284              # 0xfffffff4
.LCPI0_6:
        .long   4294967280              # 0xfffffff0
        .long   4294967280              # 0xfffffff0
        .long   4294967280              # 0xfffffff0
        .long   4294967280              # 0xfffffff0
.LCPI0_7:
        .long   4294967276              # 0xffffffec
        .long   4294967276              # 0xffffffec
        .long   4294967276              # 0xffffffec
        .long   4294967276              # 0xffffffec
.LCPI0_8:
        .long   4294967272              # 0xffffffe8
        .long   4294967272              # 0xffffffe8
        .long   4294967272              # 0xffffffe8
        .long   4294967272              # 0xffffffe8
.LCPI0_9:
        .long   4294967268              # 0xffffffe4
        .long   4294967268              # 0xffffffe4
        .long   4294967268              # 0xffffffe4
        .long   4294967268              # 0xffffffe4
.LCPI0_10:
        .long   4294967264              # 0xffffffe0
        .long   4294967264              # 0xffffffe0
        .long   4294967264              # 0xffffffe0
        .long   4294967264              # 0xffffffe0
factorial(int, int):                         # @factorial(int, int)
        mov     eax, esi
        test    edi, edi
        je      .LBB0_12
        cmp     edi, 8
        jb      .LBB0_11
        mov     ecx, edi
        and     ecx, -8
        movd    xmm0, edi
        pshufd  xmm6, xmm0, 0           # xmm6 = xmm0[0,0,0,0]
        paddd   xmm6, xmmword ptr [rip + .LCPI0_0]
        movd    xmm0, eax
        movaps  xmm1, xmmword ptr [rip + .LCPI0_1] # xmm1 = <u,1,1,1>
        movss   xmm1, xmm0              # xmm1 = xmm0[0],xmm1[1,2,3]
        lea     esi, [rcx - 8]
        mov     edx, esi
        shr     edx, 3
        add     edx, 1
        mov     eax, edx
        and     eax, 3
        cmp     esi, 24
        jae     .LBB0_4
        movdqa  xmm2, xmmword ptr [rip + .LCPI0_2] # xmm2 = [1,1,1,1]
        test    eax, eax
        jne     .LBB0_7
        jmp     .LBB0_9
.LBB0_4:
        mov     esi, 1
        sub     esi, edx
        lea     edx, [rax + rsi]
        add     edx, -1
        movdqa  xmm2, xmmword ptr [rip + .LCPI0_2] # xmm2 = [1,1,1,1]
        movdqa  xmm9, xmmword ptr [rip + .LCPI0_4] # xmm9 = [4294967288,4294967288,4294967288,4294967288]
        movdqa  xmm10, xmmword ptr [rip + .LCPI0_5] # xmm10 = [4294967284,4294967284,4294967284,4294967284]
        movdqa  xmm11, xmmword ptr [rip + .LCPI0_6] # xmm11 = [4294967280,4294967280,4294967280,4294967280]
        movdqa  xmm12, xmmword ptr [rip + .LCPI0_7] # xmm12 = [4294967276,4294967276,4294967276,4294967276]
        movdqa  xmm13, xmmword ptr [rip + .LCPI0_8] # xmm13 = [4294967272,4294967272,4294967272,4294967272]
        movdqa  xmm14, xmmword ptr [rip + .LCPI0_9] # xmm14 = [4294967268,4294967268,4294967268,4294967268]
        movdqa  xmm15, xmmword ptr [rip + .LCPI0_10] # xmm15 = [4294967264,4294967264,4294967264,4294967264]
.LBB0_5:                                # =>This Inner Loop Header: Depth=1
        movdqa  xmm0, xmm6
        paddd   xmm0, xmmword ptr [rip + .LCPI0_3]
        pshufd  xmm7, xmm6, 245         # xmm7 = xmm6[1,1,3,3]
        pshufd  xmm3, xmm1, 245         # xmm3 = xmm1[1,1,3,3]
        pmuludq xmm3, xmm7
        pmuludq xmm1, xmm6
        pshufd  xmm7, xmm2, 245         # xmm7 = xmm2[1,1,3,3]
        pshufd  xmm4, xmm0, 245         # xmm4 = xmm0[1,1,3,3]
        pmuludq xmm4, xmm7
        pmuludq xmm0, xmm2
        movdqa  xmm2, xmm6
        paddd   xmm2, xmm9
        movdqa  xmm7, xmm6
        paddd   xmm7, xmm10
        pmuludq xmm1, xmm2
        pshufd  xmm2, xmm2, 245         # xmm2 = xmm2[1,1,3,3]
        pmuludq xmm2, xmm3
        pmuludq xmm0, xmm7
        pshufd  xmm3, xmm7, 245         # xmm3 = xmm7[1,1,3,3]
        pmuludq xmm3, xmm4
        movdqa  xmm4, xmm6
        paddd   xmm4, xmm11
        movdqa  xmm7, xmm6
        paddd   xmm7, xmm12
        pshufd  xmm5, xmm4, 245         # xmm5 = xmm4[1,1,3,3]
        pmuludq xmm5, xmm2
        pmuludq xmm4, xmm1
        pshufd  xmm8, xmm7, 245         # xmm8 = xmm7[1,1,3,3]
        pmuludq xmm8, xmm3
        pmuludq xmm7, xmm0
        movdqa  xmm0, xmm6
        paddd   xmm0, xmm13
        movdqa  xmm3, xmm6
        paddd   xmm3, xmm14
        pmuludq xmm4, xmm0
        pshufd  xmm1, xmm4, 232         # xmm1 = xmm4[0,2,2,3]
        pshufd  xmm0, xmm0, 245         # xmm0 = xmm0[1,1,3,3]
        pmuludq xmm0, xmm5
        pshufd  xmm0, xmm0, 232         # xmm0 = xmm0[0,2,2,3]
        punpckldq       xmm1, xmm0      # xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
        pmuludq xmm7, xmm3
        pshufd  xmm2, xmm7, 232         # xmm2 = xmm7[0,2,2,3]
        pshufd  xmm0, xmm3, 245         # xmm0 = xmm3[1,1,3,3]
        pmuludq xmm0, xmm8
        pshufd  xmm0, xmm0, 232         # xmm0 = xmm0[0,2,2,3]
        punpckldq       xmm2, xmm0      # xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
        paddd   xmm6, xmm15
        add     edx, 4
        jne     .LBB0_5
        test    eax, eax
        je      .LBB0_9
.LBB0_7:
        neg     eax
        movdqa  xmm3, xmmword ptr [rip + .LCPI0_3] # xmm3 = [4294967292,4294967292,4294967292,4294967292]
        movdqa  xmm4, xmmword ptr [rip + .LCPI0_4] # xmm4 = [4294967288,4294967288,4294967288,4294967288]
.LBB0_8:                                # =>This Inner Loop Header: Depth=1
        movdqa  xmm0, xmm6
        paddd   xmm0, xmm3
        pshufd  xmm5, xmm1, 245         # xmm5 = xmm1[1,1,3,3]
        pmuludq xmm1, xmm6
        pshufd  xmm1, xmm1, 232         # xmm1 = xmm1[0,2,2,3]
        pshufd  xmm7, xmm6, 245         # xmm7 = xmm6[1,1,3,3]
        pmuludq xmm7, xmm5
        pshufd  xmm5, xmm7, 232         # xmm5 = xmm7[0,2,2,3]
        punpckldq       xmm1, xmm5      # xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1]
        pshufd  xmm5, xmm2, 245         # xmm5 = xmm2[1,1,3,3]
        pmuludq xmm2, xmm0
        pshufd  xmm2, xmm2, 232         # xmm2 = xmm2[0,2,2,3]
        pshufd  xmm0, xmm0, 245         # xmm0 = xmm0[1,1,3,3]
        pmuludq xmm0, xmm5
        pshufd  xmm0, xmm0, 232         # xmm0 = xmm0[0,2,2,3]
        punpckldq       xmm2, xmm0      # xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
        paddd   xmm6, xmm4
        inc     eax
        jne     .LBB0_8
.LBB0_9:
        pshufd  xmm0, xmm2, 245         # xmm0 = xmm2[1,1,3,3]
        pmuludq xmm2, xmm1
        pshufd  xmm2, xmm2, 232         # xmm2 = xmm2[0,2,2,3]
        pshufd  xmm1, xmm1, 245         # xmm1 = xmm1[1,1,3,3]
        pmuludq xmm1, xmm0
        pshufd  xmm0, xmm1, 232         # xmm0 = xmm1[0,2,2,3]
        punpckldq       xmm2, xmm0      # xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
        pshufd  xmm0, xmm2, 78          # xmm0 = xmm2[2,3,0,1]
        pmuludq xmm0, xmm2
        pshufd  xmm0, xmm0, 232         # xmm0 = xmm0[0,2,2,3]
        pshufd  xmm2, xmm1, 10          # xmm2 = xmm1[2,2,0,0]
        pmuludq xmm2, xmm1
        pshufd  xmm1, xmm2, 232         # xmm1 = xmm2[0,2,2,3]
        punpckldq       xmm0, xmm1      # xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
        pshufd  xmm1, xmm0, 229         # xmm1 = xmm0[1,1,2,3]
        pmuludq xmm1, xmm0
        movd    eax, xmm1
        cmp     ecx, edi
        je      .LBB0_12
        sub     edi, ecx
.LBB0_11:                               # =>This Inner Loop Header: Depth=1
        imul    eax, edi
        add     edi, -1
        jne     .LBB0_11
.LBB0_12:
        ret

Generated assembly at -01:

factorial(int, int):                         # @factorial(int, int)
        mov     eax, esi
        test    edi, edi
        je      .LBB0_2
.LBB0_1:                                # =>This Inner Loop Header: Depth=1
        imul    eax, edi
        add     edi, -1
        jne     .LBB0_1
.LBB0_2:
        ret
c++
recursion
optimization
clang
tail-recursion
asked on Stack Overflow Jun 17, 2019 by Bob Jansen • edited Jun 17, 2019 by Bob Jansen

0 Answers

Nobody has answered this question yet.


User contributions licensed under CC BY-SA 3.0