GCC -O3 generates strange assembly for std::uninitialized_default_construct

1

I'm looking at the assembly generated by GCC 9.1 for the new (C++17) function std::uninitialized_default_construct. In most cases (MSVC, Clang, even older GCC) it's what you would expect: a loop with calling (or inlining) the default constructor of the type T.

For some reason, the GCC from 7 to current run with -03 generates assembly that I do not fully understand. Especially the part that looks like hashing at the beginning.

I'm compiling with "-O3 -std=c++17 -fno-stack-protector -fno-exceptions -fomit-frame-pointer -fno-unroll-loops". Am I missing something here?

Tested the same code on various compilers. None behaves this strangely. There's no explanation I could find for the hashing thing.

Test code (run via godbolt.org):

#include <iostream>
#include <algorithm>
#include <memory>
#include <string>
#include <stdio.h>

struct Vector3 {
    float x=0,y=0,z=0; 
};

template< typename T >
static int init(T* mem, size_t len)
{
    std::uninitialized_default_construct(mem, mem+len);     
    return 0;
}

template< typename T >
static int init2(T* mem, size_t len)
{
    auto end = mem + len;
    while (mem < end)
        new (mem++) T();
    return 0;
}

int main(int argc, char** argv)
{
    Vector3 mem[100];
    auto funcPtr = (argc > 2) ? &init<Vector3> : &init2<Vector3>;

    if (0 != funcPtr(mem, (int)atoi(argv[1])))
        return -1;
    printf("%f\n", mem[0].x);
    return 0;
}

Generated disassembly for function "init" (brace yourself):

int init<Vector3>(Vector3*, unsigned long):
        lea     rax, [rsi+rsi*2]
        lea     rcx, [rdi+rax*4]
        cmp     rcx, rdi
        je      .L2
        movabs  rsi, 3074457345618258603
        mov     rdx, rcx
        mov     rax, rdi
        sub     rdx, rdi
        sub     rdx, 12
        shr     rdx, 2
        imul    rdx, rsi
        movabs  rsi, 4611686018427387903
        and     rdx, rsi
        lea     rsi, [rdx+1]
        cmp     rdx, 2
        jbe     .L3
        mov     rdx, rsi
        pxor    xmm0, xmm0
        shr     rdx, 2
        lea     rdx, [rdx+rdx*2]
        sal     rdx, 4
        add     rdx, rdi
.L4:
        movups  XMMWORD PTR [rax], xmm0
        add     rax, 48
        movups  XMMWORD PTR [rax-32], xmm0
        movups  XMMWORD PTR [rax-16], xmm0
        cmp     rax, rdx
        jne     .L4
        mov     rax, rsi
        and     rax, -4
        lea     rdx, [rax+rax*2]
        lea     rdi, [rdi+rdx*4]
        cmp     rsi, rax
        je      .L2
.L3:
        lea     rax, [rdi+12]
        mov     QWORD PTR [rdi], 0
        mov     DWORD PTR [rdi+8], 0x00000000
        cmp     rcx, rax
        je      .L2
        lea     rax, [rdi+24]
        mov     QWORD PTR [rdi+12], 0
        mov     DWORD PTR [rdi+20], 0x00000000
        cmp     rcx, rax
        je      .L2
        mov     QWORD PTR [rdi+24], 0
        mov     DWORD PTR [rdi+32], 0x00000000
.L2:
        xor     eax, eax
        ret

For comparison, this is what MSVC generates for the same code:

int init<Vector3>(Vector3 *,unsigned __int64) PROC         ; init<Vector3>, COMDAT
        lea     rax, QWORD PTR [rdx+rdx*2]
        lea     rdx, QWORD PTR [rcx+rax*4]
        xor     eax, eax
        cmp     rcx, rdx
        je      SHORT $LN5@init
        npad    1
$LL6@init:
        mov     QWORD PTR [rcx], rax
        mov     DWORD PTR [rcx+8], eax
        add     rcx, 12
        cmp     rcx, rdx
        jne     SHORT $LL6@init
$LN5@init:
        ret     0

Is there a way to make GCC behave properly?

c++
gcc
assembly
optimization
x86-64
asked on Stack Overflow Sep 13, 2019 by RexDex • edited Sep 13, 2019 by Jester

0 Answers

Nobody has answered this question yet.


User contributions licensed under CC BY-SA 3.0