Why is calling snprintf() so slow?

3

Our internal program is written in C makes extensive use of snprintf() for many piece, and I noticed that during debugging with perf record/report, it's spending a lot of time on the following:

       │      _IO_vfprintf_internal():                                                                                                                                                                                             ▒
       │        mov    -0x510(%rbp),%rdx                                                                                                                                                                                           ▒
       │        mov    %r12,%rsi                                                                                                                                                                                                   ▒
       │        mov    %r15,%rdi                                                                                                                                                                                                   ▒
       │      → callq  *0x38(%rax)                                                                                                                                                                                                 ▒
       │        cmp    %rax,-0x510(%rbp)                                                                                                                                                                                           ▒
       │        mov    -0x530(%rbp),%r9                                                                                                                                                                                            ▒
       │      ↑ jne    91a                                                                                                                                                                                                         ▒
       │        mov    -0x4d0(%rbp),%esi                                                                                                                                                                                           ▒
       │        mov    -0x540(%rbp),%ecx                                                                                                                                                                                           ▒
       │        mov    $0x7fffffff,%eax                                                                                                                                                                                            ▒
       │        sub    %esi,%eax                                                                                                                                                                                                   ▒
       │        add    %esi,%ecx                                                                                                                                                                                                   ▒
       │        cltq                                                                                                                                                                                                               ▒
       │        cmp    %rax,-0x510(%rbp)                                                                                                                                                                                           ▒
       │      ↑ jbe    252b                                                                                                                                                                                                        ▒
       │      ↑ jmpq   28f0                                                                                                                                                                                                        ▒
       │4a70:   xor    %eax,%eax                                                                                                                                                                                                   ▒
       │        or     $0xffffffffffffffff,%rcx                                                                                                                                                                                    ▒
       │        mov    %r12,%rdi                                                                                                                                                                                                   ▒
 99.52 │        repnz  scas %es:(%rdi),%al

It seems regardless of the GCC version/glibc, I've tried GCC 4.8.5 all the way to GCC 9, O2 to O3, etc. I've even used a sandbox with glibc from -HEAD.

Edit: I found a test program that demonstrates this problem quite well with perf:

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/time.h>

char dst[20000];

static void test_strcpy(int i) {
    size_t len = sizeof(dst);
    char* src = (char*)malloc(len + 1);
    memset(src, 'a', len);
    src[len] = 0;

    while (i--)
        strcpy(dst, src);

    free(src);
}

static void test_strncat(int i) {
    size_t len = sizeof(dst);
    char* src = (char*)malloc(len + 1);
    memset(src, 'a', len);
    src[len] = 0;

    while (i--) {
    dst[0] = 0;
        strncat(dst, src, sizeof(dst));
    }

    free(src);
}

#define BENCH(FN, ARG)                          \
do {                                    \
    struct timeval start, end;                      \
    long sec, msec;                         \
    gettimeofday(&start, NULL);                     \
    FN(ARG);                                \
    gettimeofday(&end, NULL);                       \
    sec = end.tv_sec - start.tv_sec;                    \
    msec = end.tv_usec - start.tv_usec;                 \
    if (msec < 0) {                         \
    --sec;                              \
    msec += 1000000;                        \
    }                                   \
    printf("%13s(%5d): %2ld sec %6ld msec\n", #FN, ARG, sec, msec); \
} while (0)

static void test_snprintf(int i) {
    size_t len = sizeof(dst);
    char* src = (char*)malloc(len + 1);
    memset(src, 'a', len);
    src[len] = 0;

    while (i--)
        snprintf(dst, sizeof(dst), "%s", src);

    free(src);
}

int main(int argc, char *argv[]) {
int i;
for (i = 0; i < 10; i++) {
    BENCH(test_strcpy, 10);
    BENCH(test_strcpy, 100);
    BENCH(test_strcpy, 1000);
    BENCH(test_strcpy, 10000);

    BENCH(test_strncat, 10);
    BENCH(test_strncat, 100);
    BENCH(test_strncat, 1000);
    BENCH(test_strncat, 10000);

    BENCH(test_snprintf, 10);
    BENCH(test_snprintf, 100);
    BENCH(test_snprintf, 1000);
    BENCH(test_snprintf, 10000);
    printf("\n");
}
    return 0;
}

It seems as if renz scas is the culprit, regardless of the string size. I've read that renz scas shouldn't be used for small strings because the setup cost on silicon is quite high.

c
performance
assembly
gcc
x86
asked on Stack Overflow Oct 29, 2020 by user1016031 • edited Oct 29, 2020 by user1016031

0 Answers

Nobody has answered this question yet.


User contributions licensed under CC BY-SA 3.0