Our internal program is written in C makes extensive use of snprintf() for many piece, and I noticed that during debugging with perf record/report, it's spending a lot of time on the following:
│ _IO_vfprintf_internal(): ▒
│ mov -0x510(%rbp),%rdx ▒
│ mov %r12,%rsi ▒
│ mov %r15,%rdi ▒
│ → callq *0x38(%rax) ▒
│ cmp %rax,-0x510(%rbp) ▒
│ mov -0x530(%rbp),%r9 ▒
│ ↑ jne 91a ▒
│ mov -0x4d0(%rbp),%esi ▒
│ mov -0x540(%rbp),%ecx ▒
│ mov $0x7fffffff,%eax ▒
│ sub %esi,%eax ▒
│ add %esi,%ecx ▒
│ cltq ▒
│ cmp %rax,-0x510(%rbp) ▒
│ ↑ jbe 252b ▒
│ ↑ jmpq 28f0 ▒
│4a70: xor %eax,%eax ▒
│ or $0xffffffffffffffff,%rcx ▒
│ mov %r12,%rdi ▒
99.52 │ repnz scas %es:(%rdi),%al
It seems regardless of the GCC version/glibc, I've tried GCC 4.8.5 all the way to GCC 9, O2 to O3, etc. I've even used a sandbox with glibc from -HEAD.
Edit: I found a test program that demonstrates this problem quite well with perf:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/time.h>
char dst[20000];
static void test_strcpy(int i) {
size_t len = sizeof(dst);
char* src = (char*)malloc(len + 1);
memset(src, 'a', len);
src[len] = 0;
while (i--)
strcpy(dst, src);
free(src);
}
static void test_strncat(int i) {
size_t len = sizeof(dst);
char* src = (char*)malloc(len + 1);
memset(src, 'a', len);
src[len] = 0;
while (i--) {
dst[0] = 0;
strncat(dst, src, sizeof(dst));
}
free(src);
}
#define BENCH(FN, ARG) \
do { \
struct timeval start, end; \
long sec, msec; \
gettimeofday(&start, NULL); \
FN(ARG); \
gettimeofday(&end, NULL); \
sec = end.tv_sec - start.tv_sec; \
msec = end.tv_usec - start.tv_usec; \
if (msec < 0) { \
--sec; \
msec += 1000000; \
} \
printf("%13s(%5d): %2ld sec %6ld msec\n", #FN, ARG, sec, msec); \
} while (0)
static void test_snprintf(int i) {
size_t len = sizeof(dst);
char* src = (char*)malloc(len + 1);
memset(src, 'a', len);
src[len] = 0;
while (i--)
snprintf(dst, sizeof(dst), "%s", src);
free(src);
}
int main(int argc, char *argv[]) {
int i;
for (i = 0; i < 10; i++) {
BENCH(test_strcpy, 10);
BENCH(test_strcpy, 100);
BENCH(test_strcpy, 1000);
BENCH(test_strcpy, 10000);
BENCH(test_strncat, 10);
BENCH(test_strncat, 100);
BENCH(test_strncat, 1000);
BENCH(test_strncat, 10000);
BENCH(test_snprintf, 10);
BENCH(test_snprintf, 100);
BENCH(test_snprintf, 1000);
BENCH(test_snprintf, 10000);
printf("\n");
}
return 0;
}
It seems as if renz scas is the culprit, regardless of the string size. I've read that renz scas shouldn't be used for small strings because the setup cost on silicon is quite high.
User contributions licensed under CC BY-SA 3.0