why “mov” instruction cost so much time

Question

why “mov” instruction cost so much time

I profile my code by linux perf, but the results are not so easy to understand. Why does the mov cost so much time in the following code?

55.85 │      mov    %rdx,-0x48(%rbp)

Could anyone help on that?

 15.93 │      mov    -0x48(%rbp),%rax                                                                                                                                               
  0.00 │      mov    %rdx,%rcx                                                                                                                                                      
       │      mov    %rdx,-0x50(%rbp)                                                                                                                                               
  0.59 │      xor    %edx,%edx                                                                                                                                                      
  0.12 │      div    %rsi                                                                                                                                                           
  6.96 │      mov    %rcx,%rax                                                                                                                                                      
  0.00 │      not    %ecx                                                                                                                                                           
       │      shr    $0x3,%rax                                                                                                                                                      
  0.62 │      and    $0x7,%ecx                                                                                                         
  0.17 │      movzbl 0x20(%r14,%rax,1),%eax                                                                                                                                         
  55.85│      mov    %rdx,-0x48(%rbp)    <= here
  0.02 │      sar    %cl,%eax

c code is here for reference:

struct resource {
    uint64_t  magic_num;
    uint64_t  m;
    uint64_t  k;
    uint64_t  count;
    unsigned char c_vector[1];
};
#define get_array(v, n)    ((v)[(n) >> 3] &  (0x1 << (0x7 - ((n) & 0x7))))
int
compute_result(const struct resource *res, const void *key, size_t len)
{
    uint32_t i;
    uint64_t  result[2];

    for (i = 0; i < res->k; i++) {
        get_x64_result(key, len, i, &result);
        result[0] %= res->m;
        result[1] %= res->m;
        if (!get_array(res->c_vector, result[0])){
            return 0;
        }
        if (!get_array(res->c_vector, result[1])){
            return 0;
        }
    }
    return 1;

}

maybe i can check if (result and res->m) < 0xffffffff then use 32bit div, but not sure could help.

performance

assembly

x86-64

perf

asked on Stack Overflow Aug 2, 2019 by

skywalker-2020 • edited Aug 4, 2019 by

skywalker-2020

0 Answers

Nobody has answered this question yet.

User contributions licensed under CC BY-SA 3.0