I profile my code by linux perf, but the results are not so easy to understand. Why does the mov cost so much time in the following code?
55.85 │ mov %rdx,-0x48(%rbp)
Could anyone help on that?
15.93 │ mov -0x48(%rbp),%rax
0.00 │ mov %rdx,%rcx
│ mov %rdx,-0x50(%rbp)
0.59 │ xor %edx,%edx
0.12 │ div %rsi
6.96 │ mov %rcx,%rax
0.00 │ not %ecx
│ shr $0x3,%rax
0.62 │ and $0x7,%ecx
0.17 │ movzbl 0x20(%r14,%rax,1),%eax
55.85│ mov %rdx,-0x48(%rbp) <= here
0.02 │ sar %cl,%eax
c code is here for reference:
struct resource {
uint64_t magic_num;
uint64_t m;
uint64_t k;
uint64_t count;
unsigned char c_vector[1];
};
#define get_array(v, n) ((v)[(n) >> 3] & (0x1 << (0x7 - ((n) & 0x7))))
int
compute_result(const struct resource *res, const void *key, size_t len)
{
uint32_t i;
uint64_t result[2];
for (i = 0; i < res->k; i++) {
get_x64_result(key, len, i, &result);
result[0] %= res->m;
result[1] %= res->m;
if (!get_array(res->c_vector, result[0])){
return 0;
}
if (!get_array(res->c_vector, result[1])){
return 0;
}
}
return 1;
}
maybe i can check if (result and res->m) < 0xffffffff then use 32bit div, but not sure could help.
User contributions licensed under CC BY-SA 3.0