I have implemented a program for a[i]=a[i-1]+c and I represent it her. I use begin_rdtsc and end_rdtsc to read and store the rdtsc to measure the speedup.
The program is as follows, I use x86intrin.h
#define MAX1 512
#define LEN MAX1*MAX1 //array size for time measure ments
int __attribute__(( aligned(32))) a[LEN];
int main(){
singleCore // It's a macro to assign the program to a single core of the processor
int i, b, c;
begin_rdtsc
// b=1 and c=2 in this case
b = 1;
c = 2;
i = 0;
a[i++] = b;//0 --> a[0] = 1
//step 1:
//solving dependencies vectorization factor is 8
a[i++] = a[0] + 1*c; //1 --> a[1] = 1 + 2 = 3
a[i++] = a[0] + 2*c; //2 --> a[2] = 1 + 4 = 5
a[i++] = a[0] + 3*c; //3 --> a[3] = 1 + 6 = 7
a[i++] = a[0] + 4*c; //4 --> a[4] = 1 + 8 = 9
a[i++] = a[0] + 5*c; //5 --> a[5] = 1 + 10 = 11
a[i++] = a[0] + 6*c; //6 --> a[6] = 1 + 12 = 13
a[i++] = a[0] + 7*c; //7 --> a[7] = 1 + 14 = 15
// vectorization factor reached
// 8 *c will work for all
//loading the results to an vector
__m256i dep1;
//__m256i dep2; // dep = { 1, 3, 5, 7, 9, 11, 13, 15 }
__m256i coeff = _mm256_set1_epi32(8*c); //coeff = { 16, 16, 16, 16, 16, 16, 16, 16 }
//step2
for(; i<LEN-1; i+=8){
dep1 = _mm256_load_si256((__m256i *) &a[i-8]);
dep1 = _mm256_add_epi32(dep1, coeff);
_mm256_store_si256((__m256i *) &a[i], dep1);
}
end_rdtsc
return 0;
}
I compiled this program with different compilers. My compilers are : icc 18, gcc 7.2, clang 4.
The OS is fedora 27.
The CPU is Corei7 6700HQ (Skylake)
The scalar implementation which is compiled with icc -D _GNU_SOURCE -O3 -no-vec -march=native
is the baseline for speedup measurements.
The asm output for each compiler is as follows: Because the behavior of ICC is not normal I copied all the code for icc. I marked the section in C program ("mm...mm1/2").
ICC
# mark_description "Intel(R) C Intel(R) 64 Compiler for applications running on Intel(R) 64, Version 18.0.1.163 Build 20171018";
# mark_description "-D _GNU_SOURCE -O3 -no-vec -march=native -c -S -o AIC3iccnovec";
.file "AIC3.c"
.text
..TXTST0:
.L_2__routine_start_main_0:
# -- Begin main
.text
# mark_begin;
.align 16,0x90
.globl main
# --- main()
main:
..B1.1: # Preds ..B1.0
# Execution count [1.00e+00]
.cfi_startproc
..___tag_value_main.1:
..L2:
#7.11
pushq %rbp #7.11
.cfi_def_cfa_offset 16
movq %rsp, %rbp #7.11
.cfi_def_cfa 6, 16
.cfi_offset 6, -16
andq $-128, %rsp #7.11
subq $128, %rsp #7.11
xorl %esi, %esi #7.11
movl $3, %edi #7.11
call __intel_new_feature_proc_init #7.11
# LOE rbx r12 r13 r14 r15
..B1.21: # Preds ..B1.1
# Execution count [1.00e+00]
vstmxcsr (%rsp) #7.11
vpxor %ymm0, %ymm0, %ymm0 #9.2
orl $32832, (%rsp) #7.11
vldmxcsr (%rsp) #7.11
vmovups %ymm0, mask(%rip) #9.2
vmovups %ymm0, 32+mask(%rip) #9.2
vmovups %ymm0, 64+mask(%rip) #9.2
vmovups %ymm0, 96+mask(%rip) #9.2
# LOE rbx r12 r13 r14 r15
..B1.2: # Preds ..B1.21
# Execution count [5.00e-01]
xorl %edi, %edi #9.2
movl $128, %esi #9.2
movl $mask, %edx #9.2
orq $12, mask(%rip) #9.2
vzeroupper #9.2
..___tag_value_main.6:
# sched_setaffinity(__pid_t, size_t, const cpu_set_t *)
call sched_setaffinity #9.2
..___tag_value_main.7:
# LOE rbx r12 r13 r14 r15
..B1.3: # Preds ..B1.2
# Execution count [1.72e+00]
movq $0xdf84757ff, %rax #12.5
movq $.L_2__STRING.1, programName(%rip) #10.2
movq $100000000, elapsed_rdtsc(%rip) #12.5
movq %rax, overal_time(%rip) #12.5
movq $0, ttime(%rip) #12.5
vmovdqu .L_2il0floatpacket.2(%rip), %ymm0 #33.21
# LOE rbx r12 r13 r14 r15
..B1.4: # Preds ..B1.12 ..B1.3
# Execution count [2.91e+00]
# Begin ASM
# #mmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmm1
# End ASM
# LOE rbx r12 r13 r14 r15
..B1.23: # Preds ..B1.4
# Execution count [2.91e+00]
vzeroupper #12.5
rdtsc #12.5
shlq $32, %rdx #12.5
orq %rdx, %rax #12.5
# LOE rax rbx r12 r13 r14 r15
..B1.5: # Preds ..B1.23
# Execution count [2.62e+00]
movq %rax, t1_rdtsc(%rip) #12.5
xorl %edx, %edx #35.5
movl $1, a(%rip) #18.5
xorl %eax, %eax #35.5
movl $3, 4+a(%rip) #21.5
movl $5, 8+a(%rip) #21.5
movl $7, 12+a(%rip) #21.5
movl $9, 16+a(%rip) #21.5
movl $11, 20+a(%rip) #21.5
movl $13, 24+a(%rip) #21.5
movl $15, 28+a(%rip) #21.5
vmovdqu .L_2il0floatpacket.2(%rip), %ymm1 #35.5
# LOE rax rbx r12 r13 r14 r15 edx ymm1
..B1.6: # Preds ..B1.6 ..B1.5
# Execution count [4.29e+04]
vpaddd a(%rax), %ymm1, %ymm0 #38.16
incl %edx #35.5
vmovdqu %ymm0, 32+a(%rax) #39.41
addq $32, %rax #35.5
cmpl $2047, %edx #35.5
jb ..B1.6 # Prob 99% #35.5
# LOE rax rbx r12 r13 r14 r15 edx ymm1
..B1.7: # Preds ..B1.6
# Execution count [2.91e+00]
vzeroupper #46.5
rdtsc #46.5
shlq $32, %rdx #46.5
orq %rdx, %rax #46.5
# LOE rax rbx r12 r13 r14 r15
..B1.8: # Preds ..B1.7
# Execution count [2.91e+00]
movq %rax, t2_rdtsc(%rip) #46.5
# LOE rbx r12 r13 r14 r15
..B1.26: # Preds ..B1.8
# Execution count [2.91e+00]
# Begin ASM
# #mmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmm2
# End ASM
# LOE rbx r12 r13 r14 r15
..B1.25: # Preds ..B1.26
# Execution count [2.91e+00]
movq t2_rdtsc(%rip), %rdx #46.5
subq t1_rdtsc(%rip), %rdx #46.5
movq ttbest_rdtsc(%rip), %rsi #46.5
movq %rdx, ttotal_rdtsc(%rip) #46.5
cmpq %rsi, %rdx #46.5
jge ..B1.10 # Prob 50% #46.5
# LOE rdx rbx rsi r12 r13 r14 r15
..B1.9: # Preds ..B1.25
# Execution count [1.45e+00]
movq elapsed_rdtsc(%rip), %rcx #46.5
movq %rcx, %rax #46.5
negq %rax #46.5
movq %rdx, %rsi #46.5
addq $100000000, %rax #46.5
movq %rdx, ttbest_rdtsc(%rip) #46.5
movq %rax, elapsed(%rip) #46.5
jmp ..B1.11 # Prob 100% #46.5
# LOE rdx rcx rbx rsi r12 r13 r14 r15
..B1.10: # Preds ..B1.25
# Execution count [1.45e+00]
movq elapsed_rdtsc(%rip), %rcx #46.5
# LOE rdx rcx rbx rsi r12 r13 r14 r15
..B1.11: # Preds ..B1.9 ..B1.10
# Execution count [2.91e+00]
movq ttime(%rip), %rax #46.5
addq %rdx, %rax #46.5
movq %rax, ttime(%rip) #46.5
testq %rcx, %rcx #46.5
je ..B1.14 # Prob 50% #46.5
# LOE rax rcx rbx rsi r12 r13 r14 r15
..B1.12: # Preds ..B1.11
# Execution count [1.45e+00]
decq %rcx #46.5
movq %rcx, elapsed_rdtsc(%rip) #46.5
cmpq overal_time(%rip), %rax #46.5
jl ..B1.4 # Prob 82% #46.5
jmp ..B1.15 # Prob 100% #46.5
# LOE rcx rbx rsi r12 r13 r14 r15
..B1.14: # Preds ..B1.11
# Execution count [1.45e+00]
movq $-1, elapsed_rdtsc(%rip) #46.5
movq $-1, %rcx #46.5
# LOE rcx rbx rsi r12 r13 r14 r15
..B1.15: # Preds ..B1.12 ..B1.14
# Execution count [1.00e+00]
negq %rcx #46.5
movl $.L_2__STRING.2, %edi #46.5
addq $100000000, %rcx #46.5
xorl %eax, %eax #46.5
movq elapsed(%rip), %rdx #46.5
..___tag_value_main.8:
# printf(const char *__restrict__, ...)
call printf #46.5
..___tag_value_main.9:
# LOE rbx r12 r13 r14 r15
..B1.16: # Preds ..B1.15
# Execution count [1.00e+00]
movl $.L_2__STRING.3, %edi #46.5
movl $.L_2__STRING.4, %esi #46.5
# fopen(const char *__restrict__, const char *__restrict__)
call fopen #46.5
# LOE rax rbx r12 r13 r14 r15
..B1.17: # Preds ..B1.16
# Execution count [1.00e+00]
movl $128, %ecx #46.5
movq %rax, %rdi #46.5
movq %rax, fileForSpeedups(%rip) #46.5
movl $.L_2__STRING.5, %esi #46.5
movl %ecx, %r8d #46.5
xorl %eax, %eax #46.5
movq programName(%rip), %rdx #46.5
movq ttbest_rdtsc(%rip), %r9 #46.5
# fprintf(FILE *__restrict__, const char *__restrict__, ...)
call fprintf #46.5
# LOE rbx r12 r13 r14 r15
..B1.18: # Preds ..B1.17
# Execution count [1.00e+00]
xorl %eax, %eax #47.9
movq %rbp, %rsp #47.9
popq %rbp #47.9
.cfi_def_cfa 7, 8
.cfi_restore 6
ret #47.9
.align 16,0x90
# LOE
.cfi_endproc
# mark_end;
.type main,@function
.size main,.-main
..LNmain.0:
.data
# -- End main
.bss
.align 8
.align 8
.globl fileForSpeedups
fileForSpeedups:
.type fileForSpeedups,@object
.size fileForSpeedups,8
.space 8 # pad
.align 8
.globl ttime
ttime:
.type ttime,@object
.size ttime,8
.space 8 # pad
.data
.align 8
.align 8
.globl programName
programName:
.quad .L_2__STRING.0
.type programName,@object
.size programName,8
.align 8
.globl ttbest_rdtsc
ttbest_rdtsc:
.long 0x5d89ffff,0x01634578
.type ttbest_rdtsc,@object
.size ttbest_rdtsc,8
.align 8
.globl elapsed_rdtsc
elapsed_rdtsc:
.long 0x05f5e100,0x00000000
.type elapsed_rdtsc,@object
.size elapsed_rdtsc,8
.align 8
.globl overal_time
overal_time:
.long 0xf84757ff,0x0000000d
.type overal_time,@object
.size overal_time,8
.section .rodata, "a"
.align 32
.align 32
.L_2il0floatpacket.2:
.long 0x00000010,0x00000010,0x00000010,0x00000010,0x00000010,0x00000010,0x00000010,0x00000010
.type .L_2il0floatpacket.2,@object
.size .L_2il0floatpacket.2,32
.section .rodata.str1.4, "aMS",@progbits,1
.align 4
.align 4
.L_2__STRING.1:
.long 860047681
.byte 0
.type .L_2__STRING.1,@object
.size .L_2__STRING.1,5
.space 3, 0x00 # pad
.align 4
.L_2__STRING.2:
.long 1701344266
.long 1936024096
.long 1936269428
.long 1819026720
.long 1852383332
.long 1819026720
.long 543716452
.long 1919251561
.long 1869182049
.long 1851859054
.long 1814372452
.long 1914725484
.long 1952804965
.long 1869182057
.long 684910
.type .L_2__STRING.2,@object
.size .L_2__STRING.2,60
.align 4
.L_2__STRING.3:
.long 1701603686
.long 1400008518
.long 1684366704
.long 7565429
.type .L_2__STRING.3,@object
.size .L_2__STRING.3,16
.align 4
.L_2__STRING.4:
.word 97
.type .L_2__STRING.4,@object
.size .L_2__STRING.4,2
.space 2, 0x00 # pad
.align 4
.L_2__STRING.5:
.long 539783973
.long 628646949
.long 622865508
.long 174353516
.byte 0
.type .L_2__STRING.5,@object
.size .L_2__STRING.5,17
.space 3, 0x00 # pad
.align 4
.L_2__STRING.0:
.word 32
.type .L_2__STRING.0,@object
.size .L_2__STRING.0,2
.data
.comm mask1,128,32
.comm t1_rdtsc,8,8
.comm t2_rdtsc,8,8
.comm ttotal_rdtsc,8,8
.comm elapsed,8,8
.comm mask,128,32
.comm a,65536,32
.section .note.GNU-stack, ""
// -- Begin DWARF2 SEGMENT .eh_frame
.section .eh_frame,"a",@progbits
.eh_frame_seg:
.align 8
# End
GCC
//gcc -D _GNU_SOURCE -O3 -fno-tree-vectorize -fno-tree-slp-vectorize -march=native -c -S -o "AIC3" "AIC3.c"
rdtsc
salq $32, %rdx
movq %r10, a(%rip)
orq %rdx, %rax
movq %r9, a+8(%rip)
movq %r8, a+16(%rip)
movq %rdi, a+24(%rip)
vmovdqa a(%rip), %ymm1
movq %rax, t1_rdtsc(%rip)
movl $a+32, %eax
.p2align 4,,10
.p2align 3
.L2:
vpaddd %ymm1, %ymm2, %ymm0
addq $32, %rax
vmovdqa %ymm0, -32(%rax)
vmovdqa %ymm0, %ymm1
cmpq %rax, %rcx
jne .L2
rdtsc
Clang
//clang -D _GNU_SOURCE -O3 -fno-vectorize -fno-slp-vectorize -march=native -c -S -o "AIC3"clang "
rdtsc
shlq $32, %rdx
orq %rax, %rdx
movq %rdx, t1_rdtsc(%rip)
movq %r8, a(%rip)
movq %r9, a+8(%rip)
movq %r10, a+16(%rip)
movq %rcx, a+24(%rip)
vmovdqa a(%rip), %ymm8
movl $64, %eax
jmp .LBB0_2
.p2align 4, 0x90
.LBB0_9: # in Loop: Header=BB0_2 Depth=2
vpaddd %ymm7, %ymm8, %ymm8
vmovdqa %ymm8, a(,%rax,4)
addq $64, %rax
.LBB0_2: # Parent Loop BB0_1 Depth=1
# => This Inner Loop Header: Depth=2
vpaddd %ymm0, %ymm8, %ymm9
vmovdqa %ymm9, a-224(,%rax,4)
vpaddd %ymm1, %ymm8, %ymm9
vmovdqa %ymm9, a-192(,%rax,4)
vpaddd %ymm2, %ymm8, %ymm9
vmovdqa %ymm9, a-160(,%rax,4)
vpaddd %ymm3, %ymm8, %ymm9
vmovdqa %ymm9, a-128(,%rax,4)
vpaddd %ymm4, %ymm8, %ymm9
vmovdqa %ymm9, a-96(,%rax,4)
vpaddd %ymm5, %ymm8, %ymm9
vmovdqa %ymm9, a-64(,%rax,4)
vpaddd %ymm6, %ymm8, %ymm9
vmovdqa %ymm9, a-32(,%rax,4)
cmpq $16383, %rax # imm = 0x3FFF
jl .LBB0_9
# BB#3: # in Loop: Header=BB0_1 Depth=1
rdtsc
The speedups are ~1.30, ~4.10 and 4.00 using icc, gcc and clang, respectively.
As I mentioned, I've compiled the same code with different compilers and recorder the rdtsc. speedup for ICC is not as I expected. I used IACA to watch the inner loop, the summarized output is:
-----------------------------------------------------
| compilers | icc | gcc | clang |
------------------------------------------------------
| Throughput |1.49 cycle |1.00 cycle |1.49 cycle |
------------------------------------------------------
| bottleneck | Front End | dependency | Front End |
------------------------------------------------------
UPDATE-0 : I've compared with and without IACA generated codes. The reason that IACA does not help, in this case, is the outputs are not the same. It seems injecting the IACA marks forces the compilers to stop their optimization, GCC has the same generated code as ICC and Clang has. But, calculating the addresses in GCC is more efficient in throughput point of view. In summary, IACA cannot help for this code.
UPDATE-1 : The outputs for perf
is as follows:
512*512
ICC:
86.06 │loop: vpaddd 0x604580(%rax),%ymm1,%ymm0
0.17 │ inc %edx
4.73 │ vmovdq %ymm0,0x6045a0(%rax)
│ add $0x20,%rax
│ cmp $0x7fff,%edx
8.98 │ jb loop
GCC:
30.62 │loop: vpaddd %ymm1,%ymm2,%ymm0
15.12 │ add $0x20,%rax
46.03 │ vmovdq %ymm0,-0x20(%rax)
2.40 │ vmovdq %ymm0,%ymm1
0.01 │ cmp %rax,%rcx
5.62 │ jne loop
LLVM:
3.00 │loop: vpaddd %ymm0,%ymm7,%ymm8
6.61 │ vmovdq %ymm8,0x6020e0(,%rax,4)
15.96 │ vpaddd %ymm1,%ymm7,%ymm8
5.19 │ vmovdq %ymm8,0x602100(,%rax,4)
1.89 │ vpaddd %ymm2,%ymm7,%ymm8
6.16 │ vmovdq %ymm8,0x602120(,%rax,4)
13.25 │ vpaddd %ymm3,%ymm7,%ymm8
8.01 │ vmovdq %ymm8,0x602140(,%rax,4)
2.10 │ vpaddd %ymm4,%ymm7,%ymm8
5.37 │ vmovdq %ymm8,0x602160(,%rax,4)
13.92 │ vpaddd %ymm5,%ymm7,%ymm8
7.95 │ vmovdq %ymm8,0x602180(,%rax,4)
0.89 │ vpaddd %ymm6,%ymm7,%ymm7
4.34 │ vmovdq %ymm7,0x6021a0(,%rax,4)
2.82 │ add $0x38,%rax
│ cmp $0x3ffff,%rax
2.24 │ jl loop
The ICC assembly output show that there is some SIMD instructions inside the rdtsc
. If I miss something, or something is wrong I really have no idea. I spent a lot of time to realize the problem but zero achievement. Please, if somebody knows the reason help me.
Thanks in advance.
The different compilers actually use fairly different implementation strategies here.
GCC notices that it never has to re-load a[i-8]
which was calculated in the previous iteration and therefore can be sourced from a register. This relies on mov-elimination somewhat, otherwise the reg-reg move would still add some latency, though even without mov-elimination it would be a lot faster than reloading every time.
ICC's codegen is very naive, it just does it exactly the way you wrote it. The store/reload adds quite a lot of latency.
Clang does approximately the same thing as GCC, but unrolls by 8 (minus the first iteration). Clang often likes to unroll more. I'm not sure why it's slightly worse than what GCC does.
You can avoid the reloading by explicitly not doing it in the first place: (not tested)
dep1 = _mm256_load_si256((__m256i *) &a[0]);
for(; i<LEN-1; i+=8){
dep1 = _mm256_add_epi32(dep1, coeff);
_mm256_store_si256((__m256i *) &a[i], dep1);
}
User contributions licensed under CC BY-SA 3.0