I think code tells everything:
➜ project cat branch_prediction_victim.cpp
#include <algorithm>#include <ctime>#include <iostream>int main(int argc, char* argv[]){ const size_t array_length = 32768; int array_aka[array_length]; std::srand(std::time(nullptr)); for (size_t i = 0; i < array_length; ++i) array_aka[i] = std::rand() % 256; // !!! With this, the next loop runs faster. //std::sort(array_aka, array_aka + array_length); clock_t start; long long sum; double elapsed_seconds; sum = 0; start = clock(); for (int i = 0; i < 100000; ++i) for (size_t j = 0; j < array_length; ++j) if (array_aka[j] >= 128) sum += array_aka[j]; elapsed_seconds = static_cast<double>(clock() - start) / CLOCKS_PER_SEC; std::cout << "unsorted(>= 128): \t"<< elapsed_seconds << std::endl; std::cout << "sum = \t"<< sum << std::endl; sum = 0; start = clock(); for (int i = 0; i < 100000; ++i) for (size_t j = 0; j < array_length; ++j) if (array_aka[j] >= 64) sum += array_aka[j]; elapsed_seconds = static_cast<double>(clock() - start) / CLOCKS_PER_SEC; std::cout << "unsorted(>= 64): \t"<< elapsed_seconds << std::endl; std::cout << "sum = \t"<< sum << std::endl; sum = 0; start = clock(); for (int i = 0; i < 100000; ++i) for (size_t j = 0; j < array_length; ++j) if (__builtin_expect((array_aka[j] >= 64), 1)) sum += array_aka[j]; elapsed_seconds = static_cast<double>(clock() - start) / CLOCKS_PER_SEC; std::cout << "unsorted(>= 64)(hint):\t"<< elapsed_seconds << std::endl; std::cout << "sum = \t"<< sum << std::endl; std::sort(array_aka, array_aka + array_length); sum = 0; start = clock(); for (int i = 0; i < 100000; ++i) for (size_t j = 0; j < array_length; ++j) if (array_aka[j] >= 128) sum += array_aka[j]; elapsed_seconds = static_cast<double>(clock() - start) / CLOCKS_PER_SEC; std::cout << "sorted(>= 128): \t"<< elapsed_seconds << std::endl; std::cout << "sum = \t"<< sum << std::endl; sum = 0; start = clock(); for (int i = 0; i < 100000; ++i) for (size_t j = 0; j < array_length; ++j) if (array_aka[j] >= 64) sum += array_aka[j]; elapsed_seconds = static_cast<double>(clock() - start) / CLOCKS_PER_SEC; std::cout << "sorted(>= 64): \t"<< elapsed_seconds << std::endl; std::cout << "sum = \t"<< sum << std::endl; sum = 0; start = clock(); for (int i = 0; i < 100000; ++i) for (size_t j = 0; j < array_length; ++j) if (__builtin_expect((array_aka[j] >= 64), 1)) sum += array_aka[j]; elapsed_seconds = static_cast<double>(clock() - start) / CLOCKS_PER_SEC; std::cout << "sorted(>= 64)(hint): \t"<< elapsed_seconds << std::endl; std::cout << "sum = \t"<< sum << std::endl;}
➜ project g++ -o run branch_prediction_victim.cpp && ./run
unsorted(>= 128): 19.3221sum = 312708500000unsorted(>= 64): 13.5077 <-- branch prediction friendly there is majoritysum = 391463200000unsorted(>= 64)(hint): 13.6219 <-- __builtin_expect has no effectsum = 391463200000sorted(>= 128): 6.38736 <-- branch prediction friendly with sorted arraysum = 312708500000sorted(>= 64): 6.35014sum = 391463200000sorted(>= 64)(hint): 6.18703sum = 391463200000
➜ project g++ -O3 -o run branch_prediction_victim.cpp && ./run
unsorted(>= 128): 0.787082sum = 314110000000unsorted(>= 64): 0.777937sum = 392066300000unsorted(>= 64)(hint): 0.783885sum = 392066300000sorted(>= 128): 0.782302sum = 314110000000sorted(>= 64): 0.782116sum = 392066300000sorted(>= 64)(hint): 0.785842sum = 392066300000
As we can see, 1. with -O3
we need neither make majority
nor sorted array
, 2. __builtin_expect
has no use here. It seems we need to check out assembly for the reasons
➜ project g++ -O3 -Wa,-a -g branch_prediction_victim.cpp > branch_prediction_victim.S
834 .L74: 835 0080 660FEFD2 pxor %xmm2, %xmm2 836 0084 4889EB movq %rbp, %rbx 837 .LVL108: 838 0087 660F1F84 .p2align 4,,10 838 00000000 838 00 839 .p2align 3 840 .L73: 841 .LBB547: 842 .LBB548: 22:branch_prediction_victim.cpp **** for (int i = 0; i < 100000; ++i) 23:branch_prediction_victim.cpp **** for (size_t j = 0; j < array_length; ++j) 24:branch_prediction_victim.cpp **** if (array_aka[j] >= 128) 843 .loc 5 24 0 844 0090 660F6F03 movdqa (%rbx), %xmm0 845 0094 4883C310 addq $16, %rbx 846 0098 4939DC cmpq %rbx, %r12 847 009b 660F6FC8 movdqa %xmm0, %xmm1 848 009f 660F66CC pcmpgtd %xmm4, %xmm1 849 00a3 660FDBC1 pand %xmm1, %xmm0 850 00a7 660F6FCD movdqa %xmm5, %xmm1 851 00ab 660F6FD8 movdqa %xmm0, %xmm3 852 00af 660F66C8 pcmpgtd %xmm0, %xmm1 853 00b3 660F62D9 punpckldq %xmm1, %xmm3 854 00b7 660F6AC1 punpckhdq %xmm1, %xmm0 855 00bb 660FD4D3 paddq %xmm3, %xmm2 856 00bf 660FD4D0 paddq %xmm0, %xmm2 857 00c3 75CB jne .L73 858 00c5 660F6FC2 movdqa %xmm2, %xmm0 859 00c9 660F73D8 psrldq $8, %xmm0 859 08 860 00ce 660FD4C2 paddq %xmm2, %xmm0 861 00d2 66480F7E movq %xmm0, %rax 861 C0 862 00d7 4901C6 addq %rax, %r14 863 .LVL109: 864 .LBE548: 22:branch_prediction_victim.cpp **** for (int i = 0; i < 100000; ++i) 865 .loc 5 22 0 866 00da 83EA01 subl $1, %edx 867 00dd 75A1 jne .L74 868 00df 0F292424 movaps %xmm4, (%rsp) 869 .LBE547: 25:branch_prediction_victim.cpp **** sum += array_aka[j];
I also checked the assembly of the without -O3
174 .LBB3: 22:branch_prediction_victim.cpp **** for (int i = 0; i < 100000; ++i) 175 .loc 3 22 0 176 00a4 C78580FF movl $0, -131200(%rbp) 176 FDFF0000 176 0000 177 .L15: 178 .loc 3 22 0 is_stmt 0 discriminator 1 179 00ae 81BD80FF cmpl $99999, -131200(%rbp) 179 FDFF9F86 179 0100 180 00b8 7F55 jg .L11 181 .LBB4: 182 .LBB5: 23:branch_prediction_victim.cpp **** for (size_t j = 0; j < array_length; ++j) 183 .loc 3 23 0 is_stmt 1 184 00ba 48C785A8 movq $0, -131160(%rbp) 184 FFFDFF00 184 000000 185 .L14: 186 .loc 3 23 0 is_stmt 0 discriminator 1 187 00c5 4881BDA8 cmpq $32767, -131160(%rbp) 187 FFFDFFFF 187 7F0000 188 00d0 7734 ja .L12 24:branch_prediction_victim.cpp **** if (array_aka[j] >= 128) 189 .loc 3 24 0 is_stmt 1 190 00d2 488B85A8 movq -131160(%rbp), %rax 190 FFFDFF 191 00d9 8B8485F0 movl -131088(%rbp,%rax,4), %eax 191 FFFDFF 192 00e0 83F87F cmpl $127, %eax 193 00e3 7E17 jle .L13 25:branch_prediction_victim.cpp **** sum += array_aka[j]; 194 .loc 3 25 0 195 00e5 488B85A8 movq -131160(%rbp), %rax 195 FFFDFF 196 00ec 8B8485F0 movl -131088(%rbp,%rax,4), %eax 196 FFFDFF 197 00f3 4898 cltq 198 00f5 480185A0 addq %rax, -131168(%rbp) 198 FFFDFF 199 .L13: 23:branch_prediction_victim.cpp **** for (size_t j = 0; j < array_length; ++j) 200 .loc 3 23 0 discriminator 2 201 00fc 488385A8 addq $1, -131160(%rbp) 201 FFFDFF01 202 0104 EBBF jmp .L14 203 .L12: 204 .LBE5: 205 .LBE4: 22:branch_prediction_victim.cpp **** for (size_t j = 0; j < array_length; ++j) 206 .loc 3 22 0 discriminator 2 207 0106 838580FF addl $1, -131200(%rbp) 207 FDFF01 208 010d EB9F jmp .L15 209 .L11: 210 .LBE3:
there is too much difference
- what is the logic here with so much
xmm
things? - why such things can speed up performance?
- It seems
__builtin_expect
hardly has any effect, so why there islikely
aka__builtin_expect
everywhere in Linux kernel? - without
-O3
, we can seefor (0..100k)
andfor (0..arr_len)
two loops they are separate, I'm sure there is100k * arr_len
loops then. - For the
-O3
versionfor (0..100k)
andfor (0..arr_len)
are together, so there is only100k
loops?