Quantcast
Channel: Active questions tagged gcc - Stack Overflow
Viewing all articles
Browse latest Browse all 22461

How does `g++ -O3` generated `xmm` things affect branch prediction for unsorted array?

$
0
0

I think code tells everything:

➜ project cat branch_prediction_victim.cpp

#include <algorithm>#include <ctime>#include <iostream>int main(int argc, char* argv[]){    const size_t array_length = 32768;    int array_aka[array_length];    std::srand(std::time(nullptr));    for (size_t i = 0; i < array_length; ++i)        array_aka[i] = std::rand() % 256;    // !!! With this, the next loop runs faster.    //std::sort(array_aka, array_aka + array_length);    clock_t start;    long long sum;    double elapsed_seconds;    sum = 0;    start = clock();    for (int i = 0; i < 100000; ++i)        for (size_t j = 0; j < array_length; ++j)            if (array_aka[j] >= 128)                sum += array_aka[j];    elapsed_seconds = static_cast<double>(clock() - start) / CLOCKS_PER_SEC;    std::cout << "unsorted(>= 128):     \t"<< elapsed_seconds << std::endl;    std::cout << "sum =                 \t"<< sum << std::endl;    sum = 0;    start = clock();    for (int i = 0; i < 100000; ++i)        for (size_t j = 0; j < array_length; ++j)            if (array_aka[j] >= 64)                sum += array_aka[j];    elapsed_seconds = static_cast<double>(clock() - start) / CLOCKS_PER_SEC;    std::cout << "unsorted(>= 64):      \t"<< elapsed_seconds << std::endl;    std::cout << "sum =                 \t"<< sum << std::endl;    sum = 0;    start = clock();    for (int i = 0; i < 100000; ++i)        for (size_t j = 0; j < array_length; ++j)            if (__builtin_expect((array_aka[j] >= 64), 1))                sum += array_aka[j];    elapsed_seconds = static_cast<double>(clock() - start) / CLOCKS_PER_SEC;    std::cout << "unsorted(>= 64)(hint):\t"<< elapsed_seconds << std::endl;    std::cout << "sum =                 \t"<< sum << std::endl;    std::sort(array_aka, array_aka + array_length);    sum = 0;    start = clock();    for (int i = 0; i < 100000; ++i)        for (size_t j = 0; j < array_length; ++j)            if (array_aka[j] >= 128)                sum += array_aka[j];    elapsed_seconds = static_cast<double>(clock() - start) / CLOCKS_PER_SEC;    std::cout << "sorted(>= 128):       \t"<< elapsed_seconds << std::endl;    std::cout << "sum =                 \t"<< sum << std::endl;    sum = 0;    start = clock();    for (int i = 0; i < 100000; ++i)        for (size_t j = 0; j < array_length; ++j)            if (array_aka[j] >= 64)                sum += array_aka[j];    elapsed_seconds = static_cast<double>(clock() - start) / CLOCKS_PER_SEC;    std::cout << "sorted(>= 64):        \t"<< elapsed_seconds << std::endl;    std::cout << "sum =                 \t"<< sum << std::endl;    sum = 0;    start = clock();    for (int i = 0; i < 100000; ++i)        for (size_t j = 0; j < array_length; ++j)            if (__builtin_expect((array_aka[j] >= 64), 1))                sum += array_aka[j];    elapsed_seconds = static_cast<double>(clock() - start) / CLOCKS_PER_SEC;    std::cout << "sorted(>= 64)(hint):  \t"<< elapsed_seconds << std::endl;    std::cout << "sum =                 \t"<< sum << std::endl;}

➜ project g++ -o run branch_prediction_victim.cpp && ./run

unsorted(>= 128):       19.3221sum =                   312708500000unsorted(>= 64):        13.5077         <-- branch prediction friendly there is majoritysum =                   391463200000unsorted(>= 64)(hint):  13.6219         <-- __builtin_expect has no effectsum =                   391463200000sorted(>= 128):         6.38736         <-- branch prediction friendly with sorted arraysum =                   312708500000sorted(>= 64):          6.35014sum =                   391463200000sorted(>= 64)(hint):    6.18703sum =                   391463200000

➜ project g++ -O3 -o run branch_prediction_victim.cpp && ./run

unsorted(>= 128):       0.787082sum =                   314110000000unsorted(>= 64):        0.777937sum =                   392066300000unsorted(>= 64)(hint):  0.783885sum =                   392066300000sorted(>= 128):         0.782302sum =                   314110000000sorted(>= 64):          0.782116sum =                   392066300000sorted(>= 64)(hint):    0.785842sum =                   392066300000

As we can see, 1. with -O3 we need neither make majority nor sorted array, 2. __builtin_expect has no use here. It seems we need to check out assembly for the reasons

➜ project g++ -O3 -Wa,-a -g branch_prediction_victim.cpp > branch_prediction_victim.S

 834                    .L74: 835 0080 660FEFD2              pxor    %xmm2, %xmm2 836 0084 4889EB                movq    %rbp, %rbx 837                    .LVL108: 838 0087 660F1F84              .p2align 4,,10 838      00000000 838      00 839                            .p2align 3 840                    .L73: 841                    .LBB547: 842                    .LBB548:  22:branch_prediction_victim.cpp ****     for (int i = 0; i < 100000; ++i)  23:branch_prediction_victim.cpp ****         for (size_t j = 0; j < array_length; ++j)  24:branch_prediction_victim.cpp ****             if (array_aka[j] >= 128) 843                            .loc 5 24 0 844 0090 660F6F03              movdqa  (%rbx), %xmm0 845 0094 4883C310              addq    $16, %rbx 846 0098 4939DC                cmpq    %rbx, %r12 847 009b 660F6FC8              movdqa  %xmm0, %xmm1 848 009f 660F66CC              pcmpgtd %xmm4, %xmm1 849 00a3 660FDBC1              pand    %xmm1, %xmm0 850 00a7 660F6FCD              movdqa  %xmm5, %xmm1 851 00ab 660F6FD8              movdqa  %xmm0, %xmm3 852 00af 660F66C8              pcmpgtd %xmm0, %xmm1 853 00b3 660F62D9              punpckldq       %xmm1, %xmm3 854 00b7 660F6AC1              punpckhdq       %xmm1, %xmm0 855 00bb 660FD4D3              paddq   %xmm3, %xmm2 856 00bf 660FD4D0              paddq   %xmm0, %xmm2 857 00c3 75CB                  jne     .L73 858 00c5 660F6FC2              movdqa  %xmm2, %xmm0 859 00c9 660F73D8              psrldq  $8, %xmm0 859      08 860 00ce 660FD4C2              paddq   %xmm2, %xmm0 861 00d2 66480F7E              movq    %xmm0, %rax 861      C0 862 00d7 4901C6                addq    %rax, %r14 863                    .LVL109: 864                    .LBE548:  22:branch_prediction_victim.cpp ****     for (int i = 0; i < 100000; ++i) 865                            .loc 5 22 0 866 00da 83EA01                subl    $1, %edx 867 00dd 75A1                  jne     .L74 868 00df 0F292424              movaps  %xmm4, (%rsp) 869                    .LBE547:  25:branch_prediction_victim.cpp ****                 sum += array_aka[j];

I also checked the assembly of the without -O3

 174                .LBB3:  22:branch_prediction_victim.cpp ****     for (int i = 0; i < 100000; ++i) 175                    .loc 3 22 0 176 00a4 C78580FF      movl    $0, -131200(%rbp) 176      FDFF0000  176      0000 177                .L15: 178                    .loc 3 22 0 is_stmt 0 discriminator 1 179 00ae 81BD80FF      cmpl    $99999, -131200(%rbp) 179      FDFF9F86  179      0100 180 00b8 7F55          jg  .L11 181                .LBB4: 182                .LBB5:  23:branch_prediction_victim.cpp ****         for (size_t j = 0; j < array_length; ++j) 183                    .loc 3 23 0 is_stmt 1 184 00ba 48C785A8      movq    $0, -131160(%rbp) 184      FFFDFF00  184      000000 185                .L14: 186                    .loc 3 23 0 is_stmt 0 discriminator 1 187 00c5 4881BDA8      cmpq    $32767, -131160(%rbp) 187      FFFDFFFF  187      7F0000 188 00d0 7734          ja  .L12  24:branch_prediction_victim.cpp ****             if (array_aka[j] >= 128) 189                    .loc 3 24 0 is_stmt 1 190 00d2 488B85A8      movq    -131160(%rbp), %rax 190      FFFDFF 191 00d9 8B8485F0      movl    -131088(%rbp,%rax,4), %eax 191      FFFDFF 192 00e0 83F87F        cmpl    $127, %eax 193 00e3 7E17          jle .L13  25:branch_prediction_victim.cpp ****                 sum += array_aka[j]; 194                    .loc 3 25 0 195 00e5 488B85A8      movq    -131160(%rbp), %rax 195      FFFDFF 196 00ec 8B8485F0      movl    -131088(%rbp,%rax,4), %eax 196      FFFDFF 197 00f3 4898          cltq 198 00f5 480185A0      addq    %rax, -131168(%rbp) 198      FFFDFF 199                .L13:  23:branch_prediction_victim.cpp ****         for (size_t j = 0; j < array_length; ++j) 200                    .loc 3 23 0 discriminator 2 201 00fc 488385A8      addq    $1, -131160(%rbp) 201      FFFDFF01  202 0104 EBBF          jmp .L14 203                .L12: 204                .LBE5: 205                .LBE4:  22:branch_prediction_victim.cpp ****         for (size_t j = 0; j < array_length; ++j) 206                    .loc 3 22 0 discriminator 2 207 0106 838580FF      addl    $1, -131200(%rbp) 207      FDFF01 208 010d EB9F          jmp .L15 209                .L11: 210                .LBE3:

there is too much difference

  1. what is the logic here with so much xmm things?
  2. why such things can speed up performance?
  3. It seems __builtin_expect hardly has any effect, so why there is likely aka __builtin_expect everywhere in Linux kernel?
  4. without -O3, we can see for (0..100k) and for (0..arr_len) two loops they are separate, I'm sure there is 100k * arr_len loops then.
  5. For the -O3 version for (0..100k) and for (0..arr_len) are together, so there is only 100k loops?

Viewing all articles
Browse latest Browse all 22461

Trending Articles



<script src="https://jsc.adskeeper.com/r/s/rssing.com.1596347.js" async> </script>