C Loop Unrolling very inefficient (Clang, GCC,...)

Why do the compilers produce inefficient loop unrolling.

GCC Version 14.2Clang Version 19.1.0Compile args for both: -Ofast -lm

Example code:

#define MIN(a,b) ((a) < (b) ? (a) : (b))void to_test(size_t num_iters){    const size_t exec_iters = MIN(num_iters, 10);    #if __GNUC__    #pragma GCC unroll 10    #elif __clang__    #pragma unroll 10    #endif    for (size_t i = 0; i < exec_iters; ++i) {        printf("%d\n", i);    }}

In this case, what both compilers do, is duplicate the code (unrolling).and after every execution it does a conditional jump to see if its finishedgcc Godbolt for gcc variant 1clang Godbolt for clang variant 1

So both do basically this, which is quite suboptimal.

to_test:        test    rdi, rdi        je      .L33        push    rbx        xor     esi, esi        mov     rbx, rdi        xor     eax, eax        mov     edi, OFFSET FLAT:.LC0        call    printf        cmp     rbx, 1        je      .L1        xor     eax, eax        mov     esi, 1        mov     edi, OFFSET FLAT:.LC0        call    printf        cmp     rbx, 2        je      .L1        xor     eax, eax        mov     esi, 2        mov     edi, OFFSET FLAT:.LC0        call    printf        cmp     rbx, 3        je      .L1        xor     eax, eax        mov     esi, 3        mov     edi, OFFSET FLAT:.LC0        call    printf        cmp     rbx, 4        je      .L1        xor     eax, eax        mov     esi, 4        mov     edi, OFFSET FLAT:.LC0        call    printf        cmp     rbx, 5        je      .L1        xor     eax, eax        mov     esi, 5        mov     edi, OFFSET FLAT:.LC0        call    printf        cmp     rbx, 6        je      .L1        xor     eax, eax        mov     esi, 6        mov     edi, OFFSET FLAT:.LC0        call    printf        cmp     rbx, 7        je      .L1        xor     eax, eax        mov     esi, 7        mov     edi, OFFSET FLAT:.LC0        call    printf        cmp     rbx, 8        je      .L1        xor     eax, eax        mov     esi, 8        mov     edi, OFFSET FLAT:.LC0        call    printf        cmp     rbx, 9        je      .L1        mov     esi, 9        mov     edi, OFFSET FLAT:.LC0        xor     eax, eax        pop     rbx        jmp     printf

Now if we slighly modify the code to

void to_test(size_t num_iters){    const size_t exec_iters = fmin(num_iters, 10);    #if __GNUC__    #pragma GCC unroll 10    #elif __clang__    #pragma unroll 10    #endif    for (size_t i = 0; i < exec_iters; ++i) {        printf("%d\n", i);    }}

We see them producing different code.GCC Godbolt for gcc variant 2 performs a linear search thus lots of comparisons.

.L5:        test    r12, r12        je      .L1        mov     rax, r12        xor     ebx, ebx        and     eax, 7        je      .L7        cmp     rax, 1        je      .L32        cmp     rax, 2        je      .L33        cmp     rax, 3        je      .L34        cmp     rax, 4        je      .L35        cmp     rax, 5        je      .L36        cmp     rax, 6        jne     .L49

Clang makes it even weirder.It first checks if the value results in 10.if this is the case, it will use the fully unrolled loop (no if checks afterwards).But if its smaller, it doesn't do any loop unrolling and just performs the ordinary loop.Clang Godbolt for clang variant 2

        cmp     r12, 10        jb      .LBB0_4        sub     r12, r15        lea     r14, [rip + .L.str]        xor     ebx, ebx.LBB0_3:        mov     rdi, r14        mov     rsi, rbx        xor     eax, eax        call    printf@PLT        lea     rsi, [rbx + 1]        mov     rdi, r14        xor     eax, eax        call    printf@PLT        lea     rsi, [rbx + 2]        mov     rdi, r14        xor     eax, eax        call    printf@PLT        lea     rsi, [rbx + 3]        mov     rdi, r14        xor     eax, eax        call    printf@PLT        lea     rsi, [rbx + 4]        mov     rdi, r14        xor     eax, eax        call    printf@PLT        lea     rsi, [rbx + 5]        mov     rdi, r14        xor     eax, eax        call    printf@PLT        lea     rsi, [rbx + 6]        mov     rdi, r14        xor     eax, eax        call    printf@PLT        lea     rsi, [rbx + 7]        mov     rdi, r14        xor     eax, eax        call    printf@PLT        lea     rsi, [rbx + 8]        mov     rdi, r14        xor     eax, eax        call    printf@PLT        lea     rsi, [rbx + 9]        mov     rdi, r14        xor     eax, eax        call    printf@PLT        add     rbx, 10        cmp     rbx, r12        jne     .LBB0_3.LBB0_4:        cmp     r13, 10        je      .LBB0_7        lea     r14, [rip + .L.str].LBB0_6:        mov     rdi, r14        mov     rsi, rbx        xor     eax, eax        call    printf@PLT        inc     rbx        dec     r15        jne     .LBB0_6

Final question, these variants all seem quite ineffective.Especially the linear search for the correct value by GCC or clang not doing any unrolling at all in the second case.

Is it possible to let the compiler do something more clever.For example something like this (so it calcs an initial start_index)(it has the max number of possible iterations)and then jump directly to the final location.

// assumption (exec_iters = 4)// then start_idx = 4 - 10 = -6const start_idx = -6;jump label_fabs({start_idx}):label_0printf("%d\n", start_idx + 0);     // i = 0:label_1printf("%d\n", start_idx + 1);     // i = 1:label_2printf("%d\n", start_idx + 2);     // i = 2:label_3printf("%d\n", start_idx + 3);     // i = 3:label_4printf("%d\n", start_idx + 4);     // i = 4:label_5printf("%d\n", start_idx + 5);     // i = 5:label_6printf("%d\n", start_idx + 6);     // i = 6:label_7printf("%d\n", start_idx + 7);     // i = 7:label_8printf("%d\n", start_idx + 8);     // i = 8:label_9printf("%d\n", start_idx + 9);     // i = 9

Or even building 10 functions and jump directly to the one with the exact number of iterations.

Even fixing the end of the loop and having a variable start (which should be easier for the compiler to optimize), does not result in something more clever.

#define MIN(a,b) ((a) < (b) ? (a) : (b))void to_test(size_t num_iters){    const size_t exec_iters = MIN(num_iters, 10);    int loop_start = 10 - exec_iters;    int base_offset = exec_iters - 10;    int running_offset = loop_start;    #if __GNUC__    #pragma GCC unroll 10    #elif __clang__    #pragma unroll 10    #endif    for (size_t i = loop_start; i < 10; ++i) {        printf("%d\n", base_offset + running_offset);++running_offset;    }}

GCC Version 14.2 Clang Version 19.1.0 Compile args for both: -Ofast -lm

Latest Images

Trending Articles

Latest Images