Disclaimer: full code can be found here.
16 byte alignment
Given a fairly simple type to support proper SSE alignment
struct alignas(16) simd_pack
{
std::int32_t data[4];
};
and a function that adds two arrays together
void add_packed(simd_pack* lhs_and_result, simd_pack* rhs, std::size_t size)
{
for (std::size_t i = 0; i < size; i++)
for (std::size_t j = 0; j < 4; j++)
lhs_and_result[i].data[j] += rhs[i].data[j];
}
compile the code with clang and gcc using -O3
.
Clang produces the following assembly:
add_packed(simd_pack*, simd_pack*, unsigned long): # @add_packed(simd_pack*, simd_pack*, unsigned long)
test rdx, rdx
je .LBB0_3
mov eax, 12
.LBB0_2: # =>This Inner Loop Header: Depth=1
mov ecx, dword ptr [rsi + rax - 12]
add dword ptr [rdi + rax - 12], ecx
mov ecx, dword ptr [rsi + rax - 8]
add dword ptr [rdi + rax - 8], ecx
mov ecx, dword ptr [rsi + rax - 4]
add dword ptr [rdi + rax - 4], ecx
mov ecx, dword ptr [rsi + rax]
add dword ptr [rdi + rax], ecx
add rax, 16
add rdx, -1
jne .LBB0_2
.LBB0_3:
ret
I'm not very literate in assembly but to me it looks like clang is simply unrolling the inner for loop. If we take a look at gcc we get:
add_packed(simd_pack*, simd_pack*, unsigned long):
test rdx, rdx
je .L1
sal rdx, 4
xor eax, eax
.L3:
movdqa xmm0, XMMWORD PTR [rdi+rax]
paddd xmm0, XMMWORD PTR [rsi+rax]
movaps XMMWORD PTR [rdi+rax], xmm0
add rax, 16
cmp rax, rdx
jne .L3
.L1:
ret
which is what I expect.
64 byte alignment
The difference gets even bigger (obviously) if we go to 64 byte alignment (which usually is a cache line if I'm not mistaken)
struct alignas(64) cache_line
{
std::int32_t data[16];
};
void add_cache_line(cache_line* lhs_and_result, cache_line* rhs, std::size_t size)
{
for (std::size_t i = 0; i < size; i++)
for (std::size_t j = 0; j < 16; j++)
lhs_and_result[i].data[j] += rhs[i].data[j];
}
Clang keeps simply unrolling:
add_cache_line(cache_line*, cache_line*, unsigned long): # @add_cache_line(cache_line*, cache_line*, unsigned long)
test rdx, rdx
je .LBB1_3
mov eax, 60
.LBB1_2: # =>This Inner Loop Header: Depth=1
mov ecx, dword ptr [rsi + rax - 60]
add dword ptr [rdi + rax - 60], ecx
mov ecx, dword ptr [rsi + rax - 56]
add dword ptr [rdi + rax - 56], ecx
mov ecx, dword ptr [rsi + rax - 52]
add dword ptr [rdi + rax - 52], ecx
mov ecx, dword ptr [rsi + rax - 48]
add dword ptr [rdi + rax - 48], ecx
mov ecx, dword ptr [rsi + rax - 44]
add dword ptr [rdi + rax - 44], ecx
mov ecx, dword ptr [rsi + rax - 40]
add dword ptr [rdi + rax - 40], ecx
mov ecx, dword ptr [rsi + rax - 36]
add dword ptr [rdi + rax - 36], ecx
mov ecx, dword ptr [rsi + rax - 32]
add dword ptr [rdi + rax - 32], ecx
mov ecx, dword ptr [rsi + rax - 28]
add dword ptr [rdi + rax - 28], ecx
mov ecx, dword ptr [rsi + rax - 24]
add dword ptr [rdi + rax - 24], ecx
mov ecx, dword ptr [rsi + rax - 20]
add dword ptr [rdi + rax - 20], ecx
mov ecx, dword ptr [rsi + rax - 16]
add dword ptr [rdi + rax - 16], ecx
mov ecx, dword ptr [rsi + rax - 12]
add dword ptr [rdi + rax - 12], ecx
mov ecx, dword ptr [rsi + rax - 8]
add dword ptr [rdi + rax - 8], ecx
mov ecx, dword ptr [rsi + rax - 4]
add dword ptr [rdi + rax - 4], ecx
mov ecx, dword ptr [rsi + rax]
add dword ptr [rdi + rax], ecx
add rax, 64
add rdx, -1
jne .LBB1_2
.LBB1_3:
ret
while gcc uses SSE and also unrolls that:
add_cache_line(cache_line*, cache_line*, unsigned long):
mov rcx, rdx
test rdx, rdx
je .L9
sal rcx, 6
mov rax, rdi
mov rdx, rsi
add rcx, rdi
.L11:
movdqa xmm2, XMMWORD PTR [rdx+16]
movdqa xmm3, XMMWORD PTR [rax]
add rax, 64
add rdx, 64
movdqa xmm1, XMMWORD PTR [rdx-32]
movdqa xmm0, XMMWORD PTR [rdx-16]
paddd xmm3, XMMWORD PTR [rdx-64]
paddd xmm2, XMMWORD PTR [rax-48]
paddd xmm1, XMMWORD PTR [rax-32]
paddd xmm0, XMMWORD PTR [rax-16]
movaps XMMWORD PTR [rax-64], xmm3
movaps XMMWORD PTR [rax-48], xmm2
movaps XMMWORD PTR [rax-32], xmm1
movaps XMMWORD PTR [rax-16], xmm0
cmp rax, rcx
jne .L11
.L9:
ret
No alignment
It's getting interesting if we use plain 32 bit integer arrays with no alignment at all. We use the exact same compiler flags.
void add_unaligned(std::int32_t* lhs_and_result, std::int32_t* rhs, std::size_t size)
{
for (std::size_t i = 0; i < size; i++)
lhs_and_result[i] += rhs[i];
}
Clang
Clang's assembly exploaded a fair bit by adding some branches:
add_unaligned(int*, int*, unsigned long): # @add_unaligned(int*, int*, unsigned long)
test rdx, rdx
je .LBB2_16
cmp rdx, 7
jbe .LBB2_2
lea rax, [rsi + 4*rdx]
cmp rax, rdi
jbe .LBB2_9
lea rax, [rdi + 4*rdx]
cmp rax, rsi
jbe .LBB2_9
.LBB2_2:
xor r10d, r10d
.LBB2_3:
mov r8, r10
not r8
add r8, rdx
mov rcx, rdx
and rcx, 3
je .LBB2_5
.LBB2_4: # =>This Inner Loop Header: Depth=1
mov eax, dword ptr [rsi + 4*r10]
add dword ptr [rdi + 4*r10], eax
add r10, 1
add rcx, -1
jne .LBB2_4
.LBB2_5:
cmp r8, 3
jb .LBB2_16
.LBB2_6: # =>This Inner Loop Header: Depth=1
mov eax, dword ptr [rsi + 4*r10]
add dword ptr [rdi + 4*r10], eax
mov eax, dword ptr [rsi + 4*r10 + 4]
add dword ptr [rdi + 4*r10 + 4], eax
mov eax, dword ptr [rsi + 4*r10 + 8]
add dword ptr [rdi + 4*r10 + 8], eax
mov eax, dword ptr [rsi + 4*r10 + 12]
add dword ptr [rdi + 4*r10 + 12], eax
add r10, 4
cmp rdx, r10
jne .LBB2_6
jmp .LBB2_16
.LBB2_9:
mov r10, rdx
and r10, -8
lea rax, [r10 - 8]
mov r9, rax
shr r9, 3
add r9, 1
mov r8d, r9d
and r8d, 1
test rax, rax
je .LBB2_10
sub r9, r8
xor ecx, ecx
.LBB2_12: # =>This Inner Loop Header: Depth=1
movdqu xmm0, xmmword ptr [rsi + 4*rcx]
movdqu xmm1, xmmword ptr [rsi + 4*rcx + 16]
movdqu xmm2, xmmword ptr [rdi + 4*rcx]
paddd xmm2, xmm0
movdqu xmm0, xmmword ptr [rdi + 4*rcx + 16]
paddd xmm0, xmm1
movdqu xmm1, xmmword ptr [rdi + 4*rcx + 32]
movdqu xmm3, xmmword ptr [rdi + 4*rcx + 48]
movdqu xmmword ptr [rdi + 4*rcx], xmm2
movdqu xmmword ptr [rdi + 4*rcx + 16], xmm0
movdqu xmm0, xmmword ptr [rsi + 4*rcx + 32]
paddd xmm0, xmm1
movdqu xmm1, xmmword ptr [rsi + 4*rcx + 48]
paddd xmm1, xmm3
movdqu xmmword ptr [rdi + 4*rcx + 32], xmm0
movdqu xmmword ptr [rdi + 4*rcx + 48], xmm1
add rcx, 16
add r9, -2
jne .LBB2_12
test r8, r8
je .LBB2_15
.LBB2_14:
movdqu xmm0, xmmword ptr [rsi + 4*rcx]
movdqu xmm1, xmmword ptr [rsi + 4*rcx + 16]
movdqu xmm2, xmmword ptr [rdi + 4*rcx]
paddd xmm2, xmm0
movdqu xmm0, xmmword ptr [rdi + 4*rcx + 16]
paddd xmm0, xmm1
movdqu xmmword ptr [rdi + 4*rcx], xmm2
movdqu xmmword ptr [rdi + 4*rcx + 16], xmm0
.LBB2_15:
cmp r10, rdx
jne .LBB2_3
.LBB2_16:
ret
.LBB2_10:
xor ecx, ecx
test r8, r8
jne .LBB2_14
jmp .LBB2_15
What is happening at .LBB2_4
and .LBB2_6
? It looks like it's unrolling a loop again but I'm not sure what happens there (mainly because of the registers used).
In .LBB2_12
it even unrolls the SSE part. I think it's only unrolled two-fold though because it needs two SIMD registers to load each operand because they are unaligned now. .LBB2_14
contains the SSE part without the unrolling.
How is the control flow here? I'm assuming it should be:
- keep using the unrolled SSE part until the remaining data is too small to fill all the registers (
xmm0..3
) - switch to the single stage SSE part and do it once if we have enough data remaining to fill
xmm0
(4 integers in our case) - process the remaining data (3 operations at max, otherwise it would be SSE suitable again)
The order of the labels and the jump instructions are confusing, is that (approx.) what happens here?
GCC
Gcc's assembly is a bit easier to read:
add_unaligned(int*, int*, unsigned long):
test rdx, rdx
je .L16
lea rcx, [rsi+4]
mov rax, rdi
sub rax, rcx
cmp rax, 8
jbe .L22
lea rax, [rdx-1]
cmp rax, 2
jbe .L22
mov rcx, rdx
xor eax, eax
shr rcx, 2
sal rcx, 4
.L19:
movdqu xmm0, XMMWORD PTR [rdi+rax]
movdqu xmm1, XMMWORD PTR [rsi+rax]
paddd xmm0, xmm1
movups XMMWORD PTR [rdi+rax], xmm0
add rax, 16
cmp rax, rcx
jne .L19
mov rax, rdx
and rax, -4
test dl, 3
je .L16
mov ecx, DWORD PTR [rsi+rax*4]
add DWORD PTR [rdi+rax*4], ecx
lea rcx, [rax+1]
cmp rdx, rcx
jbe .L16
add rax, 2
mov r8d, DWORD PTR [rsi+rcx*4]
add DWORD PTR [rdi+rcx*4], r8d
cmp rdx, rax
jbe .L16
mov edx, DWORD PTR [rsi+rax*4]
add DWORD PTR [rdi+rax*4], edx
ret
.L22:
xor eax, eax
.L18:
mov ecx, DWORD PTR [rsi+rax*4]
add DWORD PTR [rdi+rax*4], ecx
add rax, 1
cmp rdx, rax
jne .L18
.L16:
ret
I assume the control flow is similar to clang
- keep using the single stage SSE part until the remaining data is too small to fill
xmm0
andxmm1
- process the remaining data (3 operations at max, otherwise it would be SSE suitable again)
It looks like exactly this is happening in .L19
but what is .L18
doing then?
Summary
Here is the full code, including assembly. My question are:
- Why is clang unrolling the functions that use aligned data instead of using SSE or a combination of both (like gcc)?
- What are
.LBB2_4
and.LBB2_6
in clang's assembly doing? - Are my assumptions about the control flow of the function with the unaligned data correct?
- What is
.L18
in gcc's assembly doing?