I have created a simple, vectorized C function to square each element in an array. The code is as follows:
#include <immintrin.h>void square(const double* arr, uint len, double* outarr) { __m256d v; for (uint i = 0; i <= len - 4; i += 4) { v = _mm256_load_pd(&arr[i]); _mm256_stream_pd(&outarr[i], _mm256_mul_pd(v, v)); } for (uint i = len-(len&3u); i < len; i++) { outarr[i] = arr[i]*arr[i]; }}int main() { double* inp = aligned_alloc(32, 100* sizeof(double)); double* out = aligned_alloc(32, 100* sizeof(double)); square(inp, 100u, out); return 0;}
When I compile this code with:
gcc main.c -mavx -o main
I get the following disassembly for the square function:
0x0000000000400546 <+0>: lea 0x8(%rsp),%r10 0x000000000040054b <+5>: and $0xffffffffffffffe0,%rsp 0x000000000040054f <+9>: pushq -0x8(%r10) 0x0000000000400553 <+13>: push %rbp 0x0000000000400554 <+14>: mov %rsp,%rbp 0x0000000000400557 <+17>: push %r10 0x0000000000400559 <+19>: sub $0x50,%rsp 0x000000000040055d <+23>: mov %rdi,-0xb8(%rbp) 0x0000000000400564 <+30>: mov %esi,-0xbc(%rbp) 0x000000000040056a <+36>: mov %rdx,-0xc8(%rbp) 0x0000000000400571 <+43>: movl $0x0,-0xa8(%rbp) 0x000000000040057b <+53>: jmpq 0x400615 <square+207> 0x0000000000400580 <+58>: mov -0xa8(%rbp),%eax 0x0000000000400586 <+64>: cltq 0x0000000000400588 <+66>: lea 0x0(,%rax,8),%rdx 0x0000000000400590 <+74>: mov -0xb8(%rbp),%rax 0x0000000000400597 <+81>: add %rdx,%rax 0x000000000040059a <+84>: mov %rax,-0xa0(%rbp) 0x00000000004005a1 <+91>: mov -0xa0(%rbp),%rax 0x00000000004005a8 <+98>: vmovapd (%rax),%ymm0 0x00000000004005ac <+102>: vmovapd %ymm0,-0x90(%rbp) 0x00000000004005b4 <+110>: vmovapd -0x90(%rbp),%ymm0 0x00000000004005bc <+118>: vmovapd %ymm0,-0x70(%rbp) 0x00000000004005c1 <+123>: vmovapd -0x90(%rbp),%ymm0 0x00000000004005c9 <+131>: vmovapd %ymm0,-0x30(%rbp) 0x00000000004005ce <+136>: vmovapd -0x70(%rbp),%ymm0 0x00000000004005d3 <+141>: vmulpd -0x30(%rbp),%ymm0,%ymm0 0x00000000004005d8 <+146>: mov -0xa8(%rbp),%eax 0x00000000004005de <+152>: cltq 0x00000000004005e0 <+154>: lea 0x0(,%rax,8),%rdx 0x00000000004005e8 <+162>: mov -0xc8(%rbp),%rax 0x00000000004005ef <+169>: add %rdx,%rax 0x00000000004005f2 <+172>: mov %rax,-0x98(%rbp) 0x00000000004005f9 <+179>: vmovapd %ymm0,-0x50(%rbp) 0x00000000004005fe <+184>: mov -0x98(%rbp),%rax 0x0000000000400605 <+191>: vmovapd -0x50(%rbp),%ymm0 0x000000000040060a <+196>: vmovntpd %ymm0,(%rax) 0x000000000040060e <+200>: addl $0x4,-0xa8(%rbp) 0x0000000000400615 <+207>: mov -0xbc(%rbp),%eax 0x000000000040061b <+213>: lea -0x4(%rax),%edx 0x000000000040061e <+216>: mov -0xa8(%rbp),%eax 0x0000000000400624 <+222>: cmp %eax,%edx 0x0000000000400626 <+224>: jae 0x400580 <square+58> 0x000000000040062c <+230>: mov -0xbc(%rbp),%eax 0x0000000000400632 <+236>: and $0xfffffffc,%eax 0x0000000000400635 <+239>: mov %eax,-0xa4(%rbp) 0x000000000040063b <+245>: jmp 0x40069c <square+342> 0x000000000040063d <+247>: mov -0xa4(%rbp),%eax 0x0000000000400643 <+253>: lea 0x0(,%rax,8),%rdx 0x000000000040064b <+261>: mov -0xc8(%rbp),%rax 0x0000000000400652 <+268>: add %rdx,%rax
Then, when I compile the code with:
gcc main.c -mavx -O3 -o main
I get the following disassembly for the square function:
0x00000000004005c0 <+0>: lea -0x4(%rsi),%r9d 0x00000000004005c4 <+4>: mov %rdi,%r8 0x00000000004005c7 <+7>: mov %rdx,%rcx 0x00000000004005ca <+10>: xor %eax,%eax 0x00000000004005cc <+12>: nopl 0x0(%rax) 0x00000000004005d0 <+16>: vmovapd (%r8),%ymm0 0x00000000004005d5 <+21>: add $0x4,%eax 0x00000000004005d8 <+24>: add $0x20,%r8 0x00000000004005dc <+28>: add $0x20,%rcx 0x00000000004005e0 <+32>: vmulpd %ymm0,%ymm0,%ymm0 0x00000000004005e4 <+36>: vmovntpd %ymm0,-0x20(%rcx) 0x00000000004005e9 <+41>: cmp %eax,%r9d 0x00000000004005ec <+44>: jae 0x4005d0 <square+16> 0x00000000004005ee <+46>: mov %esi,%eax 0x00000000004005f0 <+48>: and $0xfffffffc,%eax 0x00000000004005f3 <+51>: cmp %eax,%esi 0x00000000004005f5 <+53>: jbe 0x400617 <square+87> 0x00000000004005f7 <+55>: nopw 0x0(%rax,%rax,1) 0x0000000000400600 <+64>: mov %eax,%ecx 0x0000000000400602 <+66>: add $0x1,%eax 0x0000000000400605 <+69>: vmovsd (%rdi,%rcx,8),%xmm0 0x000000000040060a <+74>: cmp %eax,%esi 0x000000000040060c <+76>: vmulsd %xmm0,%xmm0,%xmm0 0x0000000000400610 <+80>: vmovsd %xmm0,(%rdx,%rcx,8) 0x0000000000400615 <+85>: jne 0x400600 <square+64> 0x0000000000400617 <+87>: vzeroupper 0x000000000040061a <+90>: retq
My GCC version is:
gcc (Ubuntu 5.4.0-6ubuntu1~16.04.12) 5.4.0 20160609Copyright (C) 2015 Free Software Foundation, Inc.This is free software; see the source for copying conditions. There is NOwarranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
I was just wondering if someone could explain what GCC is doing in the first code snippet. There seems to be a bunch of random vmovapd
instructions with no particular purpose. I.e instruction <+110>
seems to be useless as the contents of %ymm0
are the same as -0x90(%rbp)
? I can understand what the -O3
code does but I'm confused about the non-optimized code. Thank you.